In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

In [16]:
df=pd.read_csv('D:\ds_intern\credit card-AssignmentData.csv')

In [17]:
df.drop_duplicates

<bound method DataFrame.drop_duplicates of           Time        V1        V2        V3        V4        V5        V6  \
0            0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388   
1            0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361   
2            1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499   
3            1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203   
4            2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921   
...        ...       ...       ...       ...       ...       ...       ...   
284802  169142 -1.927883  1.125653 -4.518331  1.749293 -1.566487 -2.010494   
284803  169347  1.378559  1.289381 -5.004247  1.411850  0.442581 -1.326536   
284804  169351 -0.676143  1.126366 -2.213700  0.468308 -1.120541 -0.003346   
284805  169966 -3.113832  0.585864 -5.399730  1.817092 -0.840618 -2.943548   
284806  170348  1.991976  0.158476 -2.583441  0.408670  1.151147 -0.096695   

                  V7

In [18]:
df_cleaned = df.copy()
for col in ['V2', 'V7', 'V9', 'V24']:
    df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')

In [19]:
df_cleaned.dropna(inplace=True)

In [20]:
scaler = StandardScaler()
df_cleaned[['Time', 'Amount']] = scaler.fit_transform(df_cleaned[['Time', 'Amount']])

In [21]:
pca_features = df_cleaned.drop(columns=['Class']).values

In [22]:
pca = PCA(n_components=2)

In [23]:
pca_transformed = pca.fit_transform(pca_features)

In [24]:
pca_df = pd.DataFrame(data=pca_transformed, columns=['PC1', 'PC2'])
pca_df['Class'] = df_cleaned['Class']

In [25]:
pca_df.dropna(inplace=True)

In [26]:
X = pca_df.drop(columns=['Class']).values
y = pca_df['Class'].values  

In [27]:
# 1. Isolation Forest
iso_forest = IsolationForest(contamination=0.0017, random_state=42)  # Contamination roughly equals to the proportion of frauds
iso_forest.fit(X)
y_pred_iso = iso_forest.predict(X)
y_pred_iso = [1 if pred == -1 else 0 for pred in y_pred_iso]  # Anomalies are labeled as -1

In [28]:
# 2. Local Outlier Factor (LOF)
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.0017,novelty=True)
lof = lof.fit(X)
y_pred_lof=lof.predict(X)
y_pred_lof = [1 if pred == -1 else 0 for pred in y_pred_lof]  # Anomalies are labeled as -1

In [30]:
import pickle

In [31]:
pickle.dump(lof,open('lof.pkl','wb'))

In [32]:
pickle.dump(iso_forest,open('iso_forest.pkl','wb'))

In [33]:
with open("D:/ds_intern/scaler.pkl", 'wb') as file:
    pickle.dump(scaler, file)

with open("D:/ds_intern/pca.pkl", 'wb') as file:
    pickle.dump(pca, file)