In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import pyplot
import seaborn as sns

from numpy import quantile
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn import linear_model
from sklearn.linear_model import SGDOneClassSVM

from sklearn.metrics import classification_report,accuracy_score
from sklearn.metrics import confusion_matrix,f1_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

ValueError: mount failed

In [None]:
!ls /content/drive/My\ Drive/creditcard.csv

In [None]:
df = pd.read_csv('/content/drive/My Drive/creditcard.csv', on_bad_lines='skip') # Use on_bad_lines='skip' to replace the deprecated 'error_bad_lines=False'

In [None]:
df.head()

In [None]:
#shape of data
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df['Class'].value_counts()

In [None]:
# Filter the DataFrame to create df_fraud and df_normal
df_fraud = df[df['Class'] == 1]
df_normal = df[df['Class'] == 0]

In [None]:
outlier_fraction = len(df_fraud)/float(len(df_normal))

In [None]:
print(outlier_fraction)
print("Fraud Cases : {}".format(len(df_fraud)))
print("Normal Cases : {}".format(len(df_normal)))

In [None]:
classes = pd.value_counts(df['Class'], sort = True)
classes.plot(kind = 'bar', rot=0)
plt.title("Transaction Class Distribution")
plt.xticks(range(2), labels = ["Normal", "Fraud"])
plt.xlabel("Class")
plt.ylabel("Frequency");

In [None]:
df[['Class', 'Amount', 'Time', 'V1', 'V2', 'V3', 'V4', 'V5']].describe()

In [None]:
df.hist(figsize=(20, 15), bins=50);

In [None]:
df['Time'] = df['Time'].apply(lambda x : x / 3600)

In [None]:
sns.distplot(df['Time'])
plt.title("Distribution of Time")
sns.despine();

In [None]:
df_normal = df[df['Class']==0]
df_fraud = df[df['Class']==1]

In [None]:
print(df_normal.Amount.describe())
print('----------------')
print(df_fraud.Amount.describe())

In [None]:
bins = np.linspace(100, 2000, 100)
plt.figure(figsize=(12, 6))
plt.hist(df_normal.Amount, bins, alpha=1, density=True, label='Normal')
plt.hist(df_fraud.Amount, bins, alpha=0.6, density=True, label='Fraud', color='crimson')
plt.legend(loc='upper right')
plt.title("Amount by percentage of transactions (transactions over \$100)")
plt.xlabel("Transaction amount (USD)")
plt.ylabel("Percentage of transactions (%)");
plt.show()

In [None]:
bins = np.linspace(0, 50, 50)
plt.figure(figsize=(12, 6))
plt.hist(df_normal.Time, bins, alpha=0.6, density=True, label='Normal', color='blue')
plt.hist(df_fraud.Time, bins, alpha=0.6, density=True, label='Fraud', color='crimson')
plt.legend(loc='upper right')
plt.title("Percentage of transactions by hour")
plt.xlabel("Transaction time from first transaction in the dataset (hours)")
plt.ylabel("Percentage of transactions (%)");
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plt.scatter(df_normal.Time, df_normal.Amount, alpha=0.6, label='Normal')
plt.scatter(df_fraud.Time, df_fraud.Amount, alpha=0.9, label='Fraud')
plt.title("Amount of transaction by hour")
plt.xlabel("Transaction time from first transaction in the dataset (hours)")
plt.ylabel('Amount (USD)')
plt.legend(loc='upper right')
plt.show()

In [None]:
corr = df.corr()

plt.figure(figsize=(20,8))
ax = sns.heatmap(corr.round(2), annot = True, linewidth=0.5, fmt='0.1f', cmap = 'coolwarm')
ax.set_ylim(sorted(ax.get_xlim(), reverse=True))
ax.set(title="Correlation Matrix");
plt.show()

In [None]:
df_norm = df.copy()

In [None]:
df_norm['Time'] = StandardScaler().fit_transform(df_norm['Time'].values.reshape(-1, 1))
df_norm['Amount'] = StandardScaler().fit_transform(df_norm['Amount'].values.reshape(-1, 1))

In [None]:
train, test = train_test_split(df_norm, test_size=0.3, random_state = 10)

In [None]:
X_train = train[train['Class'] == 0]
X_train = X_train.drop(['Class'], axis=1)

X_test = test.drop(['Class'], axis=1)
y_test = test['Class']

In [None]:
print(f'X_train shape: {X_train.shape};')
print(f'X_test shape: {X_test.shape}; y_test shape: {y_test.shape}')

In [None]:
model_iF = IsolationForest(n_estimators=20, max_samples='auto',
                           contamination=0.01, random_state=42, verbose=2)
model_iF.fit(X_train)

In [None]:
y_pred = model_iF.predict(X_test)

In [None]:
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1

In [None]:
print("Accuracy Score :")
print(accuracy_score(y_test, y_pred))
print("Classification Report :")
print(classification_report(y_test, y_pred))

In [None]:
f1_score(y_test, y_pred)

In [None]:
if_cm = confusion_matrix(y_test, y_pred)

In [None]:
def confusion_matrix_plot(cm):
    df_cm = pd.DataFrame(cm,['True Normal','True Fraud'],['Pred Normal','Pred Fraud'])
    plt.figure(figsize = (6,4))
    ax = sns.heatmap(df_cm, annot=True,annot_kws={"size": 16},fmt='g')
    ax.set_ylim(sorted(ax.get_xlim(), reverse=True))
    plt.show()

In [None]:
confusion_matrix_plot(if_cm)

In [None]:
model_lf = LocalOutlierFactor(n_neighbors=2, contamination=0.1)
model_lf.fit(X_train)

In [None]:
pred_y = model_lf.fit_predict(X_test)

scores_pred = model_lf.negative_outlier_factor_

In [None]:
scores_pred

In [None]:
thresh = quantile(scores_pred, .03)
print(thresh)

In [None]:
pred_y[pred_y == 1] = 0
pred_y[pred_y == -1] = 1

In [None]:
print("Accuracy Score :")
print(accuracy_score(y_test, pred_y))
print("Classification Report :")
print(classification_report(y_test, pred_y))

In [None]:
lf_cm = confusion_matrix(y_test, pred_y)

In [None]:
confusion_matrix_plot(lf_cm)

In [None]:
model_sgd = SGDOneClassSVM(random_state=42, nu = 0.1, fit_intercept=True, shuffle=True, tol=1e-4)
model_sgd.fit(X_train)

In [None]:
pred = model_sgd.predict(X_test)

In [None]:
pred[pred == 1] = 0
pred[pred == -1] = 1

In [None]:
print("Accuracy Score :")
print(accuracy_score(y_test, pred))
print("Classification Report :")
print(classification_report(y_test, pred))

In [None]:
svm_cm = confusion_matrix(y_test, pred)

In [None]:
confusion_matrix_plot(svm_cm)