In [None]:
# import the necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import gridspec

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.pipeline import Pipeline 

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score,matthews_corrcoef, roc_auc_score, roc_curve, precision_recall_curve, average_precision_score
from sklearn.metrics import homogeneity_score, silhouette_score
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Load the dataset from the csv file using pandas
data = pd.read_csv('creditcard.csv')
# Grab a peak at the data
data.head()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
# Count the occurrences of fraud and no fraud and print them
occ = data['Class'].value_counts()
occ

In [None]:
# Print the ratio of fraud cases
ratio_cases = occ/len(data.index)
print(f'Ratio of fraudulent cases: {ratio_cases[1]}')
print(f'Natural Hit Rate: {1 - ratio_cases[1]}')

In [None]:
# Dividing the X(features) and the Y(target) from the dataset
X = data.drop(["Class", "Time"], axis=1).values
Y = data["Class"].values
print(f'X shape: {X.shape}\nY shape: {Y.shape}')

In [None]:
# Define the resampling method
resampling = SMOTE()
# Create the resampled feature set
X_resampled, Y_resampled = resampling.fit_sample(X, Y)

In [None]:
# First, try models without SMOTE

In [None]:
# Logistic Regression

In [None]:
# Create the training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
X_resampled_train, X_resampled_test, Y_resampled_train, Y_resampled_test = train_test_split(X_resampled, Y_resampled, test_size=0.3, random_state=0)

In [None]:
def evaluate(model_name, actual, prediction):
    print("the Model used is {}".format(model_name))
    acc = accuracy_score(actual, prediction)
    print("The accuracy is {}".format(acc))
    prec = precision_score(actual, prediction)
    print("The precision is {}".format(prec))
    rec = recall_score(actual, prediction)
    print("The recall is {}".format(rec))
    f1 = f1_score(actual, prediction)
    print("The F1-Score is {}".format(f1))
    mcc = matthews_corrcoef(actual, prediction)
    print("The Matthews correlation coefficient is {}".format(mcc))
    # Print the classifcation report and confusion matrix
    print("Classification report:\n", classification_report(actual, prediction))
    conf_mat = confusion_matrix(y_true=actual, y_pred=prediction)
    print("Confusion matrix:\n", conf_mat)

In [None]:
def eval_roc(model, x_test, y_test):
    # Predict probabilities
    probs = model.predict_proba(x_test)
    # Print the ROC curve
    print('ROC Score:')
    print(roc_auc_score(y_test, probs[:,1]))

In [None]:
# Fit a logistic regression model to original data
lr = LogisticRegression(solver='sag')
lr.fit(X_train, Y_train)
# Obtain model predictions
lr_predicted = lr.predict(X_test)
evaluate("Logistic Regression", Y_test, lr_predicted)

In [None]:
# Fit a logistic regression model to resampled data
lr = LogisticRegression(solver='sag')
lr.fit(X_resampled_train, Y_resampled_train)
# Obtain model predictions
lr_predicted = lr.predict(X_resampled_test)
evaluate("Logistic Regression", Y_resampled_test, lr_predicted)

In [None]:
# ROC score of logistic regression
eval_roc(lr, X_resampled_test, Y_resampled_test)

In [None]:
# Random Forest

In [None]:
# Define the model as the random forest
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train,Y_train)
# predictions
rf_predicted = rf.predict(X_test)
evaluate("Random Forest", Y_test, rf_predicted)

In [None]:
# Define the model as the random forest
rf = RandomForestClassifier(random_state=12345)
rf.fit(X_resampled_train,Y_resampled_train)
# predictions
rf_predicted = rf.predict(X_resampled_test)
evaluate("Random Forest", Y_resampled_test, rf_predicted)

In [None]:
# ROC score of Random Forest
eval_roc(rf, X_resampled_test, Y_resampled_test)

In [None]:
# Artificial Neural Network

In [None]:
# Define the model as ANN
ann = MLPClassifier(random_state=0)
ann.fit(X_train,Y_train)
# predictions
ann_predicted = ann.predict(X_test)
evaluate("Artificial Neural Network", Y_test, ann_predicted)

In [None]:
# Define the model as the random forest
ann = MLPClassifier(random_state=0)
ann.fit(X_resampled_train,Y_resampled_train)
# predictions
ann_predicted = ann.predict(X_resampled_test)
evaluate("Artificial Neural Network", Y_resampled_test, ann_predicted)

In [None]:
# ROC score of ANN
eval_roc(ann, X_resampled_test, Y_resampled_test)

In [None]:
# Ensemble Method

In [None]:
# Combine the classifiers in the ensemble model
ensemble_model = VotingClassifier(estimators=[('lr', lr), ('rf', rf), ('ann', ann)], voting='soft', weights=[0, 5, 5], flatten_transform=True)
ensemble_model.fit(X_resampled_train,Y_resampled_train)
# predictions
ensemble_model_predicted = ensemble_model.predict(X_resampled_test)
evaluate("Ensemble Model", Y_resampled_test, ensemble_model_predicted)