In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import metrics
from sklearn import ensemble
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.metrics import Precision, Recall, AUC

In [2]:
## Load credit card transaction dataset
df = pd.read_csv("card_transdata(1).csv")

## Store the data as X and y where X stores input features and y stores fraud indication
X = df.drop("fraud", axis=1).values
Y = df["fraud"].values

# seperate out 20% of data as the validation set to ensure no overfitting
X_set,X_valid, Y_set, Y_valid = train_test_split(X, Y, test_size=0.2, random_state=30)

# Split data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X_set, Y_set, test_size=0.2, random_state=30)

## Resample the training set to balance the classes
oversampler = SMOTE(random_state=30)
X_train_resampled, Y_train_resampled = oversampler.fit_resample(X_train, Y_train)


In [3]:
#Random Forest
rfc = ensemble.RandomForestClassifier()
rfc = rfc.fit(X_train_resampled, Y_train_resampled)

y_pred_rfc = rfc.predict(X_test)


print('Test Data:')
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred_rfc))
print("Precision:", metrics.precision_score(Y_test, y_pred_rfc))
print("Recall:", metrics.recall_score(Y_test, y_pred_rfc))
print("F1 score:", metrics.f1_score(Y_test, y_pred_rfc))
print(pd.crosstab(Y_test, y_pred_rfc, rownames=["Actual Fraud"], colnames=["Flagged Fraud"]))

y_pred_rfc_valid = rfc.predict(X_valid)

print("Validation Data:")
print("Validation")
print("Accuracy:",metrics.accuracy_score(Y_valid, y_pred_rfc_valid))
print("Precision:", metrics.precision_score(Y_valid, y_pred_rfc_valid))
print("Recall:", metrics.recall_score(Y_valid, y_pred_rfc_valid))
print("F1 score:", metrics.f1_score(Y_valid, y_pred_rfc_valid))

Test Data:
Accuracy: 0.9999875
Precision: 1.0
Recall: 0.9998573669947226
F1 score: 0.999928678410955
Flagged Fraud       0      1
Actual Fraud                
0              145978      0
1                   2  14020
Validation Data:
Validation
Accuracy: 0.999985
Precision: 1.0
Recall: 0.9998285028297033
F1 score: 0.9999142440614013


In [6]:
## Define the neural network model
model = Sequential()
model.add(Dense(units=32, activation='relu', input_dim=X.shape[1]))
model.add(Dense(units=16, activation='relu'))
model.add(Dense(units=8, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

## Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', 
              metrics=[Precision(), Recall(), AUC(), 'accuracy'])

## Train the model
model.fit(X_train_resampled, Y_train_resampled, epochs=10, batch_size=32)

## Make predictions on the test inputs
y_pred_nn = model.predict(X_test)
y_pred_nn = (y_pred_nn > 0.5).astype(int).reshape(-1) # Convert probabilities to binary predictions

## Evaluate using the following performance metrics
print("Test Data:")
print("Accuracy:", metrics.accuracy_score(Y_test, y_pred_nn))
print("Precision:", metrics.precision_score(Y_test, y_pred_nn))
print("Recall:", metrics.recall_score(Y_test, y_pred_nn))
print("F1 score:", metrics.f1_score(Y_test, y_pred_nn))

## Make predictions on the test inputs
y_pred_nn_v = model.predict(X_valid)
y_pred_nn_v = (y_pred_nn_v > 0.5).astype(int).reshape(-1) # Convert probabilities to binary predictions

## Evaluate using the following performance metrics
print("Validation Data:")
print("Accuracy:", metrics.accuracy_score(Y_valid, y_pred_nn_v))
print("Precision:", metrics.precision_score(Y_valid, y_pred_nn_v))
print("Recall:", metrics.recall_score(Y_valid, y_pred_nn_v))
print("F1 score:", metrics.f1_score(Y_valid, y_pred_nn_v))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Data:
Accuracy: 0.9962
Precision: 0.9588806787082649
Recall: 0.999500784481529
F1 score: 0.9787694671415602
Test Data:
Accuracy: 0.996175
Precision: 0.9586532134239965
Recall: 0.9993711770422454
F1 score: 0.9785888214055808


In [7]:
# Create logistic regression classifier
clf = LogisticRegression()

# Fit the classifier to the training data
clf.fit(X_train_resampled, Y_train_resampled)

# Make predictions on the testing data
Y_pred = clf.predict(X_test)
                     
# Evaluate the performance of the model
print('Test Data:')
print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred))
print("Precision:", metrics.precision_score(Y_test, Y_pred))
print("Recall:", metrics.recall_score(Y_test, Y_pred))
print("F1 score:", metrics.f1_score(Y_test, Y_pred))

# Make predictions on the testing data
Y_pred_v = clf.predict(X_valid)
                     
# Evaluate the performance of the model
print('Validation Data:')
print("Accuracy:", metrics.accuracy_score(Y_valid, Y_pred_v))
print("Precision:", metrics.precision_score(Y_valid, Y_pred_v))
print("Recall:", metrics.recall_score(Y_valid, Y_pred_v))
print("F1 score:", metrics.f1_score(Y_valid, Y_pred_v))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Test Data:
Accuracy: 0.93285
Precision: 0.5705917822379188
Recall: 0.944801026957638
F1 score: 0.7114930182599355
Validation Data:
Accuracy: 0.934225
Precision: 0.5755328040117008
Recall: 0.9447779111644657
F1 score: 0.7153151983379861
