# Introduction
In this project, we utilize the Bank Account Fraud dataset, which was released during NeurIPS 2022. Our objective is to develop a machine learning model capable of identifying fraudulent bank transactions. Additional information about the dataset can be found on Kaggle:
https://www.kaggle.com/datasets/sgpjesus/bank-account-fraud-dataset-neurips-2022

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn import metrics

# Data Exploration

In [17]:
# directly load the dataset from kaggle
data = pd.read_csv('/kaggle/input/bank-account-fraud-dataset-neurips-2022/Base.csv')

# dispaly the first few rows of the dataset
data.head().T

Unnamed: 0,0,1,2,3,4
fraud_bool,0,0,0,0,0
income,0.3,0.8,0.8,0.6,0.9
name_email_similarity,0.986506,0.617426,0.996707,0.4751,0.842307
prev_address_months_count,-1,-1,9,11,-1
current_address_months_count,25,89,14,14,29
customer_age,40,20,40,30,40
days_since_request,0.006735,0.010095,0.012316,0.006991,5.742626
intended_balcon_amount,102.453711,-0.849551,-1.490386,-1.863101,47.152498
payment_type,AA,AD,AB,AB,AA
zip_count_4w,1059,1658,1095,3483,2339


In [18]:
# summary info of the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 32 columns):
 #   Column                            Non-Null Count    Dtype  
---  ------                            --------------    -----  
 0   fraud_bool                        1000000 non-null  int64  
 1   income                            1000000 non-null  float64
 2   name_email_similarity             1000000 non-null  float64
 3   prev_address_months_count         1000000 non-null  int64  
 4   current_address_months_count      1000000 non-null  int64  
 5   customer_age                      1000000 non-null  int64  
 6   days_since_request                1000000 non-null  float64
 7   intended_balcon_amount            1000000 non-null  float64
 8   payment_type                      1000000 non-null  object 
 9   zip_count_4w                      1000000 non-null  int64  
 10  velocity_6h                       1000000 non-null  float64
 11  velocity_24h                      1000

In [19]:
# stats of categorical columns
data.describe(include=["object"]).T

Unnamed: 0,count,unique,top,freq
payment_type,1000000,5,AB,370554
employment_status,1000000,7,CA,730252
housing_status,1000000,7,BC,372143
source,1000000,2,INTERNET,992952
device_os,1000000,5,other,342728


In [20]:
# stats of numerical columns
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fraud_bool,1000000.0,0.011029,0.104438,0.0,0.0,0.0,0.0,1.0
income,1000000.0,0.562696,0.290343,0.1,0.3,0.6,0.8,0.9
name_email_similarity,1000000.0,0.493694,0.289125,1.43455e-06,0.225216,0.492153,0.755567,0.999999
prev_address_months_count,1000000.0,16.718568,44.04623,-1.0,-1.0,-1.0,12.0,383.0
current_address_months_count,1000000.0,86.587867,88.406599,-1.0,19.0,52.0,130.0,428.0
customer_age,1000000.0,33.68908,12.025799,10.0,20.0,30.0,40.0,90.0
days_since_request,1000000.0,1.025705,5.381835,4.03686e-09,0.007193,0.015176,0.026331,78.456904
intended_balcon_amount,1000000.0,8.661499,20.236155,-15.53055,-1.181488,-0.830507,4.984176,112.956928
zip_count_4w,1000000.0,1572.692049,1005.374565,1.0,894.0,1263.0,1944.0,6700.0
velocity_6h,1000000.0,5665.296605,3009.380665,-170.6031,3436.365848,5319.769349,7680.717827,16715.565404


In [21]:
fig = px.pie(values=data['fraud_bool'].value_counts(), names=["Non-Fraud","Fraud"] , width=600, height=400, color_discrete_sequence=["lightblue","red"]
             ,title="Fraud vs Non-Fraud transactions")
fig.show()

## Observations
1. Very imbalanced distribution in target feature fraud_bool
2. Dataset contains several categorical features, can use one-hot encoding to convert them to numerical format

# Data Preprocessing

## Check Missing Values and Drop Duplicates

In [22]:
data.drop_duplicates(inplace=True)
data.isnull().sum()

fraud_bool                          0
income                              0
name_email_similarity               0
prev_address_months_count           0
current_address_months_count        0
customer_age                        0
days_since_request                  0
intended_balcon_amount              0
payment_type                        0
zip_count_4w                        0
velocity_6h                         0
velocity_24h                        0
velocity_4w                         0
bank_branch_count_8w                0
date_of_birth_distinct_emails_4w    0
employment_status                   0
credit_risk_score                   0
email_is_free                       0
housing_status                      0
phone_home_valid                    0
phone_mobile_valid                  0
bank_months_count                   0
has_other_cards                     0
proposed_credit_limit               0
foreign_request                     0
source                              0
session_leng

## Apply One-hot Encoding For Categorical Features

In [23]:
data = pd.get_dummies(data)
data.head().T

Unnamed: 0,0,1,2,3,4
fraud_bool,0,0,0,0,0
income,0.3,0.8,0.8,0.6,0.9
name_email_similarity,0.986506,0.617426,0.996707,0.4751,0.842307
prev_address_months_count,-1,-1,9,11,-1
current_address_months_count,25,89,14,14,29
customer_age,40,20,40,30,40
days_since_request,0.006735,0.010095,0.012316,0.006991,5.742626
intended_balcon_amount,102.453711,-0.849551,-1.490386,-1.863101,47.152498
zip_count_4w,1059,1658,1095,3483,2339
velocity_6h,13096.035018,9223.283431,4471.472149,14431.993621,7601.511579


## Splitting Training and Test Dataset

In [24]:
from sklearn.model_selection import train_test_split

X = data.drop('fraud_bool', axis=1)
y = data['fraud_bool']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42, stratify=y)


## Data Normalization

In [25]:
from sklearn.preprocessing import MinMaxScaler

numeric_features = data.select_dtypes(include='number').columns.to_list()
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

871498    0
637378    0
660426    0
146887    0
747635    0
         ..
757886    0
777036    0
941372    0
137263    0
117519    0
Name: fraud_bool, Length: 300000, dtype: int64


## Oversampling Using SMOTE
SMOTE, which stands for "Synthetic Minority Over-sampling Technique," is a technique used in the field of imbalanced machine learning to address class imbalance problems, particularly in classification tasks.

In [26]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy = 'minority', random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Model Selection
Next, we will train for different models and use cross validation result to select the best model. Here we want to optimize for recall. In the case of bank fraud detection, we want to avoid false negatives as much as possible. So recall is more important than precision here.

In [27]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

def cross_validation_result(estimator_name, estimator, X_train, y_train):
    folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
    score = cross_val_score(estimator, X_train, y_train, scoring = 'recall', cv = folds)
    print("Cross Validation Result for model: ".format(estimator_name))
    print("Cross Validation Recall scores are: {}".format(score))
    print("Mean of scores: {}".format(score.mean()))
    print("Standard Deviation of scores: {}".format(score.mean()))

### Decision Tree

In [28]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier()

cross_validation_result('Decision Tree', decision_tree, X_train_smote, y_train_smote)

Cross Validation Result for model: 
Cross Validation Recall scores are: [0.98853065 0.98876177 0.98816953 0.98801063 0.9874545 ]
Mean of scores: 0.9881854163055411
Standard Deviation of scores: 0.9881854163055411


### Random Forest

In [29]:
from sklearn.ensemble import RandomForestClassifier

randome_forest = RandomForestClassifier(n_estimators=20, random_state=13)
cross_validation_result('Random Forest', randome_forest, X_train_smote, y_train_smote)


Cross Validation Result for model: 
Cross Validation Recall scores are: [0.99509591 0.99531259 0.99508869 0.99517536 0.99461201]
Mean of scores: 0.9950569133876466
Standard Deviation of scores: 0.9950569133876466


### Grandient Boosting

In [31]:
from sklearn.ensemble import GradientBoostingClassifier

gradient_boost = GradientBoostingClassifier(n_estimators=20, learning_rate=0.1, max_depth=3, random_state=42)

cross_validation_result('Gradient Boosting', gradient_boost, X_train_smote, y_train_smote)


Cross Validation Result for model: 
Cross Validation Recall scores are: [0.89790258 0.89978766 0.89683365 0.89400243 0.8980687 ]
Mean of scores: 0.8973190038712661
Standard Deviation of scores: 0.8973190038712661


## Hyperparamter Tuning
From the cross validation results above, Random Forest is the best performing model. Next we will use grid search to fine tune model hyperparamters.

In [39]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [15, 20, 25],
    'max_depth': [3, 4, 5],
    'random_state': [13]
}

folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
best_model = GridSearchCV(rf, param_grid=params, cv=folds, scoring='recall').fit(X_train, y_train)

print('Best parameters:', best_model.best_params_)

Best parameters: {'max_depth': 3, 'n_estimators': 15, 'random_state': 13}


# Model Evaluation
Finally, we evaluate the performance of the model on the test set.

In [40]:
from sklearn.metrics import classification_report, confusion_matrix, recall_score, precision_score, f1_score, accuracy_score

y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=0))    

              precision    recall  f1-score   support

           0       0.99      1.00      0.99    296691
           1       0.00      0.00      0.00      3309

    accuracy                           0.99    300000
   macro avg       0.49      0.50      0.50    300000
weighted avg       0.98      0.99      0.98    300000

