In [43]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

In [20]:
df = pd.read_csv("../data/Fraud_Data_cleaned.csv")

In [21]:
df.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,1,9839.64,170136.0,160296.36,0.0,0.0,0,0,0,0,1,0
1,1,1864.28,21249.0,19384.72,0.0,0.0,0,0,0,0,1,0
2,1,181.0,181.0,0.0,0.0,0.0,1,0,0,0,0,1
3,1,181.0,181.0,0.0,21182.0,0.0,1,0,1,0,0,0
4,1,11668.14,41554.0,29885.86,0.0,0.0,0,0,0,0,1,0


In [22]:
df.tail()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
999995,45,10913.42,576108.8,565195.38,0.0,0.0,0,0,0,0,1,0
999996,45,2014.46,565195.38,563180.92,0.0,0.0,0,0,0,0,1,0
999997,45,18839.45,563180.92,544341.47,0.0,0.0,0,0,0,0,1,0
999998,45,17890.19,544341.47,526451.29,0.0,0.0,0,0,0,0,1,0
999999,45,25411.16,526451.29,501040.13,0.0,0.0,0,0,0,0,1,0


In [23]:
df.shape

(1000000, 12)

In [24]:
#The first step is to seperate the independent variables(the features) and the dependent variable(the label)
#The features will be all columns except isFraud and the label will be isFraud

X = df.drop(["isFraud"], axis = 1)
y = df["isFraud"]

In [25]:
#Using standard scaler, I will standardize all data in X, the features, using standard scaler to make sure it is within the same range. 
#Although it is not required for random forest, I will scale the data to improve model performance

scaler = StandardScaler()
X[X.columns] = scaler.fit_transform(X[X.columns])
X.head(10)

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,-1.90928,-0.580156,-0.237235,-0.244442,-0.427716,-0.463887,-0.529031,-0.754361,-0.080723,1.425683,-0.299713
1,-1.90928,-0.610918,-0.287157,-0.291112,-0.427716,-0.463887,-0.529031,-0.754361,-0.080723,1.425683,-0.299713
2,-1.90928,-0.617411,-0.294221,-0.297532,-0.427716,-0.463887,-0.529031,-0.754361,-0.080723,-0.701418,3.336523
3,-1.90928,-0.617411,-0.294221,-0.297532,-0.418529,-0.463887,-0.529031,1.325625,-0.080723,-0.701418,-0.299713
4,-1.90928,-0.573103,-0.280348,-0.287634,-0.427716,-0.463887,-0.529031,-0.754361,-0.080723,1.425683,-0.299713
5,-1.90928,-0.587955,-0.276222,-0.282283,-0.427716,-0.463887,-0.529031,-0.754361,-0.080723,1.425683,-0.299713
6,-1.90928,-0.590693,-0.232856,-0.239212,-0.427716,-0.463887,-0.529031,-0.754361,-0.080723,1.425683,-0.299713
7,-1.90928,-0.587786,-0.23524,-0.241816,-0.427716,-0.463887,-0.529031,-0.754361,-0.080723,1.425683,-0.299713
8,-1.90928,-0.602586,-0.293386,-0.297532,-0.427716,-0.463887,-0.529031,-0.754361,-0.080723,1.425683,-0.299713
9,-1.90928,-0.59752,-0.280293,-0.285482,-0.409543,-0.447259,-0.529031,-0.754361,12.388059,-0.701418,-0.299713


In [26]:
#I will also observe the first few rows of y, the label, to get a sense of the data in y
y.head()

0    0
1    0
2    1
3    1
4    0
Name: isFraud, dtype: int64

In [27]:
#Next, I split the data into training and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3 , random_state=42)

In [28]:
# Training a random forest model

RF_model = RandomForestClassifier(random_state = 42)
RF_model.fit(X_train ,y_train)

In [29]:
#Evaluating the accuracy of the random forest model

RF_predictions = RF_model.predict(X_test)
RF_accuracy = accuracy_score(y_test, RF_predictions)
print("Random Forest Accuracy = ", round((RF_accuracy *100 ), 5) , "%")

Random Forest Accuracy =  99.97767 %


With an accuracy score of 99.97 % the model seems to be overfitting

Although accuracy score is a good measure for model accuracy, in this specific case, we would need another metric to evaluate the performance of the model. In this situation, it would be costly if the model missed a true case of fraud and predicted a false negative, meaning that the model failed to predict fraud in a transaction that actually was fraud.

Because of this, it would be important to focus on Recall for this model. With a higher recall score for the model, it would mean that the model missed fewer cases of fraud in transactions that were fraud. The f1 score would also be an important metric because there is a severe class imbalance in this dataset, as the majority of transactions are not fraud.

In [30]:
#Checking the precision, recall and f1-score for the random forest model

print(classification_report(y_test, RF_predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    299850
           1       0.91      0.61      0.73       150

    accuracy                           1.00    300000
   macro avg       0.96      0.81      0.87    300000
weighted avg       1.00      1.00      1.00    300000



For transactions that are fradulent, the model has a high precision score of 0.91 which tells us that the model is accurately predicting true positive cases of fraud and minimizing false positives. The medium recall score of 0.61 tells us that the model correctly identifies 61% of fraudulent transactions and there are fewer false negatives. A f1-score of 0.73, is reasonable and it tells us that the model is achieving a reasonable balance between precision and recall. 

In transactions that are not fraudulent, the model has a perfect precision score of 1 and a perfect recall score of 1 along with a perfect f1 score of 1. This tells us that the model is accurately predicting in cases where the transaction is not fraudulent. This may be due to the large class imbalance because most of the transactions in the dataset are not fraudulent

In [33]:
#Creating a DataFrame with a list of all features and their scores 

Feature_Importance = RF_model.feature_importances_
Features = pd.DataFrame({'Name':X.columns.values,'Importance':Feature_Importance})

#Sorting features in descending order
Features_sorted = Features.sort_values('Importance',ascending=False)

#Getting the top 5 features
top_five=list(Features_sorted['Name'].iloc[:5])

print('Top five features: {0}'.format(top_five))


Top five features: ['oldbalanceOrg', 'amount', 'newbalanceDest', 'oldbalanceDest', 'step']


In [60]:
#Using samples of train and test sets for random search cv. My computer can not handle
#1 million rows for search, so using sample of 10000
Sample_X_train = X_train.sample(10000, random_state= 42)
Sample_y_train = y_train.sample(10000, random_state= 42)
Sample_X_test = X_test.sample(10000, random_state= 42)
Sample_y_test = y_test.sample(10000, random_state= 42)


RF_model.fit(Sample_X_train ,Sample_y_train)

In [49]:
#obtaining default hyperparameters for the random forest model
RF_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [50]:
#Choosing 3 hyperparameters for search
n_estimators=[250,500,1000]
max_depth=[10,25,50]
max_features=["log2","sqrt"]

In [51]:
#Creating the hyperparameter grid
param_grid={'n_estimators':n_estimators, 'max_depth':max_depth, 'max_features':max_features}

In [52]:
param_grid

{'n_estimators': [250, 500, 1000],
 'max_depth': [10, 25, 50],
 'max_features': ['log2', 'sqrt']}

In [53]:
#Setting up random search using the hyperparameter grid
rf_grid =RandomizedSearchCV(RF_model,param_grid,cv=5)


In [55]:
#Fitting the random search model
rf_random_search = rf_grid.fit(Sample_X_train ,Sample_y_train)

In [56]:
#getting the best parameters 
rf_best_params=rf_grid.best_params_
rf_best_params

{'n_estimators': 1000, 'max_features': 'log2', 'max_depth': 50}

In [61]:
#Fitting best rf model and computing accuracy
rf_best = RandomForestClassifier(random_state = 42,max_depth=50, max_features="log2", n_estimators=1000)

#Creating new test and training sets with the top 5 features, using the samples of 10,000 rows
new_X_train = Sample_X_train[top_five]
new_X_test = Sample_X_test[top_five]

In [62]:
#Fitting the new x train and test to the model
rf_best.fit(new_X_train ,Sample_y_train)

In [63]:
#Evaluating the accuracy of the  random forest model with best parameters

rf_best_predictions = rf_best.predict(new_X_test)
rf_best_accuracy = accuracy_score(Sample_y_test, rf_best_predictions)
print(" Best Random Forest Accuracy = ", round((rf_best_accuracy *100 ), 5) , "%")

 Best Random Forest Accuracy =  99.94 %


In [65]:
print(classification_report(Sample_y_test, rf_best_predictions,zero_division=np.NaN))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9994
           1        nan      0.00       nan         6

    accuracy                           1.00     10000
   macro avg       1.00      0.50      1.00     10000
weighted avg       1.00      1.00      1.00     10000



We can see after adjusting the Random Forest Model and using the best hyperparameters and top 5 features, the accuracy actually slightly decreased from 99.97 to 99.94

The recall score for transactions that are actually fradulent decreased from 0.61 to 0. This may show that the original model with default hyperparameters was best.

Both precision and recall for transactions that are not fradulent remained the same, as most transactions are not fradulent.

The issues arising may be due to using the top 5 features, as all features might be neccesary for this specific model since it deals with financial data.