In [6]:
# Import the required modules
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression

In [7]:
# Read in prepped data csv
fraud_data = pd.read_csv(
    Path('prepped_data.csv'))
fraud_data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_timefreq_enc,merchantfreq_enc,categoryfreq_enc,genderfreq_enc,streetfreq_enc,cityfreq_enc,statefreq_enc,jobfreq_enc,dobfreq_enc,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
0,0,1,109,4931,54641,165,165,2368,287,165,4.97,28654,36.0788,-81.1781,3495,1325376018,36.011293,-82.048315,0
1,1,1,192,9533,54641,256,302,1456,390,256,107.23,99160,48.8878,-118.2105,149,1325376044,49.159047,-118.186462,0
2,2,1,128,7179,45359,37,37,452,37,37,220.11,83252,42.1808,-112.262,4154,1325376051,43.150704,-112.154481,0
3,3,1,200,10221,45359,43,43,967,201,43,45.0,59632,46.2306,-112.1138,1939,1325376076,47.034331,-112.561071,0
4,4,1,120,6065,45359,176,176,2312,176,176,41.96,24433,38.4207,-79.4629,99,1325376186,38.674999,-78.632459,0


In [8]:
# Drop Unnamed column
fraud_data = fraud_data.drop(columns=['Unnamed: 0'])
fraud_data.head()

Unnamed: 0,trans_date_trans_timefreq_enc,merchantfreq_enc,categoryfreq_enc,genderfreq_enc,streetfreq_enc,cityfreq_enc,statefreq_enc,jobfreq_enc,dobfreq_enc,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
0,1,109,4931,54641,165,165,2368,287,165,4.97,28654,36.0788,-81.1781,3495,1325376018,36.011293,-82.048315,0
1,1,192,9533,54641,256,302,1456,390,256,107.23,99160,48.8878,-118.2105,149,1325376044,49.159047,-118.186462,0
2,1,128,7179,45359,37,37,452,37,37,220.11,83252,42.1808,-112.262,4154,1325376051,43.150704,-112.154481,0
3,1,200,10221,45359,43,43,967,201,43,45.0,59632,46.2306,-112.1138,1939,1325376076,47.034331,-112.561071,0
4,1,120,6065,45359,176,176,2312,176,176,41.96,24433,38.4207,-79.4629,99,1325376186,38.674999,-78.632459,0


In [9]:
# Assign independent variables to X and target variable to Y
X = fraud_data.drop(columns=['is_fraud'])
y = fraud_data['is_fraud']

In [10]:
# Call train_test_split with a 80/20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [11]:
# Assign our model
rf_classifier = RandomForestClassifier(random_state=42)

In [12]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split':[2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [13]:
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, verbose=2, n_jobs=-1)

In [14]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  14.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  15.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  15.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  15.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  15.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  30.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  31.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  31.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  31.7s
[CV] END m

In [16]:
best_grid = grid_search.best_estimator_
y_pred = best_grid.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Best GridSearch Parameters:', grid_search.best_params_)
print('Test Accuracy:', accuracy)

Best GridSearch Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Test Accuracy: 0.99745


In [17]:
# Fit the model
rf_classifier.fit(X_train, y_train)

In [18]:
# Predict our target variable from X_test
y_pred = rf_classifier.predict(X_test)

In [19]:
# Print our Accuracy Score and Classification Report
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
print('Accuracy Score:', accuracy)
print('Classification Report:', classification_rep)

Accuracy Score: 0.99755
Classification Report:               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19802
           1       0.94      0.80      0.87       198

    accuracy                           1.00     20000
   macro avg       0.97      0.90      0.93     20000
weighted avg       1.00      1.00      1.00     20000



In [24]:
# Confirm imbalance
training_balance = y_train.value_counts()
training_balance

is_fraud
0    79208
1      792
Name: count, dtype: int64

In [25]:
# Call SMOTE to sythesize and balance data
smote = SMOTE(random_state=42)

In [26]:
# Fit data to SMOTE
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [27]:
# Confirm synthetic balance
y_train_smote.value_counts()

is_fraud
0    79208
1    79208
Name: count, dtype: int64

In [28]:
# Refit RF Model with balanced data
rf_classifier.fit(X_train_smote, y_train_smote)

In [15]:
# Predict target variable
y_pred = rf_classifier.predict(X_test)

In [19]:
# Print Accuracy score and Classification Report for balanced data
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
print('Accuracy Score:', accuracy)
print('Classification Report:', classification_rep)

Accuracy Score: 0.99565
Classification Report:               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19802
           1       0.76      0.83      0.79       198

    accuracy                           1.00     20000
   macro avg       0.88      0.91      0.89     20000
weighted avg       1.00      1.00      1.00     20000



In [17]:
# Reconfirm data is balanced
training_balance = y_train_smote.value_counts()
training_balance

is_fraud
0    79208
1    79208
Name: count, dtype: int64

In [None]:
# Balancing this data had no impact on model scores.