In [1]:
# Import the required modules
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline as ImbPipeline

In [2]:
# Read in prepped data csv
fraud_data = pd.read_csv(
    Path('prepped_data.csv'))
fraud_data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_timefreq_enc,merchantfreq_enc,categoryfreq_enc,genderfreq_enc,streetfreq_enc,cityfreq_enc,statefreq_enc,jobfreq_enc,dobfreq_enc,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
0,0,1,109,4931,54641,165,165,2368,287,165,4.97,28654,36.0788,-81.1781,3495,1325376018,36.011293,-82.048315,0
1,1,1,192,9533,54641,256,302,1456,390,256,107.23,99160,48.8878,-118.2105,149,1325376044,49.159047,-118.186462,0
2,2,1,128,7179,45359,37,37,452,37,37,220.11,83252,42.1808,-112.262,4154,1325376051,43.150704,-112.154481,0
3,3,1,200,10221,45359,43,43,967,201,43,45.0,59632,46.2306,-112.1138,1939,1325376076,47.034331,-112.561071,0
4,4,1,120,6065,45359,176,176,2312,176,176,41.96,24433,38.4207,-79.4629,99,1325376186,38.674999,-78.632459,0


In [3]:
# Drop Unnamed column
fraud_data = fraud_data.drop(columns=['Unnamed: 0'])
fraud_data.head()

Unnamed: 0,trans_date_trans_timefreq_enc,merchantfreq_enc,categoryfreq_enc,genderfreq_enc,streetfreq_enc,cityfreq_enc,statefreq_enc,jobfreq_enc,dobfreq_enc,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
0,1,109,4931,54641,165,165,2368,287,165,4.97,28654,36.0788,-81.1781,3495,1325376018,36.011293,-82.048315,0
1,1,192,9533,54641,256,302,1456,390,256,107.23,99160,48.8878,-118.2105,149,1325376044,49.159047,-118.186462,0
2,1,128,7179,45359,37,37,452,37,37,220.11,83252,42.1808,-112.262,4154,1325376051,43.150704,-112.154481,0
3,1,200,10221,45359,43,43,967,201,43,45.0,59632,46.2306,-112.1138,1939,1325376076,47.034331,-112.561071,0
4,1,120,6065,45359,176,176,2312,176,176,41.96,24433,38.4207,-79.4629,99,1325376186,38.674999,-78.632459,0


In [4]:
# Assign independent variables to X and target variable to Y
X = fraud_data.drop(columns=['is_fraud'])
y = fraud_data['is_fraud']

In [5]:
# Call train_test_split with a 80/20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
# Call and define that includes scaling, SMOTE, and RF
pipeline = ImbPipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [7]:
# Define parameters for gridsearch.
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split':[2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

In [8]:
# Initialize StratifiedKFold to prevent oversampling
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [9]:
# Initialize gridsearch with pipeline and defined parameters
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, 
                           cv=cv, verbose=2, n_jobs=-1, scoring='accuracy')

In [10]:
# Fit the gridsearch
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=  44.4s
[CV] END classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=  44.4s
[CV] END classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=  44.8s
[CV] END classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=  45.0s
[CV] END classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time=  45.8s
[CV] END classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=200; total time= 1.5min
[CV] END classifier__max_depth=N

In [11]:
# Print best parameters and cross validation score
print('Best Parameters:', grid_search.best_params_)
print('Best Cross-validation score:', grid_search.best_score_)

Best Parameters: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
Best Cross-validation score: 0.9963875


In [12]:
# Predict the training set and evaluate scores
y_train_pred = grid_search.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print('Training Accuracy:', train_accuracy)
print('Classification Report on Training Data:\n', classification_report(y_train, y_train_pred))

Training Accuracy: 1.0
Classification Report on Training Data:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     79208
           1       1.00      1.00      1.00       792

    accuracy                           1.00     80000
   macro avg       1.00      1.00      1.00     80000
weighted avg       1.00      1.00      1.00     80000



In [None]:
# TESTING MODEL ON UNTRAINED DATA:

In [23]:
# Read in prepped test data csv
fraud_test_data = pd.read_csv(
    Path('prepped_fraud_test_dataset.csv'))
fraud_test_data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_timefreq_enc,merchantfreq_enc,categoryfreq_enc,genderfreq_enc,streetfreq_enc,cityfreq_enc,statefreq_enc,jobfreq_enc,dobfreq_enc,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
0,0,1,151,7046,45217,120,120,2291,451,120,2.86,29209,33.9659,-80.9355,333497,1371816865,33.986391,-81.200714,0
1,1,1,130,7046,54783,142,142,839,437,142,29.84,84002,40.3207,-110.436,302,1371816873,39.450498,-109.960431,0
2,2,1,164,6639,54783,193,193,6456,448,193,41.28,11710,40.6729,-73.5365,34496,1371816893,40.49581,-74.196111,0
3,3,1,133,6260,45217,131,131,3319,131,131,60.05,32780,28.5697,-80.8191,54767,1371816915,28.812398,-80.883061,0
4,4,1,73,3139,45217,167,167,3552,167,167,3.19,49632,44.2529,-85.017,1126,1371816917,44.959148,-85.884734,0


In [24]:
# Drop unnamed column
fraud_test_data = fraud_test_data.drop(columns=['Unnamed: 0'])
fraud_test_data.head()

Unnamed: 0,trans_date_trans_timefreq_enc,merchantfreq_enc,categoryfreq_enc,genderfreq_enc,streetfreq_enc,cityfreq_enc,statefreq_enc,jobfreq_enc,dobfreq_enc,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
0,1,151,7046,45217,120,120,2291,451,120,2.86,29209,33.9659,-80.9355,333497,1371816865,33.986391,-81.200714,0
1,1,130,7046,54783,142,142,839,437,142,29.84,84002,40.3207,-110.436,302,1371816873,39.450498,-109.960431,0
2,1,164,6639,54783,193,193,6456,448,193,41.28,11710,40.6729,-73.5365,34496,1371816893,40.49581,-74.196111,0
3,1,133,6260,45217,131,131,3319,131,131,60.05,32780,28.5697,-80.8191,54767,1371816915,28.812398,-80.883061,0
4,1,73,3139,45217,167,167,3552,167,167,3.19,49632,44.2529,-85.017,1126,1371816917,44.959148,-85.884734,0


In [25]:
# Define independant variables for X_new
X_new = fraud_test_data.drop(columns=['is_fraud'])
X_new.head()

Unnamed: 0,trans_date_trans_timefreq_enc,merchantfreq_enc,categoryfreq_enc,genderfreq_enc,streetfreq_enc,cityfreq_enc,statefreq_enc,jobfreq_enc,dobfreq_enc,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long
0,1,151,7046,45217,120,120,2291,451,120,2.86,29209,33.9659,-80.9355,333497,1371816865,33.986391,-81.200714
1,1,130,7046,54783,142,142,839,437,142,29.84,84002,40.3207,-110.436,302,1371816873,39.450498,-109.960431
2,1,164,6639,54783,193,193,6456,448,193,41.28,11710,40.6729,-73.5365,34496,1371816893,40.49581,-74.196111
3,1,133,6260,45217,131,131,3319,131,131,60.05,32780,28.5697,-80.8191,54767,1371816915,28.812398,-80.883061
4,1,73,3139,45217,167,167,3552,167,167,3.19,49632,44.2529,-85.017,1126,1371816917,44.959148,-85.884734


In [26]:
# Define target variable for y_new
y_new = fraud_test_data['is_fraud']

In [29]:
# Predict target variable and evaluate scores
y_test_pred = grid_search.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print('Test Accuracy:', test_accuracy)
print('Classification Report on Test Data:\n', classification_report(y_test, y_test_pred))

Test Accuracy: 0.99665
Classification Report on Test Data:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19802
           1       0.80      0.88      0.84       198

    accuracy                           1.00     20000
   macro avg       0.90      0.94      0.92     20000
weighted avg       1.00      1.00      1.00     20000

