In [1]:
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append('../src')

from data.loader import DataLoader
from data.preparation import DataPreparation
from models.rfc import RFCModel

from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

import pandas as pd

In [2]:
X_train, y_train = DataLoader('../data/fraudTrain.csv').load()
X_test, y_test = DataLoader('../data/fraudTest.csv').load()

## Baseline

In [3]:
prep = DataPreparation()
scaler = StandardScaler()
rfc = RFCModel()
baseline_pipeline = make_pipeline(prep, scaler, rfc)

In [4]:
baseline_pipeline.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   29.9s finished


In [5]:
y_pred = baseline_pipeline.predict(X_test)

[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=24)]: Done 100 out of 100 | elapsed:    0.2s finished


In [6]:
metrics, cm = rfc.evaluate(y_test)
cm_df = pd.DataFrame(cm, index = ['Actual Not Fraud','Actual Fraud'], columns = ['Predicted Not Fraud','Predicted Fraud'])
cm_df

Unnamed: 0,Predicted Not Fraud,Predicted Fraud
Actual Not Fraud,553520,54
Actual Fraud,653,1492


In [7]:
# the fraud is the "positive" class so we want to maximize recall and precision
scores = pd.DataFrame(metrics, index=[0])
scores.insert(0, 'Random Forest with', 'No Under/Oversampling')
scores

Unnamed: 0,Random Forest with,accuracy,precision,recall,f1_score
0,No Under/Oversampling,0.998728,0.965071,0.695571,0.808453


## Oversampler

In [8]:
prep_smote = DataPreparation()
scaler_smote = StandardScaler()
smote = SMOTE(
    random_state=42,
    sampling_strategy=0.3,
    n_jobs=-1)
rfc_smote = RFCModel()
smote_pipeline = make_pipeline(prep_smote, scaler_smote, smote, rfc_smote)

In [9]:
smote_pipeline.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   46.9s finished


In [10]:
y_pred = smote_pipeline.predict(X_test)

[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=24)]: Done 100 out of 100 | elapsed:    0.2s finished


In [11]:
metrics, cm = rfc_smote.evaluate(y_test)
cm_df = pd.DataFrame(cm, index = ['Actual Not Fraud','Actual Fraud'], columns = ['Predicted Not Fraud','Predicted Fraud'])
cm_df

Unnamed: 0,Predicted Not Fraud,Predicted Fraud
Actual Not Fraud,553358,216
Actual Fraud,499,1646


In [12]:
smote_score = pd.DataFrame(metrics, index=[0])
smote_score.insert(0, 'Random Forest with', 'SMOTE')
scores = pd.concat([scores, smote_score])
scores

Unnamed: 0,Random Forest with,accuracy,precision,recall,f1_score
0,No Under/Oversampling,0.998728,0.965071,0.695571,0.808453
0,SMOTE,0.998713,0.883996,0.767366,0.821562


In [13]:
rfc_smote.get_feature_importance(prep_smote.get_feature_names())

{'amt': 0.5570984140623019,
 'hour': 0.16328982509761558,
 'day_of_week': 0.03271897246834442,
 'category_gas_transport': 0.024459595819924042,
 'category_shopping_net': 0.021141344069091444,
 'age': 0.020880677873352998,
 'category_grocery_pos': 0.019985508375160305,
 'month': 0.016695371678989726,
 'gender_M': 0.013306346609409665,
 'city_pop': 0.01083100898814325,
 'unix_time': 0.008444622873186479,
 'category_misc_net': 0.008315310629016762,
 'category_travel': 0.008023496457150205,
 'state': 0.007701623836850731,
 'category_misc_pos': 0.007547997842480841,
 'job': 0.007383134967250382,
 'merchant': 0.007250659059441068,
 'city': 0.007132914802856498,
 'lat': 0.0071250863692353655,
 'long': 0.00688163627761475,
 'category_home': 0.006530800401001886,
 'merch_lat': 0.006022579095559438,
 'merch_long': 0.005976130204695626,
 'category_shopping_pos': 0.005717270343804557,
 'category_food_dining': 0.00528345499822403,
 'category_grocery_net': 0.005207590105460749,
 'category_kids_pets'

In [14]:
rfc_smote.save_model('../models/rfc_smote_100_42.pkl')

## GridSearchCV

In [15]:
grid = {
    'smote__sampling_strategy': [0.1, 0.2, 0.3, 0.5],
    'randomforestclassifier__n_estimators': [50, 75, 100, 150],
}

In [None]:
prep_grid = DataPreparation()
scaler_grid = StandardScaler()
smote = SMOTE(
    random_state=42,
    sampling_strategy=0.3,
    n_jobs=-1)
rfc_grid = RFCModel()
grid_pipeline = make_pipeline(prep_grid, scaler_grid, smote, rfc_grid)

In [None]:
grid_model = GridSearchCV(grid_pipeline, grid, cv=StratifiedKFold(n_splits=3), scoring='recall', n_jobs=-1, verbose=1)

In [None]:
grid_model.fit(X_train, y_train)

In [None]:
grid_model.best_params_