In [1]:
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append('../src')

from data.loader import DataLoader
from data.preparation import DataPreparation
from models.rfc import RFCModel

from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

import pandas as pd

In [2]:
X_train, y_train = DataLoader('../data/fraudTrain.csv').load()
X_test, y_test = DataLoader('../data/fraudTest.csv').load()

## Baseline

In [3]:
prep = DataPreparation()
scaler = StandardScaler()
rfc = RFCModel(n_estimators=75)
baseline_pipeline = make_pipeline(prep, scaler, rfc)

In [4]:
baseline_pipeline.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:   18.2s finished


In [5]:
y_pred = baseline_pipeline.predict(X_test)

[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=24)]: Done  75 out of  75 | elapsed:    0.1s finished


In [6]:
metrics, cm = rfc.evaluate(y_test)
cm_df = pd.DataFrame(cm, index = ['Actual Not Fraud','Actual Fraud'], columns = ['Predicted Not Fraud','Predicted Fraud'])
cm_df

Unnamed: 0,Predicted Not Fraud,Predicted Fraud
Actual Not Fraud,553473,101
Actual Fraud,691,1454


In [7]:
# the fraud is the "positive" class so we want to maximize recall and precision (so f1 score)
scores = pd.DataFrame(metrics, index=[0])
scores.insert(0, 'Random Forest with', 'No Under/Oversampling')
scores

Unnamed: 0,Random Forest with,accuracy,precision,recall,f1_score
0,No Under/Oversampling,0.998575,0.935048,0.677855,0.785946


In [8]:
rfc.get_feature_importance(prep.get_feature_names())

{'amt': 0.44342290731753836,
 'hour': 0.10672865143290824,
 'grocery_pos': 0.07576262781009635,
 'age': 0.06777018520902203,
 'unix_time': 0.04850324772916683,
 'merchant': 0.03556870989987343,
 'city_pop': 0.0353637792331568,
 'job': 0.030265840743529244,
 'city': 0.02989370273739533,
 'gas_transport': 0.02503033551499131,
 'state': 0.022316066232377346,
 'month': 0.01835774760296797,
 'high_risk_job': 0.017106533866574927,
 'day_of_week': 0.01696865895349737,
 'misc_net': 0.011870339965280809,
 'shopping_net': 0.009530047381657017,
 'shopping_pos': 0.005540618369966766}

## Oversampler

In [9]:
prep_smote = DataPreparation()
scaler_smote = StandardScaler()
smote = SMOTE(
    random_state=42,
    sampling_strategy=0.2,
    n_jobs=-1)
rfc_smote = RFCModel(n_estimators=75)
smote_pipeline = make_pipeline(prep_smote, scaler_smote, smote, rfc_smote)

In [10]:
smote_pipeline.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:   27.2s finished


In [11]:
y_pred = smote_pipeline.predict(X_test)

[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=24)]: Done  75 out of  75 | elapsed:    0.1s finished


In [12]:
metrics, cm = rfc_smote.evaluate(y_test)
cm_df = pd.DataFrame(cm, index = ['Actual Not Fraud','Actual Fraud'], columns = ['Predicted Not Fraud','Predicted Fraud'])
cm_df

Unnamed: 0,Predicted Not Fraud,Predicted Fraud
Actual Not Fraud,553316,258
Actual Fraud,571,1574


In [13]:
smote_score = pd.DataFrame(metrics, index=[0])
smote_score.insert(0, 'Random Forest with', 'SMOTE')
scores = pd.concat([scores, smote_score])
scores

Unnamed: 0,Random Forest with,accuracy,precision,recall,f1_score
0,No Under/Oversampling,0.998575,0.935048,0.677855,0.785946
0,SMOTE,0.998508,0.85917,0.7338,0.791551


In [14]:
rfc_smote.get_feature_importance(prep_smote.get_feature_names())

{'amt': 0.5589411814023789,
 'hour': 0.20004787210018754,
 'day_of_week': 0.03318405872326123,
 'gas_transport': 0.024676134801874063,
 'age': 0.023156030417611155,
 'month': 0.023018072391422967,
 'grocery_pos': 0.02259577978628667,
 'shopping_net': 0.01633945594854311,
 'unix_time': 0.015380884345067396,
 'city_pop': 0.014026204999703324,
 'job': 0.01227884306289771,
 'city': 0.012086606663201434,
 'state': 0.011950341943663859,
 'merchant': 0.010470009021275161,
 'misc_net': 0.008558920537964587,
 'high_risk_job': 0.008210925726101005,
 'shopping_pos': 0.005078678128559999}

In [15]:
rfc_smote.save_model('../models/rfc_smote_75_42.pkl')

## GridSearchCV

In [None]:
grid = {
    'smote__sampling_strategy': [0.1, 0.2, 0.3, 0.5],
    'randomforestclassifier__n_estimators': [50, 75, 100, 150],
}

In [None]:
from sklearn.ensemble import RandomForestClassifier

prep_grid = DataPreparation()
scaler_grid = StandardScaler()
smote = SMOTE(
    random_state=42,
    sampling_strategy=0.3,
    n_jobs=-1)
rfc_grid = RandomForestClassifier()
grid_pipeline = make_pipeline(prep_grid, scaler_grid, smote, rfc_grid)

In [None]:
grid_model = GridSearchCV(grid_pipeline, grid, cv=StratifiedKFold(n_splits=3), scoring='recall', n_jobs=-1, verbose=1)

In [None]:
grid_model.fit(X_train, y_train)

In [None]:
# 75 estimators and 0.2 sampling strategy is the best
# could try other parameters
grid_model.best_params_