In [1]:
import sys
sys.path.append('../src')

In [2]:
from data.preparation import DataPreparation
from models.rfc import RFCModel


import pandas as pd

In [3]:
dp = DataPreparation('../data/fraudTrain.csv')

In [4]:
dp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 15 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   merchant    1296675 non-null  int64  
 1   category    1296675 non-null  int64  
 2   amt         1296675 non-null  float64
 3   gender      1296675 non-null  int64  
 4   city        1296675 non-null  int64  
 5   state       1296675 non-null  int64  
 6   lat         1296675 non-null  float64
 7   long        1296675 non-null  float64
 8   city_pop    1296675 non-null  int64  
 9   job         1296675 non-null  int64  
 10  dob         1296675 non-null  int64  
 11  unix_time   1296675 non-null  int64  
 12  merch_lat   1296675 non-null  float64
 13  merch_long  1296675 non-null  float64
 14  is_fraud    1296675 non-null  int64  
dtypes: float64(5), int64(10)
memory usage: 148.4 MB
None
(1296675, 15)


In [5]:
train_df = dp.get_data()
train_df.head()

Unnamed: 0,merchant,category,amt,gender,city,state,lat,long,city_pop,job,dob,unix_time,merch_lat,merch_long,is_fraud
0,514,8,-0.407826,0,526,27,36.0788,-81.1781,3495,370,779,1325376018,36.011293,-82.048315,0
1,241,4,0.230039,0,612,47,48.8878,-118.2105,149,428,607,1325376044,49.159047,-118.186462,0
2,390,0,0.934149,1,468,13,42.1808,-112.262,4154,307,302,1325376051,43.150704,-112.154481,0
3,360,2,-0.158132,1,84,26,46.2306,-112.1138,1939,328,397,1325376076,47.034331,-112.561071,0
4,297,9,-0.177094,1,216,45,38.4207,-79.4629,99,116,734,1325376186,38.674999,-78.632459,0


In [6]:
X_train, y_train = train_df.drop('is_fraud', axis=1), train_df['is_fraud']

In [7]:
rfc = RFCModel()

In [8]:
rfc.train(X_train, y_train)

In [9]:
dp_test = DataPreparation('../data/fraudTest.csv')
dp_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 15 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   merchant    555719 non-null  int64  
 1   category    555719 non-null  int64  
 2   amt         555719 non-null  float64
 3   gender      555719 non-null  int64  
 4   city        555719 non-null  int64  
 5   state       555719 non-null  int64  
 6   lat         555719 non-null  float64
 7   long        555719 non-null  float64
 8   city_pop    555719 non-null  int64  
 9   job         555719 non-null  int64  
 10  dob         555719 non-null  int64  
 11  unix_time   555719 non-null  int64  
 12  merch_lat   555719 non-null  float64
 13  merch_long  555719 non-null  float64
 14  is_fraud    555719 non-null  int64  
dtypes: float64(5), int64(10)
memory usage: 63.6 MB
None
(555719, 15)


In [10]:
test_df = dp_test.get_data()
X_test, y_test = test_df.drop('is_fraud', axis=1), test_df['is_fraud']

In [11]:
metrics = rfc.evaluate(X_test, y_test)

  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
rf_score = pd.DataFrame(data = metrics, columns=['Recall','Precision','F1 Score', 'Accuracy'])
rf_score.insert(0, 'Random Forest with', 'No Under/Oversampling')

In [13]:
rf_score

Unnamed: 0,Random Forest with,Recall,Precision,F1 Score,Accuracy
0,No Under/Oversampling,0.0,0.0,0.0,0.99614


In [14]:
rfc.get_feature_importance(X_train)

{'merchant': 0.0006263105874443632,
 'category': 0.07846230897998699,
 'amt': 0.7560244282587812,
 'gender': 0.006533783792897625,
 'city': 0.0009154646471433975,
 'state': 0.00233744394993332,
 'lat': 0.08215638041611112,
 'long': 1.235960350524123e-05,
 'city_pop': 0.007655564727005781,
 'job': 0.0032283512584462093,
 'dob': 0.01610109797584191,
 'unix_time': 0.004600646624103639,
 'merch_lat': 0.03232344392631547,
 'merch_long': 0.00902241525248374}

In [15]:
rfc.confusion_matrix()

array([[553574,      0],
       [  2145,      0]])

In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [4, 6, 10, 12],
    'random_state': [13]
}

kf = StratifiedKFold(n_splits=2, shuffle=False)
grid_rf = GridSearchCV(rfc.get_model(), param_grid=params, cv=kf, 
                          scoring='recall').fit(X_train, y_train)

In [18]:
print('Best parameters:', grid_rf.best_params_)
print('Best score:', grid_rf.best_score_)

Best parameters: {'max_depth': 12, 'n_estimators': 50, 'random_state': 13}
Best score: 0.658273381294964


In [19]:
rfc.replace_model(grid_rf.best_estimator_)

In [20]:
metrics = rfc.evaluate(X_test, y_test)
feature_scaling_gridsearch_rf_score = pd.DataFrame(data = metrics, columns=['Recall','Precision','F1 Score', 'Accuracy'])
feature_scaling_gridsearch_rf_score.insert(0, 'Random Forest with', 'Feauture Scaling and GridSearchCV')
feature_scaling_gridsearch_rf_score

Unnamed: 0,Random Forest with,Recall,Precision,F1 Score,Accuracy
0,Feauture Scaling and GridSearchCV,0.477389,0.769925,0.589353,0.997432


In [21]:
rfc.confusion_matrix()

array([[553268,    306],
       [  1121,   1024]])

In [22]:
rfc.save_model('../models/rfc2.pkl')

In [23]:
predictions = pd.concat([rf_score, feature_scaling_gridsearch_rf_score], ignore_index=True, sort=False)
predictions.sort_values(by=['Recall'], ascending=False)

Unnamed: 0,Random Forest with,Recall,Precision,F1 Score,Accuracy
1,Feauture Scaling and GridSearchCV,0.477389,0.769925,0.589353,0.997432
0,No Under/Oversampling,0.0,0.0,0.0,0.99614


## Oversampler

In [24]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score

smote_pipeline = make_pipeline(SMOTE(random_state=42), rfc.get_model())

In [25]:
score3 = cross_val_score(smote_pipeline, X_train, y_train, scoring='recall', cv=kf)
print("Cross Validation Recall Scores are: {}".format(score3))
print("Average Cross Validation Recall score: {}".format(score3.mean()))

Cross Validation Recall Scores are: [8.85691447e-01 7.99360512e-04]
Average Cross Validation Recall score: 0.44324540367705834


In [26]:
new_params = {'randomforestclassifier__' + key: params[key] for key in params}
smote_rf = GridSearchCV(smote_pipeline, param_grid=new_params, cv=kf, scoring='recall',
                        return_train_score=True)
smote_rf.fit(X_train, y_train)

In [27]:
print('Best parameters:', smote_rf.best_params_)
print('Best score:', smote_rf.best_score_)

Best parameters: {'randomforestclassifier__max_depth': 6, 'randomforestclassifier__n_estimators': 200, 'randomforestclassifier__random_state': 13}
Best score: 0.8076205702104983


In [28]:
y_pred = smote_rf.best_estimator_.named_steps['randomforestclassifier'].predict(X_test)

In [29]:
rfc.replace_model(smote_rf.best_estimator_)

In [30]:
metrics = rfc.evaluate(X_test, y_test)
feature_scaling_gridsearch_smote_rf_score = pd.DataFrame(data = metrics, columns=['Recall','Precision','F1 Score', 'Accuracy'])
feature_scaling_gridsearch_smote_rf_score.insert(0, 'Random Forest with', 'Feauture Scaling and GridSearchCV')
feature_scaling_gridsearch_smote_rf_score

Unnamed: 0,Random Forest with,Recall,Precision,F1 Score,Accuracy
0,Feauture Scaling and GridSearchCV,0.751981,0.063934,0.117849,0.956546


In [31]:
from sklearn.metrics import confusion_matrix

In [32]:
confusion_matrix(y_test, y_pred)

array([[529958,  23616],
       [   532,   1613]])