In [1]:
import sys
sys.path.append('../src')

In [29]:
from data.preparation import DataPreparation
from models.rfc import RFCModel
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import cross_val_score
import pandas as pd

In [3]:
dp = DataPreparation('../data/fraudTrain.csv')

In [4]:
dp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 15 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   merchant    1296675 non-null  object 
 1   category    1296675 non-null  object 
 2   amt         1296675 non-null  float64
 3   gender      1296675 non-null  object 
 4   city        1296675 non-null  object 
 5   state       1296675 non-null  object 
 6   lat         1296675 non-null  float64
 7   long        1296675 non-null  float64
 8   city_pop    1296675 non-null  int64  
 9   job         1296675 non-null  object 
 10  dob         1296675 non-null  object 
 11  unix_time   1296675 non-null  int64  
 12  merch_lat   1296675 non-null  float64
 13  merch_long  1296675 non-null  float64
 14  is_fraud    1296675 non-null  int64  
dtypes: float64(5), int64(3), object(7)
memory usage: 148.4+ MB
None
(1296675, 15)


In [5]:
train_df = dp.get_data()
train_df.head()

Unnamed: 0,merchant,category,amt,gender,city,state,lat,long,city_pop,job,dob,unix_time,merch_lat,merch_long,is_fraud
0,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,Moravian Falls,NC,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,1325376018,36.011293,-82.048315,0
1,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,Orient,WA,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1325376044,49.159047,-118.186462,0
2,fraud_Lind-Buckridge,entertainment,220.11,M,Malad City,ID,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,1325376051,43.150704,-112.154481,0
3,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,Boulder,MT,46.2306,-112.1138,1939,Patent attorney,1967-01-12,1325376076,47.034331,-112.561071,0
4,fraud_Keeling-Crist,misc_pos,41.96,M,Doe Hill,VA,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,1325376186,38.674999,-78.632459,0


In [6]:
X_train, y_train = train_df.drop('is_fraud', axis=1), train_df['is_fraud']

In [8]:
dp_test = DataPreparation('../data/fraudTest.csv')
dp_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 15 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   merchant    555719 non-null  object 
 1   category    555719 non-null  object 
 2   amt         555719 non-null  float64
 3   gender      555719 non-null  object 
 4   city        555719 non-null  object 
 5   state       555719 non-null  object 
 6   lat         555719 non-null  float64
 7   long        555719 non-null  float64
 8   city_pop    555719 non-null  int64  
 9   job         555719 non-null  object 
 10  dob         555719 non-null  object 
 11  unix_time   555719 non-null  int64  
 12  merch_lat   555719 non-null  float64
 13  merch_long  555719 non-null  float64
 14  is_fraud    555719 non-null  int64  
dtypes: float64(5), int64(3), object(7)
memory usage: 63.6+ MB
None
(555719, 15)


In [9]:
test_df = dp_test.get_data()
X_test, y_test = test_df.drop('is_fraud', axis=1), test_df['is_fraud']

In [24]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder


feature_scaler = StandardScaler() 
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
object_columns = X_train.select_dtypes(include='object').columns
number_columns = X_train.select_dtypes(include='number').columns

column_transform = make_column_transformer((one_hot_encoder, object_columns),
                                          (feature_scaler, number_columns),
                                           remainder="drop")

## Baseline

In [25]:
kf = StratifiedKFold(n_splits=2, shuffle=False)

In [30]:
rfc = RandomForestClassifier()
baseline_pipeline = make_pipeline(column_transform, rfc)

In [31]:
score3 = cross_val_score(baseline_pipeline, X_train, y_train, scoring='recall', cv=kf)
print("Cross Validation Recall Scores are: {}".format(score3))
print("Average Cross Validation Recall score: {}".format(score3.mean()))

Cross Validation Recall Scores are: [1.         0.31254996]
Average Cross Validation Recall score: 0.6562749800159873


In [44]:
X_train_trans = column_transform.transform(X_train)
X_test_trans = column_transform.transform(X_test)

In [45]:
rfc.fit(X_train_trans, y_train)

In [46]:
rfc_model = RFCModel()
rfc_model.replace_model(rfc)

In [49]:
metrics = rfc_model.evaluate(X_test_trans, y_test)

In [50]:
rf_score = pd.DataFrame(data = metrics, columns=['Recall','Precision','F1 Score', 'Accuracy'])
rf_score.insert(0, 'Random Forest with', 'No Under/Oversampling')

In [51]:
rf_score

Unnamed: 0,Random Forest with,Recall,Precision,F1 Score,Accuracy
0,No Under/Oversampling,0.356177,0.823276,0.497234,0.99722


In [54]:
rfc_model.get_feature_importance(X_train)
#TODO

AttributeError: 'zip' object has no attribute 'items'

In [55]:
rfc_model.confusion_matrix()

array([[553410,    164],
       [  1381,    764]])

In [12]:


params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [4, 6, 10, 12],
    'random_state': [13]
}


In [16]:
kf = StratifiedKFold(n_splits=2, shuffle=False)

In [15]:
grid_rf = GridSearchCV(rfc.get_model(), param_grid=params, cv=kf, 
                          scoring='recall').fit(X_train, y_train)

KeyboardInterrupt: 

In [18]:
print('Best parameters:', grid_rf.best_params_)
print('Best score:', grid_rf.best_score_)

Best parameters: {'max_depth': 12, 'n_estimators': 50, 'random_state': 13}
Best score: 0.658273381294964


In [19]:
rfc.replace_model(grid_rf.best_estimator_)

In [20]:
metrics = rfc.evaluate(X_test, y_test)
feature_scaling_gridsearch_rf_score = pd.DataFrame(data = metrics, columns=['Recall','Precision','F1 Score', 'Accuracy'])
feature_scaling_gridsearch_rf_score.insert(0, 'Random Forest with', 'Feauture Scaling and GridSearchCV')
feature_scaling_gridsearch_rf_score

Unnamed: 0,Random Forest with,Recall,Precision,F1 Score,Accuracy
0,Feauture Scaling and GridSearchCV,0.477389,0.769925,0.589353,0.997432


In [21]:
rfc.confusion_matrix()

array([[553268,    306],
       [  1121,   1024]])

In [22]:
rfc.save_model('../models/rfc2.pkl')

In [23]:
predictions = pd.concat([rf_score, feature_scaling_gridsearch_rf_score], ignore_index=True, sort=False)
predictions.sort_values(by=['Recall'], ascending=False)

Unnamed: 0,Random Forest with,Recall,Precision,F1 Score,Accuracy
1,Feauture Scaling and GridSearchCV,0.477389,0.769925,0.589353,0.997432
0,No Under/Oversampling,0.0,0.0,0.0,0.99614


## Oversampler

In [23]:
smote_pipeline = make_pipeline(column_transform, SMOTE(random_state=42), rfc.get_model())

In [25]:
score3 = cross_val_score(smote_pipeline, X_train, y_train, scoring='recall', cv=kf)
print("Cross Validation Recall Scores are: {}".format(score3))
print("Average Cross Validation Recall score: {}".format(score3.mean()))

Cross Validation Recall Scores are: [8.85691447e-01 7.99360512e-04]
Average Cross Validation Recall score: 0.44324540367705834


In [24]:
new_params = {'randomforestclassifier__' + key: params[key] for key in params}
smote_rf = GridSearchCV(smote_pipeline, param_grid=new_params, cv=kf, scoring='recall',
                        return_train_score=True)
smote_rf.fit(X_train, y_train)

ValueError: 
All the 24 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "/home/benya/edu/continual-fraud/.venv/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/benya/edu/continual-fraud/.venv/lib/python3.8/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/benya/edu/continual-fraud/.venv/lib/python3.8/site-packages/imblearn/pipeline.py", line 322, in fit
    Xt, yt = self._fit(X, y, routed_params)
  File "/home/benya/edu/continual-fraud/.venv/lib/python3.8/site-packages/imblearn/pipeline.py", line 248, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/home/benya/edu/continual-fraud/.venv/lib/python3.8/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
  File "/home/benya/edu/continual-fraud/.venv/lib/python3.8/site-packages/imblearn/pipeline.py", line 1097, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "/home/benya/edu/continual-fraud/.venv/lib/python3.8/site-packages/sklearn/utils/_set_output.py", line 157, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/home/benya/edu/continual-fraud/.venv/lib/python3.8/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/benya/edu/continual-fraud/.venv/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py", line 754, in fit_transform
    result = self._fit_transform(X, y, _fit_transform_one)
  File "/home/benya/edu/continual-fraud/.venv/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py", line 681, in _fit_transform
    return Parallel(n_jobs=self.n_jobs)(
  File "/home/benya/edu/continual-fraud/.venv/lib/python3.8/site-packages/sklearn/utils/parallel.py", line 65, in __call__
    return super().__call__(iterable_with_config)
  File "/home/benya/edu/continual-fraud/.venv/lib/python3.8/site-packages/joblib/parallel.py", line 1918, in __call__
    return output if self.return_generator else list(output)
  File "/home/benya/edu/continual-fraud/.venv/lib/python3.8/site-packages/joblib/parallel.py", line 1847, in _get_sequential_output
    res = func(*args, **kwargs)
  File "/home/benya/edu/continual-fraud/.venv/lib/python3.8/site-packages/sklearn/utils/parallel.py", line 127, in __call__
    return self.function(*args, **kwargs)
  File "/home/benya/edu/continual-fraud/.venv/lib/python3.8/site-packages/sklearn/pipeline.py", line 957, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
TypeError: fit_transform() takes 2 positional arguments but 3 were given


In [27]:
print('Best parameters:', smote_rf.best_params_)
print('Best score:', smote_rf.best_score_)

Best parameters: {'randomforestclassifier__max_depth': 6, 'randomforestclassifier__n_estimators': 200, 'randomforestclassifier__random_state': 13}
Best score: 0.8076205702104983


In [28]:
y_pred = smote_rf.best_estimator_.named_steps['randomforestclassifier'].predict(X_test)

In [29]:
rfc.replace_model(smote_rf.best_estimator_)

In [30]:
metrics = rfc.evaluate(X_test, y_test)
feature_scaling_gridsearch_smote_rf_score = pd.DataFrame(data = metrics, columns=['Recall','Precision','F1 Score', 'Accuracy'])
feature_scaling_gridsearch_smote_rf_score.insert(0, 'Random Forest with', 'Feauture Scaling and GridSearchCV')
feature_scaling_gridsearch_smote_rf_score

Unnamed: 0,Random Forest with,Recall,Precision,F1 Score,Accuracy
0,Feauture Scaling and GridSearchCV,0.751981,0.063934,0.117849,0.956546


In [31]:
from sklearn.metrics import confusion_matrix

In [32]:
confusion_matrix(y_test, y_pred)

array([[529958,  23616],
       [   532,   1613]])

In [36]:
rfc.save_model('../models/rfc_smote_200_12.pkl')