In [1]:
# Ref :
# https://towardsdatascience.com/how-to-tune-a-decision-tree-f03721801680
# https://medium.com/analytics-vidhya/decisiontree-classifier-working-on-moons-dataset-using-gridsearchcv-to-find-best-hyperparameters-ede24a06b489
# https://www.kaggle.com/fermatsavant/decision-tree-high-acc-using-gridsearchcv
# https://medium.com/analytics-vidhya/decisiontree-classifier-working-on-moons-dataset-using-gridsearchcv-to-find-best-hyperparameters-ede24a06b489

In [10]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

random_seed = 100

### Read Data

In [3]:
df = pd.read_csv('creditcard.csv', header=None)
df.columns = ['col_' + str(col) for col in df.columns]
df.sample(10)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_21,col_22,col_23,col_24,col_25,col_26,col_27,col_28,col_29,col_30
116584,74343.0,-0.713339,1.247239,0.649339,-0.158624,0.103149,-0.330059,0.234299,0.551754,-0.759508,...,-0.20846,-0.727487,-0.038285,-0.412832,-0.075818,0.099272,0.112961,0.012764,11.28,0
183641,125890.0,1.798071,0.250201,-2.965791,2.641648,3.325648,3.695267,0.327103,0.684736,-1.689328,...,0.145563,0.08163,0.014283,0.686716,0.283088,0.085567,-0.079019,-0.054448,96.75,0
56516,47464.0,-1.162452,0.795672,1.953356,1.654216,1.069023,-0.183501,0.093182,0.377642,-1.362161,...,0.25846,0.502164,-0.121908,0.094908,-0.078996,-0.084585,0.09307,0.146178,2.0,0
50494,44516.0,1.191514,-0.063162,0.83064,0.946044,-0.291276,0.810491,-0.639143,0.272633,0.604531,...,-0.078977,-0.018224,-0.170073,-0.807901,0.56449,-0.356721,0.07164,0.017949,9.99,0
81791,59088.0,1.117663,-0.064553,0.815978,1.040835,-0.342201,0.528131,-0.542878,0.200219,0.483516,...,0.102277,0.297337,-0.117374,-0.77026,0.372363,-0.284153,0.07202,0.035901,45.0,0
50793,44654.0,1.430365,-0.393018,0.435842,-0.757851,-0.8811,-0.822212,-0.505574,-0.21171,-0.857026,...,-0.110336,-0.406573,0.081789,0.030504,0.301186,-0.461793,0.013998,0.020811,9.99,0
225536,144288.0,1.769945,-0.358679,-1.868018,0.160267,0.733966,0.752475,-0.226617,0.364024,0.899733,...,-0.110365,-0.122457,0.187898,-1.55092,-0.323842,0.009826,0.02784,-0.039578,59.01,0
118137,74963.0,1.124072,0.094894,0.284698,0.896919,-0.067543,0.0801,-0.103018,0.159892,-0.065671,...,0.008615,-0.040802,-0.053768,-0.3353,0.446987,-0.406179,0.024888,0.011117,27.72,0
115209,73786.0,-0.342535,0.241431,1.805765,-0.259046,0.081243,0.201569,0.369828,-0.124194,0.537466,...,-0.129738,0.24366,-0.275303,0.104926,-0.083382,1.159828,-0.24708,-0.429851,5.0,0
39097,39693.0,-1.184994,0.24309,2.280318,1.316581,0.248879,-0.762975,-0.153772,0.176707,-0.31897,...,0.195686,0.418834,-0.126137,0.380621,0.370709,-0.101154,0.31534,0.166798,28.0,0


## check imblanace of target class

In [4]:
df_features = df.drop(['col_30'], axis=1)
df_target = df['col_30']
df_target.value_counts()

0    284315
1       492
Name: col_30, dtype: int64

## Keep 10% of data aside to validate which technique would really work ?

In [5]:
x, x_real_world, y, y_real_world = train_test_split(df_features, df_target, test_size = .1, random_state=random_seed)

# SMOTE after split

In [7]:
# Train, test split the original data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .2, random_state=random_seed)

# Oversample using SMOTE
sm = SMOTE(random_state=random_seed)
x_train_res, y_train_res = sm.fit_resample(x_train, y_train)
print("\nCounts before oversampling : \n", y_train.value_counts())
print("\nCounts after oversampling : \n", y_train_res.value_counts())



Counts before oversampling : 
 0    204705
1       355
Name: col_30, dtype: int64

Counts after oversampling : 
 0    204705
1    204705
Name: col_30, dtype: int64


# Basic Decision Tree Classifier

In [8]:
clf_dt = DecisionTreeClassifier(random_state=random_seed)
clf_dt.fit(x_train_res, y_train_res)
print("Train set f1_score : ", f1_score(y_train_res, clf_dt.predict(x_train_res)))
print("Test set f1_score : ", f1_score(y_test, clf_dt.predict(x_test)))
print("Real World f1_score : ", f1_score(y_real_world, clf_dt.predict(x_real_world)))

Train set f1_score :  1.0
Test set f1_score :  0.549618320610687
Real World f1_score :  0.513888888888889


## Decision Tree Grid CV

In [22]:
# Scoring parameters to use will vary depending on problem statement
# https://scikit-learn.org/stable/modules/model_evaluation.html

param_grid = {'max_features': ['auto', 'sqrt', 'log2'],
              'ccp_alpha': [0.1, .01, .001],
              'max_depth' : [2, 3, 4],
              'criterion' :['gini', 'entropy'],
              'max_leaf_nodes': [2, 3, 4, 5], 
              'min_samples_split': [2, 3, 4, 5, 6, 7, 10]
             }
tree_clas = DecisionTreeClassifier(random_state=1024)
grid_search_cv = GridSearchCV(estimator=tree_clas, param_grid=param_grid, cv=5, scoring='f1', verbose=True)
grid_search_cv.fit(x_train_res, y_train_res)

Fitting 5 folds for each of 864 candidates, totalling 4320 fits


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=1024),
             param_grid={'ccp_alpha': [0.1, 0.01, 0.001],
                         'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 4],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'max_leaf_nodes': [2, 3, 4, 5],
                         'min_samples_split': [2, 3, 4, 5]},
             scoring='f1', verbose=True)

In [23]:
grid_search_cv.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.001, max_depth=4, max_features='log2',
                       max_leaf_nodes=5, random_state=1024)

In [25]:
clf_dt = grid_search_cv.best_estimator_.fit(x_train_res, y_train_res)
print("Train set f1_score : ", f1_score(y_train_res, clf_dt.predict(x_train_res))) #Same as best accuracy

print("Test set f1_score : ", f1_score(y_test, clf_dt.predict(x_test)))
print("Real World f1_score : ", f1_score(y_real_world, clf_dt.predict(x_real_world)))

Train set f1_score :  1.0
Test set f1_score :  0.549618320610687
Real World f1_score :  0.513888888888889


## Random Forest

In [28]:
rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True) 
param_grid = { 
    'n_estimators': [50,100],
    'max_features': ['auto', 'sqrt', 'log2']
}
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(x_train_res, y_train_res)
print(CV_rfc.best_params_)

KeyboardInterrupt: 