In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pickle

In [2]:
df = pd.read_csv('dataset/train_clean.csv')

In [3]:
df.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,0,67.0,0,91.0,0,0,58.0,1,0,55.0,1,58.8,1,270000.0
1,2,0,79.33,1,78.33,0,1,77.48,1,1,86.5,0,66.28,1,200000.0
2,3,0,65.0,1,68.0,1,2,64.0,0,0,75.0,0,57.8,1,250000.0
3,4,0,56.0,1,52.0,1,1,52.0,1,0,66.0,1,59.43,0,288655.405405
4,5,0,85.8,1,73.6,1,0,73.3,0,0,96.8,0,55.5,1,425000.0


In [4]:
df.drop(columns=['sl_no'], inplace=True)

In [5]:
X = df.drop(columns=['status'], axis=1)
y = df['status']

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [9]:
scaler = pickle.load(open("../model_file/scaling.pkl","rb"))

In [11]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
X_train

array([[-6.85994341e-01, -7.57290748e-01, -1.12815215e+00, ...,
         1.06904497e+00,  5.67423045e-01,  8.61568772e-03],
       [-6.85994341e-01, -6.55851334e-01, -1.12815215e+00, ...,
         1.06904497e+00, -4.73901214e-01,  8.61568772e-03],
       [ 1.45773797e+00, -6.55851334e-01,  8.86405260e-01, ...,
         1.06904497e+00, -1.99674217e-03, -1.25766544e-01],
       ...,
       [ 1.45773797e+00,  9.82785365e-01,  8.86405260e-01, ...,
         1.06904497e+00,  1.72715876e+00,  8.61568772e-03],
       [ 1.45773797e+00,  9.47671722e-01,  8.86405260e-01, ...,
        -9.35414347e-01,  8.54744956e-01, -4.78892714e-01],
       [-6.85994341e-01,  1.19736874e+00, -1.12815215e+00, ...,
         1.06904497e+00,  1.53561082e+00,  2.04343707e+00]])

In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [14]:
model_dt = DecisionTreeClassifier()

In [15]:
model_dt.fit(X_train,y_train)

DecisionTreeClassifier()

In [16]:
model_dt.score(X_train,y_train)

1.0

In [17]:
model_dt.score(X_test, y_test)

0.9846153846153847

In [18]:
params = {
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'criterion': ["gini", "entropy"]
}

In [25]:
grid_search = GridSearchCV(estimator=model_dt, 
                           param_grid=params, 
                           cv=4, n_jobs=-1, verbose=3, scoring = "precision")

In [26]:
%%time
grid_search.fit(X_train, y_train)

Fitting 4 folds for each of 50 candidates, totalling 200 fits
CPU times: total: 219 ms
Wall time: 247 ms


GridSearchCV(cv=4, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 5, 10, 20],
                         'min_samples_leaf': [5, 10, 20, 50, 100]},
             scoring='precision', verbose=3)

In [27]:
score_df = pd.DataFrame(grid_search.cv_results_)
score_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001805,0.000615,0.002551,0.000359,gini,2,5,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",1.0,1.0,1.0,0.857143,0.964286,0.061859,29
1,0.002666,0.000905,0.001994,2e-06,gini,2,10,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",1.0,1.0,1.0,0.84,0.96,0.069282,30
2,0.001163,0.000502,0.001981,2.4e-05,gini,2,20,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",1.0,1.0,1.0,0.84,0.96,0.069282,30
3,0.001496,0.000864,0.002493,0.000499,gini,2,50,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",1.0,0.956522,1.0,0.84,0.94913,0.065459,36
4,0.00142,0.000491,0.002367,0.000649,gini,2,100,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.710526,0.710526,0.702703,0.702703,0.706615,0.003912,41


In [29]:
score_df.nlargest(5,"mean_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
5,0.001599,0.000253,0.002911,0.00013,gini,3,5,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",1.0,1.0,1.0,1.0,1.0,0.0,1
6,0.001906,0.000155,0.002063,0.000612,gini,3,10,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",1.0,1.0,1.0,1.0,1.0,0.0,1
10,0.001134,0.000529,0.001746,0.000432,gini,5,5,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",1.0,1.0,1.0,1.0,1.0,0.0,1
11,0.001326,0.000408,0.001755,0.000138,gini,5,10,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",1.0,1.0,1.0,1.0,1.0,0.0,1
15,0.001626,0.000332,0.002014,0.000616,gini,10,5,"{'criterion': 'gini', 'max_depth': 10, 'min_sa...",1.0,1.0,1.0,1.0,1.0,0.0,1


In [30]:
grid_search.best_estimator_

DecisionTreeClassifier(max_depth=3, min_samples_leaf=5)

In [31]:
dt_best = grid_search.best_estimator_

In [56]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
def evaluate_model(dt_classifier):
    print("Train Accuracy :", accuracy_score(y_train, dt_classifier.predict(X_train)))
    print("Train Confusion Matrix:")
    print(confusion_matrix(y_train, dt_classifier.predict(X_train)))
    print("-"*50)
    print("Test Accuracy :", accuracy_score(y_test, dt_classifier.predict(X_test)))
    print("Test Precision :", precision_score(y_test, dt_classifier.predict(X_test)))
    print("Test Recall: ", recall_score(y_test, dt_classifier.predict(X_test)))
    print("Test F1: ", f1_score(y_test, dt_classifier.predict(X_test)))
    print("Test Confusion Matrix:")
    print(confusion_matrix(y_test, dt_classifier.predict(X_test)))

In [41]:
evaluate_model(dt_best)

Train Accuracy : 1.0
Train Confusion Matrix:
[[ 44   0]
 [  0 106]]
--------------------------------------------------
Test Accuracy : 0.9846153846153847
Test Precision : 1.0
Test Recall:  0.9761904761904762
Test Confusion Matrix:
[[23  0]
 [ 1 41]]


## Let's test with scoring as recall and see the results

In [42]:
grid_search = GridSearchCV(estimator=model_dt, 
                           param_grid=params, 
                           cv=4, n_jobs=-1, verbose=3, scoring = "recall")

In [43]:
%%time
grid_search.fit(X_train, y_train)

Fitting 4 folds for each of 50 candidates, totalling 200 fits
CPU times: total: 359 ms
Wall time: 4.37 s


GridSearchCV(cv=4, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 5, 10, 20],
                         'min_samples_leaf': [5, 10, 20, 50, 100]},
             scoring='recall', verbose=3)

In [44]:
score_df = pd.DataFrame(grid_search.cv_results_)
score_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002027,0.0007066605,0.002447,0.0003822752,gini,2,5,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.962963,1.0,0.961538,0.923077,0.961895,0.027203,26
1,0.001247,0.0004318461,0.001995,1.521962e-06,gini,2,10,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.962963,1.0,0.961538,0.807692,0.933048,0.073999,31
2,0.001975,0.0007066806,0.002284,0.0004094994,gini,2,20,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.962963,1.0,0.961538,0.807692,0.933048,0.073999,31
3,0.001267,0.000466534,0.001995,5.960464e-07,gini,2,50,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.444444,0.814815,0.615385,0.807692,0.670584,0.153124,41
4,0.000997,5.462856e-07,0.001746,0.0004317769,gini,2,100,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",1.0,1.0,1.0,1.0,1.0,0.0,1


In [45]:
score_df.nlargest(5,"mean_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
4,0.000997,5.462856e-07,0.001746,0.000432,gini,2,100,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",1.0,1.0,1.0,1.0,1.0,0.0,1
9,0.001064,0.0001157992,0.001429,0.000442,gini,3,100,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",1.0,1.0,1.0,1.0,1.0,0.0,1
14,0.00152,0.0004418268,0.001246,0.00034,gini,5,100,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",1.0,1.0,1.0,1.0,1.0,0.0,1
19,0.000642,0.0004082289,0.001474,0.000477,gini,10,100,"{'criterion': 'gini', 'max_depth': 10, 'min_sa...",1.0,1.0,1.0,1.0,1.0,0.0,1
24,0.000876,0.000893137,0.001356,0.000451,gini,20,100,"{'criterion': 'gini', 'max_depth': 20, 'min_sa...",1.0,1.0,1.0,1.0,1.0,0.0,1


In [46]:
grid_search.best_estimator_

DecisionTreeClassifier(max_depth=2, min_samples_leaf=100)

In [47]:
dt_best = grid_search.best_estimator_

In [48]:
evaluate_model(dt_best)

Train Accuracy : 0.7066666666666667
Train Confusion Matrix:
[[  0  44]
 [  0 106]]
--------------------------------------------------
Test Accuracy : 0.6461538461538462
Test Precision : 0.6461538461538462
Test Recall:  1.0
Test Confusion Matrix:
[[ 0 23]
 [ 0 42]]


## Let's try with f1 score as accuracy score

In [49]:
grid_search = GridSearchCV(estimator=model_dt, 
                           param_grid=params, 
                           cv=4, n_jobs=-1, verbose=3, scoring = "f1")

In [50]:
%%time
grid_search.fit(X_train, y_train)

Fitting 4 folds for each of 50 candidates, totalling 200 fits
CPU times: total: 203 ms
Wall time: 253 ms


GridSearchCV(cv=4, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 5, 10, 20],
                         'min_samples_leaf': [5, 10, 20, 50, 100]},
             scoring='f1', verbose=3)

In [51]:
score_df = pd.DataFrame(grid_search.cv_results_)
score_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001693,0.0003803582,0.002097,0.000177,gini,2,5,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.981132,1.0,0.980392,0.888889,0.962603,0.043278,24
1,0.001664,0.0003503121,0.002011,0.000705,gini,2,10,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.981132,1.0,0.980392,0.823529,0.946263,0.071295,25
2,0.001861,0.0002736239,0.002024,0.000209,gini,2,20,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.981132,1.0,0.980392,0.823529,0.946263,0.071295,25
3,0.001994,6.391886e-07,0.001745,0.000433,gini,2,50,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.615385,0.88,0.761905,0.823529,0.770205,0.098662,46
4,0.00112,0.0002140517,0.001581,0.000482,gini,2,100,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",0.830769,0.830769,0.825397,0.825397,0.828083,0.002686,31


In [52]:
score_df.nlargest(5,"mean_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
25,0.001358,0.000626,0.002139,0.000235,entropy,2,5,"{'criterion': 'entropy', 'max_depth': 2, 'min_...",0.981132,1.0,0.980392,1.0,0.990381,0.009622,1
26,0.001869,0.000739,0.002003,1.6e-05,entropy,2,10,"{'criterion': 'entropy', 'max_depth': 2, 'min_...",0.981132,1.0,0.980392,1.0,0.990381,0.009622,1
27,0.001132,0.000233,0.002366,0.000425,entropy,2,20,"{'criterion': 'entropy', 'max_depth': 2, 'min_...",0.981132,1.0,0.980392,1.0,0.990381,0.009622,1
30,0.001497,0.000498,0.002369,0.000818,entropy,3,5,"{'criterion': 'entropy', 'max_depth': 3, 'min_...",0.981132,1.0,0.980392,1.0,0.990381,0.009622,1
31,0.002424,0.001025,0.002313,0.000407,entropy,3,10,"{'criterion': 'entropy', 'max_depth': 3, 'min_...",0.981132,1.0,0.980392,1.0,0.990381,0.009622,1


In [53]:
grid_search.best_estimator_

DecisionTreeClassifier(criterion='entropy', max_depth=2, min_samples_leaf=5)

In [54]:
dt_best = grid_search.best_estimator_

In [57]:
evaluate_model(dt_best)

Train Accuracy : 1.0
Train Confusion Matrix:
[[ 44   0]
 [  0 106]]
--------------------------------------------------
Test Accuracy : 0.9846153846153847
Test Precision : 1.0
Test Recall:  0.9761904761904762
Test F1:  0.9879518072289156
Test Confusion Matrix:
[[23  0]
 [ 1 41]]
