In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
from google.colab import files          #Upload hpt-small.csv from the main folder of the course 
uploaded = files.upload()

Saving hpt-small.csv to hpt-small.csv


In [None]:
df = pd.read_csv('hpt-small.csv')
df.head()

Unnamed: 0,age,wc,education,marital status,race,gender,hours per week,IncomeClass
0,38,Private,HS-grad,Divorced,White,Male,40,<=50K
1,28,Private,Bachelors,Married,Black,Female,40,<=50K
2,37,Private,Masters,Married,White,Female,40,<=50K
3,31,Private,Masters,Never-married,White,Female,50,>50K
4,42,Private,Bachelors,Married,White,Male,40,>50K


In [None]:
#Turn categorical features into dummy variables
df = pd.get_dummies(df, drop_first=True)
df.head()

Unnamed: 0,age,hours per week,wc_ Local-gov,wc_ Private,education_ Doctorate,education_ HS-grad,education_ Masters,education_ Preschool,education_ Prof-school,education_ Some-college,marital status_ Never-married,marital status_ Widowed,marital status_Married,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,gender_ Male,IncomeClass_ >50K
0,38,40,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0
1,28,40,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
2,37,40,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0
3,31,50,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1
4,42,40,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1


In [None]:
df.shape

(500, 19)

In [None]:
#Specify X (independant variables) and Y(predicted variable or target variable)
X = df.iloc[:, 0:-1]
Y = df.iloc[:, -1]

#Models (Default)
**These models are using their "default", i.e. "Un-Tuned" hyper-parameters**

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=1234)

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=1234)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=1234)

In [None]:
from sklearn.svm import SVC
svc = SVC(gamma=0.5)

#Cross Validation

**Let's run cross validation on all these 4 models with their "default" hyper-parameters**

In [None]:
from sklearn.model_selection import cross_validate

In [None]:
#10-Fold Cross Validation for all 4 models
CV_results_lr = cross_validate(lr, X, Y, cv=10, return_train_score=True)
CV_results_dtc = cross_validate(dtc, X, Y, cv=10, return_train_score=True)
CV_results_rfc = cross_validate(rfc, X, Y, cv=10, return_train_score=True)
CV_results_svc = cross_validate(svc, X, Y, cv=10, return_train_score=True)

In [None]:
CV_results_dtc

{'fit_time': array([0.00342202, 0.0029304 , 0.00310588, 0.00337291, 0.00292993,
        0.00290108, 0.00294948, 0.0031395 , 0.00291133, 0.00284243]),
 'score_time': array([0.00105953, 0.00102401, 0.00104904, 0.00105238, 0.0009892 ,
        0.001019  , 0.00105286, 0.00105977, 0.00100374, 0.00100851]),
 'test_score': array([0.72, 0.82, 0.7 , 0.7 , 0.8 , 0.74, 0.8 , 0.72, 0.74, 0.76]),
 'train_score': array([0.98444444, 0.98      , 0.98      , 0.98666667, 0.98444444,
        0.98666667, 0.98666667, 0.98444444, 0.98444444, 0.98222222])}

In [None]:
#Find Average of results for test sets

lr_test_average  = np.average(CV_results_lr['test_score'])
dtc_test_average = np.average(CV_results_dtc['test_score'])
rfc_test_average = np.average(CV_results_rfc['test_score'])
svc_test_average = np.average(CV_results_svc['test_score'])

In [None]:
#Find Average of results for train sets

lr_train_average  = np.average(CV_results_lr['train_score'])
dtc_train_average = np.average(CV_results_dtc['train_score'])
rfc_train_average = np.average(CV_results_rfc['train_score'])
svc_train_average = np.average(CV_results_svc['train_score'])

In [None]:
print('Logostic Regression')
print(lr_test_average)
print(lr_train_average)
print('\n')

print('Decision Tree Classifier')
print(dtc_test_average)
print(dtc_train_average)
print('\n')

print('Random Forest Classifier')
print(rfc_test_average)
print(rfc_train_average)
print('\n')

print('SVC')
print(svc_test_average)
print(svc_train_average)
print('\n')

Logostic Regression
0.8140000000000001
0.8266666666666668


Decision Tree Classifier
0.75
0.984


Random Forest Classifier
0.776
0.9835555555555556


SVC
0.734
0.9548888888888889




Comment:

We can see that using default parameters for these 4 models, there is some extent of overfitting for all models except for lr. Also, we can see that all model's score except for lr is about 75%

#Tuning Models' Parameters and Grid Search


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate


##Grid Search for rfc model

In [None]:
rfc_parameters = {'n_estimators':[10,15,20], 
                  'min_samples_split':[8,16],
                  'min_samples_leaf':[2,3]}      #So this grid will have 3*2*2=12 combinations

rfc_grid = GridSearchCV(estimator=rfc,
                        param_grid= rfc_parameters,
                        scoring='accuracy',
                        cv=3,                     #So, 12*3=36 individual runs will be performed
                        return_train_score= True)                  

rfc_grid_fit = rfc_grid.fit(X,Y)

CV_results_rfc = pd.DataFrame.from_dict( rfc_grid_fit.cv_results_)
CV_results_rfc  
                #12 rows from 0 to 11, each row for one combination of parameters
                #disregards the columns showing "time"
                #Important Columns:
                #split0_test_score    split1_test_score    split2_test_score   mean_test_score    rank_test_score   rank_test_score
                #split0_train_score   split1_train_score   split2_train_score  mean_train_score   rank_train_score
    

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.018937,0.001365,0.002873,0.000243,2,8,10,"{'min_samples_leaf': 2, 'min_samples_split': 8...",0.754491,0.814371,0.795181,0.788014,0.024966,11,0.891892,0.864865,0.883234,0.879997,0.011269
1,0.024408,0.000915,0.00296,0.000112,2,8,15,"{'min_samples_leaf': 2, 'min_samples_split': 8...",0.760479,0.820359,0.813253,0.79803,0.026711,7,0.891892,0.870871,0.892216,0.884993,0.009987
2,0.030449,0.000564,0.00332,7.7e-05,2,8,20,"{'min_samples_leaf': 2, 'min_samples_split': 8...",0.760479,0.850299,0.801205,0.803994,0.036722,4,0.885886,0.87988,0.901198,0.888988,0.008975
3,0.016299,0.000223,0.002557,0.000155,2,16,10,"{'min_samples_leaf': 2, 'min_samples_split': 1...",0.790419,0.808383,0.807229,0.80201,0.00821,5,0.873874,0.846847,0.835329,0.852017,0.016155
4,0.023685,0.000237,0.003098,0.000309,2,16,15,"{'min_samples_leaf': 2, 'min_samples_split': 1...",0.760479,0.814371,0.801205,0.792018,0.02294,9,0.876877,0.837838,0.847305,0.854007,0.016627
5,0.030081,0.000436,0.003566,8.8e-05,2,16,20,"{'min_samples_leaf': 2, 'min_samples_split': 1...",0.754491,0.808383,0.789157,0.78401,0.0223,12,0.858859,0.840841,0.856287,0.851996,0.007957
6,0.016353,0.00015,0.002575,0.000145,3,8,10,"{'min_samples_leaf': 3, 'min_samples_split': 8...",0.778443,0.826347,0.825301,0.810031,0.02234,2,0.876877,0.864865,0.856287,0.86601,0.008445
7,0.022891,0.000286,0.002874,0.000133,3,8,15,"{'min_samples_leaf': 3, 'min_samples_split': 8...",0.778443,0.814371,0.831325,0.808047,0.022047,3,0.87988,0.861862,0.853293,0.865012,0.01108
8,0.029833,6.5e-05,0.003223,3.7e-05,3,8,20,"{'min_samples_leaf': 3, 'min_samples_split': 8...",0.802395,0.826347,0.825301,0.818015,0.011053,1,0.894895,0.873874,0.859281,0.876017,0.014618
9,0.016319,0.000292,0.002372,0.000138,3,16,10,"{'min_samples_leaf': 3, 'min_samples_split': 1...",0.760479,0.802395,0.807229,0.790034,0.020992,10,0.84985,0.840841,0.823353,0.838015,0.011


In [None]:
                
CV_results_rfc[CV_results_rfc['rank_test_score']==1]      #find the best(i.e. rank==1) combination of parameters  

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
8,0.029833,6.5e-05,0.003223,3.7e-05,3,8,20,"{'min_samples_leaf': 3, 'min_samples_split': 8...",0.802395,0.826347,0.825301,0.818015,0.011053,1,0.894895,0.873874,0.859281,0.876017,0.014618


In [None]:
#To directly get the best parameters:
rfc_grid_fit.best_params_  

{'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 20}

##Tuning and Grid Search for lr model

In [None]:
lr_parameters = {'C':[0.01, 0.1, 0.5, 1, 2, 5, 10],         #C is 1/lamda in Lasso. So, smaller C: bigger lambda: less overfitting
                  'penalty':['l2'],                         #Lasso Regularization
                  'solver':['liblinear', 'lbfgs', 'saga']}  #So this grid will have 7*1*3=21 combinations

lr_grid = GridSearchCV(estimator=lr,
                        param_grid= lr_parameters,
                        scoring='accuracy',
                        cv=3,                     #So, 21*3=63 individual runs will be performed
                        return_train_score= True)                  

lr_grid_fit = lr_grid.fit(X,Y)

CV_results_lr = pd.DataFrame.from_dict( lr_grid_fit.cv_results_)
#CV_results_lr  
                #21 rows from 0 to 20, each row for one combination of parameters
                #disregards the columns showing "time"
                #Important Columns:
                #split0_test_score    split1_test_score    split2_test_score   mean_test_score    rank_test_score   rank_test_score
                #split0_train_score   split1_train_score   split2_train_score  mean_train_score   rank_train_score

In [None]:
CV_results_lr[CV_results_lr['rank_test_score']==1]      #find the best(i.e. rank==1) combination of parameters 

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_penalty,param_solver,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
13,0.027045,0.001763,0.001874,4.2e-05,2,l2,lbfgs,"{'C': 2, 'penalty': 'l2', 'solver': 'lbfgs'}",0.808383,0.814371,0.861446,0.828067,0.023729,1,0.840841,0.825826,0.817365,0.828011,0.009708


In [None]:
#To directly get the best parameters:
lr_grid_fit.best_params_  

{'C': 2, 'penalty': 'l2', 'solver': 'lbfgs'}

##Tuning and Grid Search for SVC model

In [None]:
svc_parameters = {'C':[0.01, 0.1, 0.5, 1, 2, 5, 10],    #C is 1/lamda in Lasso. So, smaller C: bigger lambda: less overfitting
                  'kernel':['rbf', 'linear'],
                  'gamma':[0.1, 0.25, 0.5, 1, 5]}       #So this grid will have 7*2*5=70 combinations

svc_grid = GridSearchCV(estimator=svc,
                        param_grid= svc_parameters,
                        scoring='accuracy',
                        cv=3,                     #So, 70*3=210 individual runs will be performed
                        return_train_score= True)                  

svc_grid_fit = svc_grid.fit(X,Y)

CV_results_svc = pd.DataFrame.from_dict( svc_grid_fit.cv_results_)
#CV_results_svc  
                #210 rows from 0 to 209, each row for one combination of parameters
                #disregards the columns showing "time"
                #Important Columns:
                #split0_test_score    split1_test_score    split2_test_score   mean_test_score    rank_test_score   rank_test_score
                #split0_train_score   split1_train_score   split2_train_score  mean_train_score   rank_train_score

#CV_results_svc[CV_results_svc['rank_test_score']==1]      #find the best(i.e. rank==1) combination of parameters

In [None]:
#To directly get the best parameters:
svc_grid_fit.best_params_  

{'C': 10, 'gamma': 0.1, 'kernel': 'linear'}

##Comparison of Models

In [None]:
lr_mean_test_score = CV_results_lr[ CV_results_lr['rank_test_score']==1 ]['mean_test_score'] 
lr_mean_train_score = CV_results_lr[ CV_results_lr['rank_test_score']==1 ]['mean_train_score'] 

rfc_mean_test_score = CV_results_rfc[CV_results_rfc['rank_test_score']==1]['mean_test_score']
rfc_mean_train_score = CV_results_rfc[CV_results_rfc['rank_test_score']==1]['mean_train_score']

svc_mean_test_score = CV_results_svc[CV_results_svc['rank_test_score']==1]['mean_test_score'] 
svc_mean_train_score = CV_results_svc[CV_results_svc['rank_test_score']==1]['mean_train_score'] 

print('lr_mean_test_score:',lr_mean_test_score)
print('lr_mean_train_score:',lr_mean_train_score,'\n')

print('rfc_mean_test_score:',rfc_mean_test_score)
print('rfc_mean_train_score:',rfc_mean_train_score,'\n')

print('svc_mean_test_score:',svc_mean_test_score)
print('svc_mean_train_score:',svc_mean_train_score)

lr_mean_test_score: 13    0.828067
Name: mean_test_score, dtype: float64
lr_mean_train_score: 13    0.828011
Name: mean_train_score, dtype: float64 

rfc_mean_test_score: 8    0.818015
Name: mean_test_score, dtype: float64
rfc_mean_train_score: 8    0.876017
Name: mean_train_score, dtype: float64 

svc_mean_test_score: 61    0.828019
63    0.828019
65    0.828019
67    0.828019
69    0.828019
Name: mean_test_score, dtype: float64
svc_mean_train_score: 61    0.834005
63    0.834005
65    0.834005
67    0.834005
69    0.834005
Name: mean_train_score, dtype: float64


Comments:

Now, we can see that tuning models parameters and performing a grid search has significantly improve the overfitting issue. Also, the average test score is now almost 82% whearas 75% in teh case of default models' parameters

#Tuning Models' Parameters and Random Search
Random search (as opposed to grid search) searches random combinations only and thus, saves a lot of time. The results will still be close to the Grid Search results

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_validate

##Tuning and Grid Search for rfc model

In [None]:
rfc_parameters = {'n_estimators':[10,15,20], 
                  'min_samples_split':[8,16,20],
                  'min_samples_leaf':[1,2,3,4,5,6]}      # So this grid will have 3*3*6=54 combinations

rfc_RandomSearch = RandomizedSearchCV(estimator=rfc,
                        param_distributions= rfc_parameters,
                        scoring='accuracy',
                        n_iter=10,                # How many different combinations do you want out of 54?
                        cv=3,                     # So, 10*3=30 individual runs will be performed
                        return_train_score= True,
                        random_state=1234)                  

rfc_RandomSearch_fit = rfc_RandomSearch.fit(X,Y)

CV_results_rfc_RandomSearch = pd.DataFrame.from_dict( rfc_RandomSearch_fit.cv_results_)
#CV_results_rfc_RandomSearch  
                #10 rows from 0 to 9, each row for one combination of parameters
                #disregards the columns showing "time"
                #Important Columns:
                #split0_test_score    split1_test_score    split2_test_score   mean_test_score    rank_test_score   rank_test_score
                #split0_train_score   split1_train_score   split2_train_score  mean_train_score   rank_train_score
                
CV_results_rfc_RandomSearch[CV_results_rfc_RandomSearch['rank_test_score']==1]      #find the best(i.e. rank==1) combination of parameters      

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_samples_split,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
4,0.030864,0.001126,0.003327,0.000165,20,8,4,"{'n_estimators': 20, 'min_samples_split': 8, '...",0.790419,0.844311,0.825301,0.820011,0.022317,1,0.861862,0.855856,0.847305,0.855008,0.005973


In [None]:
#To directly get the best parameters:
rfc_RandomSearch_fit.best_params_  

{'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 20}

In [None]:
CV_results_rfc_RandomSearch  #Note that it only has 10 random rows (combination)out of 54 possible combinations

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_samples_split,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.017346,0.000898,0.00275,0.000133,10,8,4,"{'n_estimators': 10, 'min_samples_split': 8, '...",0.790419,0.856287,0.807229,0.817979,0.027944,3,0.864865,0.834835,0.853293,0.850998,0.012367
1,0.023477,0.000552,0.002824,6.3e-05,15,16,5,"{'n_estimators': 15, 'min_samples_split': 16, ...",0.784431,0.838323,0.813253,0.812003,0.022019,4,0.861862,0.825826,0.820359,0.836016,0.018412
2,0.031411,0.000764,0.00352,2.8e-05,20,8,3,"{'n_estimators': 20, 'min_samples_split': 8, '...",0.802395,0.826347,0.825301,0.818015,0.011053,2,0.894895,0.873874,0.859281,0.876017,0.014618
3,0.016905,0.00033,0.002306,6.3e-05,10,20,1,"{'n_estimators': 10, 'min_samples_split': 20, ...",0.790419,0.802395,0.777108,0.789974,0.010328,10,0.84985,0.846847,0.859281,0.851993,0.005298
4,0.030864,0.001126,0.003327,0.000165,20,8,4,"{'n_estimators': 20, 'min_samples_split': 8, '...",0.790419,0.844311,0.825301,0.820011,0.022317,1,0.861862,0.855856,0.847305,0.855008,0.005973
5,0.030557,0.000196,0.003283,0.000105,20,20,1,"{'n_estimators': 20, 'min_samples_split': 20, ...",0.772455,0.814371,0.789157,0.791994,0.017229,9,0.861862,0.855856,0.859281,0.859,0.00246
6,0.02405,0.001658,0.002701,8e-05,15,16,4,"{'n_estimators': 15, 'min_samples_split': 16, ...",0.778443,0.820359,0.813253,0.804018,0.018316,5,0.870871,0.828829,0.823353,0.841018,0.021227
7,0.016987,0.001289,0.002251,6.1e-05,10,16,6,"{'n_estimators': 10, 'min_samples_split': 16, ...",0.766467,0.820359,0.813253,0.800026,0.023907,7,0.855856,0.81982,0.811377,0.829018,0.019288
8,0.023514,0.000109,0.002961,0.000237,15,16,2,"{'n_estimators': 15, 'min_samples_split': 16, ...",0.760479,0.814371,0.801205,0.792018,0.02294,8,0.876877,0.837838,0.847305,0.854007,0.016627
9,0.016364,0.000144,0.002298,0.000136,10,16,5,"{'n_estimators': 10, 'min_samples_split': 16, ...",0.778443,0.838323,0.795181,0.803982,0.025226,6,0.864865,0.822823,0.802395,0.830028,0.026007
