# Import necessary libraries 

In [1]:
# system

import os

# data analysis and plotting

import pandas as pd
import numpy as np
from scipy.stats import zscore
from scipy.stats import shapiro

from random import randint

import matplotlib.pyplot as plt 
import seaborn as sns
from xgboost import plot_importance

from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, KernelPCA

# data processing and model validation

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, explained_variance_score, confusion_matrix, accuracy_score, classification_report, log_loss
from math import sqrt
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
from sklearn.model_selection import RepeatedStratifiedKFold

# classification libraries

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.gaussian_process.kernels import RBF, DotProduct, WhiteKernel, Matern, RationalQuadratic
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn import svm
from xgboost import XGBClassifier
import lightgbm as lgb

# Importing imputation libs. 

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Hyperparameter optimization

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV

# exporting the models
import pickle

# parameter settings

%matplotlib inline

# To change scientific numbers to float
np.set_printoptions(formatter={'float_kind':'{:f}'.format})

# Increase the size of sns plots
sns.set(rc={'figure.figsize':(12,10)})

# import sys
# !conda list Check the packages installed

# Displaying all the rows/columns in a data set (the default option is not to show them)

pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

# Import and trim data

In [2]:
# Importing the raw data

raw_data_howell = pd.read_csv("datasets/Howell.csv", header = 0, encoding= 'unicode_escape')

In [3]:
raw_data_howell_test = pd.read_csv("datasets/HowellTest.csv", header = 0, encoding= 'unicode_escape')

In [4]:
raw_data_howell.head()

Unnamed: 0,ID,Sex,PopNum,Population,GOL,NOL,BNL,BBH,XCB,XFB,ZYB,AUB,WCB,ASB,BPL,NPH,NLH,JUB,NLB,MAB,MDH,MDB,OBH,OBB,DKB,NDS,WNB,SIS,ZMB,SSS,FMB,NAS,EKB,DKS,IML,XML,MLS,WMH,SOS,GLS,STB,FRC,FRS,FRF,PAC,PAS,PAF,OCC,OCS,OCF,FOL,NAR,SSR,PRR,DKR,ZOR,FMR,EKR,ZMR,AVR,BRR,VRR,LAR,OSR,BAR,NAA,PRA,BAA,NBA,BBA,BRA,SSA,NFA,DKA,NDA,SIA,FRA,PAA,OCA,RFA,RPA,ROA,BSA,SBA,SLA,TBA
0,1,M,1,NORSE,189,185,100,135,143,120,133,119,70,112,96,66,50,118,26,63,31,13,31,42,22,12,9.5,4.9,83,20,100,19,100,8,42,57,13,24,7,4,115,118,25,53,119,26,62,98,30,51,34,96,95,100,84,81,74,73,76,83,0,122,0,0,0,67,74,39,76,58,46,129,138,158,85,88,134,133,117,0,0,0,0,0,0,0
1,2,M,1,NORSE,182,178,102,139,145,120,137,125,66,113,108,64,48,118,25,72,19,13,28,39,21,9,10.8,4.5,101,27,95,17,96,9,32,53,10,23,6,4,117,116,28,55,113,24,59,93,27,39,34,93,102,108,84,84,76,73,74,82,0,124,0,0,0,77,67,35,79,55,46,124,141,153,99,100,128,134,119,0,0,0,0,0,0,0
2,3,M,1,NORSE,191,187,102,123,140,114,134,125,74,112,102,67,53,112,23,65,28,14,33,41,20,13,8.1,4.5,90,24,98,19,97,10,35,56,10,24,6,4,112,107,25,47,118,23,59,88,30,45,41,96,96,102,82,82,77,72,70,82,0,116,0,0,0,71,71,38,72,56,52,124,138,152,75,84,129,137,111,0,0,0,0,0,0,0
3,4,M,1,NORSE,191,188,100,127,141,123,135,127,71,113,95,76,53,114,26,62,25,12,35,40,23,10,8.8,4.4,94,23,98,16,99,8,34,52,11,22,8,3,116,109,26,47,116,24,57,94,34,50,38,92,93,98,81,79,79,73,72,77,0,118,0,0,0,64,71,46,75,56,49,128,144,157,98,90,128,135,108,0,0,0,0,0,0,0
4,5,M,1,NORSE,178,177,97,128,138,117,129,121,69,111,90,67,51,115,24,64,26,14,32,39,21,11,8.9,5.4,91,21,96,18,97,10,35,52,12,27,5,2,116,102,22,45,113,26,62,94,32,40,34,91,92,94,79,79,72,69,71,76,0,118,0,0,0,64,75,42,80,52,48,130,139,150,87,79,133,130,111,0,0,0,0,0,0,0


In [5]:
measured_data_howell = raw_data_howell.loc[:,"GOL":"TBA"]

model_cols_howell = [
    'GOL', 
    'NOL', 
    'BNL', 
    'BBH', 
    'XCB', 
    'XFB', 
    'ZYB', 
    'AUB', 
    'WCB', 
    'ASB',
    'BPL', 
    'NPH', 
    'NLH', 
    'JUB', 
    'NLB', 
    'MAB', 
    'MDH', 
    'MDB', 
    'OBH', 
    'OBB',
    'DKB', 
    'ZMB', 
    'FMB', 
    'EKB', 
    'IML', 
    'XML', 
    'WMH', 
    'STB', 
    'FRC', 
    'PAC', 
    'OCC', 
    'FOL'
]
             
model_data_howell = measured_data_howell.drop(columns=[col for col in measured_data_howell if col not in model_cols_howell])

model_data_howell.shape

(2524, 32)

In [6]:
measured_data_howell_test = raw_data_howell_test.loc[:,"GOL":"TBA"]

model_data_howell_test = measured_data_howell_test.drop(columns=[col for col in measured_data_howell_test if col not in model_cols_howell])

model_data_howell_test.shape

(524, 32)

In [7]:
# Add the Sex column

model_data_howell_test = pd.concat([model_data_howell_test.loc[:,:],raw_data_howell_test.loc[:,"Sex"]],axis=1)

In [8]:
model_data_howell_test

Unnamed: 0,GOL,NOL,BNL,BBH,XCB,XFB,ZYB,AUB,WCB,ASB,BPL,NPH,NLH,JUB,NLB,MAB,MDH,MDB,OBH,OBB,DKB,ZMB,FMB,EKB,IML,XML,WMH,STB,FRC,PAC,OCC,FOL,Sex
0,190,185,100,141,141,119,136,125,82,114,96,71,50,117,26,63,30,18,31,41,20,98,102,101,38,58,27,117,115,126,102,35,M
1,176,174,102,134,126,103,124,113,64,108,100,63,45,111,24,63,26,10,32,40,22,95,100,99,36,51,23,100,102,105,101,34,M
2,179,178,95,123,141,112,129,110,72,108,94,62,50,113,29,65,30,11,35,42,25,99,100,104,32,46,19,108,106,112,96,35,M
3,199,198,101,130,141,125,128,120,80,115,102,63,49,116,28,64,26,11,35,42,26,92,105,105,45,59,19,124,114,130,106,38,F
4,183,182,96,124,137,121,123,115,78,107,91,58,48,110,29,60,25,12,31,39,26,91,100,99,36,49,18,114,115,112,92,41,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
519,178,175,98,135,136,117,134,125,74,107,92,67,51,111,23,65,25,10,32,38,22,88,93,92,36,52,20,112,108,103,103,39,M
520,176,170,100,138,143,123,138,128,72,109,97,73,50,120,26,65,30,13,35,40,23,94,100,99,35,54,25,123,117,116,96,32,M
521,181,175,101,132,137,114,139,127,77,105,97,66,46,124,26,62,34,15,33,41,24,99,103,104,36,59,25,108,111,108,95,38,M
522,180,177,109,146,145,122,146,134,76,103,99,76,54,125,25,67,31,12,36,43,26,99,106,104,38,56,28,115,114,104,104,39,M


In [9]:
# Add the Sex column

model_data_howell = pd.concat([model_data_howell.loc[:,:],raw_data_howell.loc[:,"Sex"]],axis=1)

model_data_howell

Unnamed: 0,GOL,NOL,BNL,BBH,XCB,XFB,ZYB,AUB,WCB,ASB,BPL,NPH,NLH,JUB,NLB,MAB,MDH,MDB,OBH,OBB,DKB,ZMB,FMB,EKB,IML,XML,WMH,STB,FRC,PAC,OCC,FOL,Sex
0,189,185,100,135,143,120,133,119,70,112,96,66,50,118,26,63,31,13,31,42,22,83,100,100,42,57,24,115,118,119,98,34,M
1,182,178,102,139,145,120,137,125,66,113,108,64,48,118,25,72,19,13,28,39,21,101,95,96,32,53,23,117,116,113,93,34,M
2,191,187,102,123,140,114,134,125,74,112,102,67,53,112,23,65,28,14,33,41,20,90,98,97,35,56,24,112,107,118,88,41,M
3,191,188,100,127,141,123,135,127,71,113,95,76,53,114,26,62,25,12,35,40,23,94,98,99,34,52,22,116,109,116,94,38,M
4,178,177,97,128,138,117,129,121,69,111,90,67,51,115,24,64,26,14,32,39,21,91,96,97,35,52,27,116,102,113,94,34,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2519,159,158,89,125,128,101,117,106,64,91,92,58,45,106,24,55,19,7,32,38,20,87,89,91,36,45,16,98,104,101,89,30,F
2520,156,156,87,123,124,101,113,102,62,92,90,54,43,105,24,57,21,12,31,36,20,89,84,89,34,48,16,95,104,101,88,32,F
2521,160,160,89,121,129,106,117,112,69,99,95,55,42,105,24,60,24,16,32,35,21,88,88,90,35,48,20,102,105,101,90,33,F
2522,172,170,92,118,137,110,114,106,65,95,93,60,46,105,26,57,22,8,33,37,19,86,88,91,31,46,17,104,103,104,89,33,F


# Building the dataset using the chosen features

In [10]:
# Full data

# Convert M and F to 0 and 1

model_data_howell['Sex']= model_data_howell['Sex'].map({'M': 0,'F': 1})

model_data_howell['Sex'] = model_data_howell['Sex'].astype(int)

model_data_howell.describe()


Unnamed: 0,GOL,NOL,BNL,BBH,XCB,XFB,ZYB,AUB,WCB,ASB,BPL,NPH,NLH,JUB,NLB,MAB,MDH,MDB,OBH,OBB,DKB,ZMB,FMB,EKB,IML,XML,WMH,STB,FRC,PAC,OCC,FOL,Sex
count,2524.0,2524.0,2524.0,2524.0,2524.0,2524.0,2524.0,2524.0,2524.0,2524.0,2524.0,2524.0,2524.0,2524.0,2524.0,2524.0,2524.0,2524.0,2524.0,2524.0,2524.0,2524.0,2524.0,2524.0,2524.0,2524.0,2524.0,2524.0,2524.0,2524.0,2524.0,2524.0,2524.0
mean,179.172345,176.886688,99.120048,131.644216,136.847861,113.412837,130.766244,120.591125,70.987322,106.856577,97.782488,65.975832,50.009509,115.171949,26.298732,63.545563,27.353803,12.310618,33.667987,39.486529,21.381537,94.97504,96.973059,97.338748,36.108162,52.676704,22.740491,109.347068,109.538827,110.578051,95.690571,35.784865,0.458003
std,8.535998,7.91752,5.808567,7.239258,7.288233,6.372311,7.78873,7.39026,4.788081,5.69408,6.378397,5.534869,4.000187,6.218169,2.325713,4.041097,3.870529,2.135564,2.228324,2.023695,2.410965,5.738572,4.46629,4.23906,4.031754,4.386508,3.043158,7.867309,5.469376,6.629592,5.95056,2.642542,0.498332
min,151.0,151.0,83.0,107.0,116.0,95.0,105.0,98.0,57.0,88.0,80.0,48.0,36.0,97.0,19.0,52.0,16.0,6.0,26.0,33.0,13.0,79.0,81.0,83.0,20.0,38.0,14.0,81.0,93.0,89.0,79.0,27.0,0.0
25%,173.0,172.0,95.0,127.0,132.0,109.0,125.0,115.0,68.0,103.0,93.0,62.75,47.0,111.0,25.0,61.0,25.0,11.0,32.0,38.0,20.0,91.0,94.0,95.0,33.0,50.0,21.0,104.0,106.0,106.0,92.0,34.0,0.0
50%,179.0,177.0,99.0,131.0,137.0,113.0,131.0,120.0,71.0,107.0,98.0,66.0,50.0,115.0,26.0,63.0,27.0,12.0,34.0,39.0,21.0,95.0,97.0,97.0,36.0,53.0,23.0,109.0,109.0,111.0,95.0,36.0,0.0
75%,185.0,182.0,103.0,137.0,141.0,117.0,136.0,125.0,74.0,111.0,102.0,70.0,53.0,120.0,28.0,66.0,30.0,14.0,35.0,41.0,23.0,99.0,100.0,100.0,39.0,56.0,25.0,115.0,113.0,115.0,100.0,37.0,1.0
max,206.0,200.0,120.0,155.0,167.0,145.0,158.0,149.0,89.0,128.0,123.0,82.0,65.0,138.0,35.0,78.0,39.0,20.0,41.0,46.0,32.0,120.0,112.0,113.0,49.0,69.0,35.0,140.0,128.0,135.0,118.0,50.0,1.0


In [11]:
# Test data

model_data_howell_test['Sex']= model_data_howell_test['Sex'].map({'M': 0,'F': 1})

model_data_howell_test['Sex'] = model_data_howell_test['Sex'].astype(int)

model_data_howell_test.describe()


Unnamed: 0,GOL,NOL,BNL,BBH,XCB,XFB,ZYB,AUB,WCB,ASB,BPL,NPH,NLH,JUB,NLB,MAB,MDH,MDB,OBH,OBB,DKB,ZMB,FMB,EKB,IML,XML,WMH,STB,FRC,PAC,OCC,FOL,Sex
count,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0
mean,180.442748,177.931298,99.812977,132.652672,137.679389,113.992366,133.007634,122.110687,71.730916,108.068702,99.230916,67.041985,50.46374,116.933206,26.496183,64.675573,28.280534,12.860687,33.576336,39.979008,21.812977,96.601145,98.30916,98.545802,36.717557,53.645038,23.167939,109.604962,110.049618,112.017176,96.150763,36.137405,0.303435
std,9.450817,8.843825,6.271071,7.406367,7.722498,6.868885,8.416042,7.75233,5.20462,5.825297,6.943787,6.03924,4.302676,6.962157,2.794762,4.511054,3.906107,2.258966,2.493566,2.385732,2.555567,6.372989,5.459728,5.253407,4.108518,4.632576,2.870441,8.242643,5.836312,7.352221,5.661431,2.915019,0.460181
min,158.0,156.0,80.0,103.0,116.0,95.0,112.0,104.0,60.0,90.0,80.0,49.0,35.0,98.0,20.0,53.0,17.0,5.0,27.0,34.0,16.0,78.0,84.0,86.0,22.0,39.0,15.0,75.0,94.0,89.0,80.0,25.0,0.0
25%,174.0,172.0,96.0,128.0,132.0,109.0,128.0,117.0,68.0,104.0,95.0,63.0,47.0,112.0,25.0,62.0,26.0,11.0,32.0,38.0,20.0,92.75,94.0,95.0,34.0,51.0,21.0,105.0,106.0,107.0,92.0,34.0,0.0
50%,180.0,177.0,100.0,132.0,137.0,114.0,133.0,122.0,71.0,108.0,99.0,67.0,50.0,117.0,26.0,64.0,28.0,13.0,33.5,40.0,22.0,97.0,98.0,99.0,37.0,53.0,23.0,110.0,110.0,112.0,96.0,36.0,0.0
75%,186.0,184.0,103.0,138.0,142.25,119.0,138.0,127.0,75.0,112.0,103.0,71.0,53.0,122.0,28.0,68.0,31.0,14.0,35.0,41.0,24.0,101.0,102.0,102.0,40.0,57.0,25.0,115.0,114.0,117.0,100.0,38.0,1.0
max,212.0,209.0,125.0,159.0,161.0,141.0,157.0,145.0,88.0,133.0,131.0,90.0,68.0,141.0,55.0,86.0,39.0,22.0,43.0,49.0,30.0,117.0,124.0,118.0,50.0,68.0,32.0,141.0,136.0,134.0,113.0,49.0,1.0


# Merge the dataset with the test set and sample

In [12]:
full_set = pd.concat([model_data_howell, model_data_howell_test])

full_set = full_set.sample(frac=1).reset_index(drop=True)

full_set.describe()

Unnamed: 0,GOL,NOL,BNL,BBH,XCB,XFB,ZYB,AUB,WCB,ASB,BPL,NPH,NLH,JUB,NLB,MAB,MDH,MDB,OBH,OBB,DKB,ZMB,FMB,EKB,IML,XML,WMH,STB,FRC,PAC,OCC,FOL,Sex
count,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0,3048.0
mean,179.390748,177.066273,99.239173,131.817585,136.990814,113.512467,131.151575,120.852362,71.115157,107.064961,98.031496,66.159121,50.087598,115.474738,26.332677,63.739829,27.513123,12.405184,33.652231,39.571194,21.455709,95.254593,97.202756,97.54626,36.212927,52.843176,22.813976,109.391404,109.62664,110.825459,95.769685,35.845472,0.43143
std,8.71169,8.092398,5.895397,7.276989,7.370092,6.462929,7.943846,7.474485,4.869433,5.734159,6.50094,5.638137,4.056688,6.385752,2.413496,4.146904,3.89177,2.166866,2.275954,2.098224,2.441439,5.883522,4.67846,4.452406,4.050907,4.444052,3.018025,7.932312,5.536557,6.779841,5.90351,2.694134,0.495357
min,151.0,151.0,80.0,103.0,116.0,95.0,105.0,98.0,57.0,88.0,80.0,48.0,35.0,97.0,19.0,52.0,16.0,5.0,26.0,33.0,13.0,78.0,81.0,83.0,20.0,38.0,14.0,75.0,93.0,89.0,79.0,25.0,0.0
25%,174.0,172.0,95.0,127.0,132.0,109.0,125.0,116.0,68.0,103.0,94.0,63.0,47.0,111.0,25.0,61.0,25.0,11.0,32.0,38.0,20.0,91.0,94.0,95.0,33.0,50.0,21.0,104.0,106.0,106.0,92.0,34.0,0.0
50%,179.0,177.0,99.0,132.0,137.0,113.0,131.0,121.0,71.0,107.0,98.0,66.0,50.0,115.0,26.0,64.0,28.0,12.0,34.0,40.0,21.0,95.0,97.0,97.0,36.0,53.0,23.0,109.0,109.0,111.0,96.0,36.0,0.0
75%,185.0,182.0,103.0,137.0,141.0,117.25,137.0,126.0,74.0,111.0,102.0,70.0,53.0,120.0,28.0,66.0,30.0,14.0,35.0,41.0,23.0,99.0,100.0,100.0,39.0,56.0,25.0,115.0,113.0,115.0,100.0,38.0,1.0
max,212.0,209.0,125.0,159.0,167.0,145.0,158.0,149.0,89.0,133.0,131.0,90.0,68.0,141.0,55.0,86.0,39.0,22.0,43.0,49.0,32.0,120.0,124.0,118.0,50.0,69.0,35.0,141.0,136.0,135.0,118.0,50.0,1.0


# Training and test sets

In [14]:
X = full_set.drop('Sex', axis = 1).values
y = full_set['Sex']
    
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size=0.3, stratify=y)

# Classification without hyperparameter optimization

In [15]:
classifier_names = [
    "Logistic Regression", 
    "Decision Tree Classifier", 
    "Support Vector Machines", 
    "Gaussian Process Classifier", 
    "Gradient Boosting Classifier", 
    "Random Forest Classifier",
    "Ada Boost Classifier", 
    "Extra Trees Classifier", 
    "Gaussian Naive Bayes", 
    "KNNeighbors Classifier",
    "Linear Discriminant Analysis", 
    "Quadratic Discriminant Analysis", 
    "XGBClassifier", 
    "Light Gradient Boosting Classifier"
]


classifiers = [
    LogisticRegression(),
    DecisionTreeClassifier(),
    SVC(),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    GradientBoostingClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    ExtraTreesClassifier(),
    GaussianNB(),
    KNeighborsClassifier(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    XGBClassifier(),
    lgb.LGBMClassifier()
]


In [16]:
dataset_scores_list = []

for name, clf in zip(classifier_names, classifiers):
        
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)*100

    dataset_scores_list.append(score)
    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
dataset_scores_list

[81.31147540983606,
 75.84699453551913,
 82.18579234972677,
 76.83060109289617,
 83.49726775956285,
 82.18579234972677,
 82.07650273224044,
 82.40437158469945,
 79.45355191256832,
 80.87431693989072,
 85.46448087431693,
 82.29508196721311,
 83.16939890710383,
 83.82513661202185]

In [18]:
results = pd.DataFrame(index=classifier_names)
results['Howells'] = dataset_scores_list

results

Unnamed: 0,Howells
Logistic Regression,81.311475
Decision Tree Classifier,75.846995
Support Vector Machines,82.185792
Gaussian Process Classifier,76.830601
Gradient Boosting Classifier,83.497268
Random Forest Classifier,82.185792
Ada Boost Classifier,82.076503
Extra Trees Classifier,82.404372
Gaussian Naive Bayes,79.453552
KNNeighbors Classifier,80.874317


In [17]:
# Exporting the table to LateX format

# print(results.to_latex(float_format="%.2f"))

# Hyperparameter optimization

In [19]:
# Logistic regression model

model = LogisticRegression()

model.fit(X_train, y_train)

model.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8131147540983606

In [19]:
# Optimizing the logistic regression model

model = LogisticRegression()

parameters = {
    'C': np.logspace(-2,2,5),
    'max_iter': [2500],
    'random_state': [0]
}

clf  = BayesSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X, y)





BayesSearchCV(cv=10, estimator=LogisticRegression(),
              search_spaces={'C': array([0.010000, 0.100000, 1.000000, 10.000000, 100.000000]),
                             'max_iter': [2500], 'random_state': [0]})

In [20]:
clf.best_params_

OrderedDict([('C', 0.01), ('max_iter', 2500), ('random_state', 0)])

In [21]:
model = LogisticRegression(**clf.best_params_)

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.8655737704918033

In [43]:
# If you want to export a model use the following command

# pickle.dump(model, open("logreg_model_howell.dat", "wb"))

In [24]:
# Support vector machines

model = SVC()

model.fit(X_train, y_train)

model.score(X_test, y_test)


0.8218579234972677

In [26]:
# Optimizing the Support Vector Machine model

model = SVC()

parameters = {
    'C': np.logspace(-2,2,5)
     #'kernel': ['rbf','linear']
}

clf  = GridSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X, y)

GridSearchCV(cv=10, estimator=SVC(),
             param_grid={'C': array([0.010000, 0.100000, 1.000000, 10.000000, 100.000000])})

In [27]:
clf.best_params_

{'C': 100.0}

In [28]:
model = SVC(**clf.best_params_, probability=True)

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.8579234972677595

In [29]:
# pickle.dump(model, open("svm_model_howell.dat", "wb"))

In [30]:
# kNN classifier

model = KNeighborsClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.8207650273224044

In [31]:
# Optimizing the kNN classifier

model = KNeighborsClassifier()

parameters = {
    'n_neighbors': list(range(1,21)),
#      'weights' : ['uniform', 'distance'],
#       'metric' : ['euclidean', 'manhattan'],
    'leaf_size': list(range(1,20))
             }

clf  = GridSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X, y)

GridSearchCV(cv=10, estimator=KNeighborsClassifier(),
             param_grid={'leaf_size': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19],
                         'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20]})

In [32]:
clf.best_params_

{'leaf_size': 1, 'n_neighbors': 15}

In [33]:
model = KNeighborsClassifier(**clf.best_params_)

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.8185792349726776

In [34]:
# Gaussian Naive Bayes

model = GaussianNB()

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.8065573770491803

In [37]:
# Optimizing the Gaussian Naive Bayes classifier

model = GaussianNB()

parameters = {
    'var_smoothing': np.logspace(0,-9, num=100)
             }

clf  = GridSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X, y)

GridSearchCV(cv=10, estimator=GaussianNB(),
             param_grid={'var_smoothing': array([1.000000, 0.811131, 0.657933, 0.533670, 0.432876, 0.351119,
       0.284804, 0.231013, 0.187382, 0.151991, 0.123285, 0.100000,
       0.081113, 0.065793, 0.053367, 0.043288, 0.035112, 0.028480,
       0.023101, 0.018738, 0.015199, 0.012328, 0.010000, 0.008111,
       0.006579, 0.005337, 0.004329, 0.003511, 0.002848, 0.002310,
       0.0...
       0.000004, 0.000003, 0.000002, 0.000002, 0.000002, 0.000001,
       0.000001, 0.000001, 0.000001, 0.000001, 0.000000, 0.000000,
       0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
       0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
       0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
       0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
       0.000000, 0.000000, 0.000000, 0.000000])})

In [38]:
clf.best_params_

{'var_smoothing': 0.0657933224657568}

In [39]:
model = GaussianNB(**clf.best_params_)

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.805464480874317

In [40]:
# Linear Discriminant Analysis

model = LinearDiscriminantAnalysis()

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.8644808743169399

In [41]:
# Optimizing the Linear Discriminant Analysis classifier

model = LinearDiscriminantAnalysis()

parameters = {
    'solver' : ['svd', 'lsqr', 'eigen']
             }

clf  = GridSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X, y)

GridSearchCV(cv=10, estimator=LinearDiscriminantAnalysis(),
             param_grid={'solver': ['svd', 'lsqr', 'eigen']})

In [42]:
clf.best_params_

{'solver': 'svd'}

In [43]:
model = LinearDiscriminantAnalysis(**clf.best_params_)

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.8644808743169399

In [44]:
# pickle.dump(model, open("lda_model_howell.dat", "wb"))


In [45]:
# Quadratic Discriminant Analysis

model = QuadraticDiscriminantAnalysis()

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.833879781420765

In [46]:
# Optimizing the Quadratic Discriminant Analysis classifier

model = QuadraticDiscriminantAnalysis()

parameters = {
    'reg_param' : [0., 0.1, 0.2, 0.3, 0.4]
             }

clf  = GridSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X_train,y_train)

GridSearchCV(cv=10, estimator=QuadraticDiscriminantAnalysis(),
             param_grid={'reg_param': [0.0, 0.1, 0.2, 0.3, 0.4]})

In [47]:
clf.best_params_

{'reg_param': 0.1}

In [48]:
# Quadratic Discriminant Analysis

model = QuadraticDiscriminantAnalysis(**clf.best_params_)

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.8327868852459016

In [49]:
# Decision Tree Classifier

model = DecisionTreeClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.760655737704918

In [52]:
# Optimizing the Decision Tree Classifier

model = DecisionTreeClassifier()

parameters = {
    'criterion':['entropy','gini'],
    'max_depth':[1,2,3,4,5,6,7,15,20,30,40,120,150]
}

clf  = GridSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X, y)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['entropy', 'gini'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, 15, 20, 30, 40, 120,
                                       150]})

In [53]:
clf.best_params_

{'criterion': 'entropy', 'max_depth': 2}

In [54]:
model = DecisionTreeClassifier(**clf.best_params_)

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.8153005464480875

In [55]:
# Random Forest Classifier

model = RandomForestClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.8306010928961749

In [56]:
# Optimizing the Random Forest Classifier

model = RandomForestClassifier()

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
}

clf  = RandomizedSearchCV(model, param_distributions=random_grid, n_iter = 20, cv=10, return_train_score=False)

clf.fit(X, y)

RandomizedSearchCV(cv=10, estimator=RandomForestClassifier(), n_iter=20,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]})

In [57]:
clf.best_params_

{'n_estimators': 2000,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 40,
 'bootstrap': False}

In [58]:
model = RandomForestClassifier(**clf.best_params_)

model.fit(X_train, y_train)

print(model.score(X_test, y_test))

0.8327868852459016


In [59]:
# XGBoost Classifier

model = XGBClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.8426229508196721

In [60]:
# Optimizing the XGBoost Classifier

model = XGBClassifier()

parameters = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
}

clf  = BayesSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X, y)



BayesSearchCV(cv=10, estimator=XGBClassifier(),
              search_spaces={'colsample_bytree': [0.6, 0.8, 1.0],
                             'gamma': [0.5, 1, 1.5, 2, 5],
                             'max_depth': [3, 4, 5],
                             'min_child_weight': [1, 5, 10],
                             'subsample': [0.6, 0.8, 1.0]})

In [62]:
clf.best_params_

OrderedDict([('colsample_bytree', 1.0),
             ('gamma', 1.5),
             ('max_depth', 4),
             ('min_child_weight', 1),
             ('subsample', 0.6)])

In [64]:
model = XGBClassifier(**clf.best_params_)

model.fit(X_train, y_train)

print(model.score(X_test, y_test))


0.8524590163934426


In [24]:
# Gaussian Process Classifier

model = GaussianProcessClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.7683060109289618

In [25]:
# Optimizing the GaussianProcessClassifier

model = GaussianProcessClassifier()

parameters = {
    'kernel' : [1*RBF(), 1*DotProduct(), 1*Matern(),  1*RationalQuadratic(), 1*WhiteKernel()]
}

clf  = GridSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X, y)

Traceback (most recent call last):
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/gaussian_process/_gpc.py", line 664, in fit
    self.base_estimator_.fit(X, y)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/gaussian_process/_gpc.py", line 212, in fit
    optima = [self._constrained_optimization(obj_func,
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/gaussian_process/_gpc.py", line 445, in _constrained_optimization
    opt_res = scipy.optimize.minimize(
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/scipy/optimize/_minimize.py", line 623, in minimize
    return _minimize_lbfgsb(fun, x0, args, jac, bounds,
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/scipy/optimize/lbfgsb.py", line 360

Traceback (most recent call last):
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/gaussian_process/_gpc.py", line 664, in fit
    self.base_estimator_.fit(X, y)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/gaussian_process/_gpc.py", line 212, in fit
    optima = [self._constrained_optimization(obj_func,
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/gaussian_process/_gpc.py", line 445, in _constrained_optimization
    opt_res = scipy.optimize.minimize(
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/scipy/optimize/_minimize.py", line 623, in minimize
    return _minimize_lbfgsb(fun, x0, args, jac, bounds,
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/scipy/optimize/lbfgsb.py", line 360

Traceback (most recent call last):
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/gaussian_process/_gpc.py", line 664, in fit
    self.base_estimator_.fit(X, y)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/gaussian_process/_gpc.py", line 212, in fit
    optima = [self._constrained_optimization(obj_func,
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/gaussian_process/_gpc.py", line 445, in _constrained_optimization
    opt_res = scipy.optimize.minimize(
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/scipy/optimize/_minimize.py", line 623, in minimize
    return _minimize_lbfgsb(fun, x0, args, jac, bounds,
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/scipy/optimize/lbfgsb.py", line 360

Traceback (most recent call last):
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/gaussian_process/_gpc.py", line 664, in fit
    self.base_estimator_.fit(X, y)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/gaussian_process/_gpc.py", line 212, in fit
    optima = [self._constrained_optimization(obj_func,
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/gaussian_process/_gpc.py", line 445, in _constrained_optimization
    opt_res = scipy.optimize.minimize(
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/scipy/optimize/_minimize.py", line 623, in minimize
    return _minimize_lbfgsb(fun, x0, args, jac, bounds,
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/scipy/optimize/lbfgsb.py", line 360





GridSearchCV(cv=10, estimator=GaussianProcessClassifier(),
             param_grid={'kernel': [1**2 * RBF(length_scale=1),
                                    1**2 * DotProduct(sigma_0=1),
                                    1**2 * Matern(length_scale=1, nu=1.5),
                                    1**2 * RationalQuadratic(alpha=1, length_scale=1),
                                    1**2 * WhiteKernel(noise_level=1)]})

In [26]:
clf.best_params_

{'kernel': 1**2 * RationalQuadratic(alpha=1, length_scale=1)}

In [27]:
model = GaussianProcessClassifier(**clf.best_params_, max_iter_predict = 1000)

model.fit(X_train, y_train)

print(model.score(X_test, y_test))

0.853551912568306


In [28]:
# Gradient Boosting Classifier

model = GradientBoostingClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.8360655737704918

In [29]:
# Optimizing the Gradient Boosting Classifier

model = GradientBoostingClassifier()

parameters = {
    "n_estimators":[5,50,250,500],
    "max_depth":[1,3,5,7,9],
    "learning_rate":[0.01,0.1,1,10,100]
}

clf  = BayesSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X_train,y_train)



BayesSearchCV(cv=10, estimator=GradientBoostingClassifier(),
              search_spaces={'learning_rate': [0.01, 0.1, 1, 10, 100],
                             'max_depth': [1, 3, 5, 7, 9],
                             'n_estimators': [5, 50, 250, 500]})

In [30]:
clf.best_params_

OrderedDict([('learning_rate', 0.1), ('max_depth', 5), ('n_estimators', 500)])

In [31]:
model = GradientBoostingClassifier(**clf.best_params_)

model.fit(X_train, y_train)

print(model.score(X_test, y_test))


0.8491803278688524


In [32]:
# Ada Boost Classifier

model = AdaBoostClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.8207650273224044

In [33]:
# Optimizing the Gradient Boosting Classifier

model = AdaBoostClassifier()

parameters = {
    "n_estimators":[5,50,250,500],
    "learning_rate":[0.01,0.1,1,10,100]
}

clf  = GridSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X, y)

  sample_weight *= np.exp(estimator_weight *
  sample_weight /= sample_weight_sum
Traceback (most recent call last):
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_weight_boosting.py", line 443, in fit
    return super().fit(X, y, sample_weight)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_weight_boosting.py", line 130, in fit
    sample_weight, estimator_weight, estimator_error = self._boost(
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_weight_boosting.py", line 503, in _boost
    return self._boost_real(iboost, X, y, sample_weight, random_state)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_weight_boosting.py", line 513, in _boost_real
    estimator.fi

  sample_weight *= np.exp(estimator_weight *
  sample_weight /= sample_weight_sum
Traceback (most recent call last):
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_weight_boosting.py", line 443, in fit
    return super().fit(X, y, sample_weight)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_weight_boosting.py", line 130, in fit
    sample_weight, estimator_weight, estimator_error = self._boost(
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_weight_boosting.py", line 503, in _boost
    return self._boost_real(iboost, X, y, sample_weight, random_state)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_weight_boosting.py", line 513, in _boost_real
    estimator.fi

  sample_weight *= np.exp(estimator_weight *
  sample_weight /= sample_weight_sum
Traceback (most recent call last):
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_weight_boosting.py", line 443, in fit
    return super().fit(X, y, sample_weight)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_weight_boosting.py", line 130, in fit
    sample_weight, estimator_weight, estimator_error = self._boost(
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_weight_boosting.py", line 503, in _boost
    return self._boost_real(iboost, X, y, sample_weight, random_state)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_weight_boosting.py", line 513, in _boost_real
    estimator.fi

GridSearchCV(cv=10, estimator=AdaBoostClassifier(),
             param_grid={'learning_rate': [0.01, 0.1, 1, 10, 100],
                         'n_estimators': [5, 50, 250, 500]})

In [34]:
clf.best_params_

{'learning_rate': 0.1, 'n_estimators': 500}

In [35]:
model = AdaBoostClassifier(**clf.best_params_)

model.fit(X_train, y_train)

print(model.score(X_test, y_test))

0.8349726775956284


In [36]:
# Extra trees regressor

model = ExtraTreesClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.8207650273224044

In [37]:
# Optimizing the ExtraTreesClassifier

model = ExtraTreesClassifier()

parameters = {
        'n_estimators': list(range(50,126,25)),
        'min_samples_leaf': list(range(1,20,1)),
        'min_samples_split': list(range(1,20,1))
    }

clf  = RandomizedSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X, y)

RandomizedSearchCV(cv=10, estimator=ExtraTreesClassifier(),
                   param_distributions={'min_samples_leaf': [1, 2, 3, 4, 5, 6,
                                                             7, 8, 9, 10, 11,
                                                             12, 13, 14, 15, 16,
                                                             17, 18, 19],
                                        'min_samples_split': [1, 2, 3, 4, 5, 6,
                                                              7, 8, 9, 10, 11,
                                                              12, 13, 14, 15,
                                                              16, 17, 18, 19],
                                        'n_estimators': [50, 75, 100, 125]})

In [38]:
clf.best_params_

{'n_estimators': 75, 'min_samples_split': 11, 'min_samples_leaf': 1}

In [39]:
model = ExtraTreesClassifier(**clf.best_params_)

model.fit(X_train, y_train)

print(model.score(X_test, y_test))

0.8316939890710382


In [40]:
# Light boosting regressor

model = lgb.LGBMClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.8382513661202186

In [41]:
# Optimizing the LGBMClassifier

model = lgb.LGBMClassifier()

parameters = {
    'num_leaves': [5, 10, 20, 31, 50, 100], 
    'min_child_samples': [20, 30, 50 , 100], 
     'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1],
     'reg_alpha': [0, 1e-1, 1],
    'reg_lambda': [0, 1e-1, 1, 5, 10]
    }

clf  = RandomizedSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X, y)

RandomizedSearchCV(cv=10, estimator=LGBMClassifier(),
                   param_distributions={'min_child_samples': [20, 30, 50, 100],
                                        'min_child_weight': [1e-05, 0.001, 0.01,
                                                             0.1, 1],
                                        'num_leaves': [5, 10, 20, 31, 50, 100],
                                        'reg_alpha': [0, 0.1, 1],
                                        'reg_lambda': [0, 0.1, 1, 5, 10]})

In [42]:
clf.best_params_

{'reg_lambda': 0.1,
 'reg_alpha': 0.1,
 'num_leaves': 50,
 'min_child_weight': 0.001,
 'min_child_samples': 30}

In [43]:
model = lgb.LGBMClassifier(**clf.best_params_)

model.fit(X_train, y_train)

print(model.score(X_test, y_test))

0.839344262295082
