# Import necessary libraries

In [2]:
#importing all the necessary libraries

# system

import os

# data analysis and plotting

import pandas as pd
import numpy as np
from scipy.stats import zscore
from scipy.stats import shapiro
from random import randint
import matplotlib.pyplot as plt 
import seaborn as sns
from xgboost import plot_importance
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, KernelPCA
from sklearn.utils import shuffle

# data processing and model validation

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, explained_variance_score, confusion_matrix, accuracy_score, classification_report, log_loss
from math import sqrt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold


# classification libraries

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.gaussian_process.kernels import RBF, DotProduct, WhiteKernel, Matern, RationalQuadratic
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from xgboost import XGBClassifier
import lightgbm as lgb


# Importing imputation libs. 

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Hyperparameter optimization

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV

# web stuff
import pickle

# Various parameter settings

%matplotlib inline

# To install sklearn type "pip install numpy scipy scikit-learn" to the anaconda terminal

# To change scientific numbers to float
np.set_printoptions(formatter={'float_kind':'{:f}'.format})

# Increases the size of sns plots
sns.set(rc={'figure.figsize':(12,10)})

# import sys
# !conda list Check the packages installed

# Displaying all the rows/columns in a data set (the default option is not to show them)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Import the data and create the datasets needed in the analysis

In [3]:
# Importing the raw data

raw_data_goldman = pd.read_csv("datasets/Goldman.csv", header = 0, encoding= 'unicode_escape')

In [4]:
raw_data_goldman.describe()

Unnamed: 0,Element:,LHUM,RHUM,LRAD,RRAD,LFEM,RFEM,LTIB,RTIB,OSCX,Metrics:,LHML,LHEB,LHHD,LHMLD,LHAPD,RHML,RHEB,RHHD,RHMLD,RHAPD,LRML,LRMLD,LRAPD,RRML,RRMLD,RRAPD,LFML,LFBL,LFEB,LFAB,LFHD,LFMLD,LFAPD,RFML,RFBL,RFEB,RFAB,RFHD,RFMLD,RFAPD,LTML,LTPB,LTMLD,LTAPD,RTML,RTPB,RTMLD,RTAPD,BIB,LIBL,RIBL,LAcH,RAcH,Derived:,Brachial,Crural,IL UL/LL,IL LL/UL,CBR FHD,McH FHD,GRINE FHD,AVG FHD
count,0.0,1538.0,1538.0,1538.0,1538.0,1538.0,1538.0,1538.0,1538.0,1538.0,0.0,1376.0,1354.0,1368.0,1376.0,1376.0,1403.0,1384.0,1396.0,1403.0,1402.0,1321.0,1321.0,1321.0,1337.0,1340.0,1340.0,1421.0,1416.0,1380.0,1378.0,1421.0,1423.0,1423.0,1426.0,1423.0,1386.0,1390.0,1435.0,1424.0,1424.0,1403.0,1352.0,1399.0,1398.0,1400.0,1349.0,1395.0,1394.0,1469.0,1179.0,1177.0,1371.0,1375.0,0.0,1463.0,1490.0,1418.0,1418.0,1519.0,1519.0,1519.0,1519.0
mean,,0.10078,0.081274,0.140442,0.126788,0.070221,0.064369,0.086476,0.086476,0.020156,,303.759811,57.44277,42.741615,19.884404,19.717754,307.556379,58.194581,43.003059,20.362117,20.462511,233.068887,14.097017,11.188312,234.964099,14.493866,11.335993,427.106967,423.455508,76.067754,66.361168,43.430837,25.767442,27.171792,425.657433,421.620169,76.259019,66.28095,43.497401,25.440857,27.243588,353.078403,69.344305,21.457984,26.35171,352.419429,69.343217,21.985778,25.372317,262.395848,150.980068,151.104503,48.873508,48.92336,,0.765279,0.827734,0.692225,1.445639,60.145433,57.412799,62.073215,59.877149
std,,0.301135,0.273345,0.347558,0.332844,0.255602,0.245489,0.281157,0.281157,0.14058,,23.025881,5.446128,4.134822,2.472297,2.34587,23.116218,5.544286,4.257656,2.57313,2.381074,18.985815,1.898293,1.321821,18.904538,1.938423,1.364567,31.509788,31.597694,6.204072,5.857932,3.972412,2.745311,2.874594,31.751905,31.843808,6.238232,5.832092,4.05379,2.663129,2.915361,28.076348,5.773576,2.376611,3.022335,28.749863,5.775864,2.468692,2.783156,18.274559,10.573547,10.753604,4.052208,4.079195,,0.031805,0.027455,0.018438,0.038447,7.946894,8.914306,9.029766,8.555195
min,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,229.5,36.51,29.58,12.01,13.0,234.0,42.0,32.24,12.25,13.84,179.0,8.74,7.37,180.0,9.16,7.88,345.0,309.5,58.0,49.99,33.64,17.9,18.88,341.5,279.0,58.0,49.83,32.97,17.97,18.34,276.0,52.0,15.22,19.0,237.0,50.0,15.15,18.59,184.0,105.0,106.0,38.64,37.89,,0.677912,0.692025,0.59926,1.29929,36.092754,34.692285,39.05842,38.300225
25%,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,287.375,53.0,39.76,18.23,18.03,291.0,54.0,39.9,18.69,18.7325,219.0,12.78,10.23,221.0,13.1175,10.3275,404.5,401.0,72.0,62.01,40.61,23.83,25.12,403.0,399.0,72.0,61.97,40.51,23.535,25.0975,333.0,65.0,19.78,24.18,332.0,65.0,20.245,23.38,251.0,145.0,145.0,45.995,46.055,,0.744615,0.809212,0.680304,1.421166,54.614902,50.992205,55.56946,53.738995
50%,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,303.5,57.5,42.715,19.855,19.64,307.0,58.0,43.015,20.3,20.46,233.0,14.05,11.11,235.0,14.4,11.27,428.0,424.0,76.0,66.615,43.54,25.73,27.16,426.0,422.0,76.0,66.19,43.52,25.37,27.135,353.0,69.5,21.37,26.235,353.0,69.0,21.89,25.215,263.0,151.0,151.0,49.05,49.03,,0.765331,0.828423,0.691577,1.445971,60.046353,57.47411,62.13532,59.832163
75%,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,319.0,61.0,45.7225,21.46,21.3,323.5,62.0,46.0,22.035,22.1875,247.0,15.32,12.12,248.0,15.7425,12.3125,448.5,445.0,81.0,70.8225,46.15,27.65,29.175,447.5,443.5,81.0,70.605,46.395,27.2425,29.285,372.0,74.0,23.03,28.5375,372.0,74.0,23.79,27.3175,274.0,158.0,158.0,51.715,51.8,,0.785749,0.845936,0.703647,1.469932,65.196007,63.737713,68.48005,65.676422
max,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,376.0,75.0,56.33,27.22,27.29,383.0,75.0,55.67,30.44,27.19,290.5,22.15,16.28,291.0,23.22,15.68,531.0,530.0,93.5,83.68,57.39,34.54,37.06,532.5,531.0,94.0,82.85,57.86,33.84,37.53,446.0,86.0,31.0,37.94,444.0,85.0,29.83,35.2,324.0,181.0,189.0,62.94,62.99,,1.076923,0.965422,0.769651,1.668724,92.745113,89.122375,94.1935,92.020329


In [5]:
# Define a new data set that contains all the measured data, i.e.,
# not derived data as well as the class label Sex for each measurement

measured_data_goldman = raw_data_goldman.loc[:,"LHML":"RAcH"]

# Fill missing data with zeroes so that we can average between left and right skeletal measurements below

measured_data_goldman = measured_data_goldman.fillna(0)


In [6]:
# Create new columns and take the average between left and right skeletal measurements

target_cols = ['HML', 'HHD', 'RML', 'FML', 'FBL','FHD', 'TML']


for col in target_cols:
    measured_data_goldman[col] = 0.
    
    min_left_col_value = measured_data_goldman["".join(["L",col])][measured_data_goldman["".join(["L",col])] > 0.1].min() - 1
    
    min_right_col_value = measured_data_goldman["".join(["R",col])][measured_data_goldman["".join(["R",col])] > 0.1].min() - 1
    
    measured_data_goldman.loc[(measured_data_goldman["".join(["L",col])] < 0.1) & (measured_data_goldman["".join(["R",col])] > min_right_col_value), 
        col] = measured_data_goldman["".join(["R",col])]

    measured_data_goldman.loc[(measured_data_goldman["".join(["R",col])] < 0.1) & (measured_data_goldman["".join(["L",col])] > min_left_col_value), 
       col] = measured_data_goldman["".join(["L",col])]

    measured_data_goldman.loc[(measured_data_goldman["".join(["R",col])] > min_right_col_value) & (measured_data_goldman["".join(["L",col])] > min_left_col_value), 
       col] = (measured_data_goldman["".join(["L",col])] + measured_data_goldman["".join(["R",col])])/2
    
    
    
# Create a dataset with the features we will use to build our models

model_cols = ['BIB','HML', 'HHD', 'RML', 'FML', 'FBL','FHD', 'TML']

model_data_goldman = measured_data_goldman.drop(columns=[col for col in measured_data_goldman if col not in model_cols])

# Add the Sex column

model_data_goldman = pd.concat([model_data_goldman.loc[:,:],raw_data_goldman.loc[:,"Sex"]],axis=1)

print(model_data_goldman.describe())

               BIB          HML          HHD          RML          FML  \
count  1538.000000  1538.000000  1538.000000  1538.000000  1538.000000   
mean    250.623862   298.197042    41.699971   225.650033   420.892230   
std      57.194643    52.024020     8.025574    46.894808    57.592097   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%     249.000000   288.312500    39.520000   218.500000   402.750000   
50%     262.000000   304.500000    42.700000   233.000000   426.375000   
75%     274.000000   320.750000    45.728750   247.000000   447.937500   
max     324.000000   379.500000    55.120000   290.250000   531.750000   

               FBL          FHD          TML  
count  1538.000000  1538.000000  1538.000000  
mean    416.541125    42.925692   343.531632  
std      59.136325     6.222401    63.412299  
min       0.000000     0.000000     0.000000  
25%     399.000000    40.446250   330.812500  
50%     422.500000    43.432500   352.500000  
75% 

In [7]:
# Note that the Sex column is a string, not a value, that's why
# its not printed above. 

# But we take advandage of the fact that its a string to 
# drop the values 1? and 0?

# Get rid of 1? and 0? from sex estimation and then shuffle the dataset
# because otherwise you have 1 and 0 packed together 

model_data_goldman = pd.concat([model_data_goldman.loc[model_data_goldman['Sex']=='1'], model_data_goldman.loc[model_data_goldman['Sex']=='0']])

model_data_goldman = model_data_goldman.sample(frac=1).reset_index(drop=True)

# Now convert Sex from string to int

model_data_goldman["Sex"] = model_data_goldman["Sex"].astype(int) 

model_data_goldman.describe()

Unnamed: 0,BIB,HML,HHD,RML,FML,FBL,FHD,TML,Sex
count,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0
mean,250.54483,298.170681,41.714529,225.744928,420.889234,416.533704,42.939784,343.450524,0.355366
std,57.361899,52.176096,8.047175,46.673734,57.745325,59.293408,6.2375,63.587332,0.478781
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,249.0,288.25,39.53,218.5,402.5625,398.9375,40.47,330.6875,0.0
50%,262.0,304.5,42.735,233.0,426.5,422.625,43.46,352.5,0.0
75%,274.0,320.75,45.7625,247.0,447.8125,444.25,46.2775,372.0,1.0
max,324.0,379.5,55.12,290.25,531.75,530.5,57.625,444.5,1.0


In [8]:
# replace 0.0 back to nan to better handle the dataset within xgboost but
# also to become able to drop the NA entries easily

model_data_goldman[model_cols] = model_data_goldman[model_cols].replace(0.0, np.nan)

In [10]:
model_data_goldman.describe()

Unnamed: 0,BIB,HML,HHD,RML,FML,FBL,FHD,TML,Sex
count,1459.0,1491.0,1487.0,1475.0,1508.0,1506.0,1509.0,1487.0,1528.0
mean,262.393763,305.569953,42.864694,233.856441,426.47132,422.618526,43.480444,352.920242,0.355366
std,18.299914,22.965932,4.148381,18.935423,31.56893,31.525125,3.984079,28.471105,0.478781
min,184.0,234.0,29.58,179.5,343.75,329.75,33.315,271.0,0.0
25%,251.0,289.5,39.805,220.0,404.0,399.8125,40.635,332.5,0.0
50%,263.0,305.0,42.92,234.0,427.0,423.0,43.505,353.0,0.0
75%,274.0,321.25,45.81,247.5,448.0,444.5,46.305,372.25,1.0
max,324.0,379.5,55.12,290.25,531.75,530.5,57.625,444.5,1.0


In [9]:
no_zeroes_model_data_goldman = model_data_goldman.dropna()

In [11]:
no_zeroes_model_data_goldman.describe()

Unnamed: 0,BIB,HML,HHD,RML,FML,FBL,FHD,TML,Sex
count,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0
mean,262.366592,306.079821,42.90102,233.922459,427.14574,423.2571,43.533487,353.314387,0.355755
std,18.255714,22.667387,4.08016,18.783494,31.185288,31.154284,3.947922,28.19453,0.47892
min,184.0,234.0,31.65,179.5,343.75,329.75,33.315,276.5,0.0
25%,251.0,290.0,39.87875,220.5,405.0,401.3125,40.735,333.0,0.0
50%,263.0,305.25,42.9275,234.0,427.0,423.25,43.5225,353.375,0.0
75%,274.0,321.75,45.7975,247.5,448.0,444.75,46.34375,372.25,1.0
max,324.0,379.5,54.76,290.25,531.75,530.5,56.195,444.5,1.0


In [12]:
# Create a new data set using the knn imputer
# Here we use the 3 nearest neighbors to calculate the missing data

knn_imputer = KNNImputer(n_neighbors=3, missing_values=0.0)

sex_column = model_data_goldman['Sex']

temporary_data_set = model_data_goldman.fillna(0.).drop(["Sex"],axis=1)

cols = temporary_data_set.columns

temporary_data_set = knn_imputer.fit_transform(temporary_data_set)

temporary_data_set = pd.DataFrame(data=temporary_data_set, columns=cols)

knn_imputed_data_goldman = pd.concat([temporary_data_set,sex_column],axis=1)

In [13]:
knn_imputed_data_goldman.describe()

Unnamed: 0,BIB,HML,HHD,RML,FML,FBL,FHD,TML,Sex
count,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0
mean,262.323953,305.530574,42.863991,233.829407,426.414921,422.533213,43.476951,352.984337,0.355366
std,18.096663,22.855956,4.113545,18.824,31.476768,31.435141,3.969791,28.39034,0.478781
min,184.0,234.0,29.58,179.5,343.75,329.75,33.315,271.0,0.0
25%,251.0,289.5,39.82,220.4375,404.1875,400.0,40.63875,332.5,0.0
50%,263.0,305.0,42.925,234.0,426.75,423.0,43.5025,353.125,0.0
75%,274.0,321.25,45.78125,247.5,448.0,444.5,46.28625,372.5,1.0
max,324.0,379.5,55.12,290.25,531.75,530.5,57.625,444.5,1.0


In [14]:
# Create a new data set using the iterative imputer

iter_imputer = IterativeImputer(max_iter = 1000, missing_values=0.0)

sex_column = model_data_goldman['Sex']

temporary_data_set = model_data_goldman.fillna(0.).drop(["Sex"],axis=1)

cols = temporary_data_set.columns

temporary_data_set = iter_imputer.fit_transform(temporary_data_set)

temporary_data_set = pd.DataFrame(data=temporary_data_set, columns=cols)

iter_imputed_data_goldman = pd.concat([temporary_data_set,sex_column],axis=1)


In [15]:
iter_imputed_data_goldman.describe()

Unnamed: 0,BIB,HML,HHD,RML,FML,FBL,FHD,TML,Sex
count,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0
mean,262.198501,305.510815,42.852649,233.79795,426.440201,422.562111,43.479954,352.978508,0.355366
std,18.082733,22.862662,4.119429,18.879454,31.496783,31.449895,3.974315,28.378283,0.478781
min,184.0,234.0,29.58,179.5,343.75,329.75,33.315,271.0,0.0
25%,251.0,289.5,39.8075,220.25,404.1875,400.0,40.635,332.5,0.0
50%,263.0,305.0,42.92,234.0,426.75,423.0,43.495,353.25,0.0
75%,274.0,321.0625,45.7725,247.5,448.0,444.5,46.2975,372.487319,1.0
max,324.0,379.5,55.12,290.25,531.75,530.5,57.625,444.5,1.0


# Test sets

In [16]:
# Sample the data

no_zeroes_model_data_goldman = no_zeroes_model_data_goldman.sample(frac=1).reset_index(drop=True)

knn_imputed_data_goldman = knn_imputed_data_goldman.sample(frac=1).reset_index(drop=True)

iter_imputed_data_goldman = iter_imputed_data_goldman.sample(frac=1).reset_index(drop=True)

In [17]:
dataset_list = [
    no_zeroes_model_data_goldman, 
    knn_imputed_data_goldman, 
    iter_imputed_data_goldman
]

# Classification without optimization

In [19]:
classifier_names = [
    "Logistic Regression", 
    "Decision Tree Classifier", 
    "Support Vector Machines", 
    "Gaussian Process Classifier", 
    "Gradient Boosting Classifier", 
    "Random Forest Classifier",
    "Ada Boost Classifier", 
    "Extra Trees Classifier", 
    "Gaussian Naive Bayes", 
    "KNNeighbors Classifier",
    "Linear Discriminant Analysis", 
    "Quadratic Discriminant Analysis", 
    "XGBClassifier", 
    "Light Gradient Boosting Classifier"
]


classifiers = [
    LogisticRegression(),
    DecisionTreeClassifier(),
    SVC(),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    GradientBoostingClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    ExtraTreesClassifier(),
    GaussianNB(),
    KNeighborsClassifier(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    XGBClassifier(),
    lgb.LGBMClassifier()
]

In [22]:
dataset_scores_list = []

for dataset in dataset_list:
    scores = []
    
    X = dataset.drop('Sex', axis = 1).values
    y = dataset['Sex']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size=0.3, stratify=y)
    
    for name, clf in zip(classifier_names, classifiers):
        run_score = []
        
        for i in range(20):
            clf.fit(X_train, y_train)
            score = clf.score(X_test, y_test)*100
            run_score.append(score)
            
            avg_score = np.mean(run_score)
                      
        #print(run_score)    
        scores.append(avg_score)
                  
    dataset_scores_list.append(scores)
       

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [23]:
dataset_scores_list

[[78.85572139303483,
  81.75373134328359,
  76.61691542288557,
  85.82089552238807,
  86.54228855721392,
  85.39800995024876,
  86.31840796019901,
  85.31094527363183,
  79.10447761194027,
  79.85074626865669,
  86.56716417910448,
  86.06965174129351,
  86.31840796019901,
  86.06965174129351],
 [81.04575163398695,
  83.72549019607844,
  77.7777777777778,
  79.73856209150328,
  87.59259259259258,
  87.48366013071896,
  87.14596949891069,
  86.86274509803921,
  81.69934640522874,
  81.2636165577342,
  88.6710239651416,
  87.36383442265793,
  88.23529411764706,
  88.6710239651416],
 [84.96732026143789,
  80.91503267973857,
  80.82788671023965,
  77.7777777777778,
  90.55555555555556,
  88.84531590413943,
  89.54248366013073,
  88.57298474945534,
  82.35294117647057,
  81.04575163398695,
  90.6318082788671,
  90.6318082788671,
  89.54248366013073,
  89.76034858387797]]

In [25]:
results = pd.DataFrame(index=classifier_names)
# results['name'] = names
results['goldman_1'] = dataset_scores_list[0]
results['goldman_2'] = dataset_scores_list[1]
results['goldman_3'] = dataset_scores_list[2]

In [26]:
results

Unnamed: 0,goldman_1,goldman_2,goldman_3
Logistic Regression,78.855721,81.045752,84.96732
Decision Tree Classifier,81.753731,83.72549,80.915033
Support Vector Machines,76.616915,77.777778,80.827887
Gaussian Process Classifier,85.820896,79.738562,77.777778
Gradient Boosting Classifier,86.542289,87.592593,90.555556
Random Forest Classifier,85.39801,87.48366,88.845316
Ada Boost Classifier,86.318408,87.145969,89.542484
Extra Trees Classifier,85.310945,86.862745,88.572985
Gaussian Naive Bayes,79.104478,81.699346,82.352941
KNNeighbors Classifier,79.850746,81.263617,81.045752


In [27]:
# print(results.to_latex(float_format="%.2f"))

# Hyperparameter optimization

In [28]:
X = iter_imputed_data_goldman.drop('Sex', axis = 1).values
y = iter_imputed_data_goldman['Sex']

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size=0.3, stratify=y)

In [32]:
# Logistic regression model

model = LogisticRegression()

model.fit(X_train, y_train)

model.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8322440087145969

In [33]:
# Optimizing the logistic regression model

model = LogisticRegression()


parameters = {
    'C': np.logspace(-2,2,40),
    'max_iter': [1000, 1500, 2000],
    'random_state': [0,1,2,3,4,5,6,7,8,9,10,11]
}

clf  = BayesSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X,y)



BayesSearchCV(cv=10, estimator=LogisticRegression(),
              search_spaces={'C': array([0.010000, 0.012664, 0.016037, 0.020309, 0.025719, 0.032570,
       0.041246, 0.052233, 0.066147, 0.083768, 0.106082, 0.134340,
       0.170125, 0.215443, 0.272833, 0.345511, 0.437548, 0.554102,
       0.701704, 0.888624, 1.125336, 1.425103, 1.804722, 2.285464,
       2.894266, 3.665241, 4.641589, 5.878016, 7.443803, 9.426685,
       11.937766, 15.117751, 19.144820, 24.244620, 30.702906, 38.881552,
       49.238826, 62.355073, 78.965229, 100.000000]),
                             'max_iter': [1000, 1500, 2000],
                             'random_state': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                                              11]})

In [34]:
clf.best_params_

OrderedDict([('C', 0.03257020655659783),
             ('max_iter', 2000),
             ('random_state', 5)])

In [35]:
model = LogisticRegression(**clf.best_params_)
model.fit(X_train, y_train)
result = model.score(X_test, y_test)*100
    
print("{:10.2f}".format(result))

     89.98


In [36]:
# Support vector machines

model = SVC()

model.fit(X_train, y_train)

model.score(X_test, y_test)


0.7821350762527233

In [37]:
# Optimizing the Support Vevtor Machine model

model = SVC()

parameters = {
    'C': np.logspace(-2,2,10),
    'kernel': ['rbf','linear']
}

clf  = RandomizedSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X,y)

RandomizedSearchCV(cv=10, estimator=SVC(),
                   param_distributions={'C': array([0.010000, 0.027826, 0.077426, 0.215443, 0.599484, 1.668101,
       4.641589, 12.915497, 35.938137, 100.000000]),
                                        'kernel': ['rbf', 'linear']})

In [38]:
clf.best_params_

{'kernel': 'linear', 'C': 0.5994842503189409}

In [39]:
model = SVC(**clf.best_params_, probability=True)
model.fit(X_train, y_train)
result = model.score(X_test, y_test)*100
    
print("{:10.2f}".format(result))

     91.29


In [44]:
# kNN classifier

model = KNeighborsClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.7973856209150327

In [45]:
# Optimizing the kNN classifier

model = KNeighborsClassifier()

parameters = {
    'n_neighbors': list(range(1,21)),
    'weights' : ['uniform', 'distance'],
    'metric' : ['euclidean', 'manhattan'],
    'leaf_size': list(range(1,10))
}

clf  = RandomizedSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X,y)

RandomizedSearchCV(cv=10, estimator=KNeighborsClassifier(),
                   param_distributions={'leaf_size': [1, 2, 3, 4, 5, 6, 7, 8,
                                                      9],
                                        'metric': ['euclidean', 'manhattan'],
                                        'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8,
                                                        9, 10, 11, 12, 13, 14,
                                                        15, 16, 17, 18, 19,
                                                        20],
                                        'weights': ['uniform', 'distance']})

In [46]:
clf.best_params_

{'weights': 'distance',
 'n_neighbors': 6,
 'metric': 'manhattan',
 'leaf_size': 2}

In [47]:
model = KNeighborsClassifier(**clf.best_params_)
model.fit(X_train, y_train)
result = model.score(X_test, y_test)*100
    
print("{:10.2f}".format(result))

     83.01


In [48]:
# Gaussian Naive Bayes

model = GaussianNB()

model.fit(X_train, y_train)

model.score(X_test, y_test)


0.8387799564270153

In [49]:
# Optimizing the Gaussian Naive Bayes classifier

model = GaussianNB()

parameters = {
    'var_smoothing': np.logspace(0,-9, num=100)
             }

clf  = GridSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X,y)

GridSearchCV(cv=10, estimator=GaussianNB(),
             param_grid={'var_smoothing': array([1.000000, 0.811131, 0.657933, 0.533670, 0.432876, 0.351119,
       0.284804, 0.231013, 0.187382, 0.151991, 0.123285, 0.100000,
       0.081113, 0.065793, 0.053367, 0.043288, 0.035112, 0.028480,
       0.023101, 0.018738, 0.015199, 0.012328, 0.010000, 0.008111,
       0.006579, 0.005337, 0.004329, 0.003511, 0.002848, 0.002310,
       0.0...
       0.000004, 0.000003, 0.000002, 0.000002, 0.000002, 0.000001,
       0.000001, 0.000001, 0.000001, 0.000001, 0.000000, 0.000000,
       0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
       0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
       0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
       0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
       0.000000, 0.000000, 0.000000, 0.000000])})

In [50]:
clf.best_params_

{'var_smoothing': 0.001}

In [51]:
model = GaussianNB(**clf.best_params_)
model.fit(X_train, y_train)
result = model.score(X_test, y_test)*100
    
print("{:10.2f}".format(result))

     83.88


In [53]:
# Linear Discriminant Analysis

model = LinearDiscriminantAnalysis()

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.9084967320261438

In [54]:
# Optimizing the Linear Discriminant Analysis classifier

model = LinearDiscriminantAnalysis()

parameters = {
    'solver' : ['svd', 'lsqr', 'eigen']
}

clf  = BayesSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X,y)





BayesSearchCV(cv=10, estimator=LinearDiscriminantAnalysis(),
              search_spaces={'solver': ['svd', 'lsqr', 'eigen']})

In [55]:
clf.best_params_

OrderedDict([('solver', 'svd')])

In [56]:
model = LinearDiscriminantAnalysis(**clf.best_params_)
model.fit(X_train, y_train)
result = model.score(X_test, y_test)*100
    
print("{:10.2f}".format(result))

     90.85


In [35]:
#pickle.dump(model, open("lda_model.dat", "wb"))

In [57]:
# Quadratic Discriminant Analysis

model = QuadraticDiscriminantAnalysis()

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.8736383442265795

In [58]:
# Optimizing the Quadratic Discriminant Analysis classifier

model = QuadraticDiscriminantAnalysis()

parameters = {
    'reg_param' : [0., 0.1, 0.2, 0.3, 0.4]
}

clf  = GridSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X,y)

GridSearchCV(cv=10, estimator=QuadraticDiscriminantAnalysis(),
             param_grid={'reg_param': [0.0, 0.1, 0.2, 0.3, 0.4]})

In [59]:
clf.best_params_

{'reg_param': 0.1}

In [60]:
model = QuadraticDiscriminantAnalysis(**clf.best_params_)
model.fit(X_train, y_train)
result = model.score(X_test, y_test)*100
    
print("{:10.2f}".format(result))

     88.45


In [61]:
# Decision Tree Classifier

model = DecisionTreeClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.8082788671023965

In [62]:
# Optimizing the Decision Tree Classifier

model = DecisionTreeClassifier()

parameters = {
    'criterion':['entropy','gini'],
    'max_depth':[1,2,3,4,5,6,7,15,20,30,40,120,150]
}

clf  = GridSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X,y)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['entropy', 'gini'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, 15, 20, 30, 40, 120,
                                       150]})

In [63]:
clf.best_params_

{'criterion': 'gini', 'max_depth': 5}

In [64]:
model = DecisionTreeClassifier(**clf.best_params_)
model.fit(X_train, y_train)
result = model.score(X_test, y_test)*100
    
print("{:10.2f}".format(result))

     85.40


In [65]:
# Random Forest Classifier

model = RandomForestClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.8976034858387799

In [66]:
# Optimizing the Random Forest Classifier

model = RandomForestClassifier()

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
}

clf  = RandomizedSearchCV(model, param_distributions=random_grid, n_iter = 20, cv=10, return_train_score=False)

clf.fit(X,y)

RandomizedSearchCV(cv=10, estimator=RandomForestClassifier(), n_iter=20,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]})

In [67]:
clf.best_params_

{'n_estimators': 1800,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 50,
 'bootstrap': False}

In [68]:
model = RandomForestClassifier(**clf.best_params_)
model.fit(X_train, y_train)
result = model.score(X_test, y_test)*100
    
print(result)

88.23529411764706


In [69]:
# XGBoost Classifier

model = XGBClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.8976034858387799

In [70]:
# Optimizing the XGBoost Classifier

model = XGBClassifier()

parameters = {
    'min_child_weight': [1, 2, 3, 4, 5, 6, 25],
    'gamma': [0.1, 0.5, 1, 1.5, 2, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'max_depth': [3, 4, 5, 5, 6, 7, 8]
}

clf  = RandomizedSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X,y)

RandomizedSearchCV(cv=10, estimator=XGBClassifier(),
                   param_distributions={'colsample_bytree': [0.6, 0.8, 1.0],
                                        'gamma': [0.1, 0.5, 1, 1.5, 2, 5],
                                        'max_depth': [3, 4, 5, 5, 6, 7, 8],
                                        'min_child_weight': [1, 2, 3, 4, 5, 6,
                                                             25],
                                        'subsample': [0.6, 0.8, 1.0]})

In [71]:
clf.best_params_

{'subsample': 0.8,
 'min_child_weight': 1,
 'max_depth': 5,
 'gamma': 1,
 'colsample_bytree': 0.8}

In [72]:
model = XGBClassifier(**clf.best_params_)
model.fit(X_train, y_train)

print(model.score(X_test, y_test))

0.8954248366013072


In [73]:
# pickle.dump(model, open("pima.pickle.dat", "wb"))

# iter_imputed_data_goldman.columns

In [74]:
# Gaussian Process Classifier

model = GaussianProcessClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.7690631808278867

In [75]:
# Optimizing the GaussianProcessClassifier

model = GaussianProcessClassifier()

parameters = {
    'kernel' : [1*RBF(), 1*DotProduct(), 1*Matern(),  1*RationalQuadratic(), 1*WhiteKernel()]
}

clf  = RandomizedSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X,y)

Traceback (most recent call last):
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/gaussian_process/_gpc.py", line 664, in fit
    self.base_estimator_.fit(X, y)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/gaussian_process/_gpc.py", line 212, in fit
    optima = [self._constrained_optimization(obj_func,
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/gaussian_process/_gpc.py", line 445, in _constrained_optimization
    opt_res = scipy.optimize.minimize(
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/scipy/optimize/_minimize.py", line 623, in minimize
    return _minimize_lbfgsb(fun, x0, args, jac, bounds,
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/scipy/optimize/lbfgsb.py", line 360

Traceback (most recent call last):
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/gaussian_process/_gpc.py", line 664, in fit
    self.base_estimator_.fit(X, y)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/gaussian_process/_gpc.py", line 212, in fit
    optima = [self._constrained_optimization(obj_func,
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/gaussian_process/_gpc.py", line 445, in _constrained_optimization
    opt_res = scipy.optimize.minimize(
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/scipy/optimize/_minimize.py", line 623, in minimize
    return _minimize_lbfgsb(fun, x0, args, jac, bounds,
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/scipy/optimize/lbfgsb.py", line 360

Traceback (most recent call last):
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/gaussian_process/_gpc.py", line 664, in fit
    self.base_estimator_.fit(X, y)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/gaussian_process/_gpc.py", line 212, in fit
    optima = [self._constrained_optimization(obj_func,
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/gaussian_process/_gpc.py", line 445, in _constrained_optimization
    opt_res = scipy.optimize.minimize(
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/scipy/optimize/_minimize.py", line 623, in minimize
    return _minimize_lbfgsb(fun, x0, args, jac, bounds,
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/scipy/optimize/lbfgsb.py", line 360

Traceback (most recent call last):
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/gaussian_process/_gpc.py", line 664, in fit
    self.base_estimator_.fit(X, y)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/gaussian_process/_gpc.py", line 212, in fit
    optima = [self._constrained_optimization(obj_func,
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/gaussian_process/_gpc.py", line 445, in _constrained_optimization
    opt_res = scipy.optimize.minimize(
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/scipy/optimize/_minimize.py", line 623, in minimize
    return _minimize_lbfgsb(fun, x0, args, jac, bounds,
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/scipy/optimize/lbfgsb.py", line 360

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)




RandomizedSearchCV(cv=10, estimator=GaussianProcessClassifier(),
                   param_distributions={'kernel': [1**2 * RBF(length_scale=1),
                                                   1**2 * DotProduct(sigma_0=1),
                                                   1**2 * Matern(length_scale=1, nu=1.5),
                                                   1**2 * RationalQuadratic(alpha=1, length_scale=1),
                                                   1**2 * WhiteKernel(noise_level=1)]})

In [76]:
clf.best_params_

{'kernel': 1**2 * RationalQuadratic(alpha=1, length_scale=1)}

In [77]:
model = GaussianProcessClassifier(**clf.best_params_, max_iter_predict = 1000)

model.fit(X_train, y_train)

print(model.score(X_test, y_test))

0.7647058823529411


In [78]:
# Gradient Boosting Classifier

model = GradientBoostingClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.8845315904139434

In [79]:
# Optimizing the Gradient Boosting Classifier

model = GradientBoostingClassifier()

parameters = {
    "n_estimators":[5,50,250,500],
    "max_depth":[1,3,5,7,9],
    "learning_rate":[0.01,0.1,1,10,100]
}

clf  = BayesSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X,y)



BayesSearchCV(cv=10, estimator=GradientBoostingClassifier(),
              search_spaces={'learning_rate': [0.01, 0.1, 1, 10, 100],
                             'max_depth': [1, 3, 5, 7, 9],
                             'n_estimators': [5, 50, 250, 500]})

In [80]:
clf.best_params_

OrderedDict([('learning_rate', 0.01), ('max_depth', 5), ('n_estimators', 500)])

In [81]:
model = GradientBoostingClassifier(**clf.best_params_)
model.fit(X_train, y_train)

print(model.score(X_test, y_test))

0.8867102396514162


In [82]:
# Ada Boost Classifier

model = AdaBoostClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.8801742919389978

In [83]:
# Optimizing the Gradient Boosting Classifier

model = AdaBoostClassifier()

parameters = {
    "n_estimators":[5,50,250,500],
    "learning_rate":[0.01,0.1,1,10,100]
}

clf  = RandomizedSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X,y)

RandomizedSearchCV(cv=10, estimator=AdaBoostClassifier(),
                   param_distributions={'learning_rate': [0.01, 0.1, 1, 10,
                                                          100],
                                        'n_estimators': [5, 50, 250, 500]})

In [84]:
clf.best_params_

{'n_estimators': 250, 'learning_rate': 0.1}

In [85]:
model = AdaBoostClassifier(**clf.best_params_)
model.fit(X_train, y_train)

print(model.score(X_test, y_test))

0.8932461873638344


In [86]:
#pickle.dump(model, open("ada_boost_model.dat", "wb"))


In [87]:
# Extra trees classifier

model = ExtraTreesClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.8932461873638344

In [88]:
# Optimizing the SGDClassifier

model = ExtraTreesClassifier()

parameters = {
    'n_estimators': list(range(50,126,25)),
    'min_samples_leaf': list(range(1,20,1)),
    'min_samples_split': list(range(1,20,1))
}

clf  = RandomizedSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X,y)

Traceback (most recent call last):
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/joblib/_parallel_backends.py", line 208, in apply_async
    result = Immedi

Traceback (most recent call last):
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/Users/cconsta1/opt/anaconda3/lib/python3.9/site-packages/joblib/_parallel_backends.py", line 208, in apply_async
    result = Immedi

RandomizedSearchCV(cv=10, estimator=ExtraTreesClassifier(),
                   param_distributions={'min_samples_leaf': [1, 2, 3, 4, 5, 6,
                                                             7, 8, 9, 10, 11,
                                                             12, 13, 14, 15, 16,
                                                             17, 18, 19],
                                        'min_samples_split': [1, 2, 3, 4, 5, 6,
                                                              7, 8, 9, 10, 11,
                                                              12, 13, 14, 15,
                                                              16, 17, 18, 19],
                                        'n_estimators': [50, 75, 100, 125]})

In [89]:
clf.best_params_

{'n_estimators': 125, 'min_samples_split': 12, 'min_samples_leaf': 2}

In [90]:
model = ExtraTreesClassifier(**clf.best_params_)
model.fit(X_train, y_train)

print(model.score(X_test, y_test))

0.8823529411764706


In [91]:
# Light boosting regressor

model = lgb.LGBMClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.8649237472766884

In [92]:
# Optimizing the SGDClassifier

model = lgb.LGBMClassifier()

parameters = {
    'num_leaves': [5, 10, 20, 31, 50, 100], 
    'min_child_samples': [20, 30, 50 , 100], 
    'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1],
    'reg_alpha': [0, 1e-1, 1],
    'reg_lambda': [0, 1e-1, 1, 5, 10]
}

clf  = RandomizedSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X,y)

RandomizedSearchCV(cv=10, estimator=LGBMClassifier(),
                   param_distributions={'min_child_samples': [20, 30, 50, 100],
                                        'min_child_weight': [1e-05, 0.001, 0.01,
                                                             0.1, 1],
                                        'num_leaves': [5, 10, 20, 31, 50, 100],
                                        'reg_alpha': [0, 0.1, 1],
                                        'reg_lambda': [0, 0.1, 1, 5, 10]})

In [93]:
clf.best_params_

{'reg_lambda': 1,
 'reg_alpha': 0.1,
 'num_leaves': 5,
 'min_child_weight': 1e-05,
 'min_child_samples': 30}

In [94]:
model = lgb.LGBMClassifier(**clf.best_params_)
model.fit(X_train, y_train)

print(model.score(X_test, y_test))

0.906318082788671


In [96]:
#pickle.dump(model, open("lgb_model.dat", "wb"))
