# Import necessary libraries 

In [None]:
# system

import os

# data analysis and plotting

import pandas as pd
import numpy as np
from scipy.stats import zscore
from scipy.stats import shapiro

from random import randint

import matplotlib.pyplot as plt 
import seaborn as sns
from xgboost import plot_importance

from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, KernelPCA

# data processing and model validation

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, explained_variance_score, confusion_matrix, accuracy_score, classification_report, log_loss
from math import sqrt
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
from sklearn.model_selection import RepeatedStratifiedKFold

# classification libraries

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.gaussian_process.kernels import RBF, DotProduct, WhiteKernel, Matern, RationalQuadratic
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn import svm
from xgboost import XGBClassifier
import lightgbm as lgb

# Importing imputation libs. 

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Hyperparameter optimization

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV

# exporting the models
import pickle

# parameter settings

%matplotlib inline

# To change scientific numbers to float
np.set_printoptions(formatter={'float_kind':'{:f}'.format})

# Increase the size of sns plots
sns.set(rc={'figure.figsize':(12,10)})

# import sys
# !conda list Check the packages installed

# Displaying all the rows/columns in a data set (the default option is not to show them)

pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

# Import and trim data

In [None]:
# Importing the raw data

raw_data_howell = pd.read_csv("datasets/Howell.csv", header = 0, encoding= 'unicode_escape')

In [None]:
raw_data_howell_test = pd.read_csv("datasets/HowellTest.csv", header = 0, encoding= 'unicode_escape')

In [None]:
raw_data_howell.head()

In [None]:
measured_data_howell = raw_data_howell.loc[:,"GOL":"TBA"]

model_cols_howell = [
    'GOL', 
    'NOL', 
    'BNL', 
    'BBH', 
    'XCB', 
    'XFB', 
    'ZYB', 
    'AUB', 
    'WCB', 
    'ASB',
    'BPL', 
    'NPH', 
    'NLH', 
    'JUB', 
    'NLB', 
    'MAB', 
    'MDH', 
    'MDB', 
    'OBH', 
    'OBB',
    'DKB', 
    'ZMB', 
    'FMB', 
    'EKB', 
    'IML', 
    'XML', 
    'WMH', 
    'STB', 
    'FRC', 
    'PAC', 
    'OCC', 
    'FOL'
]
             
model_data_howell = measured_data_howell.drop(columns=[col for col in measured_data_howell if col not in model_cols_howell])

model_data_howell.shape

In [None]:
measured_data_howell_test = raw_data_howell_test.loc[:,"GOL":"TBA"]

model_data_howell_test = measured_data_howell_test.drop(columns=[col for col in measured_data_howell_test if col not in model_cols_howell])

model_data_howell_test.shape

In [None]:
# Add the Sex column

model_data_howell_test = pd.concat([model_data_howell_test.loc[:,:],raw_data_howell_test.loc[:,"Sex"]],axis=1)

In [None]:
model_data_howell_test

In [None]:
# Add the Sex column

model_data_howell = pd.concat([model_data_howell.loc[:,:],raw_data_howell.loc[:,"Sex"]],axis=1)

model_data_howell

# Building the dataset using the chosen features

In [None]:
# Full data

# Convert M and F to 0 and 1

model_data_howell['Sex']= model_data_howell['Sex'].map({'M': 0,'F': 1})

model_data_howell['Sex'] = model_data_howell['Sex'].astype(int)

model_data_howell.describe()


In [None]:
# Test data

model_data_howell_test['Sex']= model_data_howell_test['Sex'].map({'M': 0,'F': 1})

model_data_howell_test['Sex'] = model_data_howell_test['Sex'].astype(int)

model_data_howell_test.describe()


# Merge the dataset with the test set and sample

In [None]:
full_set = pd.concat([model_data_howell, model_data_howell_test])

full_set = full_set.sample(frac=1).reset_index(drop=True)

full_set.describe()

# Training and test sets

In [None]:
X = full_set.drop('Sex', axis = 1).values
y = full_set['Sex']
    
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size=0.3, stratify=y)

# Classification without hyperparameter optimization

In [None]:
classifier_names = [
    "Logistic Regression", 
    "Decision Tree Classifier", 
    "Support Vector Machines", 
    "Gaussian Process Classifier", 
    "Gradient Boosting Classifier", 
    "Random Forest Classifier",
    "Ada Boost Classifier", 
    "Extra Trees Classifier", 
    "Gaussian Naive Bayes", 
    "KNNeighbors Classifier",
    "Linear Discriminant Analysis", 
    "Quadratic Discriminant Analysis", 
    "XGBClassifier", 
    "Light Gradient Boosting Classifier"
]


classifiers = [
    LogisticRegression(),
    DecisionTreeClassifier(),
    SVC(),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    GradientBoostingClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    ExtraTreesClassifier(),
    GaussianNB(),
    KNeighborsClassifier(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    XGBClassifier(),
    lgb.LGBMClassifier()
]


In [None]:
dataset_scores_list = []

for name, clf in zip(classifier_names, classifiers):
        
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)*100

    dataset_scores_list.append(score)
    

In [None]:
dataset_scores_list

In [None]:
results = pd.DataFrame(index=classifier_names)
results['Howells'] = dataset_scores_list

results

In [None]:
# Exporting the table to LateX format

# print(results.to_latex(float_format="%.2f"))

# Hyperparameter optimization

In [None]:
# Logistic regression model

model = LogisticRegression()

model.fit(X_train, y_train)

model.score(X_test, y_test)

In [None]:
# Optimizing the logistic regression model

model = LogisticRegression()

parameters = {
    'C': np.logspace(-2,2,5),
    'max_iter': [2500],
    'random_state': [0]
}

clf  = BayesSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X, y)

In [None]:
clf.best_params_

In [None]:
model = LogisticRegression(**clf.best_params_)

model.fit(X_train, y_train)

model.score(X_test, y_test)

In [None]:
# If you want to export a model use the following command

# pickle.dump(model, open("logreg_model_howell.dat", "wb"))

In [None]:
# Support vector machines

model = SVC()

model.fit(X_train, y_train)

model.score(X_test, y_test)


In [None]:
# Optimizing the Support Vector Machine model

model = SVC()

parameters = {
    'C': np.logspace(-2,2,5)
     #'kernel': ['rbf','linear']
}

clf  = GridSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X, y)

In [None]:
clf.best_params_

In [None]:
model = SVC(**clf.best_params_, probability=True)

model.fit(X_train, y_train)

model.score(X_test, y_test)

In [None]:
# pickle.dump(model, open("svm_model_howell.dat", "wb"))

In [None]:
# kNN classifier

model = KNeighborsClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

In [None]:
# Optimizing the kNN classifier

model = KNeighborsClassifier()

parameters = {
    'n_neighbors': list(range(1,21)),
#      'weights' : ['uniform', 'distance'],
#       'metric' : ['euclidean', 'manhattan'],
    'leaf_size': list(range(1,20))
             }

clf  = GridSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X, y)

In [None]:
clf.best_params_

In [None]:
model = KNeighborsClassifier(**clf.best_params_)

model.fit(X_train, y_train)

model.score(X_test, y_test)

In [None]:
# Gaussian Naive Bayes

model = GaussianNB()

model.fit(X_train, y_train)

model.score(X_test, y_test)

In [None]:
# Optimizing the Gaussian Naive Bayes classifier

model = GaussianNB()

parameters = {
    'var_smoothing': np.logspace(0,-9, num=100)
             }

clf  = GridSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X, y)

In [None]:
clf.best_params_

In [None]:
model = GaussianNB(**clf.best_params_)

model.fit(X_train, y_train)

model.score(X_test, y_test)

In [None]:
# Linear Discriminant Analysis

model = LinearDiscriminantAnalysis()

model.fit(X_train, y_train)

model.score(X_test, y_test)

In [None]:
# Optimizing the Linear Discriminant Analysis classifier

model = LinearDiscriminantAnalysis()

parameters = {
    'solver' : ['svd', 'lsqr', 'eigen']
             }

clf  = GridSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X, y)

In [None]:
clf.best_params_

In [None]:
model = LinearDiscriminantAnalysis(**clf.best_params_)

model.fit(X_train, y_train)

model.score(X_test, y_test)

In [None]:
# pickle.dump(model, open("lda_model_howell.dat", "wb"))


In [None]:
# Quadratic Discriminant Analysis

model = QuadraticDiscriminantAnalysis()

model.fit(X_train, y_train)

model.score(X_test, y_test)

In [None]:
# Optimizing the Quadratic Discriminant Analysis classifier

model = QuadraticDiscriminantAnalysis()

parameters = {
    'reg_param' : [0., 0.1, 0.2, 0.3, 0.4]
             }

clf  = GridSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X_train,y_train)

In [None]:
clf.best_params_

In [None]:
# Quadratic Discriminant Analysis

model = QuadraticDiscriminantAnalysis(**clf.best_params_)

model.fit(X_train, y_train)

model.score(X_test, y_test)

In [None]:
# Decision Tree Classifier

model = DecisionTreeClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

In [None]:
# Optimizing the Decision Tree Classifier

model = DecisionTreeClassifier()

parameters = {
    'criterion':['entropy','gini'],
    'max_depth':[1,2,3,4,5,6,7,15,20,30,40,120,150]
}

clf  = GridSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X, y)

In [None]:
clf.best_params_

In [None]:
model = DecisionTreeClassifier(**clf.best_params_)

model.fit(X_train, y_train)

model.score(X_test, y_test)

In [None]:
# Random Forest Classifier

model = RandomForestClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

In [None]:
# Optimizing the Random Forest Classifier

model = RandomForestClassifier()

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
}

clf  = RandomizedSearchCV(model, param_distributions=random_grid, n_iter = 20, cv=10, return_train_score=False)

clf.fit(X, y)

In [None]:
clf.best_params_

In [None]:
model = RandomForestClassifier(**clf.best_params_)

model.fit(X_train, y_train)

print(model.score(X_test, y_test))

In [None]:
# XGBoost Classifier

model = XGBClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

In [None]:
# Optimizing the XGBoost Classifier

model = XGBClassifier()

parameters = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
}

clf  = BayesSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X, y)

In [None]:
clf.best_params_

In [None]:
model = XGBClassifier(**clf.best_params_)

model.fit(X_train, y_train)

print(model.score(X_test, y_test))


In [None]:
# Gaussian Process Classifier

model = GaussianProcessClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

In [None]:
# Optimizing the GaussianProcessClassifier

model = GaussianProcessClassifier()

parameters = {
    'kernel' : [1*RBF(), 1*DotProduct(), 1*Matern(),  1*RationalQuadratic(), 1*WhiteKernel()]
}

clf  = GridSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X, y)

In [None]:
clf.best_params_

In [None]:
model = GaussianProcessClassifier(**clf.best_params_, max_iter_predict = 1000)

model.fit(X_train, y_train)

print(model.score(X_test, y_test))

In [None]:
# Gradient Boosting Classifier

model = GradientBoostingClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

In [None]:
# Optimizing the Gradient Boosting Classifier

model = GradientBoostingClassifier()

parameters = {
    "n_estimators":[5,50,250,500],
    "max_depth":[1,3,5,7,9],
    "learning_rate":[0.01,0.1,1,10,100]
}

clf  = BayesSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X_train,y_train)

In [None]:
clf.best_params_

In [None]:
model = GradientBoostingClassifier(**clf.best_params_)

model.fit(X_train, y_train)

print(model.score(X_test, y_test))


In [None]:
# Ada Boost Classifier

model = AdaBoostClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

In [None]:
# Optimizing the Gradient Boosting Classifier

model = AdaBoostClassifier()

parameters = {
    "n_estimators":[5,50,250,500],
    "learning_rate":[0.01,0.1,1,10,100]
}

clf  = GridSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X, y)

In [None]:
clf.best_params_

In [None]:
model = AdaBoostClassifier(**clf.best_params_)

model.fit(X_train, y_train)

print(model.score(X_test, y_test))

In [None]:
# Extra trees regressor

model = ExtraTreesClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

In [None]:
# Optimizing the ExtraTreesClassifier

model = ExtraTreesClassifier()

parameters = {
        'n_estimators': list(range(50,126,25)),
        'min_samples_leaf': list(range(1,20,1)),
        'min_samples_split': list(range(1,20,1))
    }

clf  = RandomizedSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X, y)

In [None]:
clf.best_params_

In [None]:
model = ExtraTreesClassifier(**clf.best_params_)

model.fit(X_train, y_train)

print(model.score(X_test, y_test))

In [None]:
# Light boosting regressor

model = lgb.LGBMClassifier()

model.fit(X_train, y_train)

model.score(X_test, y_test)

In [None]:
# Optimizing the LGBMClassifier

model = lgb.LGBMClassifier()

parameters = {
    'num_leaves': [5, 10, 20, 31, 50, 100], 
    'min_child_samples': [20, 30, 50 , 100], 
     'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1],
     'reg_alpha': [0, 1e-1, 1],
    'reg_lambda': [0, 1e-1, 1, 5, 10]
    }

clf  = RandomizedSearchCV(model, parameters, cv=10, return_train_score=False)

clf.fit(X, y)

In [None]:
clf.best_params_

In [None]:
model = lgb.LGBMClassifier(**clf.best_params_)

model.fit(X_train, y_train)

print(model.score(X_test, y_test))