In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# CSV
import csv


from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_validate
from sklearn.model_selection import train_test_split 

import sklearn
import math
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, make_scorer

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer 

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC

import lightgbm as lgb
import optuna
import xgboost as xgb
from imblearn.pipeline import Pipeline

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

# Import the SMOTE-NC
from imblearn.over_sampling import SMOTENC

In [2]:
customer_df = pd.read_csv('telco_churn_cleaned.csv')

In [3]:
customer_df

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,...,internet_service_dsl,internet_service_fiber_optic,internet_service_no,contract_month_to_month,contract_one_year,contract_two_year,payment_method_bank_transfer,payment_method_credit_card,payment_method_electronic_check,payment_method_mailed_check
0,0,1,0,0.013889,0,0,1,0,0,0,...,1,0,0,1,0,0,0,0,1,0
1,0,0,0,0.472222,0,1,0,1,0,0,...,1,0,0,0,1,0,0,0,0,1
2,0,0,0,0.027778,0,1,1,0,0,0,...,1,0,0,1,0,0,0,0,0,1
3,0,0,0,0.625000,0,1,0,1,1,0,...,1,0,0,0,1,0,1,0,0,0
4,0,0,0,0.027778,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,1,1,0.333333,1,1,0,1,1,1,...,1,0,0,0,1,0,0,0,0,1
7039,0,1,1,1.000000,1,0,1,1,0,1,...,0,1,0,0,1,0,0,1,0,0
7040,0,1,1,0.152778,0,1,0,0,0,0,...,1,0,0,1,0,0,0,0,1,0
7041,1,1,0,0.055556,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,1


In [4]:
X = customer_df.drop(columns={'Churn'})
y = customer_df['Churn']

## SMOTE-NC

In [5]:
#Create the oversampler. For SMOTE-NC we need to pinpoint the column position where is the categorical features are. In this case, 'IsActiveMember' is positioned in the second column we input [1] as the parameter. If you have more than one categorical columns, just input all the columns position
smotenc = SMOTENC([0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23],random_state = 424)
X, y = smotenc.fit_resample(X, y)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 424)

## Decision Tree Hyperparameters Tuning

In [11]:
#Criterion
criterion = ["gini", "entropy"]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
max_depth.append(None)

param_grid = {'criterion': criterion,
               'max_depth': max_depth
              }

print(param_grid)

{'criterion': ['gini', 'entropy'], 'max_depth': [5, 10, 15, 20, 25, 30, None]}


In [12]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
dt_clf = DecisionTreeClassifier()

# Random search of parameters, using 3 fold Stratified KFold cross validation, 
kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=424)

# search across 100 different combinations, and use all available cores
dt_clf_random = GridSearchCV(dt_clf, param_grid, n_jobs=-1, cv=kfold, scoring="accuracy")
# Fit the random search model
dt_clf_random.fit(X_train, y_train)

In [14]:
cv_score=cross_val_score(dt_clf, X, y, cv=kfold, scoring="accuracy")
avg_score = np.mean(cv_score)
print("Cross Validation Score:", cv_score)
print("Average Cross Validation Score:", avg_score)

Cross Validation Score: [0.77130435 0.76311974 0.78109597]
Average Cross Validation Score: 0.7718400208419999


In [18]:
dt_clf_random.best_params_

{'criterion': 'entropy', 'max_depth': 10}

## Naive Bayes Hyperparameters Tuning 

In [15]:
param_grid_nb = {
    'var_smoothing': np.logspace(0,-9, num=100)
}

In [16]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
nb_clf = GaussianNB()

# Random search of parameters, using 3 fold Stratified KFold cross validation, 
kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=424)

# search across 100 different combinations, and use all available cores
nb_clf_random = GridSearchCV(nb_clf, param_grid_nb, n_jobs=-1, cv=kfold, scoring="accuracy")
# Fit the random search model
nb_clf_random.fit(X_train, y_train)

In [17]:
cv_score=cross_val_score(nb_clf, X, y, cv=kfold, scoring="accuracy")
avg_score = np.mean(cv_score)
print("Cross Validation Score:", cv_score)
print("Average Cross Validation Score:", avg_score)

Cross Validation Score: [0.77768116 0.77297767 0.76456944]
Average Cross Validation Score: 0.7717427581753725


In [9]:
nb_clf_random.best_params_

{'var_smoothing': 0.008111308307896872}

## Random Forest Hyperparameters Tuning

In [18]:
# Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
n_estimators = [100, 200 ,300, 400, 500]

# Maximum number of levels in tree

param_grid = {'n_estimators': n_estimators
              }

print(param_grid)

{'n_estimators': [100, 200, 300, 400, 500]}


In [19]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold Stratified KFold cross validation, 
kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=424)

# search across 100 different combinations, and use all available cores
rf_random = GridSearchCV(rf, param_grid, n_jobs=-1, cv=kfold, scoring="accuracy")
# Fit the random search model
rf_random.fit(X_train, y_train)

In [20]:
cv_score=cross_val_score(rf, X, y, cv=kfold, scoring="accuracy")
avg_score = np.mean(cv_score)
print("Cross Validation Score:", cv_score)
print("Average Cross Validation Score:", avg_score)

Cross Validation Score: [0.83855072 0.84227312 0.8309655 ]
Average Cross Validation Score: 0.8372631148425015


In [22]:
rf_random.best_params_

{'n_estimators': 500}

## Logistic Regression Hyperparameters Tuning

In [21]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

solver = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['none', 'l1', 'l2', 'elasticnet']
max_iter = [100, 200, 300, 400, 500]

param_grid = {'solver': solver,
               'penalty': penalty,
               'max_iter' : max_iter
              }

print(param_grid)

{'solver': ['newton-cg', 'lbfgs', 'liblinear'], 'penalty': ['none', 'l1', 'l2', 'elasticnet'], 'max_iter': [100, 200, 300, 400, 500]}


In [22]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
logreg = LogisticRegression(random_state=424)

kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=424)

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
logreg_random = GridSearchCV(logreg, param_grid, n_jobs=-1, cv=kfold, scoring="accuracy")

# Fit the random search model
logreg_random.fit(X_train, y_train)

90 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Samuel Thong\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Samuel Thong\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Samuel Thong\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 78, in _check_solver
    raise ValueError("penalty='none' is not supported for the liblinear solver")
ValueError: penalty='none' is 

In [23]:
cv_score=cross_val_score(logreg, X, y, cv=kfold, scoring="accuracy")
avg_score = np.mean(cv_score)
print("Cross Validation Score:", cv_score)
print("Average Cross Validation Score:", avg_score)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Cross Validation Score: [0.78637681 0.78979414 0.78544506]
Average Cross Validation Score: 0.7872053371207505


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [26]:
logreg_random.best_params_

{'max_iter': 100, 'penalty': 'none', 'solver': 'lbfgs'}

## SVM Hyperparameters Tuning

In [24]:
C = [0.1, 0.5, 1, 10]
max_iter = [1000, 3000, 5000]

param_grid = {
               'C': C,
               'max_iter':max_iter
               }

print(param_grid)

{'C': [0.1, 0.5, 1, 10], 'max_iter': [1000, 3000, 5000]}


In [25]:
from sklearn.svm import LinearSVC

# Use the random grid to search for best hyperparameters
# First create the base model to tune
linearSVC_svm = LinearSVC()

# Random search of parameters, using 3 fold cross validation, 
kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=424)

# search across 100 different combinations, and use all available cores
linearSVC_svm_random = GridSearchCV(linearSVC_svm, param_grid, n_jobs=-1, cv=kfold, scoring="accuracy")

# Fit the random search model
linearSVC_svm_random.fit(X_train, y_train)

In [26]:
cv_score=cross_val_score(linearSVC_svm, X, y, cv=kfold, scoring="accuracy")
avg_score = np.mean(cv_score)
print("Cross Validation Score:", cv_score)
print("Average Cross Validation Score:", avg_score)

Cross Validation Score: [0.78724638 0.78979414 0.78312554]
Average Cross Validation Score: 0.786722021225784


In [31]:
linearSVC_svm_random.best_params_

{'C': 0.5, 'max_iter': 1000}

## AdaBoost Hyperparameters Tuning

In [32]:
# Number of estimators
n_estimators = [100, 200, 300, 400, 500]

param_grid = {'n_estimators': n_estimators}

print(param_grid)

{'n_estimators': [100, 200, 300, 400, 500]}


In [33]:
model = DecisionTreeClassifier(criterion='gini')  

# Use the random grid to search for best hyperparameters
# First create the base model to tune
dt = AdaBoostClassifier(base_estimator=model, learning_rate=0.1, random_state=424)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores

kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=424)

dt_random = GridSearchCV(dt, param_grid, n_jobs=-1, cv=kfold, scoring="accuracy")
# Fit the random search model
dt_random.fit(X_train, y_train)

print(dt_random.best_params_)

{'n_estimators': 400}


In [34]:
model = GaussianNB()

# Use the random grid to search for best hyperparameters
# First create the base model to tune
nb = AdaBoostClassifier(base_estimator=model,learning_rate=0.1, random_state=424)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores

kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=424)

nb_random = GridSearchCV(nb, param_grid, n_jobs=-1, cv=kfold, scoring="accuracy")
# Fit the random search model
nb_random.fit(X_train, y_train)

print(nb_random.best_params_)

{'n_estimators': 400}


In [35]:
model = LogisticRegression(solver='lbfgs', penalty = 'none', random_state=424)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
logreg = AdaBoostClassifier(base_estimator=model,learning_rate=0.1, random_state=424)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores

kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=424)

logreg_random = GridSearchCV(logreg, param_grid, n_jobs=-1, cv=kfold, scoring="accuracy")
# Fit the random search model
logreg_random.fit(X_train, y_train)

print(logreg_random.best_params_)

{'n_estimators': 100}


In [36]:
model = LinearSVC(C = 10, random_state=424)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
linearSVC = AdaBoostClassifier(base_estimator=model,learning_rate=0.1, random_state=424, algorithm='SAMME')
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores

kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=424)

linearSVC_random = GridSearchCV(linearSVC, param_grid, n_jobs=-1, cv=kfold, scoring="accuracy")
# Fit the random search model
linearSVC_random.fit(X_train, y_train)

print(linearSVC_random.best_params_)

{'n_estimators': 100}


## Bagging

In [37]:
model = DecisionTreeClassifier(criterion='gini')  

# Use the random grid to search for best hyperparameters
# First create the base model to tune
dt = BaggingClassifier(base_estimator=model, random_state=424)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores

kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=424)

dt_random = GridSearchCV(dt, param_grid, n_jobs=-1, cv=kfold, scoring="accuracy")
# Fit the random search model
dt_random.fit(X_train, y_train)

print(dt_random.best_params_)

{'n_estimators': 500}


In [38]:
model = GaussianNB()

# Use the random grid to search for best hyperparameters
# First create the base model to tune
nb = BaggingClassifier(base_estimator=model, random_state=424)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores

kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=424)

nb_random = GridSearchCV(nb, param_grid, n_jobs=-1, cv=kfold, scoring="accuracy")
# Fit the random search model
nb_random.fit(X_train, y_train)

print(nb_random.best_params_)

{'n_estimators': 100}


In [39]:
model = LogisticRegression(solver='lbfgs', penalty = 'none', random_state=424)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
logreg = BaggingClassifier(base_estimator=model, random_state=424)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores

kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=424)

logreg_random = GridSearchCV(logreg, param_grid, n_jobs=-1, cv=kfold, scoring="accuracy")
# Fit the random search model
logreg_random.fit(X_train, y_train)

print(logreg_random.best_params_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'n_estimators': 100}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [40]:
model = LinearSVC(C = 10, random_state=424)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
linearSVC = BaggingClassifier(base_estimator=model, random_state=424)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores

kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=424)

linearSVC_random = GridSearchCV(linearSVC, param_grid, n_jobs=-1, cv=kfold, scoring="accuracy")
# Fit the random search model
linearSVC_random.fit(X_train, y_train)

print(linearSVC_random.best_params_)











{'n_estimators': 200}




# Light GBM


In [48]:
param_grid= {
        'reg_alpha': [0, 1, 10],
        'reg_lambda': [0, 1, 10],
        'colsample_bytree': [0.5,0.6,0.7],
        'subsample': [0.8, 1.0],
        'learning_rate': [0.01,0.015,0.02],
        'max_depth':  [10,20,50],
        'num_leaves' : [300,500],
        'min_child_samples':  [10, 15, 20]
}

lgbm_clf=lgb.LGBMClassifier(objective='binary')
kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=424)

# search across 100 different combinations, and use all available cores
lgbm_grid_cv = GridSearchCV(lgbm_clf, param_grid, n_jobs=-1, cv=kfold, scoring="accuracy")

# Fit the random search model
lgbm_grid_cv.fit(X_train, y_train)

print(lgbm_grid_cv.best_params_)

{'colsample_bytree': 0.6, 'learning_rate': 0.02, 'max_depth': 20, 'min_child_samples': 10, 'num_leaves': 300, 'reg_alpha': 0, 'reg_lambda': 0, 'subsample': 0.8}


In [27]:
lgbm_clf=lgb.LGBMClassifier(objective='binary')
kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=424)
cv_score=cross_val_score(lgbm_clf, X, y, cv=kfold, scoring="accuracy")
avg_score = np.mean(cv_score)
print("Cross Validation Score:", cv_score)
print("Average Cross Validation Score:", avg_score)

Cross Validation Score: [0.83797101 0.83386489 0.81791824]
Average Cross Validation Score: 0.8299180466787964


# Neural Network

In [28]:
def get_mlp_model(hiddenLayerOne=18, hiddenLayerTwo=10,hiddenLayerThree=4,
    dropout=0.2, learnRate=0.01):
    # initialize a sequential model and add layer to flatten the
    # input data
    model = Sequential()
    model.add(Flatten())
    model.add(Dense(hiddenLayerOne, activation="relu",
                input_shape=(24,)))
    model.add(Dense(hiddenLayerTwo, activation="relu"))
    # add a sigmoid layer on top
    model.add(Dense(hiddenLayerThree, activation="sigmoid"))
    # compile the model
    model.compile(optimizer=Adam(learning_rate=learnRate),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"])
    # return compiled model
    return model

model_mlp = KerasClassifier(build_fn=get_mlp_model, verbose=2)
# define a grid of the hyperparameter search space
hiddenLayerOne = [18, 20, 22]
hiddenLayerTwo = [10, 12, 14]
hiddenLayerThree = [3, 4, 5]
learnRate = [1e-2, 1e-3, 1e-4]
batchSize = [32, 64, 100]
epochs = [100,200,300,400]

# create a dictionary from the hyperparameter grid
grid = dict(
    hiddenLayerOne=hiddenLayerOne,
    hiddenLayerTwo=hiddenLayerTwo,
    hiddenLayerThree=hiddenLayerThree,
    learnRate=learnRate,
    batch_size=batchSize,
    epochs=epochs
)

  model_mlp = KerasClassifier(build_fn=get_mlp_model, verbose=2)


In [29]:
kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=424)
cv_score=cross_val_score(model_mlp, X, y, cv=kfold, scoring="accuracy")
avg_score = np.mean(cv_score)
print("Cross Validation Score:", cv_score)
print("Average Cross Validation Score:", avg_score)

216/216 - 2s - loss: 0.5034 - accuracy: 0.7618 - 2s/epoch - 9ms/step
216/216 - 1s - loss: 0.5104 - accuracy: 0.7597 - 1s/epoch - 7ms/step
216/216 - 1s - loss: 0.5116 - accuracy: 0.7620 - 1s/epoch - 7ms/step
Cross Validation Score: [0.78637681 0.76949841 0.76978834]
Average Cross Validation Score: 0.7752211871255829


In [54]:
# start the hyperparameter search process
print("[INFO] performing random search...")
searcher = RandomizedSearchCV(estimator=model_mlp, n_jobs=-1, cv=3,
    param_distributions=grid, scoring="accuracy")
searchResults = searcher.fit(X_train, y_train)

# summarize grid search information
bestScore = searchResults.best_score_
bestParams = searchResults.best_params_
print("[INFO] best score is {:.2f} using {}".format(bestScore,
    bestParams))

[INFO] performing random search...
Epoch 1/100
259/259 - 1s - loss: 0.7842 - accuracy: 0.6602 - 769ms/epoch - 3ms/step
Epoch 2/100
259/259 - 0s - loss: 0.4739 - accuracy: 0.7776 - 323ms/epoch - 1ms/step
Epoch 3/100
259/259 - 0s - loss: 0.4609 - accuracy: 0.7815 - 338ms/epoch - 1ms/step
Epoch 4/100
259/259 - 0s - loss: 0.4570 - accuracy: 0.7806 - 358ms/epoch - 1ms/step
Epoch 5/100
259/259 - 0s - loss: 0.4540 - accuracy: 0.7829 - 338ms/epoch - 1ms/step
Epoch 6/100
259/259 - 0s - loss: 0.4492 - accuracy: 0.7869 - 326ms/epoch - 1ms/step
Epoch 7/100
259/259 - 0s - loss: 0.4474 - accuracy: 0.7868 - 356ms/epoch - 1ms/step
Epoch 8/100
259/259 - 0s - loss: 0.4441 - accuracy: 0.7867 - 336ms/epoch - 1ms/step
Epoch 9/100
259/259 - 0s - loss: 0.4424 - accuracy: 0.7885 - 346ms/epoch - 1ms/step
Epoch 10/100
259/259 - 0s - loss: 0.4394 - accuracy: 0.7902 - 366ms/epoch - 1ms/step
Epoch 11/100
259/259 - 0s - loss: 0.4366 - accuracy: 0.7928 - 363ms/epoch - 1ms/step
Epoch 12/100
259/259 - 0s - loss: 0.434

259/259 - 0s - loss: 0.3760 - accuracy: 0.8314 - 438ms/epoch - 2ms/step
Epoch 98/100
259/259 - 0s - loss: 0.3762 - accuracy: 0.8335 - 421ms/epoch - 2ms/step
Epoch 99/100
259/259 - 0s - loss: 0.3766 - accuracy: 0.8327 - 357ms/epoch - 1ms/step
Epoch 100/100
259/259 - 0s - loss: 0.3769 - accuracy: 0.8314 - 344ms/epoch - 1ms/step
[INFO] best score is 0.80 using {'learnRate': 0.001, 'hiddenLayerTwo': 14, 'hiddenLayerThree': 5, 'hiddenLayerOne': 22, 'epochs': 100, 'batch_size': 32}
