<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Introduction" data-toc-modified-id="Introduction-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Introduction</a></span></li><li><span><a href="#Hyperparameter-Optimization" data-toc-modified-id="Hyperparameter-Optimization-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Hyperparameter Optimization</a></span></li></ul></div>

# Introduction

In this module I developed and tested a script to optimize the parameters for the various machine learning algorithms to help that metric scores.     

Once I completed the working prototype here, I implemented them into the final modelling scripts for each of the drug outcomes.


# Hyperparameter Optimization

In [15]:
#import libraries
import pandas as pd
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
import re
import numpy as np
%matplotlib inline

In [11]:
#import specific machine learning models
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import numpy as np
import time

In [None]:
#read in dataset
df = pd.read_csv("../drug_consumption_cap_20230505.csv")
df

In [None]:
#select only the drug variable columns
df.iloc[:,13:]

In [None]:
#remove 'CL' prefix
df.iloc[:,13:] = df.iloc[:,13:].applymap(lambda x: re.sub('CL','',x))
df.iloc[:,13:]

In [None]:
#recode as integer field type
df.iloc[:,13:] = df.iloc[:,13:].apply(lambda x: x.astype(int))
#check for field type of outcomes (should be integers)
df.info()

In [12]:
#creating the parameter grid for GridSearchCV - parameters to try to optimize
param_grid = [
    {'classifier' : [LogisticRegression()], #Parameters for LogisticRegression
     'classifier__penalty' : ['l1', 'l2'],
    'classifier__C' : np.logspace(-4, 4, 20),
    'classifier__solver' : ['liblinear','newton-cg', 'lbfgs','saga']},
    
    {'classifier' : [RandomForestClassifier()], #Parameters for RandomForestClassifier
    'classifier__n_estimators' : [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
    'classifier__max_features' : ['auto', 'sqrt','log2'],
    'classifier__bootstrap' : [True, False],
    'classifier__max_depth': [5,10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
     'classifier__min_samples_leaf': [1, 2, 4],
     'classifier__min_samples_split': [2, 5, 10],
    'classifier__criterion': ['gini','entropy','log_loss']},
    
    
    {'classifier' : [MLPClassifier()], #Parameters for Neural Network
    'classifier__hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'classifier__activation': ['tanh', 'relu'],
    'classifier__solver': ['sgd', 'adam'],
    'classifier__alpha': [0.0001, 0.05, 0.1],
    'classifier__learning_rate': ['constant','adaptive', 'invscaling'],
    'classifier__max_iter' :[200,300,400]}
      
]

In [13]:
#create Pipeline to feed parameters and model pairings with GridSearchCV
pipe = Pipeline([('classifier' , LogisticRegression())])

In [14]:
#Run GridSearchCV to find best parameters
LR_clf = GridSearchCV(pipe, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)

In [None]:
#save start
start = time.time()

#fit the model
best_LR_clf = LR_clf.fit(X_train, y_train)

#print time
print(f"GridSearchCV Total Time in seconds: {time.time()-start}")