In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing
from datetime import datetime

In [2]:
df = pd.read_csv('Preprocessed_pca.csv', index_col = 0)
df

Unnamed: 0,PhysHlth,BMI,MentHlth,Income,Diabetes_binary
0,-6.667397,-6.392123,-1.111675,-0.458052,0
1,-3.863936,-7.178304,3.095224,-0.072141,0
2,-0.834798,0.081819,-3.494618,-4.087203,0
3,7.085908,-6.207388,-6.370345,-2.893083,0
4,-4.274580,7.404026,0.413793,1.106787,0
...,...,...,...,...,...
253659,-3.320172,9.505420,0.697311,0.792966,1
253668,-5.022866,1.523446,-0.484450,-2.206810,1
253670,2.529385,-4.995248,11.196980,-6.451818,1
253676,-6.559798,-9.188122,-1.661467,-3.481757,1


In [3]:
X = df[['PhysHlth', 'BMI', 'MentHlth', 'Income']]
y = df.loc[:, 'Diabetes_binary']
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state = 1)

In [4]:
acc_list = []
roc_auc_ovr_List = []
f1_micro_List = []

## Trial 1

In [5]:
start = datetime.now()

In [6]:
# Create a pipeline - RF is a stand in, we will populate the classifier part below
pipe = Pipeline([('std', StandardScaler()),
                 ('classifier', LogisticRegression())])

# Create search space of candidate learning algorithms and their hyperparameters
# note lbfgs can't do l1, and if you pass penalty='none' it expects no C value
search_space = [{'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['saga'],
                 'classifier__penalty': ['l1', 'l2'],
                 'classifier__C': np.logspace(-4, 4, 9)},
                {'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['lbfgs'],
                 'classifier__penalty': ['l2'],
                 'classifier__C': np.logspace(-4, 4, 9)},
                {'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['lbfgs','saga'],
                 'classifier__penalty': ['none']},
                {'classifier': [LogisticRegression(max_iter=10000)]}
                ]

# Create grid search
clf = GridSearchCV(pipe, search_space, cv=StratifiedKFold(n_splits=10),
                   scoring=['accuracy', 'precision', 'f1_micro'], refit=False,
                   verbose=0)

# Fit grid search
best_model = clf.fit(X_train, Y_train)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
best_model.cv_results_

{'mean_fit_time': array([0.05000498, 0.10266774, 0.11528485, 0.10219669, 0.11557832,
        0.10265124, 0.11612074, 0.10003376, 0.12404246, 0.10399759,
        0.11845224, 0.10547323, 0.12610879, 0.11175604, 0.12384446,
        0.11199782, 0.13726242, 0.12367868, 0.02431118, 0.02323394,
        0.02302573, 0.02340813, 0.02270339, 0.02307572, 0.02314734,
        0.02421052, 0.02304683, 0.02414787, 0.11753125, 0.02328396]),
 'std_fit_time': array([0.00828888, 0.00840852, 0.00507814, 0.0044828 , 0.00749576,
        0.00895794, 0.00782027, 0.00528426, 0.01065537, 0.00778991,
        0.00780167, 0.00986117, 0.0165836 , 0.00927526, 0.01594894,
        0.0055158 , 0.01556274, 0.01288251, 0.00092476, 0.00101769,
        0.00105979, 0.00105457, 0.00052992, 0.00096601, 0.00164706,
        0.00213683, 0.00112265, 0.00170457, 0.00855235, 0.0005618 ]),
 'mean_score_time': array([0.00740664, 0.00781064, 0.00792811, 0.00770681, 0.00820727,
        0.00780694, 0.00790708, 0.00827262, 0.00785735, 0.00

In [8]:
best_model.cv_results_['rank_test_accuracy']

array([30, 28, 27, 25, 24,  6, 12,  3,  1,  4,  4,  8, 18, 13, 13, 13, 13,
        9, 28, 25,  6,  2, 10, 18, 18, 18, 18, 18, 13, 10])

In [9]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['mean_test_accuracy'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 0.0001,
 'classifier__penalty': 'l1',
 'classifier__solver': 'saga'}

In [10]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_acc'] = best_model.cv_results_['mean_test_accuracy']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_acc
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.600535
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.682524
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.699089
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.700207
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.700942
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.701027
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.700999
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.701027
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.701041
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.701027


In [11]:
acc_list.append(results[0:1])

In [12]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_precision'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 0.0001,
 'classifier__penalty': 'l2',
 'classifier__solver': 'saga'}

In [13]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_precision'] = best_model.cv_results_['mean_test_precision']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols


results

Unnamed: 0,classifier,C,penalty,solver,score_precision
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.0
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.704018
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.66936
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.664651
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.658945
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.658697
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.657966
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.657925
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.657897
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.657868


In [14]:
roc_auc_ovr_List.append(results[1:2])

In [15]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_f1_micro'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 1.0,
 'classifier__penalty': 'l1',
 'classifier__solver': 'saga'}

In [16]:
results = pd.DataFrame( best_model.cv_results_['params'] )


results['score_f1_micro'] = best_model.cv_results_['mean_test_f1_micro']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_f1_micro
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.600535
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.682524
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.699089
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.700207
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.700942
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.701027
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.700999
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.701027
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.701041
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.701027


In [17]:
f1_micro_List.append(results[8:9])

## Trial 2

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state = 2)

In [19]:
# Create a pipeline - RF is a stand in, we will populate the classifier part below
pipe = Pipeline([('std', StandardScaler()),
                 ('classifier', LogisticRegression())])

# Create search space of candidate learning algorithms and their hyperparameters
# note lbfgs can't do l1, and if you pass penalty='none' it expects no C value
search_space = [{'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['saga'],
                 'classifier__penalty': ['l1', 'l2'],
                 'classifier__C': np.logspace(-4, 4, 9)},
                {'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['lbfgs'],
                 'classifier__penalty': ['l2'],
                 'classifier__C': np.logspace(-4, 4, 9)},
                {'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['lbfgs','saga'],
                 'classifier__penalty': ['none']},
                {'classifier': [LogisticRegression(max_iter=10000)]}
                ]

# Create grid search
clf = GridSearchCV(pipe, search_space, cv=StratifiedKFold(n_splits=10),
                   scoring=['accuracy', 'precision', 'f1_micro'], refit=False,
                   verbose=0)

# Fit grid search
best_model = clf.fit(X_train, Y_train)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
best_model.cv_results_

{'mean_fit_time': array([0.04892576, 0.10178926, 0.11230183, 0.10224841, 0.12948077,
        0.11410449, 0.12701383, 0.10836613, 0.12756827, 0.12151308,
        0.13295162, 0.11428809, 0.12567647, 0.11730998, 0.13091736,
        0.11289821, 0.12977314, 0.11625695, 0.0244159 , 0.02319152,
        0.02306058, 0.02285249, 0.0229697 , 0.02405984, 0.02314618,
        0.02320399, 0.02290359, 0.02288551, 0.121999  , 0.02305312]),
 'std_fit_time': array([0.00562913, 0.00702792, 0.00707462, 0.00574355, 0.01137634,
        0.0085454 , 0.00960634, 0.00646496, 0.0103238 , 0.01030911,
        0.01123925, 0.00764683, 0.01095562, 0.00782899, 0.00990571,
        0.01026722, 0.00877325, 0.0067251 , 0.0014024 , 0.00103188,
        0.00153561, 0.00071822, 0.00075083, 0.00323503, 0.00186818,
        0.00115851, 0.00068914, 0.00085995, 0.01007896, 0.00064953]),
 'mean_score_time': array([0.00742869, 0.00770686, 0.00780714, 0.00790687, 0.0080569 ,
        0.0081054 , 0.00810702, 0.00802884, 0.00810726, 0.00

In [21]:
best_model.cv_results_['rank_test_accuracy']

array([30, 28, 27, 26, 24, 17, 11, 19,  1, 21,  1, 12, 12,  1,  1, 12,  1,
       12, 28, 25, 17, 19, 22,  1,  1,  1,  1,  1, 12, 22])

In [22]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['mean_test_accuracy'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 0.0001,
 'classifier__penalty': 'l1',
 'classifier__solver': 'saga'}

In [23]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_acc'] = best_model.cv_results_['mean_test_accuracy']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_acc
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.59912
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.685141
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.701069
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.701861
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.70271
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.702767
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.702767
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.702753
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.702781
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.702753


In [24]:
acc_list.append(results[0:1])

In [25]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_precision'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 0.0001,
 'classifier__penalty': 'l2',
 'classifier__solver': 'saga'}

In [26]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_precision'] = best_model.cv_results_['mean_test_precision']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_precision
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.0
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.70864
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.672481
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.667369
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.662204
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.66179
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.661162
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.661132
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.661136
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.661077


In [27]:
roc_auc_ovr_List.append(results[1:2])

In [28]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_f1_micro'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 1.0,
 'classifier__penalty': 'l1',
 'classifier__solver': 'saga'}

In [29]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_f1_micro'] = best_model.cv_results_['mean_test_f1_micro']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_f1_micro
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.59912
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.685141
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.701069
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.701861
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.70271
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.702767
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.702767
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.702753
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.702781
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.702753


In [30]:
f1_micro_List.append(results[8:9])

## Trial 3

In [31]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state = 3)

In [32]:
# Create a pipeline - RF is a stand in, we will populate the classifier part below
pipe = Pipeline([('std', StandardScaler()),
                 ('classifier', LogisticRegression())])

# Create search space of candidate learning algorithms and their hyperparameters
# note lbfgs can't do l1, and if you pass penalty='none' it expects no C value
search_space = [{'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['saga'],
                 'classifier__penalty': ['l1', 'l2'],
                 'classifier__C': np.logspace(-4, 4, 9)},
                {'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['lbfgs'],
                 'classifier__penalty': ['l2'],
                 'classifier__C': np.logspace(-4, 4, 9)},
                {'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['lbfgs','saga'],
                 'classifier__penalty': ['none']},
                {'classifier': [LogisticRegression(max_iter=10000)]}
                ]

# Create grid search
clf = GridSearchCV(pipe, search_space, cv=StratifiedKFold(n_splits=10),
                   scoring=['accuracy', 'precision', 'f1_micro'], refit=False,
                   verbose=0)

# Fit grid search
best_model = clf.fit(X_train, Y_train)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
best_model.cv_results_

{'mean_fit_time': array([0.05211456, 0.10302429, 0.11541595, 0.10655594, 0.12156851,
        0.12497203, 0.13688803, 0.11625423, 0.13023713, 0.12108722,
        0.13497989, 0.12046669, 0.13083308, 0.11973825, 0.1348619 ,
        0.12107439, 0.13136995, 0.12131386, 0.02448106, 0.02340496,
        0.02384844, 0.02312098, 0.02332084, 0.02491109, 0.02287152,
        0.02242005, 0.02292049, 0.02262018, 0.11305141, 0.02292085]),
 'std_fit_time': array([0.01004523, 0.00902707, 0.00885405, 0.01644106, 0.00877499,
        0.01629365, 0.0145322 , 0.00979008, 0.00787046, 0.00621978,
        0.01411478, 0.01108994, 0.01631147, 0.00703108, 0.01265161,
        0.01078701, 0.00931226, 0.00877696, 0.00122502, 0.00126349,
        0.00209977, 0.00144703, 0.00110132, 0.00375906, 0.00128037,
        0.0010207 , 0.00137621, 0.0004902 , 0.00611312, 0.00130118]),
 'mean_score_time': array([0.00727501, 0.00758989, 0.00770669, 0.00801365, 0.00850773,
        0.00810738, 0.00810752, 0.00830746, 0.0078068 , 0.00

In [34]:
best_model.cv_results_['rank_test_accuracy']

array([30, 28, 27, 25, 24, 23,  3,  1,  4,  6, 21,  6,  6,  6,  6,  5, 20,
        6, 28, 26, 22,  1,  6,  6,  6,  6,  6,  6,  6,  6])

In [35]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['mean_test_accuracy'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 0.0001,
 'classifier__penalty': 'l1',
 'classifier__solver': 'saga'}

In [36]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_acc'] = best_model.cv_results_['mean_test_accuracy']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_acc
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.599601
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.683571
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.699188
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.700065
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.701324
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.701409
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.701536
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.70155
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.701522
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.701508


In [37]:
acc_list.append(results[0:1])

In [38]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_precision'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 0.0001,
 'classifier__penalty': 'l2',
 'classifier__solver': 'saga'}

In [39]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_precision'] = best_model.cv_results_['mean_test_precision']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_precision
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.0
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.706928
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.670155
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.665248
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.660475
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.66015
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.659731
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.659718
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.659601
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.659572


In [40]:
roc_auc_ovr_List.append(results[1:2])

In [41]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_f1_micro'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 0.1,
 'classifier__penalty': 'l2',
 'classifier__solver': 'saga'}

In [43]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_f1_micro'] = best_model.cv_results_['mean_test_f1_micro']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_f1_micro
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.599601
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.683571
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.699188
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.700065
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.701324
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.701409
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.701536
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.70155
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.701522
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.701508


In [42]:
f1_micro_List.append(results[7:8])

## Trial 4

In [44]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state = 4)

In [45]:
# Create a pipeline - RF is a stand in, we will populate the classifier part below
pipe = Pipeline([('std', StandardScaler()),
                 ('classifier', LogisticRegression())])

# Create search space of candidate learning algorithms and their hyperparameters
# note lbfgs can't do l1, and if you pass penalty='none' it expects no C value
search_space = [{'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['saga'],
                 'classifier__penalty': ['l1', 'l2'],
                 'classifier__C': np.logspace(-4, 4, 9)},
                {'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['lbfgs'],
                 'classifier__penalty': ['l2'],
                 'classifier__C': np.logspace(-4, 4, 9)},
                {'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['lbfgs','saga'],
                 'classifier__penalty': ['none']},
                {'classifier': [LogisticRegression(max_iter=10000)]}
                ]

# Create grid search
clf = GridSearchCV(pipe, search_space, cv=StratifiedKFold(n_splits=10),
                   scoring=['accuracy', 'precision', 'f1_micro'], refit=False,
                   verbose=0)

# Fit grid search
best_model = clf.fit(X_train, Y_train)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [46]:
best_model.cv_results_

{'mean_fit_time': array([0.05357742, 0.10627964, 0.12110317, 0.1061116 , 0.13594875,
        0.12120986, 0.13368142, 0.11096139, 0.13990142, 0.11657648,
        0.13975701, 0.11654179, 0.13539028, 0.11123261, 0.13176444,
        0.10723565, 0.13515763, 0.10974636, 0.02428343, 0.02259071,
        0.02261226, 0.02259984, 0.02242928, 0.02218862, 0.0229208 ,
        0.0227211 , 0.02361941, 0.02297261, 0.10743001, 0.02244291]),
 'std_fit_time': array([0.01038546, 0.00818391, 0.00897846, 0.00584824, 0.01463121,
        0.01316838, 0.01361176, 0.00867428, 0.01446897, 0.01120118,
        0.01364252, 0.01363537, 0.00833739, 0.00837242, 0.01458079,
        0.01124388, 0.01210242, 0.00798822, 0.00140652, 0.00098355,
        0.00062885, 0.00108545, 0.0004805 , 0.00069655, 0.00083158,
        0.00122564, 0.00190915, 0.00121429, 0.00655359, 0.00061435]),
 'mean_score_time': array([0.0074796 , 0.00750649, 0.0077539 , 0.00820341, 0.00850766,
        0.00800743, 0.00794704, 0.00790691, 0.0080631 , 0.00

In [47]:
best_model.cv_results_['rank_test_accuracy']

array([30, 29, 27, 25, 22, 23, 21, 18, 19, 11, 11, 11,  1,  1, 11,  1,  1,
        1, 28, 25, 23, 20, 15,  1,  1,  1,  1,  1, 17, 15])

In [48]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['mean_test_accuracy'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 0.0001,
 'classifier__penalty': 'l1',
 'classifier__solver': 'saga'}

In [49]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_acc'] = best_model.cv_results_['mean_test_accuracy']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_acc
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.599332
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.683005
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.699895
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.700659
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.702003
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.701876
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.702088
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.70213
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.702116
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.702145


In [50]:
acc_list.append(results[0:1])

In [51]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_precision'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 0.0001,
 'classifier__penalty': 'l2',
 'classifier__solver': 'saga'}

In [52]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_precision'] = best_model.cv_results_['mean_test_precision']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_precision
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.0
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.704444
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.670251
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.664851
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.660407
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.659884
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.659463
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.659482
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.659382
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.65944


In [53]:
roc_auc_ovr_List.append(results[1:2])

In [54]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_f1_micro'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 100.0,
 'classifier__penalty': 'l1',
 'classifier__solver': 'saga'}

In [55]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_f1_micro'] = best_model.cv_results_['mean_test_f1_micro']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_f1_micro
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.599332
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.683005
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.699895
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.700659
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.702003
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.701876
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.702088
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.70213
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.702116
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.702145


In [56]:
f1_micro_List.append(results[12:13])

## Trial 5

In [57]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state = 5)

In [58]:
# Create a pipeline - RF is a stand in, we will populate the classifier part below
pipe = Pipeline([('std', StandardScaler()),
                 ('classifier', LogisticRegression())])

# Create search space of candidate learning algorithms and their hyperparameters
# note lbfgs can't do l1, and if you pass penalty='none' it expects no C value
search_space = [{'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['saga'],
                 'classifier__penalty': ['l1', 'l2'],
                 'classifier__C': np.logspace(-4, 4, 9)},
                {'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['lbfgs'],
                 'classifier__penalty': ['l2'],
                 'classifier__C': np.logspace(-4, 4, 9)},
                {'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['lbfgs','saga'],
                 'classifier__penalty': ['none']},
                {'classifier': [LogisticRegression(max_iter=10000)]}
                ]

# Create grid search
clf = GridSearchCV(pipe, search_space, cv=StratifiedKFold(n_splits=10),
                   scoring=['accuracy', 'precision', 'f1_micro'], refit=False,
                   verbose=0)

# Fit grid search
best_model = clf.fit(X_train, Y_train)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [59]:
best_model.cv_results_

{'mean_fit_time': array([0.05160749, 0.09878242, 0.11398287, 0.10975924, 0.12118802,
        0.10482383, 0.11972671, 0.1045399 , 0.12873909, 0.10793281,
        0.12007725, 0.11292734, 0.12211237, 0.10488939, 0.12972534,
        0.10506749, 0.12798007, 0.11813211, 0.02399919, 0.02280006,
        0.0227169 , 0.02222023, 0.0227412 , 0.02247024, 0.02253256,
        0.02212   , 0.02263489, 0.02248135, 0.11398149, 0.02221994]),
 'std_fit_time': array([0.00938847, 0.00172497, 0.0063941 , 0.01249836, 0.01227626,
        0.00604963, 0.01094269, 0.00865853, 0.01884232, 0.0108797 ,
        0.00930199, 0.00899529, 0.01018915, 0.00699343, 0.00842855,
        0.01053873, 0.011344  , 0.01558889, 0.00114743, 0.00113024,
        0.00119068, 0.00074913, 0.00165372, 0.00131364, 0.00102083,
        0.00083147, 0.00103008, 0.00092705, 0.01918112, 0.00074883]),
 'mean_score_time': array([0.00730603, 0.0077054 , 0.00790722, 0.00765719, 0.00780787,
        0.00840709, 0.00797496, 0.00850725, 0.00832963, 0.00

In [60]:
best_model.cv_results_['rank_test_accuracy']

array([30, 28, 27, 25,  3,  1,  4,  5,  9,  7,  9,  9,  9,  8,  9,  9, 24,
        9, 28, 25,  2,  6,  9,  9,  9,  9,  9,  9,  9,  9])

In [61]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['mean_test_accuracy'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 0.0001,
 'classifier__penalty': 'l1',
 'classifier__solver': 'saga'}

In [62]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_acc'] = best_model.cv_results_['mean_test_accuracy']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_acc
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.600167
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.685523
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.701932
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.702187
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.704026
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.704068
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.703969
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.703969
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.703941
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.703955


In [63]:
acc_list.append(results[0:1])

In [64]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_precision'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 0.0001,
 'classifier__penalty': 'l2',
 'classifier__solver': 'saga'}

In [65]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_precision'] = best_model.cv_results_['mean_test_precision']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_precision
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.0
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.709146
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.673036
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.66698
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.663063
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.662806
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.661996
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.661981
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.661856
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.661885


In [66]:
roc_auc_ovr_List.append(results[1:2])

In [67]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_f1_micro'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 0.01,
 'classifier__penalty': 'l2',
 'classifier__solver': 'saga'}

In [68]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_f1_micro'] = best_model.cv_results_['mean_test_f1_micro']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_f1_micro
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.600167
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.685523
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.701932
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.702187
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.704026
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.704068
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.703969
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.703969
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.703941
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.703955


In [69]:
f1_micro_List.append(results[5:6])

## Trial 6

In [70]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state = 6)

In [71]:
# Create a pipeline - RF is a stand in, we will populate the classifier part below
pipe = Pipeline([('std', StandardScaler()),
                 ('classifier', LogisticRegression())])

# Create search space of candidate learning algorithms and their hyperparameters
# note lbfgs can't do l1, and if you pass penalty='none' it expects no C value
search_space = [{'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['saga'],
                 'classifier__penalty': ['l1', 'l2'],
                 'classifier__C': np.logspace(-4, 4, 9)},
                {'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['lbfgs'],
                 'classifier__penalty': ['l2'],
                 'classifier__C': np.logspace(-4, 4, 9)},
                {'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['lbfgs','saga'],
                 'classifier__penalty': ['none']},
                {'classifier': [LogisticRegression(max_iter=10000)]}
                ]

# Create grid search
clf = GridSearchCV(pipe, search_space, cv=StratifiedKFold(n_splits=10),
                   scoring=['accuracy', 'precision', 'f1_micro'], refit=False,
                   verbose=0)

# Fit grid search
best_model = clf.fit(X_train, Y_train)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [72]:
best_model.cv_results_

{'mean_fit_time': array([0.05646436, 0.10914884, 0.11542802, 0.10506878, 0.11361196,
        0.10064795, 0.11870692, 0.11041274, 0.11254458, 0.10054352,
        0.1151422 , 0.10829477, 0.11674552, 0.10589635, 0.12660825,
        0.10722167, 0.12487049, 0.11238387, 0.02295918, 0.02299523,
        0.02300763, 0.02253525, 0.02256341, 0.02303963, 0.02253766,
        0.02291131, 0.02398825, 0.02251158, 0.11287985, 0.02237499]),
 'std_fit_time': array([0.00902457, 0.006758  , 0.00685167, 0.00914438, 0.01095791,
        0.00615378, 0.0118446 , 0.00447411, 0.00638088, 0.00467788,
        0.00789225, 0.00637137, 0.00893891, 0.00495626, 0.01173085,
        0.00839021, 0.0115005 , 0.0061682 , 0.00065053, 0.00102456,
        0.00141037, 0.00093191, 0.00089289, 0.00131622, 0.00105172,
        0.00114187, 0.00251631, 0.00110324, 0.00768977, 0.00088781]),
 'mean_score_time': array([0.00729456, 0.0076242 , 0.00800705, 0.00790701, 0.00780675,
        0.00780721, 0.00790701, 0.00760682, 0.00760579, 0.00

In [73]:
best_model.cv_results_['rank_test_accuracy']

array([30, 28, 27, 25, 22, 23, 20, 20,  1,  1,  1,  1,  1,  1,  1, 17,  1,
       17, 28, 25, 23, 19,  1,  1,  1,  1,  1,  1,  1,  1])

In [74]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['mean_test_accuracy'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 0.0001,
 'classifier__penalty': 'l1',
 'classifier__solver': 'saga'}

In [75]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_acc'] = best_model.cv_results_['mean_test_accuracy']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_acc
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.599389
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.682736
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.699004
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.699853
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.700801
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.700787
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.701098
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.701098
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.701169
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.701169


In [76]:
acc_list.append(results[0:1])

In [77]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_precision'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 0.0001,
 'classifier__penalty': 'l2',
 'classifier__solver': 'saga'}

In [78]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_precision'] = best_model.cv_results_['mean_test_precision']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_precision
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.0
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.705512
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.669719
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.664769
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.659544
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.659224
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.658982
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.658925
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.65896
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.65896


In [79]:
roc_auc_ovr_List.append(results[1:2])

In [80]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_f1_micro'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 1.0,
 'classifier__penalty': 'l1',
 'classifier__solver': 'saga'}

In [81]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_f1_micro'] = best_model.cv_results_['mean_test_f1_micro']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_f1_micro
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.599389
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.682736
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.699004
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.699853
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.700801
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.700787
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.701098
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.701098
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.701169
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.701169


In [82]:
f1_micro_List.append(results[8:9])

## Trial 7

In [83]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state = 7)

In [84]:
# Create a pipeline - RF is a stand in, we will populate the classifier part below
pipe = Pipeline([('std', StandardScaler()),
                 ('classifier', LogisticRegression())])

# Create search space of candidate learning algorithms and their hyperparameters
# note lbfgs can't do l1, and if you pass penalty='none' it expects no C value
search_space = [{'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['saga'],
                 'classifier__penalty': ['l1', 'l2'],
                 'classifier__C': np.logspace(-4, 4, 9)},
                {'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['lbfgs'],
                 'classifier__penalty': ['l2'],
                 'classifier__C': np.logspace(-4, 4, 9)},
                {'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['lbfgs','saga'],
                 'classifier__penalty': ['none']},
                {'classifier': [LogisticRegression(max_iter=10000)]}
                ]

# Create grid search
clf = GridSearchCV(pipe, search_space, cv=StratifiedKFold(n_splits=10),
                   scoring=['accuracy', 'precision', 'f1_micro'], refit=False,
                   verbose=0)

# Fit grid search
best_model = clf.fit(X_train, Y_train)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [85]:
best_model.cv_results_

{'mean_fit_time': array([0.05603466, 0.10090907, 0.11907024, 0.10900588, 0.11592526,
        0.10350008, 0.11353064, 0.10154216, 0.11814113, 0.10816143,
        0.12410607, 0.103193  , 0.12532997, 0.107128  , 0.11577501,
        0.1080066 , 0.12376661, 0.1053257 , 0.02266173, 0.0223201 ,
        0.02173741, 0.021365  , 0.02136874, 0.02135756, 0.02150311,
        0.02175233, 0.0218199 , 0.02152281, 0.10427606, 0.0215116 ]),
 'std_fit_time': array([0.01506615, 0.00679525, 0.01141412, 0.00885118, 0.01007513,
        0.00692923, 0.0065583 , 0.0055693 , 0.00904185, 0.00739944,
        0.00575316, 0.00918819, 0.01560695, 0.00808041, 0.00933599,
        0.00898281, 0.00577098, 0.006069  , 0.00072762, 0.00127004,
        0.00101701, 0.00044771, 0.00064354, 0.00109659, 0.00068508,
        0.00069443, 0.00107787, 0.00092961, 0.00494824, 0.00054496]),
 'mean_score_time': array([0.00750673, 0.00736382, 0.00750656, 0.00764372, 0.00805891,
        0.00766375, 0.00800679, 0.00781727, 0.00800717, 0.00

In [86]:
best_model.cv_results_['rank_test_accuracy']

array([30, 28, 27, 25,  4, 23,  3,  1, 14, 16,  5, 14, 22, 16, 16,  5,  5,
        5, 29, 25, 23,  1, 16,  5,  5,  5,  5,  5, 16, 16])

In [87]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['mean_test_accuracy'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 0.0001,
 'classifier__penalty': 'l1',
 'classifier__solver': 'saga'}

In [88]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_acc'] = best_model.cv_results_['mean_test_accuracy']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_acc
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.599785
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.682425
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.698353
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.699754
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.700829
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.700617
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.700843
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.700857
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.700815
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.700801


In [89]:
acc_list.append(results[0:1])

In [90]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_precision'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 0.0001,
 'classifier__penalty': 'l2',
 'classifier__solver': 'saga'}

In [91]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_precision'] = best_model.cv_results_['mean_test_precision']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_precision
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.0
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.705316
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.668549
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.66417
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.659145
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.658329
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.658052
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.658011
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.657855
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.657826


In [92]:
roc_auc_ovr_List.append(results[1:2])

In [93]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_f1_micro'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 0.1,
 'classifier__penalty': 'l2',
 'classifier__solver': 'saga'}

In [94]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_f1_micro'] = best_model.cv_results_['mean_test_f1_micro']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_f1_micro
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.599785
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.682425
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.698353
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.699754
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.700829
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.700617
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.700843
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.700857
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.700815
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.700801


In [95]:
f1_micro_List.append(results[7:8])

## Trial 8

In [96]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state = 8)

In [97]:
# Create a pipeline - RF is a stand in, we will populate the classifier part below
pipe = Pipeline([('std', StandardScaler()),
                 ('classifier', LogisticRegression())])

# Create search space of candidate learning algorithms and their hyperparameters
# note lbfgs can't do l1, and if you pass penalty='none' it expects no C value
search_space = [{'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['saga'],
                 'classifier__penalty': ['l1', 'l2'],
                 'classifier__C': np.logspace(-4, 4, 9)},
                {'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['lbfgs'],
                 'classifier__penalty': ['l2'],
                 'classifier__C': np.logspace(-4, 4, 9)},
                {'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['lbfgs','saga'],
                 'classifier__penalty': ['none']},
                {'classifier': [LogisticRegression(max_iter=10000)]}
                ]

# Create grid search
clf = GridSearchCV(pipe, search_space, cv=StratifiedKFold(n_splits=10),
                   scoring=['accuracy', 'precision', 'f1_micro'], refit=False,
                   verbose=0)

# Fit grid search
best_model = clf.fit(X_train, Y_train)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [98]:
best_model.cv_results_

{'mean_fit_time': array([0.04678934, 0.09954183, 0.11104252, 0.10280142, 0.11270027,
        0.11310141, 0.13514037, 0.11243141, 0.12552671, 0.11808836,
        0.13361847, 0.12123921, 0.13190758, 0.11197882, 0.13912077,
        0.11503415, 0.13074541, 0.11524057, 0.02419181, 0.02254026,
        0.02313309, 0.02313685, 0.02262032, 0.02293642, 0.02300429,
        0.0224072 , 0.02342122, 0.02322154, 0.10895872, 0.02293863]),
 'std_fit_time': array([0.00542952, 0.00541426, 0.00795919, 0.0095545 , 0.01043283,
        0.01150007, 0.01140674, 0.00599009, 0.00871672, 0.00903209,
        0.01493289, 0.01327   , 0.00826346, 0.00600628, 0.00867936,
        0.00633688, 0.00694949, 0.00771142, 0.00128885, 0.00065932,
        0.0020432 , 0.00128864, 0.00135762, 0.0010327 , 0.00079909,
        0.0009025 , 0.00135798, 0.00087238, 0.00720119, 0.00082281]),
 'mean_score_time': array([0.00740163, 0.00770652, 0.00780709, 0.00780642, 0.00800881,
        0.00835495, 0.00852909, 0.00815818, 0.00811939, 0.00

In [99]:
best_model.cv_results_['rank_test_accuracy']

array([30, 28, 27, 25, 22, 23, 21, 16, 11,  1, 18,  2, 11, 18,  2,  2, 11,
       18, 29, 26, 23, 16, 11,  2,  2,  2,  2,  2,  2, 11])

In [100]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['mean_test_accuracy'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 0.0001,
 'classifier__penalty': 'l1',
 'classifier__solver': 'saga'}

In [101]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_acc'] = best_model.cv_results_['mean_test_accuracy']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_acc
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.599969
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.682892
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.699895
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.700871
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.701465
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.701409
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.701607
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.701663
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.701678
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.701692


In [102]:
acc_list.append(results[0:1])

In [103]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_precision'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 0.0001,
 'classifier__penalty': 'l2',
 'classifier__solver': 'saga'}

In [104]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_precision'] = best_model.cv_results_['mean_test_precision']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_precision
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.0
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.705015
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.670621
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.665425
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.659714
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.659178
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.658834
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.658879
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.658851
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.658879


In [105]:
roc_auc_ovr_List.append(results[1:2])

In [106]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_f1_micro'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 1.0,
 'classifier__penalty': 'l2',
 'classifier__solver': 'saga'}

In [107]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_f1_micro'] = best_model.cv_results_['mean_test_f1_micro']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_f1_micro
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.599969
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.682892
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.699895
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.700871
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.701465
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.701409
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.701607
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.701663
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.701678
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.701692


In [108]:
f1_micro_List.append(results[9:10])

## Trial 9

In [109]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state = 9)

In [110]:
# Create a pipeline - RF is a stand in, we will populate the classifier part below
pipe = Pipeline([('std', StandardScaler()),
                 ('classifier', LogisticRegression())])

# Create search space of candidate learning algorithms and their hyperparameters
# note lbfgs can't do l1, and if you pass penalty='none' it expects no C value
search_space = [{'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['saga'],
                 'classifier__penalty': ['l1', 'l2'],
                 'classifier__C': np.logspace(-4, 4, 9)},
                {'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['lbfgs'],
                 'classifier__penalty': ['l2'],
                 'classifier__C': np.logspace(-4, 4, 9)},
                {'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['lbfgs','saga'],
                 'classifier__penalty': ['none']},
                {'classifier': [LogisticRegression(max_iter=10000)]}
                ]

# Create grid search
clf = GridSearchCV(pipe, search_space, cv=StratifiedKFold(n_splits=10),
                   scoring=['accuracy', 'precision', 'f1_micro'], refit=False,
                   verbose=0)

# Fit grid search
best_model = clf.fit(X_train, Y_train)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [111]:
best_model.cv_results_

{'mean_fit_time': array([0.04809718, 0.10023692, 0.113309  , 0.10107071, 0.11666787,
        0.10162554, 0.11393485, 0.10478768, 0.11857188, 0.10603652,
        0.1204525 , 0.10413423, 0.11036119, 0.10189722, 0.1181031 ,
        0.10527678, 0.1179261 , 0.10408208, 0.02292995, 0.02170815,
        0.02179921, 0.02157373, 0.02168822, 0.02213433, 0.02208335,
        0.02194805, 0.02187631, 0.02217865, 0.10934405, 0.02152536]),
 'std_fit_time': array([0.00781227, 0.00855313, 0.00706955, 0.00841118, 0.00780082,
        0.0045712 , 0.00651535, 0.00799121, 0.01541881, 0.00830545,
        0.00797919, 0.00526679, 0.00687355, 0.0052886 , 0.00941067,
        0.00723202, 0.00582025, 0.00526648, 0.00063922, 0.00076796,
        0.00060905, 0.00039376, 0.00092781, 0.00095655, 0.00087093,
        0.00068533, 0.00060121, 0.00070549, 0.01021588, 0.00045808]),
 'mean_score_time': array([0.00740767, 0.00770922, 0.00803773, 0.0078552 , 0.00786266,
        0.00770655, 0.00765858, 0.00800626, 0.00781729, 0.00

In [112]:
best_model.cv_results_['rank_test_accuracy']

array([30, 28, 27, 25, 24, 22, 21, 12,  4,  1, 13, 13, 13, 13, 19, 13,  4,
       11, 28, 25, 22, 18,  1,  4,  4,  4,  4,  4, 19,  1])

In [113]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['mean_test_accuracy'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 0.0001,
 'classifier__penalty': 'l1',
 'classifier__solver': 'saga'}

In [114]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_acc'] = best_model.cv_results_['mean_test_accuracy']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_acc
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.60011
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.683401
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.699117
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.700023
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.701197
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.701211
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.701296
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.701324
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.701338
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.701352


In [115]:
acc_list.append(results[0:1])

In [116]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_precision'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 0.0001,
 'classifier__penalty': 'l2',
 'classifier__solver': 'saga'}

In [117]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_precision'] = best_model.cv_results_['mean_test_precision']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_precision
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.0
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.705634
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.669054
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.66405
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.65935
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.659043
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.6585
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.658488
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.658446
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.658461


In [118]:
roc_auc_ovr_List.append(results[1:2])

In [119]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_f1_micro'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 1.0,
 'classifier__penalty': 'l2',
 'classifier__solver': 'saga'}

In [120]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_f1_micro'] = best_model.cv_results_['mean_test_f1_micro']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_f1_micro
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.60011
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.683401
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.699117
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.700023
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.701197
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.701211
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.701296
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.701324
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.701338
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.701352


In [121]:
f1_micro_List.append(results[9:10])

## Trial 10

In [122]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state = 10)

In [123]:
# Create a pipeline - RF is a stand in, we will populate the classifier part below
pipe = Pipeline([('std', StandardScaler()),
                 ('classifier', LogisticRegression())])

# Create search space of candidate learning algorithms and their hyperparameters
# note lbfgs can't do l1, and if you pass penalty='none' it expects no C value
search_space = [{'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['saga'],
                 'classifier__penalty': ['l1', 'l2'],
                 'classifier__C': np.logspace(-4, 4, 9)},
                {'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['lbfgs'],
                 'classifier__penalty': ['l2'],
                 'classifier__C': np.logspace(-4, 4, 9)},
                {'classifier': [LogisticRegression(max_iter=10000)],
                 'classifier__solver': ['lbfgs','saga'],
                 'classifier__penalty': ['none']},
                {'classifier': [LogisticRegression(max_iter=10000)]}
                ]

# Create grid search
clf = GridSearchCV(pipe, search_space, cv=StratifiedKFold(n_splits=10),
                   scoring=['accuracy', 'precision', 'f1_micro'], refit=False,
                   verbose=0)

# Fit grid search
best_model = clf.fit(X_train, Y_train)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [124]:
best_model.cv_results_

{'mean_fit_time': array([0.05068355, 0.10720069, 0.11325662, 0.09956424, 0.11455624,
        0.09883525, 0.11641784, 0.09921119, 0.11348932, 0.09970057,
        0.1185461 , 0.09866321, 0.12117403, 0.10150585, 0.12009635,
        0.10695384, 0.1215899 , 0.10464509, 0.02264526, 0.02177875,
        0.02112539, 0.02155454, 0.02171004, 0.02151945, 0.02172308,
        0.02152593, 0.02165306, 0.02176452, 0.09978416, 0.0219713 ]),
 'std_fit_time': array([0.00498382, 0.0059571 , 0.00700139, 0.00751154, 0.00936902,
        0.00396015, 0.00624293, 0.00233418, 0.00866296, 0.00765867,
        0.00747655, 0.00526311, 0.01195136, 0.00560096, 0.00624616,
        0.01026857, 0.00570143, 0.00581748, 0.00073239, 0.00096278,
        0.0005982 , 0.00045425, 0.00061283, 0.00067146, 0.00063634,
        0.00076445, 0.00059246, 0.00065624, 0.00872789, 0.000807  ]),
 'mean_score_time': array([0.00710642, 0.00780687, 0.00770671, 0.00775456, 0.00775728,
        0.00730643, 0.00760648, 0.00760658, 0.00770683, 0.00

In [125]:
best_model.cv_results_['rank_test_accuracy']

array([30, 28, 27, 25, 24, 22, 21, 16, 15, 19,  2,  2, 17,  2,  2,  2,  2,
       17, 28, 26, 22, 20,  2,  2,  2,  2,  2,  2,  1,  2])

In [126]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['mean_test_accuracy'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 0.0001,
 'classifier__penalty': 'l1',
 'classifier__solver': 'saga'}

In [127]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_acc'] = best_model.cv_results_['mean_test_accuracy']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_acc
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.600422
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.68326
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.699315
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.700518
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.700659
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.700758
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.700772
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.700871
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.700871
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.700857


In [128]:
acc_list.append(results[0:1])

In [129]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_precision'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__C': 0.0001,
 'classifier__penalty': 'l2',
 'classifier__solver': 'saga'}

In [130]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_precision'] = best_model.cv_results_['mean_test_precision']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_precision
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.0
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.706381
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.670198
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.665689
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.658796
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.658596
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.658009
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.65813
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.658001
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.657971


In [131]:
roc_auc_ovr_List.append(results[1:2])

In [132]:
best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_f1_micro'])]

{'classifier': LogisticRegression(max_iter=10000),
 'classifier__penalty': 'none',
 'classifier__solver': 'saga'}

In [133]:
results = pd.DataFrame( best_model.cv_results_['params'] )

results['score_f1_micro'] = best_model.cv_results_['mean_test_f1_micro']
cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
results.columns = cols

results

Unnamed: 0,classifier,C,penalty,solver,score_f1_micro
0,LogisticRegression(max_iter=10000),0.0001,l1,saga,0.600422
1,LogisticRegression(max_iter=10000),0.0001,l2,saga,0.68326
2,LogisticRegression(max_iter=10000),0.001,l1,saga,0.699315
3,LogisticRegression(max_iter=10000),0.001,l2,saga,0.700518
4,LogisticRegression(max_iter=10000),0.01,l1,saga,0.700659
5,LogisticRegression(max_iter=10000),0.01,l2,saga,0.700758
6,LogisticRegression(max_iter=10000),0.1,l1,saga,0.700772
7,LogisticRegression(max_iter=10000),0.1,l2,saga,0.700871
8,LogisticRegression(max_iter=10000),1.0,l1,saga,0.700871
9,LogisticRegression(max_iter=10000),1.0,l2,saga,0.700857


In [134]:
f1_micro_List.append(results[28:29])

In [135]:
end = datetime.now()
time_taken = end - start
print('Time: ',time_taken) 

Time:  0:08:51.095589


## Results

In [158]:
accuracy = [x['score_acc'][0] for x in acc_list]
accuracy_c = [x['C'][0] for x in acc_list]
accuracy_penalty = [x['penalty'][0] for x in acc_list]
roc = [x['score_precision'].reset_index(drop=True)[0] for x in roc_auc_ovr_List]
roc_c = [x['C'].reset_index(drop=True)[0] for x in roc_auc_ovr_List]
roc_penalty = [x['penalty'].reset_index(drop=True)[0] for x in roc_auc_ovr_List]
f1 = [x['score_f1_micro'].reset_index(drop=True)[0] for x in f1_micro_List]
f1_c = [x['C'].reset_index(drop=True)[0] for x in f1_micro_List]
f1_penalty = [x['penalty'].reset_index(drop=True)[0] for x in f1_micro_List]

In [159]:
data = {'Accuracy': accuracy, 'Accuracy C': accuracy_c, 'Accuracy Penalty': accuracy_penalty,
        'Precision': roc, 'Precision C': roc_c, 'Precision Penalty': roc_penalty,
        'F1_micro':f1, 'F1_micro C': f1_c, 'F1_micro Penalty': f1_penalty}

In [160]:
trainingResults  = pd.DataFrame(data = data)
pd.options.display.max_colwidth = 100
trainingResults

Unnamed: 0,Accuracy,Accuracy C,Accuracy Penalty,Precision,Precision C,Precision Penalty,F1_micro,F1_micro C,F1_micro Penalty
0,0.600535,0.0001,l1,0.704018,0.0001,l2,0.701041,1.0,l1
1,0.59912,0.0001,l1,0.70864,0.0001,l2,0.702781,1.0,l1
2,0.599601,0.0001,l1,0.706928,0.0001,l2,0.70155,0.1,l2
3,0.599332,0.0001,l1,0.704444,0.0001,l2,0.702159,100.0,l1
4,0.600167,0.0001,l1,0.709146,0.0001,l2,0.704068,0.01,l2
5,0.599389,0.0001,l1,0.705512,0.0001,l2,0.701169,1.0,l1
6,0.599785,0.0001,l1,0.705316,0.0001,l2,0.700857,0.1,l2
7,0.599969,0.0001,l1,0.705015,0.0001,l2,0.701692,1.0,l2
8,0.60011,0.0001,l1,0.705634,0.0001,l2,0.701352,1.0,l2
9,0.600422,0.0001,l1,0.706381,0.0001,l2,0.7009,,none


In [161]:
trainingResults.to_csv('LR_pca_trainingResults.csv')

### Testing Model

In [162]:
df = pd.read_csv('Preprocessed_pca.csv', index_col = 0)
trainingResults = pd.read_csv('LR_pca_trainingResults.csv', index_col = 0)
trainingResults

Unnamed: 0,Accuracy,Accuracy C,Accuracy Penalty,Precision,Precision C,Precision Penalty,F1_micro,F1_micro C,F1_micro Penalty
0,0.600535,0.0001,l1,0.704018,0.0001,l2,0.701041,1.0,l1
1,0.59912,0.0001,l1,0.70864,0.0001,l2,0.702781,1.0,l1
2,0.599601,0.0001,l1,0.706928,0.0001,l2,0.70155,0.1,l2
3,0.599332,0.0001,l1,0.704444,0.0001,l2,0.702159,100.0,l1
4,0.600167,0.0001,l1,0.709146,0.0001,l2,0.704068,0.01,l2
5,0.599389,0.0001,l1,0.705512,0.0001,l2,0.701169,1.0,l1
6,0.599785,0.0001,l1,0.705316,0.0001,l2,0.700857,0.1,l2
7,0.599969,0.0001,l1,0.705015,0.0001,l2,0.701692,1.0,l2
8,0.60011,0.0001,l1,0.705634,0.0001,l2,0.701352,1.0,l2
9,0.600422,0.0001,l1,0.706381,0.0001,l2,0.7009,,none


In [163]:
X = df[['PhysHlth', 'BMI', 'MentHlth', 'Income']]
y = df.loc[:, 'Diabetes_binary']
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state = 11)

In [164]:
LR = LogisticRegression(penalty = 'l1', C = 0.0001, solver = 'saga').fit(X_train, Y_train)

In [165]:
from sklearn.metrics import f1_score, precision_score
acc = LR.score(X_test, Y_test)
predicted = LR.predict(X_test)
f1 = f1_score(Y_test, predicted)
precision = precision_score(Y_test, predicted)

In [166]:
print("Accuracy: " + str(acc) + '\n'
      + "Precision: " + str(precision) + '\n'
      + "F1: " + str(f1))

Accuracy: 0.6967124992927064
Precision: 0.6890831652095942
F1: 0.5342370524852278
