# Hyperparameter Tuning

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
#import data
df = pd.read_csv('diabetes.csv')

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
df.Outcome.value_counts() #Class distribution

Outcome
0    500
1    268
Name: count, dtype: int64

In [5]:
#Split data into attributes and class
y = df.Outcome
X = df.drop(columns=["Outcome"])

In [6]:
#perform training and test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42,stratify=df.Outcome)

In [7]:
#Logistic regression
from sklearn.linear_model import LogisticRegression

In [8]:
clf = LogisticRegression()
clf.fit(X_train,y_train)

In [9]:
y_pred = clf.predict(X_test)
# Model Evaluation metrics 
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
print('Accuracy Score : ' + str(accuracy_score(y_test,y_pred)))
print('Precision Score : ' + str(precision_score(y_test,y_pred,average="weighted")))
print('Recall Score : ' + str(recall_score(y_test,y_pred,average="weighted")))
print('F1 Score : ' + str(f1_score(y_test,y_pred,average="weighted")))

Accuracy Score : 0.7291666666666666
Precision Score : 0.7210379340853793
Recall Score : 0.7291666666666666
F1 Score : 0.722049284611855


##### Grid Search to maximize Recall

In [10]:
#Grid Search
from sklearn.model_selection import GridSearchCV

In [11]:
clf = LogisticRegression()

In [12]:
grid_values = {
    'penalty': ['l1', 'l2'],
    'C':[0.001,0.1,1,2,100,1000],
    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
              }

In [13]:
grid_clf_acc = GridSearchCV(clf, param_grid = grid_values,
                            scoring = 'accuracy',
                           verbose=3) # try for accuracy, precision, f1

In [14]:
grid_clf_acc.fit(X_train, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
[CV 1/5] END C=0.001, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/5] END C=0.001, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 3/5] END C=0.001, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 4/5] END C=0.001, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 5/5] END C=0.001, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 1/5] END ...C=0.001, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/5] END ...C=0.001, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/5] END ...C=0.001, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 4/5] END ...C=0.001, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 5/5] END ...C=0.001, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/5] END C=0.001, penalty=l1, solver=liblinear;, score=0.655 total time=   0.0s
[CV 2/5] END C=0.001, penalty=l1, solve

In [15]:
#Predict values based on new parameters
y_pred_acc = grid_clf_acc.predict(X_test)

In [16]:
# New Model Evaluation metrics 
print('Accuracy Score : ' + str(accuracy_score(y_test,y_pred_acc)))
print('Precision Score : ' + str(precision_score(y_test,y_pred_acc,average="weighted")))
print('Recall Score : ' + str(recall_score(y_test,y_pred_acc,average="weighted")))
print('F1 Score : ' + str(f1_score(y_test,y_pred_acc,average="weighted")))

Accuracy Score : 0.734375
Precision Score : 0.7269728203781513
Recall Score : 0.734375
F1 Score : 0.7280949210354173


In [17]:
grid_clf_acc.best_params_

{'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}

In [18]:
grid_clf_acc.best_estimator_

In [19]:
model = LogisticRegression(C=0.1, penalty='l2',solver='lbfgs')

In [20]:
model.fit(X_train, y_train)

In [21]:
model.score(X_train, y_train)

0.7916666666666666