In [1]:
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import os

In [2]:
df = pd.read_csv(os.path.join("..", "Resources", "diabetes.csv"))
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
target = df["Outcome"]
target_names = ["negative", "positive"]

In [4]:
data = df.drop("Outcome", axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [6]:
# Create the Logistic Model
model = LogisticRegression(max_iter=1000)
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [7]:
# Create the GridSearch estimator along with a parameter object containing the values to adjust
# Try adjusting `C` with values of 0.001, 0.01, 0.1, 1, 10, 100. Adjust `tol` by using .0001, 0.001, and 0.01.
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'tol': [0.0001, 0.001, 0.01]}
grid_clf = GridSearchCV(model, param_grid, verbose=3)

In [8]:
# Fit the model by using the grid search estimator. 
# This will take the LogisticRegression model and try each combination of parameters.
grid_clf.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] C=0.001, tol=0.0001 .............................................
[CV] ................. C=0.001, tol=0.0001, score=0.802, total=   0.1s
[CV] C=0.001, tol=0.0001 .............................................
[CV] ................. C=0.001, tol=0.0001, score=0.739, total=   0.0s
[CV] C=0.001, tol=0.0001 .............................................
[CV] ................. C=0.001, tol=0.0001, score=0.809, total=   0.0s
[CV] C=0.001, tol=0.0001 .............................................
[CV] ................. C=0.001, tol=0.0001, score=0.739, total=   0.0s
[CV] C=0.001, tol=0.0001 .............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] ................. C=0.001, tol=0.0001, score=0.765, total=   0.1s
[CV] C=0.001, tol=0.001 ..............................................
[CV] .................. C=0.001, tol=0.001, score=0.802, total=   0.1s
[CV] C=0.001, tol=0.001 ..............................................
[CV] .................. C=0.001, tol=0.001, score=0.739, total=   0.0s
[CV] C=0.001, tol=0.001 ..............................................
[CV] .................. C=0.001, tol=0.001, score=0.809, total=   0.0s
[CV] C=0.001, tol=0.001 ..............................................
[CV] .................. C=0.001, tol=0.001, score=0.739, total=   0.0s
[CV] C=0.001, tol=0.001 ..............................................
[CV] .................. C=0.001, tol=0.001, score=0.765, total=   0.0s
[CV] C=0.001, tol=0.01 ...............................................
[CV] ................... C=0.001, tol=0.01, score=0.802, total=   0.1s
[CV] C=0.001, tol=0.01 ...............................................
[CV] .

[CV] ..................... C=10, tol=0.001, score=0.802, total=   0.0s
[CV] C=10, tol=0.001 .................................................
[CV] ..................... C=10, tol=0.001, score=0.757, total=   0.1s
[CV] C=10, tol=0.001 .................................................
[CV] ..................... C=10, tol=0.001, score=0.800, total=   0.1s
[CV] C=10, tol=0.001 .................................................
[CV] ..................... C=10, tol=0.001, score=0.730, total=   0.0s
[CV] C=10, tol=0.001 .................................................
[CV] ..................... C=10, tol=0.001, score=0.774, total=   0.0s
[CV] C=10, tol=0.01 ..................................................
[CV] ...................... C=10, tol=0.01, score=0.802, total=   0.0s
[CV] C=10, tol=0.01 ..................................................
[CV] ...................... C=10, tol=0.01, score=0.757, total=   0.1s
[CV] C=10, tol=0.01 ..................................................
[CV] .

[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:    4.0s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=1000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100],
                         'tol': [0.0001, 0.001, 0.01]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [9]:
# List the best parameters for this dataset
print(grid_clf.best_params_)

{'C': 0.1, 'tol': 0.0001}


In [10]:
# List the best score
print(grid_clf.best_score_)

0.7760119940029985


In [11]:
# Create the parameter object for the RandomizedSearchCV estimator
# Try adjusting `C` with values from 0 to 10 and 'tol` with values from 0 to 0.001
param_grid = {
    'C' : np.arange(0, 10, 0.01),
    'tol': np.arange(0, 0.001, 1e-5),
}
param_grid

{'C': array([0.  , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,
        0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,
        0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,
        0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,
        0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,
        0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,
        0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,
        0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,
        0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,
        0.99, 1.  , 1.01, 1.02, 1.03, 1.04, 1.05, 1.06, 1.07, 1.08, 1.09,
        1.1 , 1.11, 1.12, 1.13, 1.14, 1.15, 1.16, 1.17, 1.18, 1.19, 1.2 ,
        1.21, 1.22, 1.23, 1.24, 1.25, 1.26, 1.27, 1.28, 1.29, 1.3 , 1.31,
        1.32, 1.33, 1.34, 1.35, 1.36, 1.37, 1.38, 1.39, 1.4 , 1.41, 1.42,
        1.43, 1.44, 1.45, 1.46, 1

In [12]:
# Create the RandomizedSearch estimator by using the LogisticRegression model and the parameter grid that you created
from sklearn.model_selection import RandomizedSearchCV
random_clf = RandomizedSearchCV(model, param_grid, random_state=0, verbose=3)

In [13]:
# Fit the model by using the randomized search estimator. 
# This will take the LogisticRegression model and a random sample of combinations of parameters
random_clf.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] tol=0.00068, C=6.82 .............................................
[CV] ................. tol=0.00068, C=6.82, score=0.802, total=   0.0s
[CV] tol=0.00068, C=6.82 .............................................
[CV] ................. tol=0.00068, C=6.82, score=0.757, total=   0.0s
[CV] tol=0.00068, C=6.82 .............................................
[CV] ................. tol=0.00068, C=6.82, score=0.800, total=   0.1s
[CV] tol=0.00068, C=6.82 .............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] ................. tol=0.00068, C=6.82, score=0.730, total=   0.0s
[CV] tol=0.00068, C=6.82 .............................................
[CV] ................. tol=0.00068, C=6.82, score=0.774, total=   0.0s
[CV] tol=0.00067, C=4.3500000000000005 ...............................
[CV] ... tol=0.00067, C=4.3500000000000005, score=0.802, total=   0.0s
[CV] tol=0.00067, C=4.3500000000000005 ...............................
[CV] ... tol=0.00067, C=4.3500000000000005, score=0.757, total=   0.1s
[CV] tol=0.00067, C=4.3500000000000005 ...............................
[CV] ... tol=0.00067, C=4.3500000000000005, score=0.800, total=   0.1s
[CV] tol=0.00067, C=4.3500000000000005 ...............................
[CV] ... tol=0.00067, C=4.3500000000000005, score=0.730, total=   0.0s
[CV] tol=0.00067, C=4.3500000000000005 ...............................
[CV] ... tol=0.00067, C=4.3500000000000005, score=0.774, total=   0.0s
[CV] tol=0.00013000000000000002, C=4.26 ..............................
[CV] .

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    2.3s finished


RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=LogisticRegression(C=1.0, class_weight=None,
                                                dual=False, fit_intercept=True,
                                                intercept_scaling=1,
                                                l1_ratio=None, max_iter=1000,
                                                multi_class='auto', n_jobs=None,
                                                penalty='l2', random_state=None,
                                                solver='lbfgs', tol=0.0001,
                                                verbose=0, warm_start=False),
                   iid='deprecated', n_iter=10, n_jobs=None,
                   param_distributions={...
       6.3e-04, 6.4e-04, 6.5e-04, 6.6e-04, 6.7e-04, 6.8e-04, 6.9e-04,
       7.0e-04, 7.1e-04, 7.2e-04, 7.3e-04, 7.4e-04, 7.5e-04, 7.6e-04,
       7.7e-04, 7.8e-04, 7.9e-04, 8.0e-04, 8.1e-04, 8.2e-04, 8.3e-04,
       8.4e-04, 8.5e-04

In [14]:
# List the best parameters for this dataset
print(random_clf.best_params_)

{'tol': 0.00043000000000000004, 'C': 2.12}


In [15]:
# List the best score
print(random_clf.best_score_)

0.7742578710644676


In [17]:
# Make predictions with the hypertuned model
predictions = random_clf.predict(X_test)

In [18]:
# Calculate the classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

    negative       0.80      0.77      0.79       123
    positive       0.62      0.65      0.63        69

    accuracy                           0.73       192
   macro avg       0.71      0.71      0.71       192
weighted avg       0.73      0.73      0.73       192

