In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

In [2]:
df = pd.read_csv(os.path.join("..", "Resources", "diabetes.csv"))
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
target = df["Outcome"]
target_names = ["negative", "positive"]

In [4]:
data = df.drop("Outcome", axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [6]:
# KNN classifier
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()

In [7]:
# Create the grid search estimator along with a parameter object containing the values to adjust.
# Try adjusting n_neighbors with values of 1 through 19. Adjust leaf_size by using 10, 50, 100, and 500.
# Include both uniform and distance options for weights.
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19],
    'weights': ['uniform', 'distance'],
    'leaf_size': [10, 50, 100, 500]
}
grid_clf = GridSearchCV(model, param_grid, verbose=3)

In [8]:
# Fit the model by using the grid search estimator.
# This will take the KNN model and try each combination of parameters.
grid_clf.fit(X_train, y_train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[CV] leaf_size=10, n_neighbors=1, weights=uniform ....................
[CV]  leaf_size=10, n_neighbors=1, weights=uniform, score=0.741, total=   0.0s
[CV] leaf_size=10, n_neighbors=1, weights=uniform ....................
[CV]  leaf_size=10, n_neighbors=1, weights=uniform, score=0.670, total=   0.0s
[CV] leaf_size=10, n_neighbors=1, weights=uniform ....................
[CV]  leaf_size=10, n_neighbors=1, weights=uniform, score=0.765, total=   0.0s
[CV] leaf_size=10, n_neighbors=1, weights=uniform ....................
[CV]  leaf_size=10, n_neighbors=1, weights=uniform, score=0.626, total=   0.0s
[CV] leaf_size=10, n_neighbors=1, weights=uniform ....................
[CV]  leaf_size=10, n_neighbors=1, weights=uniform, score=0.617, total=   0.0s
[CV] leaf_size=10, n_neighbors=1, weights=distance ...................
[CV]  leaf_size=10, n_neighbors=1, weights=distance, score=0.741, total=   0.0s
[CV] leaf_size=10, n_neighbors=1, wei

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s



[CV] leaf_size=10, n_neighbors=3, weights=distance ...................
[CV]  leaf_size=10, n_neighbors=3, weights=distance, score=0.687, total=   0.0s
[CV] leaf_size=10, n_neighbors=5, weights=uniform ....................
[CV]  leaf_size=10, n_neighbors=5, weights=uniform, score=0.759, total=   0.0s
[CV] leaf_size=10, n_neighbors=5, weights=uniform ....................
[CV]  leaf_size=10, n_neighbors=5, weights=uniform, score=0.696, total=   0.0s
[CV] leaf_size=10, n_neighbors=5, weights=uniform ....................
[CV]  leaf_size=10, n_neighbors=5, weights=uniform, score=0.748, total=   0.0s
[CV] leaf_size=10, n_neighbors=5, weights=uniform ....................
[CV]  leaf_size=10, n_neighbors=5, weights=uniform, score=0.678, total=   0.0s
[CV] leaf_size=10, n_neighbors=5, weights=uniform ....................
[CV]  leaf_size=10, n_neighbors=5, weights=uniform, score=0.722, total=   0.0s
[CV] leaf_size=10, n_neighbors=5, weights=distance ...................
[CV]  leaf_size=10, n_neigh

[CV]  leaf_size=10, n_neighbors=19, weights=uniform, score=0.696, total=   0.0s
[CV] leaf_size=10, n_neighbors=19, weights=uniform ...................
[CV]  leaf_size=10, n_neighbors=19, weights=uniform, score=0.739, total=   0.0s
[CV] leaf_size=10, n_neighbors=19, weights=uniform ...................
[CV]  leaf_size=10, n_neighbors=19, weights=uniform, score=0.696, total=   0.0s
[CV] leaf_size=10, n_neighbors=19, weights=uniform ...................
[CV]  leaf_size=10, n_neighbors=19, weights=uniform, score=0.739, total=   0.0s
[CV] leaf_size=10, n_neighbors=19, weights=distance ..................
[CV]  leaf_size=10, n_neighbors=19, weights=distance, score=0.759, total=   0.0s
[CV] leaf_size=10, n_neighbors=19, weights=distance ..................
[CV]  leaf_size=10, n_neighbors=19, weights=distance, score=0.722, total=   0.0s
[CV] leaf_size=10, n_neighbors=19, weights=distance ..................
[CV]  leaf_size=10, n_neighbors=19, weights=distance, score=0.722, total=   0.0s
[CV] leaf_s

[CV]  leaf_size=50, n_neighbors=15, weights=uniform, score=0.722, total=   0.0s
[CV] leaf_size=50, n_neighbors=15, weights=uniform ...................
[CV]  leaf_size=50, n_neighbors=15, weights=uniform, score=0.713, total=   0.0s
[CV] leaf_size=50, n_neighbors=15, weights=uniform ...................
[CV]  leaf_size=50, n_neighbors=15, weights=uniform, score=0.722, total=   0.0s
[CV] leaf_size=50, n_neighbors=15, weights=distance ..................
[CV]  leaf_size=50, n_neighbors=15, weights=distance, score=0.741, total=   0.0s
[CV] leaf_size=50, n_neighbors=15, weights=distance ..................
[CV]  leaf_size=50, n_neighbors=15, weights=distance, score=0.722, total=   0.0s
[CV] leaf_size=50, n_neighbors=15, weights=distance ..................
[CV]  leaf_size=50, n_neighbors=15, weights=distance, score=0.739, total=   0.0s
[CV] leaf_size=50, n_neighbors=15, weights=distance ..................
[CV]  leaf_size=50, n_neighbors=15, weights=distance, score=0.748, total=   0.0s
[CV] leaf_

[CV]  leaf_size=100, n_neighbors=9, weights=uniform, score=0.748, total=   0.0s
[CV] leaf_size=100, n_neighbors=9, weights=uniform ...................
[CV]  leaf_size=100, n_neighbors=9, weights=uniform, score=0.704, total=   0.0s
[CV] leaf_size=100, n_neighbors=9, weights=uniform ...................
[CV]  leaf_size=100, n_neighbors=9, weights=uniform, score=0.774, total=   0.0s
[CV] leaf_size=100, n_neighbors=9, weights=distance ..................
[CV]  leaf_size=100, n_neighbors=9, weights=distance, score=0.759, total=   0.0s
[CV] leaf_size=100, n_neighbors=9, weights=distance ..................
[CV]  leaf_size=100, n_neighbors=9, weights=distance, score=0.722, total=   0.0s
[CV] leaf_size=100, n_neighbors=9, weights=distance ..................
[CV]  leaf_size=100, n_neighbors=9, weights=distance, score=0.757, total=   0.0s
[CV] leaf_size=100, n_neighbors=9, weights=distance ..................
[CV]  leaf_size=100, n_neighbors=9, weights=distance, score=0.713, total=   0.0s
[CV] leaf_

[CV]  leaf_size=500, n_neighbors=1, weights=distance, score=0.741, total=   0.0s
[CV] leaf_size=500, n_neighbors=1, weights=distance ..................
[CV]  leaf_size=500, n_neighbors=1, weights=distance, score=0.670, total=   0.0s
[CV] leaf_size=500, n_neighbors=1, weights=distance ..................
[CV]  leaf_size=500, n_neighbors=1, weights=distance, score=0.765, total=   0.0s
[CV] leaf_size=500, n_neighbors=1, weights=distance ..................
[CV]  leaf_size=500, n_neighbors=1, weights=distance, score=0.626, total=   0.0s
[CV] leaf_size=500, n_neighbors=1, weights=distance ..................
[CV]  leaf_size=500, n_neighbors=1, weights=distance, score=0.617, total=   0.0s
[CV] leaf_size=500, n_neighbors=3, weights=uniform ...................
[CV]  leaf_size=500, n_neighbors=3, weights=uniform, score=0.759, total=   0.0s
[CV] leaf_size=500, n_neighbors=3, weights=uniform ...................
[CV]  leaf_size=500, n_neighbors=3, weights=uniform, score=0.696, total=   0.0s
[CV] leaf

[CV]  leaf_size=500, n_neighbors=15, weights=uniform, score=0.733, total=   0.0s
[CV] leaf_size=500, n_neighbors=15, weights=uniform ..................
[CV]  leaf_size=500, n_neighbors=15, weights=uniform, score=0.739, total=   0.0s
[CV] leaf_size=500, n_neighbors=15, weights=uniform ..................
[CV]  leaf_size=500, n_neighbors=15, weights=uniform, score=0.722, total=   0.0s
[CV] leaf_size=500, n_neighbors=15, weights=uniform ..................
[CV]  leaf_size=500, n_neighbors=15, weights=uniform, score=0.713, total=   0.0s
[CV] leaf_size=500, n_neighbors=15, weights=uniform ..................
[CV]  leaf_size=500, n_neighbors=15, weights=uniform, score=0.722, total=   0.0s
[CV] leaf_size=500, n_neighbors=15, weights=distance .................
[CV]  leaf_size=500, n_neighbors=15, weights=distance, score=0.741, total=   0.0s
[CV] leaf_size=500, n_neighbors=15, weights=distance .................
[CV]  leaf_size=500, n_neighbors=15, weights=distance, score=0.722, total=   0.0s
[CV] 

[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    3.2s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'leaf_size': [10, 50, 100, 500],
                         'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [9]:
# List the best parameters for this dataset
print(grid_clf.best_params_)

{'leaf_size': 10, 'n_neighbors': 11, 'weights': 'distance'}


In [10]:
# List the best score
print(grid_clf.best_score_)

0.7464767616191904


In [11]:
# Create the parameter object for the randomized search estimator.
# Try adjusting n_neighbors with values of 1 through 19. Adjust leaf_size by using a range from 1 to 500.
# Include both uniform and distance options for weights.
param_grid = {
    'n_neighbors': np.arange(1,20,2),
    'weights': ['uniform', 'distance'],
    'leaf_size': np.arange(1, 500)
}
param_grid

{'n_neighbors': array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19]),
 'weights': ['uniform', 'distance'],
 'leaf_size': array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
         40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
         53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
         66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
         92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
        105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
        118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
        131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
        1

In [12]:
# Create the randomized search estimator by using the logistic regression model and the parameter grid that you created.
from sklearn.model_selection import RandomizedSearchCV
random_clf = RandomizedSearchCV(model, param_grid, random_state=0, verbose=3)

In [13]:
# Fit the model b using the randomized search estimator.
# This will take the logistic regression model and a random sample of combinations of parameters.
random_clf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] weights=uniform, n_neighbors=13, leaf_size=137 ..................
[CV]  weights=uniform, n_neighbors=13, leaf_size=137, score=0.733, total=   0.0s
[CV] weights=uniform, n_neighbors=13, leaf_size=137 ..................
[CV]  weights=uniform, n_neighbors=13, leaf_size=137, score=0.704, total=   0.0s

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s



[CV] weights=uniform, n_neighbors=13, leaf_size=137 ..................
[CV]  weights=uniform, n_neighbors=13, leaf_size=137, score=0.704, total=   0.0s
[CV] weights=uniform, n_neighbors=13, leaf_size=137 ..................
[CV]  weights=uniform, n_neighbors=13, leaf_size=137, score=0.713, total=   0.0s
[CV] weights=uniform, n_neighbors=13, leaf_size=137 ..................
[CV]  weights=uniform, n_neighbors=13, leaf_size=137, score=0.739, total=   0.0s
[CV] weights=distance, n_neighbors=5, leaf_size=493 ..................
[CV]  weights=distance, n_neighbors=5, leaf_size=493, score=0.759, total=   0.0s
[CV] weights=distance, n_neighbors=5, leaf_size=493 ..................
[CV]  weights=distance, n_neighbors=5, leaf_size=493, score=0.704, total=   0.0s
[CV] weights=distance, n_neighbors=5, leaf_size=493 ..................
[CV]  weights=distance, n_neighbors=5, leaf_size=493, score=0.757, total=   0.0s
[CV] weights=distance, n_neighbors=5, leaf_size=493 ..................
[CV]  weights=di

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.4s finished


RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=KNeighborsClassifier(algorithm='auto',
                                                  leaf_size=30,
                                                  metric='minkowski',
                                                  metric_params=None,
                                                  n_jobs=None, n_neighbors=5,
                                                  p=2, weights='uniform'),
                   iid='deprecated', n_iter=10, n_jobs=None,
                   param_distributions={'leaf_size': array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30...
       456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468,
       469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481,
       482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
       495, 496, 497, 498, 

In [14]:
# List the best parameters for this dataset
print(random_clf.best_params_)

{'weights': 'distance', 'n_neighbors': 11, 'leaf_size': 395}


In [15]:
# List the best score
print(random_clf.best_score_)

0.7464767616191904


In [16]:
# Make predictions with the hypertuned model
predictions = random_clf.predict(X_test)

In [17]:
# Calculate the classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

    negative       0.75      0.75      0.75       123
    positive       0.56      0.57      0.56        69

    accuracy                           0.68       192
   macro avg       0.66      0.66      0.66       192
weighted avg       0.68      0.68      0.68       192

