In [1]:
# EDA and data handling
import numpy as np
import pandas as pd
import pickle

# Modeling
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

## Prepare the dataset

In [2]:
# read in the iris data which is included with sklearn
from sklearn.datasets import load_iris
iris = load_iris()

In [3]:
# It's a 'bunch', kind of like a dictionary.
print(type(iris))
print(iris.keys())
print(iris.target_names)
print(iris.feature_names)

<class 'sklearn.utils.Bunch'>
dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])
['setosa' 'versicolor' 'virginica']
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [4]:
# what are the data and target?
print(iris.data[:5])
print(iris.target[:5])

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
[0 0 0 0 0]


In [5]:
# convert these values into a dataframe
df = pd.DataFrame(iris.data, columns=['sl', 'sw', 'pl', 'pw'])
df['species']=iris.target
df.head()

Unnamed: 0,sl,sw,pl,pw,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [10]:
# How many do we have of each species?
df['species'].value_counts().sort_index()

0    50
1    50
2    50
Name: species, dtype: int64

## a simple KNN model (with only 2 predictors)
While in practice a 2-predictor model is typically too simple (i.e., high variance), for the purposes of building a visualization it's simpler to map a scatterplot when there are only two dimensions to deal with.

In [38]:
# establish the predictors and the target
X = df[['sl', 'pl']]
y = df['species']
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X , y, 
                         test_size=0.3, random_state=52 )
print('length of y-test:', len(y_test))
# instantiate the classifier
mymodel = KNeighborsClassifier(n_neighbors=5, weights='distance', metric='euclidean')
# fit on the training data
mymodel.fit(X_train, y_train)
# predict on the testing data
y_preds = mymodel.predict(X_test)
# evaluate the model performance
print('accuracy score: ', round(metrics.accuracy_score(y_test, y_preds),2))
# examine the confusion matrix
pd.DataFrame(metrics.confusion_matrix(y_test, y_preds))

length of y-test: 45
accuracy score:  0.93


Unnamed: 0,0,1,2
0,11,0,0
1,0,14,3
2,0,0,17


## Predict for a new observation

In [12]:
# Create a fake new data point
new_observation=[[4.9, 2.7]]

In [13]:
# predict for our new observations.
prediction=mymodel.predict(new_observation)
print(prediction[0])

1


In [14]:
# what is the species names associated with that predictions?
print(iris.target_names[prediction[0]])

versicolor


In [15]:
# What are the distances of the 5 neighbors nearest to that new observation?
print('distances:', [round(x, 2) for x in list(mymodel.kneighbors(new_observation)[0][0])]) 

distances: [0.36, 0.6, 0.61, 1.1, 1.1]


In [16]:
# What are the indices of the 5 neighbors nearest to that new observation?
neighbors = [x for x in list(mymodel.kneighbors(new_observation)[1][0])]
print('indices:', neighbors) 

indices: [9, 35, 10, 73, 104]


In [21]:
# Add species back in, then reset the index.
train = X_train.copy()
train['species']=y_train.copy()
train.reset_index(drop=True, inplace=True) # necessary so that mymodel.kneighbors can work.

In [22]:
# create a dataset with only the nearest neighbors
df_neighbors=train.iloc[neighbors, :]
df_neighbors

Unnamed: 0,sl,pl,species
9,5.1,3.0,1
35,4.9,3.3,1
10,5.0,3.3,1
73,5.0,1.6,0
104,4.8,1.6,0


In [21]:
# Create multiple KNN models and pickle for use in the plotly dash app.

for k in [5,10,15,20,25]:
    mymodel = KNeighborsClassifier(n_neighbors=k, weights='distance', metric='euclidean')
    mymodel.fit(X_train, y_train)
    file = open(f'resources/model_k{k}.pkl', 'wb')
    pickle.dump(mymodel, file)
    file.close()