
# KNN -  K Nearest Neighbors - Classification via Scikit-Learn

The Scikit-Learn example can be found here
https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html#sphx-glr-auto-examples-linear-model-plot-ols-py

In [5]:
from sklearn import linear_model
import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd


#### load the dataset

In [34]:
from palmerpenguins import load_penguins
df = load_penguins()
df.head(20)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,2007
6,Adelie,Torgersen,38.9,17.8,181.0,3625.0,female,2007
7,Adelie,Torgersen,39.2,19.6,195.0,4675.0,male,2007
8,Adelie,Torgersen,34.1,18.1,193.0,3475.0,,2007
9,Adelie,Torgersen,42.0,20.2,190.0,4250.0,,2007


In [31]:
df = load_penguins()
df.head(20)

ValueError: 'penguins' is not one of the example datasets.

In [None]:
df = sns.load_dataset("penguins").dropna()
df.head(20)

#### Lets make a KNN model to clasify a given penguin based on his or her species.   

In [None]:
sns.scatterplot(x='bill_length_mm',y='bill_depth_mm',hue='species',data=df,alpha=0.9)

## Train|Test Split and Scaling Data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
X = df.loc[:,'bill_length_mm':'bill_depth_mm']
y = df['species']

In [None]:
X.head()

In [None]:
y.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=85)

In [None]:
scaler = StandardScaler()

In [None]:
help(StandardScaler)

In [None]:
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.fit_transform(X_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5)

In [None]:
knn_model.fit(scaled_X_train,y_train)

## Model Evaluation

In [None]:
y_pred = knn_model.predict(scaled_X_test)

In [None]:
y_pred

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
confusion_matrix(y_test,y_pred)

## Elbow Method for Choosing Reasonable K Values


In [None]:
test_error_rates = []


for k in range(1,20):
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(scaled_X_train,y_train) 
   
    y_pred_test = knn_model.predict(scaled_X_test)
    
    test_error = 1 - accuracy_score(y_test,y_pred_test)
    test_error_rates.append(test_error)

In [None]:
plt.figure(figsize=(10,6),dpi=200)
plt.plot(range(1,20),test_error_rates,label='Test Error')
plt.legend()
plt.ylabel('Error Rate')
plt.xlabel("K Value")