# K Nearest Neighbors with Python

## Import Libraries



In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, neighbors

# !pip install mlxtend (you might need this one)
from mlxtend.plotting import plot_decision_regions


import seaborn as sns

%matplotlib inline

## Getting Data

In [None]:
ushape = pd.read_csv('ushape.csv')

In [None]:
ushape.head()

### Seaborn pairplot is very useful. 

In [None]:
sns.pairplot(ushape, hue='class')

In [None]:
sns.scatterplot('X', 'Y' , hue='class', data=ushape)

In [None]:
plt.style.use('ggplot')
ushape.plot.scatter('X', 'Y', c=np.where(ushape['class']==1,'tab:orange','tab:purple'))

In [None]:
linearsep = pd.read_csv('linearsep.csv')
linearsep.plot.scatter('X', 'Y', c=np.where(linearsep['class']==1.0,'tab:orange','tab:purple'))

In [None]:
xor = pd.read_csv('xor.csv')
xor.plot.scatter('X', 'Y', c=np.where(xor['class']==1.0,'tab:orange','tab:purple'))

### Decision Region Function

- Here I try to visualize the decision regions for only two attributes. 
- Different parameters of $k$ are used. 

In [None]:
def knn_comparison(data, k):
    
    x = data[['X','Y']].values
    y = data['class'].astype(int).values
    
    
    # KNN classification using k
    clf = neighbors.KNeighborsClassifier(n_neighbors=k)
    clf.fit(x, y)
    
    # Plotting decision region
    plot_decision_regions(x, y, clf=clf, legend=2)
    
    # Adding axes annotations
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.title('Knn with K='+ str(k))
    
    plt.show()

In [None]:
for i in range(2, 10, 2):
    knn_comparison(ushape, i)

In [None]:
for i in range(2, 10, 2):
    knn_comparison(linearsep, i)

## Multiple Attributes 
Set index_col=0 to use the first column as the index.

In [None]:
df = pd.read_csv("Classified Data.csv",index_col=0)

In [None]:
df.describe()

In [None]:
# colors = np.where(df["TARGET CLASS"]==1,'g','b')

df.plot.scatter('WTT', 'PTI', c=np.where(df["TARGET CLASS"]==1,'tab:orange','tab:purple'))

In [None]:
df.head()

## Standardize the Variables

Because the KNN classifier predicts the class of a given test observation by identifying the observations that are nearest to it, the scale of the variables matters. Any variables that are on a large scale will have a much larger effect on the distance between the observations, and hence on the KNN classifier, than variables that are on a small scale.

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(df.drop('TARGET CLASS',axis=1))

In [None]:
scaled_features = scaler.transform(df.drop('TARGET CLASS',axis=1))

In [None]:
scaled_features

In [None]:
df.columns[:-1]

In [None]:
df_feat = pd.DataFrame(scaled_features,columns=df.columns[:-1])
df_feat.head()

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_feat, df['TARGET CLASS'],test_size=0.30, random_state = 421)

## Using KNN

Remember that we are trying to come up with a model to predict whether someone will TARGET CLASS or not. We'll start with k=1.

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)

In [None]:
knn.fit(X_train,y_train)

In [None]:
pred = knn.predict(X_test)

## Predictions and Evaluations

Let's evaluate our KNN model!

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
print(confusion_matrix(y_test,pred))

In [None]:
len(pred)

In [None]:
print(classification_report(y_test,pred))

## Choosing a K Value

Let's go ahead and use the elbow method to pick a good K Value:

In [None]:
error_rate = []

# Will take some time
for i in range(1,40):
    
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [None]:
error_rate

# not very readable with the list, could use some visualizations

In [None]:
# make it bigger
plt.figure(figsize=(10,6))

plt.plot(range(1,40), error_rate, color='blue', linestyle='dashed', marker='o', markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

Here we can see that that after arouns K>23 the error rate just tends to hover around 0.06-0.05 Let's retrain the model with that and check the classification report!

In [None]:
# FIRST A QUICK COMPARISON TO OUR ORIGINAL K=1
knn = KNeighborsClassifier(n_neighbors=1)

knn.fit(X_train,y_train)
pred = knn.predict(X_test)

print('WITH K=1')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))

In [None]:
# NOW WITH K=26
knn = KNeighborsClassifier(n_neighbors=26)

knn.fit(X_train,y_train)
pred = knn.predict(X_test)

print('WITH K=23')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))