### iris data - https://archive.ics.uci.edu/ml/datasets/Iris

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None, names=names)

X = np.array(df.ix[:, 0:4])
y = np.array(df['class'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
def knn(k, data, dataClass, inputs):
    nInputs = np.shape(inputs)[0]
    closet = np.zeros(nInputs)
    
    for n in range(nInputs):
        distances = np.sum((data-inputs[n,:])**2, axis=1)
        indices = np.argsort(distances, axis=0)
        classes = np.unique(dataClass[indices[:k]])
        
        if len(classes)==1:
            closet[n] = np.unique(classes)
        else:
            counts = np.zeros(max(classes)+1)
            for i in range(k):
                counts[dataClass[indices[i]]] += 1
            closet[n] = np.max(counts)
            
    return closet

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
k = 3
data = X_train[1:,]
dataClass = le.fit_transform(y_train[1:,])
inputs = np.reshape(X_train[0], (1, X_train[0].shape[0]))

predict = knn(3, data, dataClass, X_test)
predict = le.inverse_transform(predict.astype(int))

In [None]:
from sklearn import metrics
metrics.accuracy_score(y_test, predict)

### scikit-learn 

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
print(metrics.accuracy_score(y_test, pred))

### cross validation

In [None]:
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
%matplotlib inline

neighbors = list(range(1,10))
cv_scores = []

# perform 10-fold cross validation
for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())
    
MSE = [1 - x for x in cv_scores]

# determining best k
optimal_k = neighbors[MSE.index(min(MSE))]
print("The optimal number of neighbors is %d" % optimal_k)

# plot misclassification error vs k
plt.plot(neighbors, MSE)
plt.xlabel('Number of Neighbors K')
plt.ylabel('Misclassification Error')