In [1]:
# import important libraries
import numpy as np
import pandas as pd
from sklearn import datasets, neighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from time import time
%matplotlib inline

# K-nearest neighbors classification

### Load and Prepare Iris Dataset

In [2]:
iris = load_iris()

#For simplicity choosing only the first two input-features
df_iris = pd.DataFrame(data= np.c_[iris['data'], iris['target']],columns= iris['feature_names'] + ['target'])
df_iris = df_iris.drop('petal length (cm)',axis=1)
df_iris = df_iris.drop('petal width (cm)',axis=1)
X = iris.data[:,:2] 
Y = iris.target

# Random five rows for giving input to classifier
print(df_iris.iloc[np.random.choice(df_iris.index, 5)])
# Three classes 0,1,2
print(df_iris.target.unique())

number_of_samples = len(Y)

#Slicing dataset into training and test sets
random_indices = np.random.permutation(number_of_samples)
#Training set 
num_training_samples = int(number_of_samples*0.75)
x_train = X[random_indices[:num_training_samples]]
y_train = Y[random_indices[:num_training_samples]]
#Test set
x_test = X[random_indices[num_training_samples:]]
y_test = Y[random_indices[num_training_samples:]]



     sepal length (cm)  sepal width (cm)  target
6                  4.6               3.4     0.0
71                 6.1               2.8     1.0
120                6.9               3.2     2.0
32                 5.2               4.1     0.0
65                 6.7               3.1     1.0
[0. 1. 2.]


## K-nearest neighbour classifier algorithm

In [3]:
model = neighbors.KNeighborsClassifier(n_neighbors = 5) # K = 5
model.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

### Evaluate performance

In [4]:
def evaluate_performance(model, x_test, y_test):
    test_set_predictions = [model.predict(x_test[i].reshape((1,len(x_test[i]))))[0] for i in range(x_test.shape[0])]
    test_misclassification_percentage = 0
    for i in range(len(test_set_predictions)):
        if test_set_predictions[i]!=y_test[i]:
            test_misclassification_percentage+=1
    test_misclassification_percentage *= 100/len(y_test)
    return test_misclassification_percentage

In [5]:
#Evaluate the performances on the validation and test sets
print("Evaluating K-NN classifier:")
test_err = evaluate_performance(model, x_test, y_test)
print('test misclassification percentage = {}%'.format(test_err))

Evaluating K-NN classifier:
test misclassification percentage = 18.42105263157895%


In [6]:
def classify(model,sepal_length,sepal_width):
    l = []
    l.append(sepal_length)
    l.append(sepal_width)
    query = np.array(l)
    predicted_class = model.predict([query])
    return predicted_class

In [7]:
# pass any sepal length and width as parameters to classify function
#similar input flowers as of class 2
pred_class = classify(model,7.0,3.0)
print("Predicted class is : {}".format(pred_class[0]))
#similar input flowers as of  1
pred_class = classify(model,5.0,2.5)
print("Predicted class is : {}".format(pred_class[0]))
#similar input flowers as of class 0
pred_class = classify(model,5.0,3.0)
print("Predicted class is : {}".format(pred_class[0]))

Predicted class is : 2
Predicted class is : 1
Predicted class is : 0
