# K-nearest neighbours classification


In [1]:
%matplotlib inline

from sklearn import datasets
import numpy as np
import matplotlib.pyplot as plt

iris = datasets.load_iris()

# View a description of the dataset 
print(iris.DESCR)

# Set X a samples times features matrix, Y equal to the targets
X=iris.data 
y=iris.target 


# Add some random noise to our data to make the task more challenging
X=X+np.random.normal(0,0.4,X.shape)


.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

### View some basic dataset info

In [2]:
# View size and shape of dataset
print(len(X))
print(X.shape)

150
(150, 4)


In [3]:
# View how many classes
print(np.unique(y))

[0 1 2]


In [4]:
# How many samples do we have that belong to class 1?
print(len(np.where(y==1)[0]))
np.where(y==1)

50


(array([50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
        67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
        84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99],
       dtype=int64),)

In [5]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [6]:
X

array([[ 4.69125986e+00,  4.22970680e+00,  1.00529190e+00,
         8.59279285e-01],
       [ 5.24125131e+00,  3.10393920e+00,  1.21731968e+00,
        -9.82576690e-02],
       [ 5.04925674e+00,  3.20858407e+00,  9.64869323e-01,
         6.84771591e-01],
       [ 4.64655559e+00,  2.84063293e+00,  1.62468450e+00,
         2.34040958e-02],
       [ 5.03174143e+00,  3.55207383e+00,  1.39305513e+00,
         1.00412039e-01],
       [ 5.08083144e+00,  3.83969252e+00,  2.22081331e+00,
         5.81570053e-01],
       [ 5.00872392e+00,  3.67124658e+00,  1.09972201e+00,
         1.84008352e-04],
       [ 4.72975202e+00,  2.98982394e+00,  2.00504916e+00,
         1.07037911e-01],
       [ 5.61561283e+00,  2.63212826e+00,  1.22536767e+00,
         6.22903802e-02],
       [ 5.16079000e+00,  3.03762064e+00,  1.08364564e+00,
         8.16726542e-01],
       [ 5.10276326e+00,  3.39503035e+00,  1.53439961e+00,
         1.04499500e-01],
       [ 5.69838699e+00,  3.72015811e+00,  1.61322643e+00,
      

### Generate a list of shuffled indices of our data

In [7]:
# Generate and view unshuffled list of indexes to pre-check
L=list(range(X.shape[0]))
print(L)

# Generate and view shuffled list of indexes to check
L2 = np.random.permutation(range(X.shape[0]))
print(L2)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149]
[ 34 146  94  38  84 140   3  99  17 115 122  90   7  97  20   5 136  52
  28  54  66  62  96   9  98  63  53 108  81 139  76  59 101 145  72  86
  92  57  49  13 125  87  15 126 147  39   0  70 143  11  88  67 103  46
  55  74 127 138  36 124  93 118  19 148  21  45 106   2  65 144  78 133
 128  51  18   4  69  58  41  82  73  16  80 105 132  30  75  40  8

## Create k-NN classifier

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Split to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Define knn classifier, with 5 neighbors and use the euclidian distance
knn=KNeighborsClassifier(n_neighbors=10, metric='euclidean')
# Define training and testing data, fit the classifier
knn.fit(X_train,y_train)
# Predict values for test data based on training data
y_pred=knn.predict(X_test)
# Print values
print(y_test) # true values
print(y_pred) # predicted values


[2 2 1 2 0 0 0 0 1 1 1 0 0 0 0 2 2 0 1 1 2 2 0 1 2 0 0 0 0 0]
[2 2 1 2 0 0 0 0 1 1 1 0 0 0 0 2 1 0 2 0 2 2 0 1 2 0 0 0 0 0]


In [9]:
predictions = KNeighborsClassifier(n_neighbors=10, metric='euclidean').fit(X_train,y_train).predict(X_test)
print(predictions)

[2 2 1 2 0 0 0 0 1 1 1 0 0 0 0 2 1 0 2 0 2 2 0 1 2 0 0 0 0 0]


### View accuracy

In [10]:
# Generate overall accuracy scores
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

[[15  0  0]
 [ 1  5  1]
 [ 0  1  7]]
0.9


In [11]:
# Generate Recall and precision scores
from sklearn.metrics import precision_score, recall_score 

print('library precision: %s' % precision_score(y_test,y_pred,average=None))
print('library recall: %s' % recall_score(y_test,y_pred,average=None))

library precision: [0.9375     0.83333333 0.875     ]
library recall: [1.         0.71428571 0.875     ]


### Create own function to replicate confusion matrix and accuracy

In [12]:
# Define function to generate confusion matrix
def myConfMat(y_test, y_pred, class_nunber):
    Confusion = np.zeros((class_nunber, class_nunber), dtype = np.int)
    for i in range(0, len(y_test)):
        Confusion[y_test[i], y_pred[i]]+=1
    return Confusion

print(myConfMat(y_test,y_pred,len(np.unique(y))))

[[15  0  0]
 [ 1  5  1]
 [ 0  1  7]]


In [13]:
# Define function to generate accuracy
def myAccuracy(y_test,y_pred):
    correct = np.where(y_test==y_pred, 1, 0 )
    total = len(y_test)
    return sum(correct)/total
    
print('accuracy: %.2f' % myAccuracy(y_test,y_pred))

accuracy: 0.90


### Create own function to replicate Recall and Precision

In [14]:
# Define functions to generate recall and precision

def myPrecision(y_ground, y_pred):
    # Generate needed list of classes and zero matrix
    classes = np.unique(y_ground)
    precision = np.zeros(classes.shape) 
    
    # Generate confusion matrix
    confusion = myConfMat(y_test, y_pred, len(classes))
    
    # Measure precision
    for i in classes:
        precision[i] = confusion[i,i] / sum(confusion[:,i])
        
    return precision


def myRecall(y_test, y_pred):
    # Generate needed list of classes and zero matrix
    classes = np.unique(y_pred)
    recall = np.zeros(classes.shape) 
    
    # Generate confusion matrix
    confusion = myConfMat(y_test,y_pred,len(classes))
    
    # Measure recall 
    for i in classes:
        recall[i] = confusion[i,i] / sum(confusion[i,:])
    
    return recall

print('classes:      %s' % np.unique(y_pred) )    
print('my precision: %s' % myPrecision(y_test,y_pred))
print('my recall:    %s' % myRecall(y_test,y_pred))


classes:      [0 1 2]
my precision: [0.9375     0.83333333 0.875     ]
my recall:    [1.         0.71428571 0.875     ]
