In [2]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split

In [16]:
fruits = pd.read_excel('data.xls')

In [17]:
fruits.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [18]:
# create a mapping from fruit label value to fruit name to make results easier to interpret
lookupdict = dict(zip(fruits.fruit_label.unique(),fruits.fruit_name.unique()))

In [19]:
lookupdict

{1: u'apple', 2: u'mandarin', 3: u'orange', 4: u'lemon'}

In [41]:
#X = fruits.iloc[:,3:]
y = fruits.iloc[:,0]
X = fruits[['height', 'width', 'mass', 'color_score']]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [42]:
# plotting a 3D scatter plot
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure()
ax = fig.add_subplot(111, projection = '3d')
ax.scatter(X_train['width'], X_train['height'], X_train['color_score'], c = y_train, marker = 'o', s=100)
ax.set_xlabel('width')
ax.set_ylabel('height')
ax.set_zlabel('color_score')
plt.show() 

<IPython.core.display.Javascript object>

In [43]:
# For this example, we use the mass, width, and height features of each fruit instance
X = fruits[['mass', 'width', 'height']]
y = fruits['fruit_label']

# default is 75% / 25% train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

### KNN

In [44]:
from sklearn.neighbors import KNeighborsClassifier

In [46]:
knn = KNeighborsClassifier(n_neighbors = 5)

In [48]:
# Train the classifier (fit the estimator) using the training data
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [49]:
# Estimate the accuracy of the classifier on future data, using the test data
knn.score(X_test,y_test)  # accuracy

0.5333333333333333

### Use the trained k-NN classifier model to classify new, previously unseen objects

In [50]:
newdata = [20, 4.3, 5.5]
fruit_prediction = knn.predict([newdata])

In [53]:
lookupdict[fruit_prediction[0]]

u'mandarin'

In [55]:
# second example: a larger, elongated fruit with mass 100g, width 6.3 cm, height 8.5 cm
fruit_prediction = knn.predict([[100, 6.3, 8.5]])
lookupdict[fruit_prediction[0]]

u'lemon'

### How sensitive is k-NN classification accuracy to the choice of the 'k' parameter?



In [68]:
k_range = range(1,20)
scores = []

for i in k_range:
    knn = KNeighborsClassifier(n_neighbors = i)
    knn.fit(X_train, y_train)
    scores.append(knn.score(X_test, y_test))

plt.figure()
plt.xlabel('k')
plt.ylabel('accuracy')
plt.scatter(k_range, scores)
plt.xticks([0,5,10,15,20]);

<IPython.core.display.Javascript object>

### How sensitive is k-NN classification accuracy to the train/test split proportion

In [61]:
t = [0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2]

In [70]:
t = [0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2]

 
    
for s in t:

    scores = []
    for i in range(1,1000):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-s)
        knn.fit(X_train, y_train)
        scores.append(knn.score(X_test, y_test))
    plt.plot(s, np.mean(scores), 'bo')

plt.xlabel('Training set proportion (%)')
plt.ylabel('accuracy');

<IPython.core.display.Javascript object>