# Simple classification

---

_You are currently looking at **version 1.0** of this notebook._

---

### Import

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

### Data

In [None]:
path = !find ../.. | grep -i fruit_data_with_colors
path

In [None]:
fruits = pd.read_table(path[0])

In [None]:
fruits.head()

### Map labels to name in dictionary

In [None]:
lookup_fruit_name = dict(fruits.loc[:, ['fruit_label', 'fruit_name']].sort_values('fruit_label').values)
lookup_fruit_name

## Exploring the data

### Train-test split for selected features

- default is 75% / 25% train-test split

In [None]:
X, y = fruits[['height', 'width', 'mass', 'color_score']], fruits['fruit_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

### Plot Scatter matrix

#### Set colormap

In [None]:
from matplotlib import cm

cmap = cm.get_cmap('gnuplot')
figsize = (10, 10)

#### Plot

In [None]:
scatter = pd.plotting.scatter_matrix(X_train, c=y_train, marker='o', s=40, hist_kwds={'bins':15}, figsize=figsize, cmap=cmap)

### 3D scatter plot

In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=figsize)

ax = fig.add_subplot(111, projection = '3d')
ax.scatter(X_train['width'], X_train['height'], X_train['color_score'], c=y_train, marker='o', s=100)
ax.set_xlabel('width')
ax.set_ylabel('height')
ax.set_zlabel('color_score')
plt.show();

### Train-test split for selected features

In [None]:
X, y = fruits[['mass', 'width', 'height']], fruits['fruit_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

### k-NN Classifier - KNeighborsClassifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

### Estimate the accuracy of the classifier on test data

In [None]:
knn.score(X_test, y_test)

### Use the trained model to classify new(unseen) objects

In [None]:
# first example: a small fruit with mass 20g, width 4.3 cm, height 5.5 cm
fruit_prediction = knn.predict([[20, 4.3, 5.5]])
fruit_prediction[0], lookup_fruit_name[fruit_prediction[0]]

In [None]:
# second example: a larger, elongated fruit with mass 100g, width 6.3 cm, height 8.5 cm
fruit_prediction = knn.predict([[100, 6.3, 8.5]])
fruit_prediction[0], lookup_fruit_name[fruit_prediction[0]]

### Plot the decision boundaries of the k-NN classifier

In [None]:
def plot_fruit_knn(X, y, n_neighbors, weights):
    from matplotlib.colors import ListedColormap, BoundaryNorm
    import matplotlib.patches as mpatches
    
    X_mat = X[['height', 'width']].as_matrix()
    y_mat = y.as_matrix()

    # Create color maps
    cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF','#AFAFAF'])
    cmap_bold  = ListedColormap(['#FF0000', '#00FF00', '#0000FF','#AFAFAF'])

    clf = KNeighborsClassifier(n_neighbors, weights=weights)
    clf.fit(X_mat, y_mat)

    # Plot the decision boundary by assigning a color in the color map
    # to each mesh point.
    
    mesh_step_size = .01  # step size in the mesh
    plot_symbol_size = 50
    
    x_min, x_max = X_mat[:, 0].min() - 1, X_mat[:, 0].max() + 1
    y_min, y_max = X_mat[:, 1].min() - 1, X_mat[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, mesh_step_size),
                         np.arange(y_min, y_max, mesh_step_size))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

    # Plot training points
    plt.scatter(X_mat[:, 0], X_mat[:, 1], s=plot_symbol_size, c=y, cmap=cmap_bold, edgecolor = 'black')
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())

    patch0 = mpatches.Patch(color='#FF0000', label='apple')
    patch1 = mpatches.Patch(color='#00FF00', label='mandarin')
    patch2 = mpatches.Patch(color='#0000FF', label='orange')
    patch3 = mpatches.Patch(color='#AFAFAF', label='lemon')
    plt.legend(handles=[patch0, patch1, patch2, patch3])

        
    plt.xlabel('height (cm)')
    plt.ylabel('width (cm)')
    
    plt.show()

In [None]:
plot_fruit_knn(X_train, y_train, 5, 'uniform')   # we choose 5 nearest neighbors

### k-NN classification accuracy with respect to the 'k' parameter

In [None]:
k_range = range(1, 20)
scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, y_train)
    scores.append(knn.score(X_test, y_test))

plt.figure()
plt.xlabel('k')
plt.ylabel('accuracy')
plt.plot(k_range, scores, 'o-')
plt.xticks([0,5,10,15,20]);

### k-NN classification (test) accuracy with respect to the train/test split

In [None]:
train_proportions = [0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2]

knn = KNeighborsClassifier(n_neighbors=5)

plt.figure()

mean_scores = []
for perc in train_proportions:
    scores = []
    for _ in range(1, 100):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-perc)
        knn.fit(X_train, y_train)
        scores.append(knn.score(X_test, y_test))
    mean_scores.append(np.mean(scores))    
    
plt.plot(t, mean_scores, 'o-')

plt.xlabel('Training set proportion (%)')
plt.ylabel('accuracy');