In [16]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set()

# Fruit Classification

In [2]:
fruits = pd.read_csv('./data/fruit_data_with_colors.csv',index_col = 0)
fruits.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [3]:
col_category = fruits.select_dtypes(include=['O']).columns.values.tolist()
col_category


['fruit_name', 'fruit_subtype']

# Data Clean and Visualization

In [4]:
#fill na for numerical veriables, with median()
features= ['height', 'width', 'mass', 'color_score']
target = 'fruit_label'
fruits_clean = fruits[features + [target]]
cat_features = fruits_clean[features].select_dtypes(include=['O']).columns.values.tolist()
num_features = list(set(features)-set(cat_features))

fruits_clean[num_features] = fruits_clean[num_features].fillna(fruits_clean[num_features].median())

#categorical to dummy variables
column_dummies = [pd.get_dummies(fruits_clean[col], prefix=col, prefix_sep='_', dummy_na=True).iloc[:,1:] for col in cat_features]

try:
    all_dummies = pd.concat(column_dummies, axis=1)
    fruits_clean = pd.concat([fruits_clean[features+[target]],all_dummies], axis = 1)
except:
    fruits_clean = fruits_clean[features+[target]]
fruits_clean.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


Unnamed: 0,height,width,mass,color_score,fruit_label
0,7.3,8.4,192,0.55,1
1,6.8,8.0,180,0.59,1
2,7.2,7.4,176,0.6,1
3,4.7,6.2,86,0.8,2
4,4.6,6.0,84,0.79,2


In [5]:
X = fruits_clean[features]
y = fruits_clean[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [6]:
# This is a classification problem; draw a scatter matrix
scatter = pd.scatter_matrix(X_train, c= y_train, marker = 'o', s=40, hist_kwds={'bins':15}, figsize=(5,5))

<IPython.core.display.Javascript object>

In [None]:
# plotting a 3D scatter plot to examine the data
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure()
ax = fig.add_subplot(111, projection = '3d')
ax.scatter(X_train['height'], X['w'], X['mean symmetry'], c = y, marker = 'o', s=100)
ax.set_xlabel('mean radius')
ax.set_ylabel('mean smoothness')
ax.set_zlabel('mean symmetry')
plt.show()

# Use KNN to train a classifier

In [7]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
# we must apply the scaling to the test set that we computed for the training set
X_test_scaled = scaler.transform(X_test)

In [8]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 5)

In [9]:
knn.fit(X_train_scaled, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [10]:
print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn.score(X_train_scaled, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
     .format(knn.score(X_test_scaled, y_test)))

Accuracy of K-NN classifier on training set: 0.95
Accuracy of K-NN classifier on test set: 1.00


# Grid Search for "best" number of clusters

In [11]:
k_range = range(1,20)
scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train_scaled, y_train)
    scores.append(knn.score(X_test_scaled, y_test))

plt.figure()
plt.xlabel('k')
plt.ylabel('accuracy')
plt.scatter(k_range, scores)
plt.xticks([0,5,10,15,20]);

<IPython.core.display.Javascript object>

# 10-fold cross validation

In [14]:
from sklearn.model_selection import cross_val_score

cv_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train_scaled, y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())


MSE = [1 - x for x in cv_scores]

# determining best k
optimal_k = k_range[MSE.index(min(MSE))]
print("The optimal number of neighbors is %d" % optimal_k)



The optimal number of neighbors is 1




In [15]:
# plot misclassification error vs k
plt.figure()
plt.plot(k_range, MSE)
plt.xlabel('Number of Neighbors K')
plt.ylabel('Misclassification Error')

<IPython.core.display.Javascript object>

<matplotlib.text.Text at 0x124e8921518>