 # Fruit Classification with Random Forest Supervised Learning

In [None]:
%%html
<img src="./images/fruitml.png">
<figcaption><b>Fruit Classifier Schema</b></figcaption>

In [None]:
import sys
import itertools
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from matplotlib import cm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


## Get training data from Openshift Ceph S3

In [None]:
import os
import boto3

s3_endpoint_url = os.environ['S3_ENDPOINT']
s3_access_key = os.environ['AWS_ACCESS_KEY_ID']
s3_secret_key = os.environ['AWS_SECRET_ACCESS_KEY']

s3_bucket="fruitml"
s3_object="fruit_data_with_colors.data"
train_data_file="./%s" %s3_object

# Create an S3 client and download train data file
s3 = boto3.client(service_name='s3',verify=False, aws_access_key_id = s3_access_key,aws_secret_access_key = s3_secret_key, endpoint_url=s3_endpoint_url)
s3.download_file(s3_bucket, s3_object, train_data_file )

 ## Load training data to train the model


In [None]:
fruits = pd.read_table(train_data_file)

#### Create a mapping from fruit label value to fruit name to make results easier to interpret

In [None]:
# color_score: 1-> Red, 0-> Violet
lookup_fruit_name = dict(zip(fruits.fruit_label.unique(), fruits.fruit_name.unique()))
print (lookup_fruit_name)

#### Print training data

In [None]:
# color_score: 1-> Red, 0-> Violet
print (fruits)

 ## Create train and test datasets from training data


 ### Define features X and labels y
 ### Use the mass, width,  height and color_score features (X) of each fruit instance (y)


In [None]:
X = fruits[['mass', 'width', 'height','color_score']]
y = fruits['fruit_label']

 ### Train dataset and test dataset split, using default 75% / 25%


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

### Train dataset 3D scatter plot

In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111, projection = '3d')
ax.scatter(X_train['color_score'], X_train['width'], X_train['height'], c = y_train, marker = 'o', s=100)
ax.set_xlabel('color_score')
ax.set_ylabel('width')
ax.set_zlabel('height')
plt.show()


## Configure, train and test ML algorithm using Random Forest Classifier (RFC)

In [None]:
%%html
<img src="./images/random-forest-algorithm.png" width="360" height="360">
<figcaption><b>Random Forest Classifier Schema</b></figcaption>

In [None]:
%%html
<img src="./images/fruittree.jpeg" width="360" height="360">
<figcaption><b>Fruit Decision Tree sample</b></figcaption>

### Create RFC object
#### 'n_estimators' -> represents the number of decision trees in the forest,  'max_depth' -> represents the depth of each tree in the forest

In [None]:
clf = RandomForestClassifier(n_estimators=5,max_depth=5)

### Train RFC (fit the estimator) using the train dataset

In [None]:
clf.fit(X_train, y_train)

### Test the classfier, estimate the accuracy of the classifier on future data, using the test dataset

In [None]:
print('Accuracy of RFC classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of RFC classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
pred = clf.predict(X_test)

### Confusion matrix

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, "{}".format(cm[i, j]),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True Class')
    plt.xlabel('Predicted Class')
    
# Plot the confusion matrix using the provided functions.
class_labels = lookup_fruit_name.values()
model_cm = confusion_matrix(y_true=y_test, y_pred=pred)
np.set_printoptions(precision=2)
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(model_cm, classes=class_labels,title='Confusion matrix, without normalization', normalize=False)
## Plot normalized confusion matrix
#plt.figure()
#plot_confusion_matrix(model_cm, classes=class_labels,title='Confusion matrix, normalized', normalize=True)
plt.show()


## Classify new unseen data

### First example: a small fruit with mass 90g, width 4.3 cm, height 5.5 cm, color_score=0.79

In [None]:
fruit_prediction = clf.predict([[90, 4.3, 5.5, 0.79]])
lookup_fruit_name[fruit_prediction[0]]
print ("Prediction for mass=90, width=4.3, height=5.5, color_score=0.79: ==> %s"  %(lookup_fruit_name[fruit_prediction[0]]))


### Second example: a larger, elongated fruit with mass 150g, width 7.3 cm, height 7.5 cm, color_score=0.63

In [None]:
fruit_prediction = clf.predict([[150, 7.3, 7.5, 0.63]])
lookup_fruit_name[fruit_prediction[0]]
print ("Prediction for mass=150, width=7.3, height=7.5, color_score=0.63: ==> %s" %(lookup_fruit_name[fruit_prediction[0]]))

## Tune RFC algorithm

### How sensitive is RFC classification accuracy to the choice of the 'n_estimators' and 'max_depth' parameters?

In [None]:
md_range = range(1,40)
ne_range = range(1,40)
xs = []
ys = []
zs = []

for md in md_range:
    for ne in ne_range:
        clf = RandomForestClassifier(n_estimators=ne,max_depth=md)
        clf.fit(X_train, y_train)
        xs.append(md)
        ys.append(ne)
        zs.append(clf.score(X_test, y_test))

        
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(xs, ys, zs, c=zs, cmap='Blues', s=100)
ax.set_xlabel('max_depth')
ax.set_ylabel('n_estimators')
ax.set_zlabel('accuracy')

plt.show()

### How sensitive is RFC classification accuracy to the train/test dataset split proportion?

In [None]:
t = [0.9, 0.7, 0.5, 0.3, 0.1]

clf = RandomForestClassifier(n_estimators=2)
fig = plt.figure(figsize=(8,8))

for s in t:

    scores = []
    for i in range(1,1000):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-s)
        clf.fit(X_train, y_train)
        scores.append(clf.score(X_test, y_test))
    plt.plot(s, np.mean(scores), 'bo')
     
plt.xlabel('Training set proportion (%)')
plt.ylabel('accuracy');
plt.show()
