# Test different classification algorithms

In [1]:
# https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_validate # train/test split
from sklearn.preprocessing import StandardScaler # scale data
from sklearn.neighbors import KNeighborsClassifier # knn
from sklearn.metrics import classification_report, confusion_matrix # eval performance
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, 
                              AdaBoostClassifier, BaggingRegressor)
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.svm import SVC # svc
import sklearn

In [46]:
# #test new library for benchmarking df
# # from https://nbviewer.jupyter.org/github/JosPolfliet/pandas-profiling/blob/master/examples/meteorites.ipynb
# import pandas_profiling

In [47]:
pd.set_option('display.height', 1000) # make printing better for the entire set of columns
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [48]:
data = pd.DataFrame.from_csv('arabica_to_classify.csv') # check our csv
data.head()


  """Entry point for launching an IPython kernel.


Unnamed: 0,country_of_origin,aroma,flavor,aftertaste,acidity,body,balance,uniformity,clean_cup,sweetness,cupper_points,total_cup_points,moisture,category_one_defects,category_two_defects,altitude_low_meters,altitude_high_meters,altitude_mean_meters,processing_method_Other,processing_method_Pulped natural / honey,processing_method_Semi-washed / Semi-pulped,processing_method_Washed / Wet,region
0,Ethiopia,8.67,8.83,8.67,8.75,8.5,8.42,10.0,10.0,10.0,8.75,90.58,0.12,0,0,1950.0,2200.0,2075.0,0,0,0,1,africa
1,Ethiopia,8.75,8.67,8.5,8.58,8.42,8.42,10.0,10.0,10.0,8.58,89.92,0.12,0,1,1950.0,2200.0,2075.0,0,0,0,1,africa
3,Ethiopia,8.17,8.58,8.42,8.42,8.5,8.25,10.0,10.0,10.0,8.67,89.0,0.11,0,2,1800.0,2200.0,2000.0,0,0,0,0,africa
4,Ethiopia,8.25,8.5,8.25,8.5,8.42,8.33,10.0,10.0,10.0,8.58,88.83,0.12,0,2,1950.0,2200.0,2075.0,0,0,0,1,africa
9,Ethiopia,8.08,8.58,8.5,8.5,7.67,8.42,10.0,10.0,10.0,8.5,88.25,0.1,0,4,1795.0,1850.0,1822.5,0,0,0,0,africa


In [49]:
# drop cols during refinement here
data_trainer = data.drop(['category_one_defects','category_two_defects','total_cup_points'],axis=1)

In [50]:
data_trainer.head() #add range(alt)

Unnamed: 0,country_of_origin,aroma,flavor,aftertaste,acidity,body,balance,uniformity,clean_cup,sweetness,cupper_points,moisture,altitude_low_meters,altitude_high_meters,altitude_mean_meters,processing_method_Other,processing_method_Pulped natural / honey,processing_method_Semi-washed / Semi-pulped,processing_method_Washed / Wet,region
0,Ethiopia,8.67,8.83,8.67,8.75,8.5,8.42,10.0,10.0,10.0,8.75,0.12,1950.0,2200.0,2075.0,0,0,0,1,africa
1,Ethiopia,8.75,8.67,8.5,8.58,8.42,8.42,10.0,10.0,10.0,8.58,0.12,1950.0,2200.0,2075.0,0,0,0,1,africa
3,Ethiopia,8.17,8.58,8.42,8.42,8.5,8.25,10.0,10.0,10.0,8.67,0.11,1800.0,2200.0,2000.0,0,0,0,0,africa
4,Ethiopia,8.25,8.5,8.25,8.5,8.42,8.33,10.0,10.0,10.0,8.58,0.12,1950.0,2200.0,2075.0,0,0,0,1,africa
9,Ethiopia,8.08,8.58,8.5,8.5,7.67,8.42,10.0,10.0,10.0,8.5,0.1,1795.0,1850.0,1822.5,0,0,0,0,africa


In [52]:
data_trainer.corr()

Unnamed: 0,aroma,flavor,aftertaste,acidity,body,balance,uniformity,clean_cup,sweetness,cupper_points,moisture,altitude_low_meters,altitude_high_meters,altitude_mean_meters,processing_method_Other,processing_method_Pulped natural / honey,processing_method_Semi-washed / Semi-pulped,processing_method_Washed / Wet
aroma,1.0,0.730794,0.677853,0.603764,0.551579,0.611399,0.13505,0.197036,0.090089,0.620442,-0.078021,-0.020674,-0.01912,-0.019897,-0.052785,-0.002699,0.032285,-0.051052
flavor,0.730794,1.0,0.848174,0.752697,0.685275,0.734379,0.22184,0.301446,0.180586,0.778922,-0.12065,-0.013684,-0.012656,-0.013171,-0.026602,0.008481,0.046016,-0.139939
aftertaste,0.677853,0.848174,1.0,0.690328,0.685504,0.764795,0.24118,0.319673,0.191644,0.778498,-0.16618,-0.028069,-0.026647,-0.027359,-0.020345,0.056854,0.043226,-0.120681
acidity,0.603764,0.752697,0.690328,1.0,0.617726,0.643872,0.191302,0.167756,0.113706,0.65243,-0.099828,0.002574,0.003814,0.003194,-0.057303,-0.005517,0.008777,-0.021607
body,0.551579,0.685275,0.685504,0.617726,1.0,0.702503,0.131102,0.144744,0.086115,0.637715,-0.200925,-0.019658,-0.01826,-0.018959,-0.05148,0.05064,0.035159,-0.135983
balance,0.611399,0.734379,0.764795,0.643872,0.702503,1.0,0.241331,0.262809,0.156111,0.719986,-0.216984,-0.017328,-0.015992,-0.01666,-0.020347,0.02491,0.047359,-0.122942
uniformity,0.13505,0.22184,0.24118,0.191302,0.131102,0.241331,1.0,0.375281,0.364216,0.206771,-0.005838,-0.012522,-0.012027,-0.012275,-0.039409,0.029326,-0.004501,-0.023793
clean_cup,0.197036,0.301446,0.319673,0.167756,0.144744,0.262809,0.375281,1.0,0.429201,0.281215,-0.031121,-0.005381,-0.005143,-0.005262,-0.002999,0.019659,0.030247,-0.057383
sweetness,0.090089,0.180586,0.191644,0.113706,0.086115,0.156111,0.364216,0.429201,1.0,0.14987,0.030718,-0.023223,-0.022884,-0.023054,-0.034582,0.016212,0.009952,0.030595
cupper_points,0.620442,0.778922,0.778498,0.65243,0.637715,0.719986,0.206771,0.281215,0.14987,1.0,-0.185892,-0.013677,-0.012424,-0.013051,-0.068347,0.029676,0.033898,-0.133631


In [53]:
# add cols for feature engineering
alt_range = data_trainer['altitude_high_meters']- data_trainer['altitude_low_meters']
data_trainer['alt_range']=alt_range
data_trainer.head()

Unnamed: 0,country_of_origin,aroma,flavor,aftertaste,acidity,body,balance,uniformity,clean_cup,sweetness,cupper_points,moisture,altitude_low_meters,altitude_high_meters,altitude_mean_meters,processing_method_Other,processing_method_Pulped natural / honey,processing_method_Semi-washed / Semi-pulped,processing_method_Washed / Wet,region,alt_range
0,Ethiopia,8.67,8.83,8.67,8.75,8.5,8.42,10.0,10.0,10.0,8.75,0.12,1950.0,2200.0,2075.0,0,0,0,1,africa,250.0
1,Ethiopia,8.75,8.67,8.5,8.58,8.42,8.42,10.0,10.0,10.0,8.58,0.12,1950.0,2200.0,2075.0,0,0,0,1,africa,250.0
3,Ethiopia,8.17,8.58,8.42,8.42,8.5,8.25,10.0,10.0,10.0,8.67,0.11,1800.0,2200.0,2000.0,0,0,0,0,africa,400.0
4,Ethiopia,8.25,8.5,8.25,8.5,8.42,8.33,10.0,10.0,10.0,8.58,0.12,1950.0,2200.0,2075.0,0,0,0,1,africa,250.0
9,Ethiopia,8.08,8.58,8.5,8.5,7.67,8.42,10.0,10.0,10.0,8.5,0.1,1795.0,1850.0,1822.5,0,0,0,0,africa,55.0


In [54]:
# reorder cols
cols = list(data_trainer.columns.values)
print(cols)

['country_of_origin', 'aroma', 'flavor', 'aftertaste', 'acidity', 'body', 'balance', 'uniformity', 'clean_cup', 'sweetness', 'cupper_points', 'moisture', 'altitude_low_meters', 'altitude_high_meters', 'altitude_mean_meters', 'processing_method_Other', 'processing_method_Pulped natural / honey', 'processing_method_Semi-washed / Semi-pulped', 'processing_method_Washed / Wet', 'region', 'alt_range']


In [55]:
data_trainer = data_trainer[['region','country_of_origin', 'alt_range', 'aroma', 'flavor', 'aftertaste', 'acidity', 'body', 'balance', 'uniformity', 'clean_cup', 'sweetness', 'cupper_points', 'moisture', 'altitude_low_meters', 'altitude_high_meters', 'altitude_mean_meters', 'processing_method_Other', 'processing_method_Pulped natural / honey', 'processing_method_Semi-washed / Semi-pulped', 'processing_method_Washed / Wet']]
# reformat order of cols 

In [62]:
#add region column
regions_int = { 'United States': 0, #'north_america',
           'Mexico': 0, #'north_america',
           'China' : 1, #'asia',
           'Taiwan': 1, #'asia',
           'Japan':  1, #'asia',
           'El Salvador':                 2, #'central_america_carib',
           'Nicaragua':                   2, #'central_america_carib',
           'Costa Rica':                  2, #'central_america_carib',
           'Guatemala':                   2, #'central_america_carib',
           'Honduras':                    2, #'central_america_carib',
           'Haiti':                       2, #'central_america_carib',
           'United States (Puerto Rico)': 2, #'central_america_carib',
           'Brazil' : 3, #'south_america',
           'Peru':      3, #'south_america',
           'Colombia':  3, #'south_america',
           'Panama':    3, #'south_america',
           'Ecuador':   3, #'south_america',
           'Ethiopia':  4, #'africa',
           'Kenya':     4, #'africa',
           'Burundi':   4, #'africa',
           'Cote d?Ivoire': 4, #'africa',
           'Rwanda':    4, #'africa',
           'Malawi':    4, #'africa',
           'Uganda':    4, #'africa',
           'Tanzania, United Republic Of': 4, #'africa',
           'Zambia':                       4, #'africa',
           'Thailand':         5, #'southeast_asia',
           'Myanmar':          5, #'southeast_asia',
           'Indonesia':        5, #'southeast_asia',
           'Papua New Guinea': 5, #'southeast_asia',
           'Vietnam':          5, #'southeast_asia',
           'Philippines':      5, #'southeast_asia',
           'Laos':             5, #'southeast_asia'
          }


# create a new column for the region as an ID value for each country
region_id_list=[]
for row in data_trainer['country_of_origin']:
    region_id = regions_int[row]
    region_id_list.append(region_id)
    
data_trainer['region_id'] = region_id_list

In [126]:
# X contains the attributes, y contains the labels
# [row_start:row_end, col_start:col_end]
y = data_trainer['region_id'].values # region_id
X = data_trainer.iloc[:,3:-1].values # all features, except region_id

In [127]:
print(cols)

['country_of_origin', 'aroma', 'flavor', 'aftertaste', 'acidity', 'body', 'balance', 'uniformity', 'clean_cup', 'sweetness', 'cupper_points', 'moisture', 'altitude_low_meters', 'altitude_high_meters', 'altitude_mean_meters', 'processing_method_Other', 'processing_method_Pulped natural / honey', 'processing_method_Semi-washed / Semi-pulped', 'processing_method_Washed / Wet', 'region', 'alt_range']


In [None]:
X_cols = data_trainer['country_of_origin', 'aroma', 'flavor', 'aftertaste', 'acidity', 'body', 'balance', 'uniformity', \
          'clean_cup', 'sweetness', 'cupper_points', 'moisture', 'altitude_low_meters', 'altitude_high_meters', \
          'altitude_mean_meters', 'processing_method_Other', 'processing_method_Pulped natural / honey', \
          'processing_method_Semi-washed / Semi-pulped', 'processing_method_Washed / Wet', \
          'alt_range']

# add aftertaste v flavor
# add flavor v aroma
# add acidity v flavor
# add body v flavor
# add aftertaste w balance
# balance v flavor
# balance and body
aftertaste vs acidity
clean cup v flavor 
clean cup v aftertaste
alt * flavor?

#need to add items as interaction terms to feed as cols to the clf below

In [128]:
data_trainer.corr()

Unnamed: 0,alt_range,aroma,flavor,aftertaste,acidity,body,balance,uniformity,clean_cup,sweetness,cupper_points,moisture,altitude_low_meters,altitude_high_meters,altitude_mean_meters,processing_method_Other,processing_method_Pulped natural / honey,processing_method_Semi-washed / Semi-pulped,processing_method_Washed / Wet,region_id
alt_range,1.0,0.111096,0.073492,0.101751,0.08842,0.099967,0.095558,0.035475,0.017014,0.024381,0.089567,-0.043196,-0.017632,-0.003622,-0.010628,-0.047192,-0.029621,-0.038289,0.085856,0.191528
aroma,0.111096,1.0,0.730794,0.677853,0.603764,0.551579,0.611399,0.13505,0.197036,0.090089,0.620442,-0.078021,-0.020674,-0.01912,-0.019897,-0.052785,-0.002699,0.032285,-0.051052,0.188929
flavor,0.073492,0.730794,1.0,0.848174,0.752697,0.685275,0.734379,0.22184,0.301446,0.180586,0.778922,-0.12065,-0.013684,-0.012656,-0.013171,-0.026602,0.008481,0.046016,-0.139939,0.218862
aftertaste,0.101751,0.677853,0.848174,1.0,0.690328,0.685504,0.764795,0.24118,0.319673,0.191644,0.778498,-0.16618,-0.028069,-0.026647,-0.027359,-0.020345,0.056854,0.043226,-0.120681,0.251334
acidity,0.08842,0.603764,0.752697,0.690328,1.0,0.617726,0.643872,0.191302,0.167756,0.113706,0.65243,-0.099828,0.002574,0.003814,0.003194,-0.057303,-0.005517,0.008777,-0.021607,0.184458
body,0.099967,0.551579,0.685275,0.685504,0.617726,1.0,0.702503,0.131102,0.144744,0.086115,0.637715,-0.200925,-0.019658,-0.01826,-0.018959,-0.05148,0.05064,0.035159,-0.135983,0.25096
balance,0.095558,0.611399,0.734379,0.764795,0.643872,0.702503,1.0,0.241331,0.262809,0.156111,0.719986,-0.216984,-0.017328,-0.015992,-0.01666,-0.020347,0.02491,0.047359,-0.122942,0.268938
uniformity,0.035475,0.13505,0.22184,0.24118,0.191302,0.131102,0.241331,1.0,0.375281,0.364216,0.206771,-0.005838,-0.012522,-0.012027,-0.012275,-0.039409,0.029326,-0.004501,-0.023793,0.156405
clean_cup,0.017014,0.197036,0.301446,0.319673,0.167756,0.144744,0.262809,0.375281,1.0,0.429201,0.281215,-0.031121,-0.005381,-0.005143,-0.005262,-0.002999,0.019659,0.030247,-0.057383,0.135357
sweetness,0.024381,0.090089,0.180586,0.191644,0.113706,0.086115,0.156111,0.364216,0.429201,1.0,0.14987,0.030718,-0.023223,-0.022884,-0.023054,-0.034582,0.016212,0.009952,0.030595,0.00417


In [125]:
# cross validate, compare models and changes across models if desired
# this cell does everything the next heading and below do, but better
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from pprint import pprint

def score_func(y_true, y_pred):
    return np.mean(metrics.precision_score(y_true, y_pred, average=None))

scoring = {'precision': make_scorer(score_func)}

clf = SVC(C=3)
clf1 = KNeighborsClassifier(n_neighbors=4)
clf2 = RandomForestClassifier(n_estimators=5) # add more classifiers to find the best one

scores = []
scores.append(cross_validate(clf, X, y, scoring=scoring, cv=5))
scores.append(cross_validate(clf1, X, y, scoring=scoring, cv=5))
scores.append(cross_validate(clf2, X, y, scoring=scoring, cv=5))

print('test_precision avg')
for score_set in scores:
    print(np.mean(score_set['test_precision']))

print('train_precision avg')
for score_set in scores:
    print(np.mean(score_set['train_precision']))


test_precision avg
0.6200068372928169
0.5553911639592046
0.5022802745012646
train_precision avg
0.8079619303339731
0.7005152552148548
0.9611829855566316




## try diff classifiers

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) # train, test split 

In [None]:
scaler = StandardScaler()  # normalize
scaler.fit(X_train)

X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test)

### SVC

In [17]:
classifier = SVC(C=3)
classifier.fit(X_train, y_train) 
y_pred = classifier.predict(X_test) 

In [33]:
# evaluate performance  
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred)) 

[[ 5  0  2  7  1  0]
 [ 1  2  7  3  6  1]
 [ 5  0 41  6  7  1]
 [ 2  0 13 24  5  0]
 [ 4  1 11  0 35  0]
 [ 1  2  2  3  2  1]]
                       precision    recall  f1-score   support

               africa       0.28      0.33      0.30        15
                 asia       0.40      0.10      0.16        20
central_america_carib       0.54      0.68      0.60        60
        north_america       0.56      0.55      0.55        44
        south_america       0.62      0.69      0.65        51
       southeast_asia       0.33      0.09      0.14        11

          avg / total       0.52      0.54      0.51       201



### K nearest neighbors classification

In [None]:
classifier = KNeighborsClassifier(n_neighbors=4)
classifier.fit(X_train, y_train) 
y_pred = classifier.predict(X_test) 

In [None]:
# evaluate performance  
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred)) 

### Random Forest Classifier

In [None]:
classifier = RandomForestClassifier(n_estimators=5)
classifier.fit(X_train, y_train) 
y_pred = classifier.predict(X_test) 

In [None]:
# evaluate performance  
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred)) 

### Graph for best k number - irrelevant?

In [None]:
error = []

# Calculating error for K values between 1 and 40

for i in range(1, 40):  
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=(17, 7))  
plt.plot(range(1, 40), error, color='black', linestyle='dashed', marker='.', markerfacecolor='green', markersize=7)
plt.title('K Value Error Change')
plt.xlabel('K Value')
plt.ylabel('Mean Error')