# Build Classification Model

In [3]:
import pandas as pd
cuisines_df = pd.read_csv("./cleaned_cuisines.csv")
cuisines_df.head()

Unnamed: 0.1,Unnamed: 0,cuisine,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,indian,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [4]:
cuisines_label_df = cuisines_df['cuisine']
cuisines_label_df.head()

0    indian
1    indian
2    indian
3    indian
4    indian
Name: cuisine, dtype: object

In [5]:
cuisines_feature_df = cuisines_df.drop(['Unnamed: 0', 'cuisine'], axis=1)
cuisines_feature_df.head()

Unnamed: 0,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,artemisia,artichoke,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report, precision_recall_curve
import numpy as np

In [7]:
X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df, cuisines_label_df, test_size=0.3)

In [13]:
C = 10
# Create different classifiers.
classifiers = {
    'Linear SVC': SVC(kernel='linear', C=C, probability=True,random_state=0),
    'KNN classifier': KNeighborsClassifier(C),
    'SVC': SVC(),
    'RFST': RandomForestClassifier(n_estimators=100),
    'ADA': AdaBoostClassifier(n_estimators=100)
}

In [15]:
n_classifiers = len(classifiers)

for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(X_train, np.ravel(y_train))

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
    print(classification_report(y_test,y_pred))

Accuracy (train) for Linear SVC: 80.8% 
              precision    recall  f1-score   support

     chinese       0.72      0.72      0.72       220
      indian       0.86      0.90      0.88       237
    japanese       0.80      0.82      0.81       234
      korean       0.88      0.76      0.82       262
        thai       0.78      0.84      0.81       246

    accuracy                           0.81      1199
   macro avg       0.81      0.81      0.81      1199
weighted avg       0.81      0.81      0.81      1199

Accuracy (train) for KNN classifier: 72.2% 
              precision    recall  f1-score   support

     chinese       0.62      0.67      0.64       220
      indian       0.80      0.83      0.81       237
    japanese       0.64      0.83      0.73       234
      korean       0.95      0.56      0.71       262
        thai       0.70      0.74      0.72       246

    accuracy                           0.72      1199
   macro avg       0.74      0.73      0.72    

### Ranking (of best accuracy)

- RFST: 83.2%
- SVC: 82.8%
- Linear SVC: 80.8%
- KNN: 72.2%
- ADA: 68.5%

It makes sense that RFST would come in first as it it an averaging method of solid-accuracy models

Assignment:

To alter the parameters of the classifiers, lets make multiple K-Neighbor Classifiers models with different k lavues to determine the effects!

In [25]:
KNNClassifiers = {}

k = 1

while k <= 50:
  label = "k = " + str(k)
  KNNClassifiers[label] = KNeighborsClassifier(k)
  k += 1

# The k values are being incremented to determine how the accuracy changes with differing C values

for index, (name, classifier) in enumerate(KNNClassifiers.items()):
    classifier.fit(X_train, np.ravel(y_train))

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy when %s: %0.1f%% " % (name, accuracy * 100))

Accuracy when k = 1: 76.4% 
Accuracy when k = 2: 71.7% 
Accuracy when k = 3: 72.7% 
Accuracy when k = 4: 74.8% 
Accuracy when k = 5: 74.6% 
Accuracy when k = 6: 74.1% 
Accuracy when k = 7: 73.1% 
Accuracy when k = 8: 73.0% 
Accuracy when k = 9: 73.1% 
Accuracy when k = 10: 72.2% 
Accuracy when k = 11: 72.1% 
Accuracy when k = 12: 72.5% 
Accuracy when k = 13: 72.1% 
Accuracy when k = 14: 71.9% 
Accuracy when k = 15: 70.9% 
Accuracy when k = 16: 70.8% 
Accuracy when k = 17: 70.9% 
Accuracy when k = 18: 70.6% 
Accuracy when k = 19: 70.2% 
Accuracy when k = 20: 69.1% 
Accuracy when k = 21: 69.0% 
Accuracy when k = 22: 68.6% 
Accuracy when k = 23: 68.7% 
Accuracy when k = 24: 68.7% 
Accuracy when k = 25: 68.6% 
Accuracy when k = 26: 68.7% 
Accuracy when k = 27: 68.1% 
Accuracy when k = 28: 67.7% 
Accuracy when k = 29: 67.1% 
Accuracy when k = 30: 67.6% 
Accuracy when k = 31: 67.3% 
Accuracy when k = 32: 66.8% 
Accuracy when k = 33: 66.3% 
Accuracy when k = 34: 65.6% 
Accuracy when k = 35: 6

It looks like as K increases initially (up to the 4 - 5 range), the accuracy increases. This manes sense, as the model has more reference points to learn from. However, as K increases more, the accuracy goes way down. This, also, makes sense, as it the model as too many references, the it cant pinpoint with references match which classification. Therefore, KNN classifiers must be used under a sweetspot, a small enough AND big enough K value.