In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer

In [2]:
FOLDER_PATH = "D:\\Google Drive\\Catherning Folder\\THU\\Thesis\\Recipe datasets\\scirep-cuisines-detail"
FILES = ["cleaned_data.pkl","full_data.pkl"]

In [3]:
df_train=pd.read_pickle(os.path.join(FOLDER_PATH,FILES[1]))
df_train=df_train.reset_index()
df_train

Unnamed: 0,index,cuisine,id,ingredients,all_ingredients
0,0,Canada,0,"[egg, yeast, wheat, milk, lard]",egg;yeast;wheat;milk;lard
1,1,Canada,1,"[pork, carrot, pea, onion, potato]",pork;carrot;pea;onion;potato
2,2,Canada,2,[maple_syrup],maple_syrup
3,3,Canada,3,"[wheat, yeast, almond, honey, oat, date, veget...",wheat;yeast;almond;honey;oat;date;vegetable_oi...
4,4,Canada,4,"[butter, lovage, clam, wheat, onion, thyme, po...",butter;lovage;clam;wheat;onion;thyme;potato;ye...
...,...,...,...,...,...
57686,57686,Italian,2453,"[kiwi, olive_oil, clam, white_wine, orange, sa...",kiwi;olive_oil;clam;white_wine;orange;salmon;f...
57687,57687,Italian,2454,"[tomato, butter, beef, onion, red_wine, black_...",tomato;butter;beef;onion;red_wine;black_pepper...
57688,57688,Italian,2455,"[vegetable, wheat, egg, cheese, olive_oil]",vegetable;wheat;egg;cheese;olive_oil
57689,57689,Italian,2456,"[tomato, clam, black_pepper, parsley, celery, ...",tomato;clam;black_pepper;parsley;celery;macaroni


In [6]:
cv = CountVectorizer()
X = cv.fit_transform(df_train['all_ingredients'].values)

## Processing

In [7]:
enc = LabelEncoder()
y = enc.fit_transform(df_train.cuisine)
print(enc.classes_)
enc.classes_.shape

['African' 'American' 'Asian' 'Austria' 'Bangladesh' 'Belgium'
 'Cajun_Creole' 'Canada' 'Caribbean' 'Central_SouthAmerican' 'Chinese'
 'East-African' 'Eastern-Europe' 'English_Scottish' 'French' 'Germany'
 'Greek' 'Indian' 'Indonesia' 'Iran' 'Irish' 'Italian' 'Japanese' 'Jewish'
 'Korean' 'Lebanon' 'Malaysia' 'Mediterranean' 'Mexican' 'MiddleEastern'
 'Moroccan' 'Netherlands' 'North-African' 'Pakistan' 'Philippines'
 'Portugal' 'Scandinavian' 'South-African' 'South-America'
 'Southern_SoulFood' 'Southwestern' 'Spanish_Portuguese' 'Switzerland'
 'Thai' 'Turkey' 'UK-and-Ireland' 'Vietnamese' 'West-African' 'asian'
 'east_asian' 'western']


(51,)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
unique, counts = np.unique(y_test, return_counts=True)
dict(zip(unique, counts))

{0: 30,
 1: 8026,
 2: 235,
 3: 5,
 4: 1,
 5: 4,
 6: 28,
 7: 158,
 8: 35,
 9: 54,
 10: 84,
 11: 4,
 12: 75,
 13: 46,
 14: 257,
 15: 49,
 16: 52,
 17: 129,
 18: 2,
 19: 4,
 20: 12,
 21: 663,
 22: 55,
 23: 76,
 24: 153,
 25: 10,
 26: 6,
 27: 60,
 28: 455,
 29: 55,
 30: 29,
 31: 6,
 32: 15,
 33: 4,
 34: 8,
 35: 10,
 36: 40,
 37: 4,
 38: 23,
 39: 74,
 40: 21,
 41: 63,
 42: 4,
 43: 67,
 44: 1,
 45: 54,
 46: 20,
 47: 2,
 48: 5,
 49: 172,
 50: 94}

# Logistic Regression

In [None]:
parameters = {'C':[0.1,0.3,0.6,0.9,1],
              'solver' : ('newton-cg', 'liblinear')}
# sorted(sklearn.metrics.SCORERS.keys())
scoring = 'f1_weighted'
logistic = LogisticRegression(class_weight="balanced",multi_class="auto",max_iter=150,warm_start=True)

clf = GridSearchCV(logistic, parameters, cv=4,scoring=scoring)
clf.fit(X_train, y_train)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [None]:
# pd.DataFrame(clf.cv_results_)
print(clf.best_estimator_)
clf.best_score_

In [None]:
clf.best_estimator_.score(X_test, y_test)

## Results

In [None]:
plt.figure(figsize=(10, 10))

cm = confusion_matrix(y_test, logistic.predict(X_test))
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

plt.imshow(cm_normalized, interpolation='nearest')
plt.title("confusion matrix")
plt.colorbar(shrink=0.3)
cuisines = df_train['cuisine'].value_counts().index
tick_marks = np.arange(len(cuisines))
plt.xticks(tick_marks, enc.classes_, rotation=90)
plt.yticks(tick_marks, enc.classes_)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')

In [None]:
y_pred = logistic.predict(X_test)

print(classification_report(y_test, y_pred, labels=[i for i in range(69)], target_names=enc.classes_))

# Random Forest

much slower and a bit worse

In [None]:
parameters = {'max_depth':(None,3,5,8)}
# sorted(sklearn.metrics.SCORERS.keys())
scoring = 'f1_weighted'
rf = RandomForestClassifier(n_estimators=200, random_state=123,class_weight="balanced")

clf = GridSearchCV(rf, parameters, cv=4,scoring=scoring)
clf.fit(X_train, y_train)

In [None]:
print(clf.best_estimator_)
clf.best_score_

In [None]:
clf.best_estimator_.score(X_test, y_test)

## Results

In [None]:
y_pred = rf.predict(X_test)

print(classification_report(y_test, y_pred, labels=[i for i in range(69)], target_names=cuisines))

# SVM

In [None]:
parameters = {'loss':('hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron')
              }
# sorted(sklearn.metrics.SCORERS.keys())
scoring = 'f1_weighted'
sgd = SGDClassifier(class_weight="balanced",early_stopping=True,warm_start=True)

clf = GridSearchCV(sgd, parameters, cv=4,scoring=scoring)
clf.fit(X_train, y_train)

In [None]:
print(clf.best_estimator_)
clf.best_score_

In [None]:
clf.best_estimator_.score(X_test, y_test)