In [34]:
import pandas as pd
import numpy as np

In [35]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier

### Upload the initial data so that only ingredients in our dataset would be used

In [117]:
data=pd.read_csv('data/epi_data.csv')

In [118]:
data.drop('Unnamed: 0', axis=1, inplace=True)

In [4]:
data_short.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
import json
from pprint import pprint

In [13]:
with open('data/train.json') as f:
    train_data = json.load(f)

In [14]:
with open('data/test.json') as f:
    test_data = json.load(f)

In [15]:
train_data[0]

{'cuisine': 'greek',
 'id': 10259,
 'ingredients': ['romaine lettuce',
  'black olives',
  'grape tomatoes',
  'garlic',
  'pepper',
  'purple onion',
  'seasoning',
  'garbanzo beans',
  'feta cheese crumbles']}

In [11]:
ingredients=pd.read_csv('data/ingredients.csv')
ingredients.drop('Column1', axis=1, inplace=True)

In [12]:
len(ingredients['full name'])

3707

In [19]:
train=pd.DataFrame([x['cuisine'] for x in train_data], columns=['cuisine'], index=[x['id'] for x in train_data])

In [23]:
train.head()

Unnamed: 0,cuisine,cremini mushrooms
10259,greek,0
25693,southern_us,0
20130,filipino,0
22213,indian,0
13162,indian,0


In [30]:
for ingr in ingredients['full name']:
    train[ingr]=pd.Series([1 if ingr in x['ingredients'] else 0 for x in train_data], index=[x['id'] for x in train_data])

In [31]:
train.shape

(39774, 3708)

In [33]:
train.to_csv('train.csv', encoding='utf-8')

In [54]:
train_as_text = [' '.join([x.lower() for x in sample['ingredients'] if x in list(ingredients['full name'])]) 
                 for sample in train_data] 
train_cuisine = [sample['cuisine'] for sample in train_data]

In [57]:
train_as_text[0]

'romaine lettuce black olives grape tomatoes garlic pepper seasoning garbanzo beans'

### Split to test and dev

In [93]:
idx_train=[]
idx_dev=[]
for cuis in set(train_cuisine):
    idx=[i for i in range(len(train_cuisine)) if train_cuisine[i]==cuis]
    np.random.shuffle(idx)
    a=int(len(idx)*4/5)
    idx_train.extend(idx[:a])
    idx_dev.extend(idx[a:])

In [96]:
train=[train_as_text[i] for i in idx_train]
dev=[train_as_text[i] for i in idx_dev]

In [97]:
train_y=[train_cuisine[i] for i in idx_train]
dev_y=[train_cuisine[i] for i in idx_dev]

### Tfidf vectorizer for train data

In [98]:
tfidf_enc = TfidfVectorizer(binary=True)
lbl_enc = LabelEncoder()

X = tfidf_enc.fit_transform(train)
X = X.astype('float16')

X_dev = tfidf_enc.transform(dev)
X_dev = X_dev.astype('float16')

y_train = lbl_enc.fit_transform(train_y)
y_dev = lbl_enc.fit_transform(dev_y)

### SVC with OVR model

In [99]:
clf = SVC(C=100, kernel='rbf', degree=3,
          gamma=1, coef0=1, shrinking=True, 
          probability=False, tol=0.001, cache_size=200,
          class_weight=None, verbose=True, max_iter=-1,
          decision_function_shape=None, random_state=None)
model = OneVsRestClassifier(clf, n_jobs=4)
model.fit(X,y_train)

OneVsRestClassifier(estimator=SVC(C=100, cache_size=200, class_weight=None, coef0=1,
  decision_function_shape=None, degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=True),
          n_jobs=4)

### Test

In [101]:
y_test = model.predict(X_dev)
test_cuisine = lbl_enc.inverse_transform(y_dev)
result=lbl_enc.inverse_transform(y_test)

In [112]:
from sklearn.metrics import confusion_matrix
pd.DataFrame(confusion_matrix(test_cuisine, result))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,50,2,3,0,2,2,1,0,0,6,0,0,0,9,0,1,8,2,7,1
1,0,86,2,3,0,17,1,4,6,7,1,0,1,3,0,1,25,4,0,0
2,0,1,227,0,2,11,0,1,0,14,2,1,0,3,0,0,47,1,0,0
3,0,0,2,465,7,2,0,0,0,4,0,17,10,6,0,1,7,1,9,4
4,5,2,1,15,92,1,1,2,1,3,1,2,0,4,0,0,8,4,5,4
5,0,7,3,0,2,327,8,2,8,102,0,5,0,9,5,6,38,8,0,0
6,0,0,1,2,0,6,156,5,0,42,0,0,0,4,6,2,7,2,2,0
7,0,1,0,2,2,2,6,547,3,7,1,4,1,9,5,1,4,1,5,0
8,0,13,1,0,1,12,2,1,69,10,2,0,0,4,1,2,16,0,0,0
9,2,4,8,0,0,66,21,4,6,1385,0,1,1,12,4,3,40,10,0,1


In [114]:
from sklearn.metrics import accuracy_score
accuracy_score(test_cuisine, result)

0.78384827932680234

In [115]:
from sklearn.metrics import classification_report
print(classification_report(test_cuisine, result))

              precision    recall  f1-score   support

   brazilian       0.75      0.53      0.62        94
     british       0.63      0.53      0.58       161
cajun_creole       0.73      0.73      0.73       310
     chinese       0.79      0.87      0.83       535
    filipino       0.72      0.61      0.66       151
      french       0.61      0.62      0.62       530
       greek       0.72      0.66      0.69       235
      indian       0.88      0.91      0.90       601
       irish       0.60      0.51      0.55       134
     italian       0.80      0.88      0.84      1568
    jamaican       0.87      0.72      0.79       106
    japanese       0.78      0.69      0.73       285
      korean       0.83      0.73      0.78       166
     mexican       0.90      0.92      0.91      1288
    moroccan       0.83      0.79      0.81       165
     russian       0.66      0.51      0.57        98
 southern_us       0.71      0.74      0.73       864
     spanish       0.64    

### Retrain the model on full set

In [151]:
tfidf_enc = TfidfVectorizer(binary=True)
lbl_enc = LabelEncoder()

X = tfidf_enc.fit_transform(train_as_text)
X = X.astype('float16')

y = lbl_enc.fit_transform(train_cuisine)

In [152]:
clf = SVC(C=100, kernel='rbf', degree=3,
          gamma=1, coef0=1, shrinking=True, 
          probability=False, tol=0.001, cache_size=200,
          class_weight=None, verbose=True, max_iter=-1,
          decision_function_shape=None, random_state=None)
model = OneVsRestClassifier(clf, n_jobs=4)
model.fit(X,y)

OneVsRestClassifier(estimator=SVC(C=100, cache_size=200, class_weight=None, coef0=1,
  decision_function_shape=None, degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=True),
          n_jobs=4)

### Prediction

In [119]:
data.head()

Unnamed: 0,title,cremini mushrooms,caramel,orecchiette,maldon sea salt,baby bok choy,sliced mushrooms,pimenton,radish,mortadella,...,calories,protein,carbs,sodium,steps,breakfast,lunch,dinner,snack,drink
0,"Lentil, Apple, and Turkey Wrap",0,0,0,0,0,0,0,0,0,...,426.0,30.0,60.75,559.0,3,0,0,0,0,0
1,Boudin Blanc Terrine with Red Onion Confit,0,0,0,0,0,0,0,0,0,...,403.0,18.0,31.0,1439.0,5,0,0,0,0,0
2,Potato and Fennel Soup Hodge,0,0,0,0,0,0,0,0,0,...,165.0,6.0,19.5,165.0,2,0,0,0,0,0
3,Mahi-Mahi in Tomato Olive Sauce,0,0,0,0,0,0,0,0,0,...,,,,,2,0,0,1,0,0
4,Spinach Noodle Casserole,0,0,0,0,0,0,0,0,0,...,547.0,20.0,44.75,452.0,1,0,0,0,0,0


In [142]:
def return_string(i):
    x=list(data.loc[i][data.loc[i]==1].index)
    list(filter(lambda y: y in list(ingredients['full name']), x))
    return (' '.join(x))

In [143]:
test_as_text = [return_string(i) for i in data.index]

In [154]:
X_test = tfidf_enc.transform(test_as_text)
X_test = X_test.astype('float16')

In [155]:
y_test = model.predict(X_test)
test_cuisine = lbl_enc.inverse_transform(y_test)

In [156]:
[(x, list(test_cuisine).count(x)) for x in set(test_cuisine)]

[('japanese', 382),
 ('italian', 5217),
 ('mexican', 2030),
 ('jamaican', 106),
 ('brazilian', 197),
 ('greek', 663),
 ('southern_us', 2932),
 ('indian', 816),
 ('spanish', 467),
 ('filipino', 35),
 ('british', 566),
 ('thai', 397),
 ('chinese', 699),
 ('french', 4052),
 ('moroccan', 471),
 ('vietnamese', 108),
 ('russian', 344),
 ('korean', 124),
 ('cajun_creole', 238),
 ('irish', 267)]

In [157]:
test_cuisine[:5]

array(['mexican', 'french', 'irish', 'italian', 'italian'], 
      dtype='<U12')

In [158]:
len(test_cuisine)

20111

In [159]:
pd.DataFrame(test_cuisine).to_csv("cuisine.csv", encoding='UTF-8')