> # Cuisine Predictor: Predicts cuisine based on a list of ingredients

> ## Importing all necessary modules and libraries

In [1]:
import pandas as pd
import numpy as np
# for the word vector representations
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize 
import warnings 
import gensim 
from gensim.models import Word2Vec 
# for model fitting and train-test splits
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

> # Reading the dataset (json format)

In [2]:
with open('../input/whats-cooking-kernels-only/train.json', encoding='utf-8-sig') as f_input:
    df = pd.read_json(f_input)

df.head()

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."


> ## The ingredients which need to be edited in order for the Word2Vec model to read them as a single ingredient

In [3]:
def preprocessing(ing):
    ings_ns = list()
    for i in range(len(ing)):
        a = ing[i]
        cus_list = list()
        for j in range(len(a)):
            b = a[j]
            b = b.replace(",", "desc")
            b = b.replace('1%','one^percent')
            b = b.replace('2%','two^percent')
            b = b.replace('&','and')
            b = b.replace('!®','^reserved')
            b = b.replace("'s",'s') 
            b = b.replace("’s",'s') 
            b = b.replace('(flour)','flour')
            b = b.replace('®','^reserved')
            b = b.replace('™','^trademark')
            b = b.replace("can't","can^not")
            b = b.replace("(powder)",'powder')
            b = b.replace('95%','ninety^percent')
            b = b.replace('30%','thirty^percent')
            b = b.replace('40%','forty^percent')
            b = b.replace('33%','thirtythree^percent')
            b = b.replace('25%','twentyfive^percent')
            b = b.replace('96%','ninetysix^percent')
            b = b.replace('( oz.)','oz')
            b = b.replace("(not low fat)","not low fat")
            b = b.replace("!"," ")
            b = b.replace("''","^")
            b = b.replace("'n","n")
            b = b.replace("(10 oz.)","ten ozz")
            b = b.replace("(14 oz.)","fourteen ozz")
            b = b.replace("(15 oz.)","fifteen ozz")
            b = b.replace("(14.5 oz.)","fourteenandhalf ozz") 
            c = b.split()
            d = '^'.join([str(elem) for elem in c]) 
            d = d.lower()
            cus_list.append(d)
        ings_ns.append(cus_list)
    print(len(ing))
    return ings_ns

In [4]:
ings = df['ingredients']
ings_u = preprocessing(ings)
df['ings'] = ings_u
df.head()

39774


Unnamed: 0,id,cuisine,ingredients,ings
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes...","[romaine^lettuce, black^olives, grape^tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...","[plain^flour, ground^pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...","[eggs, pepper, salt, mayonaise, cooking^oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]","[water, vegetable^oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe...","[black^pepper, shallots, cornflour, cayenne^pe..."


> ## Writing the ingredients in a text file. Adding the cuisine with each ingredient so as to make the ingredients associated to a particular cuisine more similar to that cuisine.(Word2Vec)

In [5]:
file1 = open("MyFile.txt", "w")  
ing = df['ings']
cudi = df['cuisine']
writing = list()
ingsGlove = list()
for i in range(len(ing)):
    b = '#' + cudi[i] + "#"
    a = b.join([str(elem) for elem in ing[i]]) 
    a = a.lower()
    a = cudi[i] + '#' + a + '#' +cudi[i] + '#'
    ingsGlove.append(a)
    d = '  ' + cudi[i] + '  '
    e = d.join([str(elem) for elem in ing[i]])
    e = e.lower()
    e = cudi[i] + '  ' + e + '  ' + cudi[i] + '\n'
    writing.append(e)
file1.writelines(writing)
file1.close() 
df.head()

Unnamed: 0,id,cuisine,ingredients,ings
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes...","[romaine^lettuce, black^olives, grape^tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...","[plain^flour, ground^pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...","[eggs, pepper, salt, mayonaise, cooking^oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]","[water, vegetable^oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe...","[black^pepper, shallots, cornflour, cayenne^pe..."


> # Training the word embedding model, with the vocabulary consisting of words that are ingredients used.

In [6]:
# Python program to generate word vectors using Word2Vec 
    
nltk.download('punkt')
  
warnings.filterwarnings(action = 'ignore') 
  

#  Reads ‘MyFile.txt’ file 
sample = open("MyFile.txt", "r") 
s = sample.read() 
  
# Replaces escape character with space 
f = s.replace("\n", " ") 
  
data = [] 
  
# iterate through each sentence in the file 
for i in sent_tokenize(f): 
    temp = [] 
      
    # tokenize the sentence into words 
    for j in word_tokenize(i): 
        temp.append(j.lower()) 
  
    data.append(temp) 
  
# Create CBOW model 
model1 = gensim.models.Word2Vec(data, min_count = 1,  
                              size = 200, window = 5) 
  
# Print results 
print("Cosine similarity between 'rajma' " + 
               "and 'tumeric' - CBOW : ", 
    model1.similarity('rajma', 'tumeric')) 

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Cosine similarity between 'rajma' and 'tumeric' - CBOW :  0.8235271


#### As one can see clearly see that rajma and tumeric are both ingredients associated with the Indian cuisine. We see that the word2vec model's performance in detecting this association is good, since the similarity between the vectors is close to 0.82.

> ## Manually changing some word vectors
* ### Over specific  ingredients hurt the performance of the model, therefore in these ingredients with brands and very few occurences are replaced by their generic counterpart

In [7]:
model1.wv['(^oz.)^tomato^sauce'] = model1.wv['tomato^sauce']
model1.wv['(^oz.)^tomato^paste'] = model1.wv['tomato^paste']
model1.wv["johnsonville^reserved^hot^'n^spicy^brats"] = model1.wv['sausages']
len(model1.wv['grape^tomatoes'])

200

> # The similarity indices between some ingredients and cuisines to see if the model was able to pick up the details and intricacies.
* The cosine similarities between different breeds of tomatoes ought to be higher, as calculated by the model. 
* Some cuisines are more similar than others, for example *Indian* and *Moroccan* are more similar when compared to *Indian* and *british* 
* Mirin and Sake are wines used in cooking purposes with the only difference in slightly varying alcohol and sugar content. The model sensed this similarity with a score of 0.99
* Milk and Sake, having little or no relation, have a similarity of 0.19. 

In [8]:
print("Cosine similarity between 'grape tomatoes' and 'cherry tomatoes' - CBOW : ", model1.similarity('cherry^tomatoes', 'grape^tomatoes')) 
print("Cosine similarity between 'moroccan' and 'indian' - CBOW : ", model1.similarity('moroccan', 'indian')) 
print("Cosine similarity between 'sake' and 'mirin' - CBOW : ", model1.similarity('mirin', 'sake')) 
print("Cosine similarity between 'sake' and 'milk' - CBOW : ", model1.similarity('milk', 'sake')) 

Cosine similarity between 'grape tomatoes' and 'cherry tomatoes' - CBOW :  0.93215364
Cosine similarity between 'moroccan' and 'indian' - CBOW :  0.4181894
Cosine similarity between 'sake' and 'mirin' - CBOW :  0.99271655
Cosine similarity between 'sake' and 'milk' - CBOW :  0.19565766


> # Converting the ingredients into vectors
* #### The list of ingredients is parsed through and each ingredient's vector representation is added and then averaged.
* #### This 200 dimensional vector is what we use to train our final Multinomial Regression Model
* #### This vector is generated for each recipe in the dataset

In [9]:
ingredients = df['ings']
recs_vecs = list()
for i in range(len(ingredients)):
    reciepe_ingreds = ingredients[i]
    reciepe_vector = np.zeros(200)
    for j in range(len(reciepe_ingreds)):
        ind_ing = reciepe_ingreds[j]
        reciepe_vector = reciepe_vector + model1.wv[ind_ing]
    reciepe_vector = reciepe_vector/(len(reciepe_ingreds))
    recs_vecs.append(reciepe_vector)
df['reciepe_vector'] = recs_vecs
df.head()

Unnamed: 0,id,cuisine,ingredients,ings,reciepe_vector
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes...","[romaine^lettuce, black^olives, grape^tomatoes...","[0.2177871995502048, 0.22506931299964586, 0.07..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...","[plain^flour, ground^pepper, salt, tomatoes, g...","[0.29508479142730887, 0.167970936935903, -0.01..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...","[eggs, pepper, salt, mayonaise, cooking^oil, g...","[0.3714297578359644, 0.17236104545493922, 0.08..."
3,22213,indian,"[water, vegetable oil, wheat, salt]","[water, vegetable^oil, wheat, salt]","[0.34913451597094536, 0.2048464985564351, 0.05..."
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe...","[black^pepper, shallots, cornflour, cayenne^pe...","[0.4072163846343756, 0.253213085711468, 0.1593..."


### Making X (the dataset) with each row having the 200 entries of the vector associated with that recipe.

In [10]:
columns_list = list()
for i in range(200):
    columns_list.append([])

for i in range(len(recs_vecs)):
    reciepe_vector = recs_vecs[i]
    for j in range(len(reciepe_vector)):
        alljs = columns_list[j]
        alljs.append(reciepe_vector[j])
for i in range(len(columns_list)):
    a = 'attribute' + str(i+1)
    df[a] = columns_list[i]

df.head()

Unnamed: 0,id,cuisine,ingredients,ings,reciepe_vector,attribute1,attribute2,attribute3,attribute4,attribute5,...,attribute191,attribute192,attribute193,attribute194,attribute195,attribute196,attribute197,attribute198,attribute199,attribute200
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes...","[romaine^lettuce, black^olives, grape^tomatoes...","[0.2177871995502048, 0.22506931299964586, 0.07...",0.217787,0.225069,0.075079,0.204171,0.085583,...,0.356368,-0.333636,0.099381,-0.508168,-0.390342,-0.029346,-0.242753,-0.07524,-0.297787,-0.077505
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...","[plain^flour, ground^pepper, salt, tomatoes, g...","[0.29508479142730887, 0.167970936935903, -0.01...",0.295085,0.167971,-0.011157,0.210495,-0.025418,...,0.569582,0.023496,0.111426,-0.352402,-0.280119,0.027606,-0.138165,-0.051057,-0.10848,-0.107813
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...","[eggs, pepper, salt, mayonaise, cooking^oil, g...","[0.3714297578359644, 0.17236104545493922, 0.08...",0.37143,0.172361,0.084359,0.340172,0.025796,...,0.50787,-0.044086,0.111586,-0.403994,-0.334814,-0.00134,-0.145244,-0.100431,-0.207042,-0.235745
3,22213,indian,"[water, vegetable oil, wheat, salt]","[water, vegetable^oil, wheat, salt]","[0.34913451597094536, 0.2048464985564351, 0.05...",0.349135,0.204846,0.053148,0.252364,0.051267,...,0.452536,0.010346,-0.018478,-0.309297,-0.240181,0.01636,-0.158186,-0.105427,-0.146047,-0.18127
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe...","[black^pepper, shallots, cornflour, cayenne^pe...","[0.4072163846343756, 0.253213085711468, 0.1593...",0.407216,0.253213,0.159369,0.369206,0.01506,...,0.551787,-0.015587,-0.058962,-0.367925,-0.286473,0.067848,-0.040414,-0.286635,-0.086051,-0.040577


In [11]:
Y_text = df['cuisine']
Y_text.head()
X = df.drop(['ings','cuisine','id','ingredients','reciepe_vector'],axis=1)
X.head()

Unnamed: 0,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9,attribute10,...,attribute191,attribute192,attribute193,attribute194,attribute195,attribute196,attribute197,attribute198,attribute199,attribute200
0,0.217787,0.225069,0.075079,0.204171,0.085583,0.262799,0.13626,-0.105789,0.087794,0.010986,...,0.356368,-0.333636,0.099381,-0.508168,-0.390342,-0.029346,-0.242753,-0.07524,-0.297787,-0.077505
1,0.295085,0.167971,-0.011157,0.210495,-0.025418,0.305173,0.079618,-0.132652,0.233793,-0.047758,...,0.569582,0.023496,0.111426,-0.352402,-0.280119,0.027606,-0.138165,-0.051057,-0.10848,-0.107813
2,0.37143,0.172361,0.084359,0.340172,0.025796,0.296319,-0.069886,-0.116739,0.109865,-0.01585,...,0.50787,-0.044086,0.111586,-0.403994,-0.334814,-0.00134,-0.145244,-0.100431,-0.207042,-0.235745
3,0.349135,0.204846,0.053148,0.252364,0.051267,0.237853,-0.095144,-0.15646,0.219363,0.065569,...,0.452536,0.010346,-0.018478,-0.309297,-0.240181,0.01636,-0.158186,-0.105427,-0.146047,-0.18127
4,0.407216,0.253213,0.159369,0.369206,0.01506,0.407305,-0.007295,-0.132101,0.172809,0.134896,...,0.551787,-0.015587,-0.058962,-0.367925,-0.286473,0.067848,-0.040414,-0.286635,-0.086051,-0.040577


> # Changing the cuisines into numbers
* #### In order to fit the dataset to a model, the output features must be converted to numerical categories.
* #### They are currently in textual categories. The conversion is done through a cleanup dictionary datatype.
* #### The 20 different cuisines have to be encoded as numbers in order for a Multinomial Regression model to fit.
* #### The 20 different cuisines are encoded as integers ranging from 0 to 19 (both included).

In [12]:
def encodeNumerical(df_final):
    df_3 = df_final.dropna()
    cleanup_nums = {"cuisine":     {"italian" :  0,    
                                    "mexican" : 1,     
                                    "southern_us" : 2,   
                                    "indian" : 3     ,   
                                    "chinese" : 4     ,    
                                    "french" :  5      , 
                                    "cajun_creole" : 6  ,
                                    "thai" :  7       ,  
                                    "japanese" : 8    ,    
                                    "greek" :  9      ,  
                                    "spanish" : 10    ,    
                                    "korean" : 11     ,   
                                    "vietnamese" : 12 , 
                                    "moroccan" : 13   ,    
                                    "british" : 14    ,   
                                    "filipino" :  15  ,   
                                    "irish"  :  16    ,   
                                    "jamaican" :  17  ,   
                                    "russian" : 18  ,  
                                    "brazilian" : 19}
                   }
    obj_df = df_3.replace(cleanup_nums)
    obj_df.head(10)
    return obj_df
df1_new = encodeNumerical(df)
Y = df1_new['cuisine']
Y.head()

0     9
1     2
2    15
3     3
4     3
Name: cuisine, dtype: int64

> # Multinomial Regression
* #### Creating a test dataset with a 10% split
* #### The one vs rest classifier, after training the model on positive and negative classes, calculates the probailites of each new example belonging in the m classes, and the class with highest probaility is the result.
* #### Calculating the accuracy of the model on the test set.

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

In [14]:
clf1 = OneVsRestClassifier(SVC()).fit(X_train, Y_train)
score1 = clf1.score(X_test,Y_test)

In [15]:
print('Accuracy on test set with one vs rest classifier: ',score1)

Accuracy on test set with one vs rest classifier:  0.6791954745443117
