# Problem: Predicting the type of cuisine given the recipe (list of ingredients).

In [1]:
# data preprocessing 
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

# data vizualizations
import random
import plotly
import plotly.figure_factory as ff
import plotly.plotly as py
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 
init_notebook_mode(connected=True)
import plotly.offline as offline
import plotly.graph_objs as go
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, classification_report

# classification models
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

# Exploratory Data Analysis

We used Plotly, a Python library for data visualizations to create the graphs and charts in our exploratory data analysis, due to its appeal and interactivity.

In [2]:
# import the dataset
train_data = pd.read_json('./train.json')
train_data.head()

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."


In [3]:
train_data.shape

(39774, 3)

This dataset has 39774 rows of recipe information and 3 columns consisting of the recipe's cuisine, its unique id and the contents (ingredients) of the recipe.

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39774 entries, 0 to 39773
Data columns (total 3 columns):
id             39774 non-null int64
cuisine        39774 non-null object
ingredients    39774 non-null object
dtypes: int64(1), object(2)
memory usage: 932.3+ KB


From the information above, we conclude that there aren't any null values.

In [5]:
print("There are {} recipes in total.".format(len(train_data)))
print("Among those recipes, there are {} different ingredients.".format(len(set([i for item in train_data['ingredients'] for i in item]))))

There are 39774 recipes in total.
Among those recipes, there are 6714 different ingredients.


In [6]:
print("There are a total of", str(len(train_data.cuisine.unique())), "classes (cuisines) in the dataset, consisting of:")
for index, cuisine in enumerate(train_data.cuisine.unique()):
    print(str(index+1) + ".", cuisine)

There are a total of 20 classes (cuisines) in the dataset, consisting of:
1. greek
2. southern_us
3. filipino
4. indian
5. jamaican
6. spanish
7. italian
8. mexican
9. chinese
10. british
11. thai
12. vietnamese
13. cajun_creole
14. brazilian
15. french
16. japanese
17. irish
18. korean
19. moroccan
20. russian


In [7]:
# for visualization purposes
def random_colours(n):
    """
    Returns a list of n random color hex codes.
    """
    colors = []
    for i in range(n):
        colors.append('#' + ''.join([random.choice('0123456789ABCDEF') for j in range(6)]))
    return colors

In [8]:
# lists the cuisine starting from the cuisine with the most number of recipes
cuisines_list = [i for i in train_data.cuisine.value_counts().index]
counts_list = [j for j in train_data.cuisine.value_counts()]
total_cuisines = len(train_data)
annot_list = []

# annotations for showing percentages on the bar chart below
for cuisine, count in list(zip(cuisines_list, counts_list))[::-1]:
    percent = round(count/total_cuisines*100, 2)
    annot_list.append(dict(x=cuisine,
                      y=count+500,
                      xref='x',
                      yref='y',
                      showarrow=False,
                      text=str(percent)+'%')) 
    
trace = go.Bar(x=[i for i in train_data.cuisine.value_counts().index][::-1],
               y=train_data.cuisine.value_counts().values[::-1],
               text=train_data.cuisine.value_counts().values[::-1],
               hoverinfo='x+y',
               textposition='outside',
               marker = dict(color = random_colours(20)))


layout = go.Layout(title='Number of Recipes for Each Cuisine',
                   titlefont=dict(size=25),
                   annotations=annot_list,
                   xaxis=dict(title='Cuisines'),
                   yaxis=dict(title='Number of Recipes'),
                   width=1000,
                   height=600)

data = [trace]
figure = dict(data=data, layout=layout)
iplot(figure, filename='num_recipes')

The bar chart represents the total number of recipes in each cuisine. It can be observed that the most common cuisine in this dataset is 'italian', which makes up almost 20% of the data. The top three cuisines alone are ~47% of the data and the bottom six cuisines makes up less than 10% of the data, making this data quite imbalanced.

In [9]:
trace = go.Histogram(x=train_data['ingredients'].str.len(),
                     xbins=dict(start=0,end=70,size=1),
                     marker=dict(color='#654C4F'),
                     opacity=0.75)
data = [trace]
layout = go.Layout(title='Recipe Length Distribution',
                   titlefont=dict(size=25),
                   xaxis=dict(title='Number of Ingredients'),
                   yaxis=dict(title='Number of Recipes'),
                   height=1000,
                   bargap=0.02,
                   bargroupgap=0.02)

figure = go.Figure(data=data, layout=layout)
iplot(figure, filename='recipe_length')

print(train_data['ingredients'].str.len().describe())

count    39774.000000
mean        10.767713
std          4.428978
min          1.000000
25%          8.000000
50%         10.000000
75%         13.000000
max         65.000000
Name: ingredients, dtype: float64


The histogram shows the distribution of recipe length in the dataset. From the histogram and description, it can be observed that most recipes have 9 ingredients and the distribution skews to the right. The distribution has a median of 10 and mean of 10.7, meaning that we will have an average of 10-11 predictors for every recipe.

In [10]:
# list all ingredients
all_ingredients = [i for item in train_data['ingredients'] for i in item]

# count the number of times each ingredient appears
counter = Counter()
for ingredient in all_ingredients:
     counter[ingredient] += 1
        
# retrieve the top 20 most common ingredients for visualization purposes
top_ingredients_count = counter.most_common(20)
top_ingredients = [a[0] for a in top_ingredients_count]
top_count = [a[1] for a in top_ingredients_count]

# create percentage labels for the chart below
percent_labels = []
for i in top_count:
    percent = round(i / len(train_data) * 100, 2)
    percent = str(percent) + '%'
    percent_labels.append(percent)

trace = go.Bar(x=top_count[::-1],
               y=top_ingredients[::-1],
               text=percent_labels[::-1],
               textposition='outside',
               orientation='h',
               marker=dict(color = random_colours(20)))

layout = go.Layout(xaxis=dict(title='Number of occurences in all recipes'),
                   yaxis=dict(title='Ingredients', automargin=True),
                   title='Top 20 Ingredients',
                   titlefont = dict(size=25),
                   width=1000,
                   height=750)

data = [trace]
figure = go.Figure(data=data, layout=layout)
iplot(figure, filename='top20')

This bar chart shows the 20 most common ingredients in the dataset. We observe that salt is the most used ingredient in the dataset (used in more than 45% of all recipes). Therefore, salt will be a very poor predictor. The most commonly used ingredients generally will have a poor predictive power because they tend to appear in most cuisines.

In [11]:
def count_distinct(cuisine):
    '''
    Counts the number of different ingredients used in the given cuisine.
    '''
    ingredients_list = []
    for items in train_data[train_data['cuisine'] == cuisine]['ingredients']:
        for ingredient in items:
            ingredients_list.append(ingredient)
            
    res = (cuisine, len(set(ingredients_list)))        
    return res

In [12]:
labels = [i for i in train_data.cuisine.value_counts().index][::-1]
cuisine_ingredients = [[] for i in range(20)]
for i, j in enumerate(labels):
    for item in train_data[train_data['cuisine'] == j]['ingredients']:
        for ingredient in item:
            cuisine_ingredients[i].append(ingredient)

In [13]:
figure = tools.make_subplots(rows=5,
                             cols=2,
                             subplot_titles=labels[:10],
                             specs=[[{}, {}], [{}, {}], [{}, {}], [{}, {}], [{}, {}]])

traces = []
for i in range(10):
    ingredient_counter = Counter()
    for ingredient in cuisine_ingredients[i]:
         ingredient_counter[ingredient] += 1
    top_ingredients_count = ingredient_counter.most_common(5)
    top_ingredients = [a[0] for a in top_ingredients_count]
    top_count = [a[1] for a in top_ingredients_count]
    
    trace = go.Bar(x=top_count[::-1],
                   y=top_ingredients[::-1],
                   orientation='h',
                   hoverinfo='x+y',
                   marker=dict(color=random_colours(5)))
    traces.append(trace)
    
cols = [1,2]*5
rows = [b for b in range(1,6)]
rows = [1,1,2,2,3,3,4,4,5,5]

for t, row, col in zip(traces, rows, cols):
    figure.append_trace(t, row, col)
    figure['layout'].update(height=1000,
                            width=1600,
                            showlegend=False,
                            title='Most Common Ingredients in Each Cuisine',
                            titlefont=dict(size=25))

iplot(figure, filename='most_common')

This is the format of your plot grid:
[ (1,1) x1,y1 ]    [ (1,2) x2,y2 ]  
[ (2,1) x3,y3 ]    [ (2,2) x4,y4 ]  
[ (3,1) x5,y5 ]    [ (3,2) x6,y6 ]  
[ (4,1) x7,y7 ]    [ (4,2) x8,y8 ]  
[ (5,1) x9,y9 ]    [ (5,2) x10,y10 ]



In [14]:
figure = tools.make_subplots(rows=5,
                             cols=2,
                             subplot_titles=labels[10:],
                             specs=[[{}, {}], [{}, {}], [{}, {}], [{}, {}], [{}, {}]])

traces = []
for i in range(10, 20):
    ingredient_counter = Counter()
    for ingredient in cuisine_ingredients[i]:
         ingredient_counter[ingredient] += 1
    top_ingredients_count = ingredient_counter.most_common(5)
    top_ingredients = [a[0] for a in top_ingredients_count]
    top_count = [a[1] for a in top_ingredients_count]
    
    trace = go.Bar(x=top_count[::-1],
                   y=top_ingredients[::-1],
                   orientation='h',
                   hoverinfo='x+y',
                   marker=dict(color=random_colours(5)))
    traces.append(trace)
    
cols = [1,2]*5
rows = [b for b in range(1,6)]
rows = [1,1,2,2,3,3,4,4,5,5]

for t, row, col in zip(traces, rows, cols):
    figure.append_trace(t, row, col)
    figure['layout'].update(height=1000,
                            width=1600,
                            showlegend=False,
                            title='Most Common Ingredients in Each Cuisine',
                            titlefont=dict(size=25))

iplot(figure, filename='most_common2')

This is the format of your plot grid:
[ (1,1) x1,y1 ]    [ (1,2) x2,y2 ]  
[ (2,1) x3,y3 ]    [ (2,2) x4,y4 ]  
[ (3,1) x5,y5 ]    [ (3,2) x6,y6 ]  
[ (4,1) x7,y7 ]    [ (4,2) x8,y8 ]  
[ (5,1) x9,y9 ]    [ (5,2) x10,y10 ]



The bar charts above represent the 5 most common ingredients in each cuisine. It can be observed that salt is the most common ingredient in most cuisines. Other cuisines have either soy sauce or fish sauce as the most common ingredient. The ingredients listed above are mostly essential ingredients used in many cuisines, which may become poor predictors for the classification.

In [15]:
def unique_ingredients(cuisine):
    """
    Counts the number of ingredients unique to a cuisine.
    """
    others = []
    for other_ingredients in train_data[train_data['cuisine'] != cuisine]['ingredients']:
        for item in other_ingredients:
            others.append(item)
            
    other_ingredients = len(set(others))
    total_ingredients = len(set([i for item in train_data['ingredients'] for i in item]))
            
    return total_ingredients - other_ingredients

In [16]:
unique_list = [unique_ingredients(c) for c in labels]

trace = go.Bar(x=labels,
               y=unique_list,
               text=unique_list,
               textposition='outside',
               hoverinfo='x+y',
               marker=dict(color=random_colours(20)))

layout = go.Layout(xaxis=dict(title='Cuisine'),
                   yaxis=dict(title='Number of unique ingredients', automargin=True),
                   title='Number of Unique Ingredients Used Only in Each Cuisine',
                   titlefont=dict(size=25),
                   width=1000,
                   height=700)

data = [trace]
figure = go.Figure(data=data, layout=layout)
iplot(figure, filename='unique_ingredients')

This chart represents the number of ingredients which are only available exclusively in each cuisine. It can be seen that the less common cuisines have a low number of ingredients unique to them. This can affect some classifier models which rely on unique features to classify data.

# CLASSIFICATION

## Data Preparation

In preparing the data for classification, we vectorized the ingredient words into a matrix of token counts and transformed the words to TF-IDF (Term Frequency-Inverse Document Frequency) representation, which is used as a weighing factor to determine the importance of each ingredient in the recipe, giving the ingredients which rarely appears a higher importance than the more common ingredients.

We tried to combine multiple word ingredients by changing the space to '-', but it reduces the accuracy of the models.

In [17]:
train_df = pd.DataFrame(columns=['id', 'cuisine', 'ingredients'])
train_df['id'] = train_data['id']
train_df['cuisine'] = train_data['cuisine'].astype("category").cat.codes
new_ingredients = []

for item in train_data['ingredients']:
    new_items = []
    for ingredients in item:
        new_items.append(ingredients)
    new_ingredients.append(' '.join(new_items))

train_df['ingredients'] = new_ingredients
train_df.head()

Unnamed: 0,id,cuisine,ingredients
0,10259,6,romaine lettuce black olives grape tomatoes ga...
1,25693,16,plain flour ground pepper salt tomatoes ground...
2,20130,4,eggs pepper salt mayonaise cooking oil green c...
3,22213,7,water vegetable oil wheat salt
4,13162,7,black pepper shallots cornflour cayenne pepper...


In [18]:
# map the category's number to its cuisine name
c = train_data['cuisine'].astype('category')
cat_dic = dict(enumerate(c.cat.categories))
print(cat_dic)

{0: 'brazilian', 1: 'british', 2: 'cajun_creole', 3: 'chinese', 4: 'filipino', 5: 'french', 6: 'greek', 7: 'indian', 8: 'irish', 9: 'italian', 10: 'jamaican', 11: 'japanese', 12: 'korean', 13: 'mexican', 14: 'moroccan', 15: 'russian', 16: 'southern_us', 17: 'spanish', 18: 'thai', 19: 'vietnamese'}


In [19]:
# split the dataset into train and test
train_df, test_df = train_test_split(train_df, test_size=0.25) 

The given test.json does not have the cuisine, so we could not check the accuracy of our models. Therefore, we split the training dataset into training and test dataset, with a test size of 0.25 of the data.

In [20]:
train_ingredients = train_df['ingredients']
test_ingredients = test_df['ingredients']

# convert ingredients into a matrix of token counts and transform it to tf-idf representation 
train_tf = TfidfVectorizer(lowercase=False, preprocessor=None)
train_tf.fit(train_ingredients)

train_X = train_tf.transform(train_ingredients)
train_Y = train_df['cuisine']
test_X = train_tf.transform(test_ingredients)
test_Y = test_df['cuisine']

We use TF-IDF (Term Frequency-Inverse Document Frequency) to label the features (words) with weights. So, the less common words will have more weight on them.

## Classification Models

We did 5 classification models in total, which includes LinearSVC, Logistic Regression, Random Forest, Multinomial Naive Bayes, and a voting classifier which utilizes the first 4 classifiers to gain better results.

We tested a few classifiers regardless of their theoretical performance for text classification, because it all depends on the data itself.

In [21]:
svc = SVC(C=1.5, kernel='linear', probability=True)

# tuning the hyperparameters by checking their cross validation scores
cv_score = cross_val_score(svc, train_X, train_Y, cv=5, n_jobs=-1, verbose=2)
cv_avg = cv_score.mean()
print(cv_score)
print("Average: " + str(cv_avg))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 14.2min remaining: 21.3min


[0.78494804 0.78226617 0.77572913 0.78645659 0.77757291]
Average: 0.7813945692256119


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 14.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 14.5min finished


In [22]:
svc.fit(train_X, train_Y)

SVC(C=1.5, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [23]:
svc_pred = svc.predict(test_X)

# create confusion matrix
svc_cm = confusion_matrix(test_Y, svc_pred)

x=[cat_dic[i] for i in range(20)]
y=[cat_dic[j] for j in range(20)][::-1]

figure = ff.create_annotated_heatmap(svc_cm[::-1], x=x, y=y, showscale=True)
figure['layout']['xaxis'].update(side='bottom', title='Predicted Values')
figure['layout']['yaxis'].update(automargin=True, title='Actual Values')
figure['layout'].update(title='LinearSVC Confusion Matrix', width=1000, height=1000, autosize=False)
iplot(figure, filename='svc_heatmap')

# print out metric scores
print("LinearSVC\n")
print("Precision:", precision_score(test_Y, svc_pred, average='weighted'))
print("Recall:", recall_score(test_Y, svc_pred, average='weighted'))
print("F1 Score:", f1_score(test_Y, svc_pred, average='weighted'))

LinearSVC

Precision: 0.7855528365532397
Recall: 0.7863032984714401
F1 Score: 0.7823435282694887


In [24]:
lr = LogisticRegression(C=2.0, multi_class='multinomial', solver='saga')

# tuning the hyperparameters by checking their cross validation scores
cv_score = cross_val_score(lr, train_X, train_Y, cv=5, verbose=2)
cv_avg = cv_score.mean()
print(cv_score)
print("Average: " + str(cv_avg))

[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................................. , total=   6.2s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.1s remaining:    0.0s


[CV] ................................................. , total=   6.7s
[CV]  ................................................................
[CV] ................................................. , total=   7.0s
[CV]  ................................................................
[CV] ................................................. , total=   6.7s
[CV]  ................................................................
[CV] ................................................. , total=   6.6s
[0.78813275 0.77774053 0.77053302 0.79198793 0.77908146]
Average: 0.7814951391216896


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   33.0s finished


In [25]:
lr.fit(train_X, train_Y)

LogisticRegression(C=2.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
lr_pred = lr.predict(test_X)

# create confusion matrix
lr_cm = confusion_matrix(test_Y, lr_pred)

x=[cat_dic[i] for i in range(20)]
y=[cat_dic[j] for j in range(20)][::-1]

figure = ff.create_annotated_heatmap(lr_cm[::-1], x=x, y=y, showscale=True)
figure['layout']['xaxis'].update(side='bottom', title='Predicted Values')
figure['layout']['yaxis'].update(automargin=True, title='Actual Values')
figure['layout'].update(title='Logistic Regression Confusion Matrix', width=1000, height=1000, autosize=False)
iplot(figure, filename='lr_heatmap')

# print out metric scores
print("Logistic Regression\n")
print("Precision:", precision_score(test_Y, lr_pred, average='weighted'))
print("Recall:", recall_score(test_Y, lr_pred, average='weighted'))
print("F1 Score:", f1_score(test_Y, lr_pred, average='weighted'))

Logistic Regression

Precision: 0.7838128610134781
Recall: 0.7853982300884956
F1 Score: 0.7805807488970367


In [27]:
rf = RandomForestClassifier(n_estimators=200, max_features='sqrt', min_samples_split=3, bootstrap=False)

# tuning the hyperparameters by checking their cross validation scores
cv_score = cross_val_score(rf, train_X, train_Y, cv=5, verbose=2)
cv_avg = cv_score.mean()
print(cv_score)
print("Average: " + str(cv_avg))

[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................................. , total= 5.2min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.2min remaining:    0.0s


[CV]  ................................................................
[CV] ................................................. , total= 5.0min
[CV]  ................................................................
[CV] ................................................. , total= 4.1min
[CV]  ................................................................
[CV] ................................................. , total= 3.1min
[CV]  ................................................................
[CV] ................................................. , total= 2.9min
[0.74438485 0.74723433 0.74773718 0.74941334 0.74673148]
Average: 0.747100234663091


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 20.4min finished


In [28]:
rf.fit(train_X, train_Y)

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [29]:
rf_pred = rf.predict(test_X)

# create confusion matrix
rf_cm = confusion_matrix(test_Y, rf_pred)

x=[cat_dic[i] for i in range(20)]
y=[cat_dic[j] for j in range(20)][::-1]

figure = ff.create_annotated_heatmap(rf_cm[::-1], x=x, y=y, showscale=True)
figure['layout']['xaxis'].update(side='bottom', title='Predicted Values')
figure['layout']['yaxis'].update(automargin=True, title='Actual Values')
figure['layout'].update(title='Random Forest Classifier Confusion Matrix', width=1000, height=1000, autosize=False)
iplot(figure, filename='rf_heatmap')

# print out metric scores
print("Random Forest Classifier\n")
print("Precision:", precision_score(test_Y, rf_pred, average='weighted'))
print("Recall:", recall_score(test_Y, rf_pred, average='weighted'))
print("F1 Score:", f1_score(test_Y, rf_pred, average='weighted'))

Random Forest Classifier

Precision: 0.7687412074935859
Recall: 0.7587489943684634
F1 Score: 0.7460792370785666


In [30]:
nb = MultinomialNB(alpha=0.02)

# tuning the hyperparameters by checking their cross validation scores
cv_score = cross_val_score(nb, train_X, train_Y, cv=5, verbose=2)
cv_avg = cv_score.mean()
print(cv_score)
print("Average: " + str(cv_avg))

[CV]  ................................................................
[CV] ................................................. , total=   0.1s
[CV]  ................................................................
[CV] ................................................. , total=   0.1s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] ................................................. , total=   0.0s
[CV]  ................................................................
[CV] ................................................. , total=   0.0s
[CV]  ................................................................
[CV] ................................................. , total=   0.0s
[0.73533356 0.72678512 0.71840429 0.743882   0.73080791]
Average: 0.7310425745893395


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s finished


In [31]:
nb.fit(train_X, train_Y)

MultinomialNB(alpha=0.02, class_prior=None, fit_prior=True)

In [32]:
nb_pred = nb.predict(test_X)

# create confusion matrix
nb_cm = confusion_matrix(test_Y, nb_pred)

x=[cat_dic[i] for i in range(20)]
y=[cat_dic[j] for j in range(20)][::-1]

figure = ff.create_annotated_heatmap(nb_cm[::-1], x=x, y=y, showscale=True)
figure['layout']['xaxis'].update(side='bottom', title='Predicted Values')
figure['layout']['yaxis'].update(automargin=True, title='Actual Values')
figure['layout'].update(title='Multinomial Naive Bayes Classifier Confusion Matrix', width=1000, height=1000, autosize=False)
iplot(figure, filename='nb_heatmap')

# print out metric scores
print("Multinomial Naive Bayes Classifier\n")
print("Precision:", precision_score(test_Y, nb_pred, average='weighted'))
print("Recall:", recall_score(test_Y, nb_pred, average='weighted'))
print("F1 Score:", f1_score(test_Y, nb_pred, average='weighted'))

Multinomial Naive Bayes Classifier

Precision: 0.7384852700179362
Recall: 0.7343121480289622
F1 Score: 0.7245424556319726


In [33]:
vot = VotingClassifier(estimators=[('svc', svc), ('lr', lr), ('rf', rf), ('nb', nb)], voting='soft', weights=[2,2,1,1])

In [34]:
vot.fit(train_X, train_Y)

VotingClassifier(estimators=[('svc',
                              SVC(C=1.5, break_ties=False, cache_size=200,
                                  class_weight=None, coef0=0.0,
                                  decision_function_shape='ovr', degree=3,
                                  gamma='scale', kernel='linear', max_iter=-1,
                                  probability=True, random_state=None,
                                  shrinking=True, tol=0.001, verbose=False)),
                             ('lr',
                              LogisticRegression(C=2.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercep...
                                                     max_samples=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                       

In [35]:
vot_pred = vot.predict(test_X)

# create confusion matrix
vot_cm = confusion_matrix(test_Y, vot_pred)

x=[cat_dic[i] for i in range(20)]
y=[cat_dic[j] for j in range(20)][::-1]

figure = ff.create_annotated_heatmap(vot_cm[::-1], x=x, y=y, showscale=True)
figure['layout']['xaxis'].update(side='bottom', title='Predicted Values')
figure['layout']['yaxis'].update(automargin=True, title='Actual Values')
figure['layout'].update(title='Voting Classifier Confusion Matrix', width=1000, height=1000, autosize=False)
iplot(figure, filename='vot_heatmap')

# print out metric scores
print("Voting Classifier\n")
print("Precision:", precision_score(test_Y, vot_pred, average='weighted'))
print("Recall:", recall_score(test_Y, vot_pred, average='weighted'))
print("F1 Score:", f1_score(test_Y, vot_pred, average='weighted'))

Voting Classifier

Precision: 0.7960096986344594
Recall: 0.7962590506838294
F1 Score: 0.7912258917429532


## Results Analysis

We did an analysis on the results to find out why some classifiers perform poorly compared to the others.

In [36]:
print("SVC")
print(classification_report(test_Y, svc_pred, target_names=[cat_dic[i] for i in range(20)]))
print("Logistic Regression")
print(classification_report(test_Y, lr_pred, target_names=[cat_dic[i] for i in range(20)]))
print("Random Forest")
print(classification_report(test_Y, rf_pred, target_names=[cat_dic[i] for i in range(20)]))
print("Naive Bayes")
print(classification_report(test_Y, nb_pred, target_names=[cat_dic[i] for i in range(20)]))
print("Final Model")
print(classification_report(test_Y, vot_pred, target_names=[cat_dic[i] for i in range(20)]))

SVC
              precision    recall  f1-score   support

   brazilian       0.67      0.60      0.63        94
     british       0.51      0.44      0.47       204
cajun_creole       0.77      0.70      0.73       393
     chinese       0.78      0.87      0.82       658
    filipino       0.72      0.63      0.67       179
      french       0.62      0.65      0.63       663
       greek       0.82      0.72      0.77       300
      indian       0.87      0.90      0.89       816
       irish       0.58      0.48      0.53       143
     italian       0.80      0.89      0.84      1916
    jamaican       0.87      0.68      0.76       131
    japanese       0.83      0.66      0.73       345
      korean       0.85      0.77      0.81       198
     mexican       0.91      0.91      0.91      1619
    moroccan       0.86      0.74      0.80       207
     russian       0.73      0.33      0.45       112
 southern_us       0.71      0.80      0.75      1086
     spanish       0.67

Recall score: fraction of total actual positives which are correctly classified (TP/(TP+FN))

Precision score: fraction of total predicted positives which are correctly classified (TP/(TP+FP))

From the metric scores of all classifiers shown above, it is observed that some classifiers does not perform well:
1. The recall scores on the Naive Bayes classifier are relatively very low on most cuisines with a low number of recipes. This is due to the fact that Naive Bayes uses a simple algorithm. Naive Bayes assumes that all words that appear in the data are independent from each other, which is not true in real world data.
2. Random Forest Classifier also performs poorly on this data, having low recall scores on less common cuisines as there aren't many unique features (ingredients) among those cuisines.

Our final model utilizes the first four models to gain a better performance. We found that it performs better when using all models than only using some but not all other models. We think that this is due to the models covering each other's weaknesses.

# Predicting The Cuisine

We created a simple function that asks for an input of ingredients and predicts the cuisine from the given ingredients, in order to visualize our initial objective.

In [37]:
def guess_cuisine():
    recipe = input()
    ingredients = [" ".join(recipe.split(","))]
    
    # transform the inputted ingredients into tf-idf representation
    X = train_tf.transform(ingredients)
    
    pred = vot.predict_proba(X)
    pred = [round(pred[0, z]*100, 2) for z in range(20)]
    maxi = max(pred)
    percentages = [str(a)+'%' for a in pred]
    
    # display results
    trace = go.Bar(x=pred[::-1],
               y=[cat_dic[i] for i in range(20)][::-1],
               text=percentages[::-1],
               textposition='outside',
               hoverinfo='y+text',
               orientation='h',
               marker=dict(color = random_colours(20)))

    layout = go.Layout(
        xaxis=dict(title='Probability (%)'),
        yaxis=dict(title='Cuisines', automargin=True),
        title='Probability of Each Cuisine Given The Ingredients',
        titlefont = dict(size = 25),
        width=1000,
        height=750)
    
    data = [trace]
    figure = go.Figure(data=data, layout=layout)
    iplot(figure, filename='guess_cuisine')
    
    print('Given the ingredients: "'+ recipe + '", we predict that the cuisine is "' + cat_dic[pred.index(maxi)] + '" with a probability of', str(maxi) + '%')
    

In [38]:
# input example: salt, pepper, rice
guess_cuisine()

salt, pepper, rice


Given the ingredients: "salt, pepper, rice", we predict that the cuisine is "cajun_creole" with a probability of 12.73%


# Conclusion

We found that it is not possible to perfectly guess the cuisine given only the list of ingredients. This is due to the fact that there are missing information such as the method of cooking, and also the fact that a lot of cuisines are actually quite similar to each other.

However, using our final model we are ~79% confident that we can predict the cuisine type when given the list of ingredients.