In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plt
import json

In [2]:
recipies = pd.read_csv('./data/recipies_features_plain.csv')
recipies.head()

Unnamed: 0,id,cuisine,ingredients,num_ingredients,str_ingredients
0,10259,greek,"['romaine lettuce', 'black olives', 'grape tom...",9,"romaine lettuce, black olives, grape tomatoes,..."
1,25693,southern_us,"['plain flour', 'ground pepper', 'salt', 'toma...",11,"plain flour, ground pepper, salt, tomatoes, gr..."
2,20130,filipino,"['eggs', 'pepper', 'salt', 'mayonaise', 'cooki...",12,"eggs, pepper, salt, mayonaise, cooking oil, gr..."
3,22213,indian,"['water', 'vegetable oil', 'wheat', 'salt']",4,"water, vegetable oil, wheat, salt"
4,13162,indian,"['black pepper', 'shallots', 'cornflour', 'cay...",20,"black pepper, shallots, cornflour, cayenne pep..."


In [3]:
#Make an 80-20 split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(recipies[['num_ingredients', 'str_ingredients']], recipies['cuisine'], test_size=0.2)

In [6]:
X_train.head()

Unnamed: 0,num_ingredients,str_ingredients
13638,7,"red grape, pomegranate seeds, salt, avocado, p..."
8330,14,"sugar, hoisin sauce, all-purpose flour, onions..."
12811,8,"ground black pepper, extra-virgin olive oil, s..."
33707,8,"mayonaise, bread slices, pepper, iceberg lettu..."
37454,17,"gari, green onions, garlic, beansprouts, soy s..."


In [9]:
y_train

13638       mexican
8330       filipino
12811         greek
33707       italian
37454      japanese
            ...    
16593        french
9913     vietnamese
8262         french
25245       mexican
28859       italian
Name: cuisine, Length: 31819, dtype: object

In [7]:
# Use TFIDF vectorizer on str_ingredients
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer=lambda d: d.split(', '))
vectors = vectorizer.fit_transform(X_train['str_ingredients'])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
tfidf_train = pd.DataFrame(denselist, columns=feature_names)
tfidf_train.head()

Unnamed: 0,"""Best Foods Mayonnaise with Lime Juice""","""Campbells Condensed Cheddar Cheese Soup""","""Campbells Condensed Cream of Chicken Soup""","""Campbells Condensed Cream of Mushroom Soup""","""Campbells Condensed Tomato Soup""","""Colmans Mustard Powder""","""Coxs Orange Pippin""","""Egglands BestÂ® eggs""","""FranksÂ® RedHotÂ® Original Cayenne Pepper Sauce""","""Hellmanns Dijonnaise Creamy Dijon Mustard""",...,yukon gold,yukon gold potatoes,yuzu,yuzu juice,zest,zesty italian dressing,zinfandel,ziti,zucchini,zucchini blossoms
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
vectors = vectorizer.transform(X_test['str_ingredients'])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
tfidf_test = pd.DataFrame(denselist, columns=feature_names)
tfidf_test.head()

Unnamed: 0,"""Best Foods Mayonnaise with Lime Juice""","""Campbells Condensed Cheddar Cheese Soup""","""Campbells Condensed Cream of Chicken Soup""","""Campbells Condensed Cream of Mushroom Soup""","""Campbells Condensed Tomato Soup""","""Colmans Mustard Powder""","""Coxs Orange Pippin""","""Egglands BestÂ® eggs""","""FranksÂ® RedHotÂ® Original Cayenne Pepper Sauce""","""Hellmanns Dijonnaise Creamy Dijon Mustard""",...,yukon gold,yukon gold potatoes,yuzu,yuzu juice,zest,zesty italian dressing,zinfandel,ziti,zucchini,zucchini blossoms
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.18089,0.0


In [10]:
X_train_vect = pd.concat([X_train['num_ingredients'].reset_index(drop=True), 
           tfidf_train], axis=1)
X_train_vect.head()

Unnamed: 0,num_ingredients,"""Best Foods Mayonnaise with Lime Juice""","""Campbells Condensed Cheddar Cheese Soup""","""Campbells Condensed Cream of Chicken Soup""","""Campbells Condensed Cream of Mushroom Soup""","""Campbells Condensed Tomato Soup""","""Colmans Mustard Powder""","""Coxs Orange Pippin""","""Egglands BestÂ® eggs""","""FranksÂ® RedHotÂ® Original Cayenne Pepper Sauce""",...,yukon gold,yukon gold potatoes,yuzu,yuzu juice,zest,zesty italian dressing,zinfandel,ziti,zucchini,zucchini blossoms
0,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [133]:
X_test_vect = pd.concat([X_test['num_ingredients'].reset_index(drop=True), 
           tfidf_test], axis=1)
X_test_vect.head()

Unnamed: 0,num_ingredients,"""Best Foods Mayonnaise with Lime Juice""","""Campbells Condensed Cheddar Cheese Soup""","""Campbells Condensed Cream of Chicken Soup""","""Campbells Condensed Cream of Mushroom Soup""","""Campbells Condensed Tomato Soup""","""Colmans Mustard Powder""","""Coxs Orange Pippin""","""Egglands BestÂ® eggs""","""FranksÂ® RedHotÂ® Original Cayenne Pepper Sauce""",...,yukon gold,yukon gold potatoes,yuzu,yuzu juice,zest,zesty italian dressing,zinfandel,ziti,zucchini,zucchini blossoms
0,16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.18089,0.0


In [35]:
y_test=y_test.reset_index(drop=True)
y_train=y_train.reset_index(drop=True)

In [12]:
#save the vectorized sets for later
export = X_train_vect.to_csv('./data/vectorized_train.csv', header=True, index = False)

In [13]:
#save the vectorized sets for later
export = X_test_vect.to_csv('./data/vectorized_test.csv', header=True, index = False)

In [14]:
#save the vectorized sets for later
export = y_train.to_csv('./data/ytrain.csv', header=False, index = False)

In [15]:
#save the vectorized sets for later
export = y_test.to_csv('./data/ytest.csv', header=False, index = False)

In [16]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time

In [17]:
#try a random forest
rf = RandomForestClassifier(n_estimators=150, max_depth=30, n_jobs=-1)

start = time.time()
rf_model = rf.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = rf_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)

print('Fit time: {} / Predict time: {} ---- Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit time: 33.441 / Predict time: 1.071 ---- Accuracy: 0.571


In [18]:
#57% accuracy, nice. baseline -  5%? save the model, may make pipeline for later.
import pickle
filename = './models/rfmodel.pkl'
pickle.dump(rf_model, open(filename, 'wb'))

In [21]:
feature_importances = pd.DataFrame(rf_model.feature_importances_,index = X_train_vect.columns,columns=['importance']).sort_values('importance',ascending=False)
feature_importances

Unnamed: 0,importance
grated parmesan cheese,0.025599
garam masala,0.023130
soy sauce,0.022604
olive oil,0.018276
fish sauce,0.017559
...,...
knorr reduc sodium chicken flavor bouillon,0.000000
knorr rice side cheddar broccoli,0.000000
knorr tomato bouillon with chicken flavor,0.000000
korean buckwheat noodles,0.000000


In [22]:
#save the feature importances for study later
export = feature_importances.to_csv('./data/feat_imp.csv', header=True, index = True)

In [30]:
#Let's look at the features individually now
y_test = y_test.astype('category')
cuisine_list = list(y_test.cat.categories)

In [42]:
total_train = pd.concat([X_train_vect,y_train], axis=1)
total_train.head()

Unnamed: 0,num_ingredients,"""Best Foods Mayonnaise with Lime Juice""","""Campbells Condensed Cheddar Cheese Soup""","""Campbells Condensed Cream of Chicken Soup""","""Campbells Condensed Cream of Mushroom Soup""","""Campbells Condensed Tomato Soup""","""Colmans Mustard Powder""","""Coxs Orange Pippin""","""Egglands BestÂ® eggs""","""FranksÂ® RedHotÂ® Original Cayenne Pepper Sauce""",...,yukon gold potatoes,yuzu,yuzu juice,zest,zesty italian dressing,zinfandel,ziti,zucchini,zucchini blossoms,cuisine
0,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,mexican
1,14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,filipino
2,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,greek
3,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,italian
4,17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,japanese


In [43]:
total_test = pd.concat([X_test_vect,y_test], axis=1)
total_test.head()

Unnamed: 0,num_ingredients,"""Best Foods Mayonnaise with Lime Juice""","""Campbells Condensed Cheddar Cheese Soup""","""Campbells Condensed Cream of Chicken Soup""","""Campbells Condensed Cream of Mushroom Soup""","""Campbells Condensed Tomato Soup""","""Colmans Mustard Powder""","""Coxs Orange Pippin""","""Egglands BestÂ® eggs""","""FranksÂ® RedHotÂ® Original Cayenne Pepper Sauce""",...,yukon gold potatoes,yuzu,yuzu juice,zest,zesty italian dressing,zinfandel,ziti,zucchini,zucchini blossoms,cuisine
0,16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,indian
1,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,indian
2,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,indian
3,14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,southern_us
4,19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.18089,0.0,italian


In [48]:
# looping over each of the cuisines
for c in cuisine_list:
    print(len(total_test[total_test['cuisine']==c]))

93
164
319
530
156
559
228
660
122
1494
108
298
173
1296
161
89
842
208
289
166


In [53]:
# calculating accuracy per cuisine. really bad for some , good for others. italian baseline 20% - 96%
for c in cuisine_list:
    y_pred = rf_model.predict(total_test[total_test['cuisine']==c].drop('cuisine',axis=1))
    print('Accuracy for cusine {} = {}'.format(c,round((y_pred==c).sum()/len(y_pred), 3)))

Accuracy for cusine brazilian = 0.022
Accuracy for cusine british = 0.0
Accuracy for cusine cajun_creole = 0.354
Accuracy for cusine chinese = 0.804
Accuracy for cusine filipino = 0.135
Accuracy for cusine french = 0.032
Accuracy for cusine greek = 0.07
Accuracy for cusine indian = 0.792
Accuracy for cusine irish = 0.0
Accuracy for cusine italian = 0.958
Accuracy for cusine jamaican = 0.037
Accuracy for cusine japanese = 0.456
Accuracy for cusine korean = 0.225
Accuracy for cusine mexican = 0.87
Accuracy for cusine moroccan = 0.217
Accuracy for cusine russian = 0.022
Accuracy for cusine southern_us = 0.523
Accuracy for cusine spanish = 0.0
Accuracy for cusine thai = 0.595
Accuracy for cusine vietnamese = 0.205


In [55]:
# many of the european countries predicted as italian, asian countries as chinese or thai
y_pred_spanish = rf_model.predict(total_test[total_test['cuisine']=='spanish'].drop('cuisine',axis=1))

In [60]:
unique, counts = np.unique(y_pred_spanish, return_counts=True)
print(np.asarray((unique, counts)).T)

[['french' 1]
 ['indian' 1]
 ['italian' 180]
 ['mexican' 23]
 ['southern_us' 3]]


In [61]:
y_pred_british = rf_model.predict(total_test[total_test['cuisine']=='british'].drop('cuisine',axis=1))
unique, counts = np.unique(y_pred_british, return_counts=True)
print(np.asarray((unique, counts)).T)

[['chinese' 2]
 ['indian' 4]
 ['italian' 113]
 ['mexican' 3]
 ['southern_us' 42]]


In [62]:
y_pred_vietnamese = rf_model.predict(total_test[total_test['cuisine']=='vietnamese'].drop('cuisine',axis=1))
unique, counts = np.unique(y_pred_vietnamese, return_counts=True)
print(np.asarray((unique, counts)).T)

[['chinese' 33]
 ['indian' 3]
 ['italian' 35]
 ['japanese' 1]
 ['mexican' 10]
 ['thai' 50]
 ['vietnamese' 34]]


In [71]:
# test for leave one out models to check for driving features, compare with top 100 features overall
feat_all = feature_importances.head(100)
feat_all

Unnamed: 0,importance
grated parmesan cheese,0.025599
garam masala,0.023130
soy sauce,0.022604
olive oil,0.018276
fish sauce,0.017559
...,...
mustard seeds,0.002304
pepper,0.002260
yoghurt,0.002243
brown sugar,0.002235


In [125]:
temp_model = rf.fit(total_test[total_test['cuisine']!='indian'].drop('cuisine',axis=1),
                    total_test[total_test['cuisine']!='indian']['cuisine'])

In [126]:
temp_importances = pd.DataFrame(temp_model.feature_importances_,index = X_train_vect.columns,columns=['importance']).sort_values('importance',ascending=False)
list(((feature_importances[10:]-temp_importances[10:])).sort_values('importance',ascending=False).head(10).index)

['ground turmeric',
 'cumin seed',
 'curry powder',
 'tumeric',
 'corn starch',
 'flour tortillas',
 'green chilies',
 'ghee',
 'ground coriander',
 'ginger']

In [127]:
# this is how I want to print it out for now, may revisit later
temp2 = list(((feature_importances[10:]-temp_importances[10:])).sort_values('importance',ascending=False).head(10).index)
print('here - {}'.format(temp2))

here - ['ground turmeric', 'cumin seed', 'curry powder', 'tumeric', 'corn starch', 'flour tortillas', 'green chilies', 'ghee', 'ground coriander', 'ginger']


In [132]:
# let's do this in a loop
# no limit, some common items get in here
# common items - grated parm cheese (19), salsa (18), garam masala (20), corn starch (19), 
# extra-virgin olive oil (14), olive oil (16), soy sauce (13), fish sauce (13), flour tortillas (14), tomatillos (19),
# avocado (17) taco seasoning (19) cumin (16) italian seasoning (16) mozzarella (17) 
for c in cuisine_list:
    temp_model = rf.fit(total_test[total_test['cuisine']!=c].drop('cuisine',axis=1),
                    total_test[total_test['cuisine']!=c]['cuisine'])
    temp_importances = pd.DataFrame(temp_model.feature_importances_,index = X_train_vect.columns,columns=['importance']).sort_values('importance',ascending=False)
    changed_feat = list(((feature_importances-temp_importances)).sort_values('importance',ascending=False).head(20).index)
    print('Important features for cusine {} = {}'.format(c,changed_feat))

Important features for cusine brazilian = ['grated parmesan cheese', 'salsa', 'garam masala', 'corn starch', 'extra-virgin olive oil', 'olive oil', 'soy sauce', 'fish sauce', 'flour tortillas', 'tomatillos', 'avocado', 'taco seasoning', 'cumin', 'mirin', 'sesame oil', 'italian seasoning', 'chopped cilantro fresh', 'shredded mozzarella cheese', 'corn tortillas', 'all-purpose flour']
Important features for cusine british = ['grated parmesan cheese', 'garam masala', 'corn starch', 'olive oil', 'salsa', 'chili powder', 'flour tortillas', 'soy sauce', 'cumin', 'corn tortillas', 'avocado', 'taco seasoning', 'buttermilk', 'extra-virgin olive oil', 'parmesan cheese', 'black beans', 'jalapeno chilies', 'italian seasoning', 'shredded mozzarella cheese', 'tomatillos']
Important features for cusine cajun_creole = ['grated parmesan cheese', 'cajun seasoning', 'corn starch', 'salsa', 'olive oil', 'corn tortillas', 'fish sauce', 'avocado', 'jalapeno chilies', 'andouille sausage', 'garam masala', 'flo

In [128]:
# let's do this in a loop
# starting from 10 to exclude common items, looking for more random, representative items
for c in cuisine_list:
    temp_model = rf.fit(total_test[total_test['cuisine']!=c].drop('cuisine',axis=1),
                    total_test[total_test['cuisine']!=c]['cuisine'])
    temp_importances = pd.DataFrame(temp_model.feature_importances_,index = X_train_vect.columns,columns=['importance']).sort_values('importance',ascending=False)
    changed_feat = list(((feature_importances[10:]-temp_importances[10:])).sort_values('importance',ascending=False).head(10).index)
    print('Important features for cusine {} = {}'.format(c,changed_feat))

Important features for cusine brazilian = ['corn starch', 'flour tortillas', 'mirin', 'extra-virgin olive oil', 'taco seasoning', 'cumin', 'ginger', 'shredded mozzarella cheese', 'black beans', 'tomatillos']
Important features for cusine british = ['corn starch', 'flour tortillas', 'chili powder', 'cumin', 'extra-virgin olive oil', 'taco seasoning', 'tomatillos', 'chopped cilantro fresh', 'dry white wine', 'italian seasoning']
Important features for cusine cajun_creole = ['cajun seasoning', 'corn starch', 'andouille sausage', 'chopped cilantro fresh', 'creole seasoning', 'tomatillos', 'italian seasoning', 'taco seasoning', 'extra-virgin olive oil', 'jalapeno chilies']
Important features for cusine chinese = ['corn starch', 'oyster sauce', 'hoisin sauce', 'extra-virgin olive oil', 'Shaoxing wine', 'scallions', 'light soy sauce', 'ginger', 'tomatillos', 'taco seasoning']
Important features for cusine filipino = ['corn starch', 'taco seasoning', 'buttermilk', 'jalapeno chilies', 'curry po

In [129]:
# let's do this in a loop
# starting from 10 to exclude common items, looking for more random, representative items
for c in cuisine_list:
    temp_model = rf.fit(total_test[total_test['cuisine']!=c].drop('cuisine',axis=1),
                    total_test[total_test['cuisine']!=c]['cuisine'])
    temp_importances = pd.DataFrame(temp_model.feature_importances_,index = X_train_vect.columns,columns=['importance']).sort_values('importance',ascending=False)
    changed_feat = list(((feature_importances[20:]-temp_importances[20:])).sort_values('importance',ascending=False).head(10).index)
    print('Important features for cusine {} = {}'.format(c,changed_feat))

Important features for cusine brazilian = ['chopped cilantro fresh', 'curry powder', 'taco seasoning', 'ginger', 'parmesan cheese', 'cumin', 'tomatillos', 'italian seasoning', 'oyster sauce', 'shredded mozzarella cheese']
Important features for cusine british = ['italian seasoning', 'taco seasoning', 'tomatillos', 'cumin', 'curry powder', 'parmesan cheese', 'chopped cilantro fresh', 'shredded mozzarella cheese', 'Shaoxing wine', 'oyster sauce']
Important features for cusine cajun_creole = ['cajun seasoning', 'andouille sausage', 'creole seasoning', 'italian seasoning', 'taco seasoning', 'cumin', 'chopped cilantro fresh', 'oyster sauce', 'parmesan cheese', 'tomatillos']
Important features for cusine chinese = ['oyster sauce', 'hoisin sauce', 'Shaoxing wine', 'light soy sauce', 'italian seasoning', 'taco seasoning', 'chinese five-spice powder', 'fresh ginger', 'scallions', 'rice vinegar']
Important features for cusine filipino = ['chopped cilantro fresh', 'curry powder', 'tomatillos', 'i

In [138]:
# Optimize model
def train_RF(n_est, depth):
    rf = RandomForestClassifier(n_estimators=n_est, max_depth=depth, n_jobs=-1)
    
    start = time.time()
    rf_model.fit(X_train_vect, y_train)
    end = time.time()
    fit_time = end - start
    #scores= -1 * cross_val_score(pipeline_temp, train_X, train_y,cv=5,scoring='neg_mean_absolute_error')
    start = time.time()
    y_pred = rf_model.predict(X_test_vect)
    end = time.time()
    predict_time = end - start
    print('Est: {} / Depth: {} ---- Fit time: {} / Predict time: {} / Accuracy: {}'.format(
        n_est,depth,round(fit_time, 3), round(pred_time, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

In [139]:
for n_est in [10, 50, 100, 200, 500]:
    for depth in [10, 50, 100, None]:
        train_RF(n_est, depth)

Est: 10 / Depth: 10 ---- Fit time: 43.229 / Predict time: 1.071 / Accuracy: 0.571
Est: 10 / Depth: 50 ---- Fit time: 40.111 / Predict time: 1.071 / Accuracy: 0.577
Est: 10 / Depth: 100 ---- Fit time: 33.04 / Predict time: 1.071 / Accuracy: 0.57
Est: 10 / Depth: None ---- Fit time: 34.471 / Predict time: 1.071 / Accuracy: 0.573
Est: 50 / Depth: 10 ---- Fit time: 37.873 / Predict time: 1.071 / Accuracy: 0.573
Est: 50 / Depth: 50 ---- Fit time: 30.649 / Predict time: 1.071 / Accuracy: 0.578
Est: 50 / Depth: 100 ---- Fit time: 33.48 / Predict time: 1.071 / Accuracy: 0.575
Est: 50 / Depth: None ---- Fit time: 33.238 / Predict time: 1.071 / Accuracy: 0.575
Est: 100 / Depth: 10 ---- Fit time: 34.96 / Predict time: 1.071 / Accuracy: 0.574
Est: 100 / Depth: 50 ---- Fit time: 34.118 / Predict time: 1.071 / Accuracy: 0.576
Est: 100 / Depth: 100 ---- Fit time: 36.366 / Predict time: 1.071 / Accuracy: 0.572
Est: 100 / Depth: None ---- Fit time: 34.577 / Predict time: 1.071 / Accuracy: 0.575
Est: 20