In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
train_df = pd.read_json('../../data/train/trainText-75-25-bow.json', lines=True)

In [2]:
test_df = pd.read_json('../../data/test/testText-75-25-bow.json', lines=True)

In [3]:
train_df.head()

Unnamed: 0,cuisine,id,ingredients,label,numeric_recipe,recipe
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",0,romaine lettuce black olives grape tomatoes ga...,romaine lettuce black olives grape tomatoes ga...
1,greek,34471,"[ground pork, finely chopped fresh parsley, on...",0,ground pork finely chopped fresh parsley onion...,ground pork finely chopped fresh parsley onion...
2,greek,4635,"[minced garlic, dried oregano, red wine vinega...",0,minced garlic dried oregano red wine vinegar o...,minced garlic dried oregano red wine vinegar o...
3,greek,5980,"[orange, anise, cinnamon sticks, unflavored ge...",0,orange anise cinnamon sticks unflavored gelati...,orange anise cinnamon sticks unflavored gelati...
4,greek,18031,"[fresh dill, yoghurt, salt, myzithra, large eg...",0,fresh dill yoghurt salt myzithra large eggs ch...,fresh dill yoghurt salt myzithra large eggs ch...


In [4]:
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer() 
  
def lemmatize(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split(' ')])

In [5]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
def stem(text):
    return ' '.join([stemmer.stem(word) for word in text.split(' ')])

In [6]:
file = open("stopwords.txt", "r")
stopwords = file.read().lower().split('\n')
def remove_stopwords(text):
    filtered = ""
    for word in text.split(' '):
        if not word in stopwords:
            filtered = filtered + ' ' + word
    return filtered

In [7]:
train_df['recipe'] = train_df['recipe'].apply(remove_stopwords)
train_df['recipe'] = train_df['recipe'].apply(lemmatize)

In [8]:
train_df.head()

Unnamed: 0,cuisine,id,ingredients,label,numeric_recipe,recipe
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",0,romaine lettuce black olives grape tomatoes ga...,romaine lettuce black olive grape tomato garl...
1,greek,34471,"[ground pork, finely chopped fresh parsley, on...",0,ground pork finely chopped fresh parsley onion...,pork parsley onion salt vinegar caul fat
2,greek,4635,"[minced garlic, dried oregano, red wine vinega...",0,minced garlic dried oregano red wine vinegar o...,minced garlic dried oregano red wine vinegar ...
3,greek,5980,"[orange, anise, cinnamon sticks, unflavored ge...",0,orange anise cinnamon sticks unflavored gelati...,orange anise cinnamon stick unflavored gelati...
4,greek,18031,"[fresh dill, yoghurt, salt, myzithra, large eg...",0,fresh dill yoghurt salt myzithra large eggs ch...,dill yoghurt salt myzithra egg cheese feta ch...


In [9]:
test_df['recipe'] = test_df['recipe'].apply(remove_stopwords)
test_df['recipe'] = test_df['recipe'].apply(lemmatize)

In [10]:
recipes = list(train_df['recipe'])
labels = list(train_df['cuisine'])

In [11]:
X_test = list(test_df['recipe'])
y_test = list(test_df['cuisine'])

In [14]:
vectorizer = CountVectorizer(ngram_range=(1,2), min_df=0.0005, stop_words='english', binary=True)
vectorizer.fit(recipes)
print('Number of Features: %d'%len(vectorizer.get_feature_names()))

Number of Features: 5675


In [15]:
vectorizer.get_feature_names()

[u'active',
 u'active dry',
 u'added',
 u'adobo',
 u'adobo sauce',
 u'agave',
 u'agave nectar',
 u'aged',
 u'ale',
 u'alfredo',
 u'alfredo sauce',
 u'allspice',
 u'allspice berry',
 u'allspice cinnamon',
 u'almond',
 u'almond butter',
 u'almond egg',
 u'almond extract',
 u'almond flour',
 u'almond garlic',
 u'almond meal',
 u'almond milk',
 u'almond paste',
 u'almond salt',
 u'amaretto',
 u'amchur',
 u'american',
 u'american cheese',
 u'amino',
 u'anaheim',
 u'anaheim chile',
 u'ancho',
 u'ancho chile',
 u'ancho powder',
 u'anchovy',
 u'anchovy fillet',
 u'anchovy paste',
 u'andouille',
 u'andouille sausage',
 u'angel',
 u'angel hair',
 u'anise',
 u'anise cinnamon',
 u'anise clove',
 u'anise seed',
 u'apple',
 u'apple cider',
 u'apple juice',
 u'applesauce',
 u'apricot',
 u'apricot jam',
 u'apricot preserve',
 u'arbol',
 u'arborio',
 u'arborio rice',
 u'arrowroot',
 u'arrowroot powder',
 u'artichok',
 u'artichok heart',
 u'artichoke',
 u'artichoke heart',
 u'arugula',
 u'asadero',
 u'a

In [16]:
#import pickle
#filename = 'vectorizer.sav'
#pickle.dump(vectorizer, open(filename,'wb'))

In [17]:
X_train = vectorizer.transform(recipes).toarray()
y_train = labels

In [18]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [19]:
from sklearn.pipeline import Pipeline
DecTree_model = Pipeline([('vectorizer', vectorizer),('LR', clf)])

In [20]:
from sklearn import metrics
y_pred = DecTree_model.predict(X_test)
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

   brazilian       0.55      0.45      0.50       117
     british       0.31      0.22      0.26       201
cajun_creole       0.59      0.50      0.54       387
     chinese       0.87      1.00      0.93      3294
    filipino       0.49      0.34      0.40       189
      french       0.65      0.59      0.62      2205
       greek       0.82      0.79      0.81      1544
      indian       0.87      0.89      0.88      2119
       irish       0.37      0.31      0.34       167
     italian       0.60      0.90      0.72      4310
    jamaican       0.58      0.36      0.44       132
    japanese       0.82      0.71      0.76      1557
      korean       0.61      0.38      0.47       208
     mexican       0.79      0.53      0.63      3600
    moroccan       0.52      0.39      0.45       206
     russian       0.24      0.16      0.20       123
 southern_us       0.58      0.53      0.55      1080
     spanish       0.66    

In [None]:
input_recipe = input("Input Recipe: \n> ")
input_recipe = stem(input_recipe.lower())
X_input = vectorizer.transform([input_recipe]).toarray()
pred = clf.predict(X_input)
classes = clf.classes_

print('\nIdentified Ingredients: \n> %s'%vectorizer.inverse_transform(X_input))

print('\nCuisine Prediction: %s'%pred)

In [21]:
import pickle
filename = 'DecTreeV1.sav'
pickle.dump(DecTree_model, open(filename,'wb'))