In [70]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
train_df = pd.read_json('../../data/train/new-train.json', lines=True)

In [2]:
test_df = pd.read_json('../../data/test/new-test.json', lines=True)

In [3]:
train_df.head()

Unnamed: 0,cuisine,ingredients,label
0,greek,"romaine lettuce, black olives, grape tomatoes,...",0
1,greek,"ground pork, finely chopped fresh parsley, oni...",0
2,greek,"minced garlic, dried oregano, red wine vinegar...",0
3,greek,"orange, anise, cinnamon sticks, unflavored gel...",0
4,greek,"fresh dill, yoghurt, salt, myzithra, large egg...",0


In [4]:
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer() 
  
def lemmatize(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split(' ')])

In [5]:
train_df['ingredients'] = train_df['ingredients'].apply(lemmatize)

In [6]:
test_df['ingredients'] = test_df['ingredients'].apply(lemmatize)

In [7]:
recipes = list(train_df['ingredients'])
labels = list(train_df['cuisine'])

In [8]:
X_test = list(test_df['ingredients'])
y_test = list(test_df['cuisine'])

In [9]:
import pickle
corpus = pickle.load(open( '../../data/recipes.data', 'rb'))

In [10]:
from nltk.corpus import stopwords
file = open("stopwords.txt", "r")
sw = file.read().lower().split('\n')
sw = sw + stopwords.words('english')

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.pipeline import Pipeline

In [51]:
vectorizer = CountVectorizer(ngram_range=(2,2), max_features = 7000, binary=True, stop_words=sw)
vectorizer.fit(corpus)
X_train = vectorizer.transform(recipes).toarray()
y_train = labels
clf = LogisticRegression()
clf.fit(X_train, y_train)
model = Pipeline([('vectorizer', vectorizer),('LR', clf)])
y_pred = model.predict(X_test)
print('Number of Features: %d'%len(vectorizer.get_feature_names()))
print('Training Accuracy: %f'%metrics.accuracy_score(y_train, model.predict(recipes)))
print('Testing Accuracy: %f'%metrics.accuracy_score(y_pred, y_test))

Number of Features: 7000
Training Accuracy: 0.784927
Testing Accuracy: 0.743893


In [52]:
vectorizer = CountVectorizer(ngram_range=(1,2), max_features = 7000, binary=True, stop_words=sw)
vectorizer.fit(corpus)
X_train = vectorizer.transform(recipes).toarray()
y_train = labels

In [73]:
clf = LogisticRegression()
clf.fit(X_train, y_train)
model = Pipeline([('vectorizer', vectorizer),('LR', clf)])
y_pred = model.predict(X_test)
print('Number of Features: %d'%len(vectorizer.get_feature_names()))
print('Training Accuracy: %f'%metrics.accuracy_score(y_train, model.predict(recipes)))
print('Testing Accuracy: %f'%metrics.accuracy_score(y_pred, y_test))

Number of Features: 7000
Training Accuracy: 0.870390
Testing Accuracy: 0.832947


In [69]:
clf = LogisticRegression(max_iter=9)
clf.fit(X_train, y_train)
model = Pipeline([('vectorizer', vectorizer),('LR', clf)])
y_pred = model.predict(X_test)
print('Number of Features: %d'%len(vectorizer.get_feature_names()))
print('Training Accuracy: %f'%metrics.accuracy_score(y_train, model.predict(recipes)))
print('Testing Accuracy: %f'%metrics.accuracy_score(y_pred, y_test))

Number of Features: 7000
Training Accuracy: 0.863504
Testing Accuracy: 0.833598


### import numpy as np
from sklearn.model_selection import validation_curve

train_scores, valid_scores = validation_curve(LogisticRegression(), X_train, y_train, "alpha", train_sizes=[])

In [None]:
train_scores, valid_scores = validation_curve(LogisticRegression(), )

In [26]:
vectorizer = CountVectorizer(min_df=0.00003)
vectorizer.fit(corpus)
X_train = vectorizer.transform(recipes).toarray()
y_train = labels
clf = LogisticRegression()
clf.fit(X_train, y_train)
model = Pipeline([('vectorizer', vectorizer),('LR', clf)])
y_pred = model.predict(X_test)
print('Number of Features: %d'%len(vectorizer.get_feature_names()))
print('Training Accuracy: %f'%metrics.accuracy_score(y_train, model.predict(recipes)))
print('Testing Accuracy: %f'%metrics.accuracy_score(y_pred, y_test))

Number of Features: 2268
Training Accuracy: 0.842033
Testing Accuracy: 0.781084


In [41]:
vectorizer = CountVectorizer(ngram_range=(1,3),max_features=7000, stop_words=sw)
vectorizer.fit(corpus)
X_train = vectorizer.transform(recipes).toarray()
y_train = labels
clf = LogisticRegression()
clf.fit(X_train, y_train)
model = Pipeline([('vectorizer', vectorizer),('LR', clf)])
y_pred = model.predict(X_test)
print('Number of Features: %d'%len(vectorizer.get_feature_names()))
print('Training Accuracy: %f'%metrics.accuracy_score(y_train, model.predict(recipes)))
print('Testing Accuracy: %f'%metrics.accuracy_score(y_pred, y_test))

Number of Features: 7000
Training Accuracy: 0.869414
Testing Accuracy: 0.827704


In [32]:
vectorizer = CountVectorizer(ngram_range=(1,3), min_df=0.00003)
vectorizer.fit(corpus)
X_train = vectorizer.transform(recipes).toarray()
y_train = labels
clf = LogisticRegression()
clf.fit(X_train, y_train)
model = Pipeline([('vectorizer', vectorizer),('LR', clf)])
y_pred = model.predict(X_test)
print('Number of Features: %d'%len(vectorizer.get_feature_names()))
print('Training Accuracy: %f'%metrics.accuracy_score(y_train, model.predict(recipes)))
print('Testing Accuracy: %f'%metrics.accuracy_score(y_pred, y_test))

Number of Features: 6794
Training Accuracy: 0.870973
Testing Accuracy: 0.795513


In [34]:
vectorizer = CountVectorizer(min_df=0.00003, stop_words=sw)
vectorizer.fit(corpus)
X_train = vectorizer.transform(recipes).toarray()
y_train = labels
clf = LogisticRegression()
clf.fit(X_train, y_train)
model = Pipeline([('vectorizer', vectorizer),('LR', clf)])
y_pred = model.predict(X_test)
print('Number of Features: %d'%len(vectorizer.get_feature_names()))
print('Training Accuracy: %f'%metrics.accuracy_score(y_train, model.predict(recipes)))
print('Testing Accuracy: %f'%metrics.accuracy_score(y_pred, y_test))

Number of Features: 1734
Training Accuracy: 0.833480
Testing Accuracy: 0.814657


In [35]:
vectorizer = CountVectorizer(ngram_range=(1,2), min_df=0.00003, stop_words=sw)
vectorizer.fit(corpus)
X_train = vectorizer.transform(recipes).toarray()
y_train = labels
clf = LogisticRegression()
clf.fit(X_train, y_train)
model = Pipeline([('vectorizer', vectorizer),('LR', clf)])
y_pred = model.predict(X_test)
print('Number of Features: %d'%len(vectorizer.get_feature_names()))
print('Training Accuracy: %f'%metrics.accuracy_score(y_train, model.predict(recipes)))
print('Testing Accuracy: %f'%metrics.accuracy_score(y_pred, y_test))

Number of Features: 3689
Training Accuracy: 0.856713
Testing Accuracy: 0.823843


In [36]:
vectorizer = CountVectorizer(ngram_range=(1,2), min_df=0.00001, stop_words=sw)
vectorizer.fit(corpus)
X_train = vectorizer.transform(recipes).toarray()
y_train = labels
clf = LogisticRegression()
clf.fit(X_train, y_train)
model = Pipeline([('vectorizer', vectorizer),('LR', clf)])
y_pred = model.predict(X_test)
print('Number of Features: %d'%len(vectorizer.get_feature_names()))
print('Training Accuracy: %f'%metrics.accuracy_score(y_train, model.predict(recipes)))
print('Testing Accuracy: %f'%metrics.accuracy_score(y_pred, y_test))

Number of Features: 9752
Training Accuracy: 0.878048
Testing Accuracy: 0.830468


In [37]:
vectorizer = CountVectorizer(ngram_range=(1,2), min_df=0.00005, stop_words=sw)
vectorizer.fit(corpus)
X_train = vectorizer.transform(recipes).toarray()
y_train = labels
clf = LogisticRegression()
clf.fit(X_train, y_train)
model = Pipeline([('vectorizer', vectorizer),('LR', clf)])
y_pred = model.predict(X_test)
print('Number of Features: %d'%len(vectorizer.get_feature_names()))
print('Training Accuracy: %f'%metrics.accuracy_score(y_train, model.predict(recipes)))
print('Testing Accuracy: %f'%metrics.accuracy_score(y_pred, y_test))

Number of Features: 2683
Training Accuracy: 0.849570
Testing Accuracy: 0.819697


In [20]:
print(metrics.accuracy_score(y_train, model.predict(recipes)))

0.8420332090816672


In [None]:
vectorizer = CountVectorizer(min_df=0.00003, stop_words=sw, binary=True)
vectorizer.fit(corpus)
print('Number of Features: %d'%len(vectorizer.get_feature_names()))
X_train = vectorizer.transform(recipes).toarray()
y_train = labels
clf = LogisticRegression()
clf.fit(X_train, y_train)
model = Pipeline([('vectorizer', vectorizer),('LR', clf)])
y_pred = model.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

In [None]:
vetorizer = CountVectorizer()
vectorizer.fit(corpus)
print('Number of Features: %d'%len(vectorizer.get_feature_names()))
X_train = vectorizer.transform(recipes).toarray()
y_train = labels
clf = LogisticRegression()
clf.fit(X_train, y_train)
model = Pipeline([('vectorizer', vectorizer),('LR', clf)])
y_pred = model.predict(X_test)
print(metrics.classification_report(y_test, y_pred))

In [51]:
vectorizer = CountVectorizer(ngram_range=(1,2), min_df=0.00003, stop_words=sw, binary=True)
vectorizer.fit(corpus)
print('Number of Features: %d'%len(vectorizer.get_feature_names()))

Number of Features: 3689


In [53]:
X_train = vectorizer.transform(recipes).toarray()
y_train = labels

In [54]:
clf = LogisticRegression()
clf.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [55]:
from sklearn.pipeline import Pipeline
LogReg_model = Pipeline([('vectorizer', vectorizer),('LR', clf)])

In [56]:
from sklearn import metrics
y_pred = LogReg_model.predict(X_test)
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

   brazilian       0.85      0.48      0.61       117
     british       0.60      0.30      0.40       201
cajun_creole       0.81      0.65      0.72       387
     chinese       0.86      0.92      0.89      3294
    filipino       0.71      0.38      0.50       189
      french       0.70      0.60      0.64      2205
       greek       0.90      0.83      0.86      1544
      indian       0.93      0.95      0.94      2119
       irish       0.71      0.44      0.55       167
     italian       0.76      0.99      0.86      4310
    jamaican       0.82      0.57      0.67       132
    japanese       0.88      0.80      0.84      1557
      korean       0.77      0.59      0.67       208
     mexican       0.90      0.97      0.93      3600
    moroccan       0.83      0.64      0.72       206
     russian       0.65      0.26      0.37       123
 southern_us       0.74      0.65      0.69      1080
     spanish       0.85    

In [62]:
filename = 'model.sav'
pickle.dump(LogReg_model, open(filename,'wb'))

In [60]:
unigramVec = CountVectorizer(min_df=0.00003, stop_words=sw, binary=True)
unigramVec.fit(corpus)
len(unigramVec.get_feature_names())
X_train = unigramVec.transform(recipes).toarray()
y_train = labels
clf = LogisticRegression()
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [61]:
from sklearn import metrics
LogRegUni = Pipeline([('vectorizer', unigramVec),('LR', clf)])
y_pred = LogRegUni.predict(X_test)
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

   brazilian       0.81      0.41      0.55       117
     british       0.55      0.25      0.35       201
cajun_creole       0.77      0.63      0.69       387
     chinese       0.84      0.91      0.87      3294
    filipino       0.65      0.34      0.45       189
      french       0.68      0.59      0.63      2205
       greek       0.90      0.82      0.86      1544
      indian       0.93      0.96      0.94      2119
       irish       0.73      0.40      0.51       167
     italian       0.76      0.98      0.86      4310
    jamaican       0.82      0.58      0.68       132
    japanese       0.88      0.80      0.84      1557
      korean       0.74      0.53      0.62       208
     mexican       0.89      0.96      0.93      3600
    moroccan       0.83      0.61      0.70       206
     russian       0.71      0.26      0.38       123
 southern_us       0.71      0.64      0.68      1080
     spanish       0.86    

In [258]:
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
clf.fit(X_train, y_train)
NB_model = Pipeline([('vectorizer', vectorizer),('LR', clf)])
y_pred = NB_model.predict(X_test)
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

   brazilian       0.03      0.18      0.05       117
     british       0.12      0.23      0.16       201
cajun_creole       0.54      0.67      0.60       387
     chinese       0.87      0.85      0.86      3294
    filipino       0.33      0.28      0.30       189
      french       0.41      0.32      0.36      2205
       greek       0.85      0.74      0.79      1544
      indian       0.93      0.89      0.91      2119
       irish       0.13      0.32      0.19       167
     italian       0.73      0.88      0.80      4310
    jamaican       0.24      0.43      0.31       132
    japanese       0.61      0.33      0.43      1557
      korean       0.68      0.51      0.58       208
     mexican       0.87      0.76      0.81      3600
    moroccan       0.36      0.62      0.45       206
     russian       0.10      0.16      0.12       123
 southern_us       0.57      0.56      0.57      1080
     spanish       0.60    

In [259]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
DT_model = Pipeline([('vectorizer', vectorizer),('LR', clf)])
y_pred = DT_model.predict(X_test)
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

   brazilian       0.48      0.41      0.44       117
     british       0.24      0.18      0.21       201
cajun_creole       0.56      0.51      0.54       387
     chinese       0.86      1.00      0.92      3294
    filipino       0.50      0.32      0.39       189
      french       0.60      0.57      0.59      2205
       greek       0.78      0.78      0.78      1544
      indian       0.86      0.88      0.87      2119
       irish       0.33      0.25      0.28       167
     italian       0.74      0.83      0.79      4310
    jamaican       0.47      0.38      0.42       132
    japanese       0.80      0.68      0.73      1557
      korean       0.59      0.45      0.51       208
     mexican       0.86      0.86      0.86      3600
    moroccan       0.45      0.37      0.40       206
     russian       0.20      0.15      0.17       123
 southern_us       0.56      0.50      0.53      1080
     spanish       0.65    

In [245]:
import numpy as np
for i in range(0, clf.coef_.shape[0]):
    print('%s top ingredients:' % clf.classes_[i])
    top5_indices = np.argsort(clf.coef_[i])[-10:]
    for j in top5_indices[::-1]:  
        print features[j]
    print('============================')

brazilian top ingredients:
cachaca
granola
brown basmati
crema
starch
instant coffee
palm
cookies
tapioca flour
idaho
british top ingredients:
scotch
crumbles
haddock
chili flake
gingerroot
pound
baked
style plain
port
chicken stock
cajun_creole top ingredients:
creole
cajun
andouille
crawfish
salami
smoked sausage
chicken sausage
fat mayonnaise
okra
pecan
chinese top ingredients:
mandarin
szechwan
chinese
mein
char
conimex
custard
spring onion
yardlong
chili oil
filipino top ingredients:
coriander seed
green mango
tilapia
jackfruit
oregano leaf
blackberry
serrano chile
mung
chili paste
peppercorn
french top ingredients:
fried
herbes
vanilla cake
onion soup
challa
calvados
delicious
creamer
blanc
cognac
greek top ingredients:
greek
phyllo
yogurt
greek seasoning
lamb
yoghurt
tahini
feta
oregano
feta cheese
indian top ingredients:
tandoori
masala
naan
cardamom
curry
urad
chutney
yoghurt
cardamon
yellow cornmeal
irish top ingredients:
irish
color
stout
coconut cream
sweet cherry
sweet sau

In [None]:
input_recipe = raw_input("Input Recipe: \n> ")
input_recipe = stem(input_recipe.lower())
X_input = vectorizer.transform([input_recipe]).toarray()
prob = clf.predict_proba(X_input)[0]
classes = clf.classes_
T
print('\nIdentified Ingredients: \n> %s'%vectorizer.inverse_transform(X_input))

print('\nCuisine Probabilities:')
for i in range(len(prob)):
    print('> %s: %s%%'%(classes[i], int(prob[i]*100)))

In [None]:
import pickle
filename = 'LogReg_v5.sav'
pickle.dump(LogReg_model, open(filename,'wb'))