In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import Imputer, StandardScaler, RobustScaler
import xgboost as xgb
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.multiclass import OneVsRestClassifier
import datetime
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import os
import nltk
import gensim

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
%matplotlib inline
sns.set()
np.random.seed(123)

In [9]:
def load_n_prep(name):
    df = pd.read_json(os.path.join('{}.json'.format(name)))
    df.set_index('id', inplace=True)
    df['ingredients_count'] = df['ingredients'].apply(lambda x: len(x))
    df['ingredients_word_count'] = df['ingredients'].apply(lambda ingredients: [len(i.split()) for i in ingredients])
    df['ingredients'] = df['ingredients'].astype(str).apply(lambda ingredients: re.sub('\[|\]', '', ingredients)) #ingredients of two words can get lost
    #     df['ingredients'] = df['ingredients'].apply(lambda ingredients: ' '.join(ingredients)) #ingredients of two words can get lost
    return df

In [10]:
train = load_n_prep('train')
train.head()

Unnamed: 0_level_0,cuisine,ingredients,ingredients_count,ingredients_word_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10259,greek,"'romaine lettuce', 'black olives', 'grape toma...",9,"[2, 2, 2, 1, 1, 2, 1, 2, 3]"
25693,southern_us,"'plain flour', 'ground pepper', 'salt', 'tomat...",11,"[2, 2, 1, 1, 3, 1, 1, 2, 3, 1, 2]"
20130,filipino,"'eggs', 'pepper', 'salt', 'mayonaise', 'cookin...",12,"[1, 1, 1, 1, 2, 2, 3, 2, 2, 2, 1, 2]"
22213,indian,"'water', 'vegetable oil', 'wheat', 'salt'",4,"[1, 2, 1, 1]"
13162,indian,"'black pepper', 'shallots', 'cornflour', 'caye...",20,"[2, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 2, ..."


In [85]:
df = pd.read_json(os.path.join('train.json'))
total = []
for i, ind in df.ingredients.iteritems():
    total += ind

In [120]:
total = [re.sub(r'[^\s\w]', '', t) for t in total]
sorted(set(total))

['    oz tomato sauce',
 '   oz tomato paste',
 '1 lowfat buttermilk',
 '1 lowfat chocolate milk',
 '1 lowfat cottage cheese',
 '1 lowfat milk',
 '10 oz frozen chopped spinach',
 '10 oz frozen chopped spinach thawed and squeezed dry',
 '14 oz sweetened condensed milk',
 '145 oz diced tomatoes',
 '15 oz refried beans',
 '2 12 to 3 lb chicken cut into serving pieces',
 '2 low fat cheddar chees',
 '2 lowfat cottage cheese',
 '2 lowfat greek yogurt',
 '2 milk shredded mozzarella cheese',
 '2 reducedfat milk',
 '25 less sodium chicken broth',
 '33 less sodium cooked deli ham',
 '33 less sodium cooked ham',
 '33 less sodium ham',
 '33 less sodium smoked fully cooked ham',
 '40 less sodium taco seasoning',
 '40 less sodium taco seasoning mix',
 '7 Up',
 '8 ounc ziti pasta cook and drain',
 '95 lean ground beef',
 'A Taste of Thai Rice Noodles',
 'Accent Seasoning',
 'Adobo All Purpose Seasoning',
 'Alaskan king crab legs',
 'Alexia Waffle Fries',
 'Alfredo sauce',
 'Amarena cherries',
 'Amare

In [None]:
# 'low fat':'lowfat', ''v 8':'v8'

In [109]:
# set(re.findall(r'\([\d\soz\.]+\)', ' '.join(total)))
set(re.findall(r'\d+%', ' '.join(total)))

{'1%', '2%', '25%', '33%', '40%', '95%', '96%'}

In [117]:
set(re.findall(r'[^\s\w]', ' '.join(total)))

{'!', '%', '&', "'", '(', ')', ',', '-', '.', '/', '®', '’', '€', '™'}

In [27]:
set_ingredients = list(set(' '.join(train.ingredients).split()))
len(set_ingredients)

3589

In [58]:
# stemmer = SnowballStemmer("english")
stops = set(stopwords.words("english"))
word_len_sort = {}
for w in set_ingredients:
    w = w.lower()
#     w = re.sub("[^\w\s]", "", w)
    if w not in stops:
        word_len = len(w)
        if word_len in word_len_sort:
            word_len_sort[word_len].append(w)
        else:
            word_len_sort[word_len] = [w]

word_len_sort.keys()

dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19])

In [80]:
sorted(set(word_len_sort[3]))

['(10',
 '(14',
 '(15',
 '1/2',
 '100',
 '25%',
 '33%',
 '40%',
 '95%',
 '96%',
 'age',
 'ahi',
 'aka',
 'ale',
 'bag',
 'bai',
 'bar',
 'bay',
 'bbq',
 'bee',
 'big',
 'bob',
 'bok',
 'boy',
 'bun',
 'cai',
 'cod',
 'con',
 'cut',
 'dal',
 'day',
 'dew',
 'dip',
 'dog',
 'dr.',
 'dri',
 'dry',
 'ear',
 'eau',
 'eel',
 'egg',
 'eye',
 'fat',
 'fig',
 'fri',
 'fry',
 'gai',
 'gao',
 'gel',
 'gem',
 'gin',
 'gum',
 'guy',
 'ham',
 'hen',
 'hip',
 'hog',
 'hoi',
 'hot',
 'ice',
 'imo',
 "it'",
 'jam',
 'jif',
 'jus',
 'key',
 'kha',
 'kim',
 'lan',
 'lap',
 'lb.',
 'lea',
 'leg',
 'lop',
 'low',
 'lox',
 'mae',
 'mam',
 'mex',
 'mie',
 'min',
 'mix',
 'msg',
 'nam',
 'new',
 'ngo',
 'non',
 'nut',
 'oat',
 'oil',
 'old',
 'one',
 'opo',
 'pad',
 'pak',
 'pam',
 'pan',
 'pao',
 'pat',
 'pea',
 'pho',
 'pie',
 'pig',
 'pit',
 'pla',
 'pod',
 'poi',
 'pop',
 'pot',
 'puy',
 'qua',
 'ras',
 'raw',
 'red',
 'rib',
 'roe',
 'rub',
 'rum',
 'rye',
 'san',
 'sea',
 'sec',
 'sel',
 'siu',
 'soi',


In [72]:
train.ingredients[train.ingredients.str.contains(' - ')].values

array(['grated parmesan cheese medium shrimp Knorr® Pasta Sides™ - Alfredo baby spinach cherry tomatoes oil',
       "corn kernels purple onion Knorr® Fiesta Sides™ - Mexican Rice large tomato boneless sirloin steak I Can't Believe It's Not Butter!® Spread",
       'Knorr® Pasta Sides™ - Alfredo tomatoes fresh basil leaves provolone cheese vegetable oil boneless skinless chicken breast halves',
       'boneless chicken skinless thigh Knorr® Pasta Sides™ - Chicken flavor cumin nonfat plain greek yogurt red bell pepper olive oil lemon juice chili powder onions',
       "corn kernels boneless sirloin steak I Can't Believe It's Not Butter!® Spread Knorr® Fiesta Sides™ - Mexican Rice tomatoes purple onion",
       'tomatoes cream cheese, soften shredded mozzarella cheese Knorr® Pasta Sides™ - Butter & Herb cut up cooked chicken frozen chopped spinach, thawed and squeezed dry'],
      dtype=object)

In [77]:
for k,v in word_len_sort.items():
    print(k, len(v))

1 12
2 26
3 169
4 483
5 589
6 642
7 551
8 444
9 286
10 188
11 85
12 39
13 24
14 7
15 5
16 5
18 1
19 2


In [126]:
word_len_sort[16]

['chocolatecovered',
 'butter-margarine',
 'hickory-flavored',
 'vegetable-filled',
 'chicken-flavored']

In [5]:
import multiprocessing
def model_run(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    train_pred = model.predict(X_train) 
    test_pred = model.predict(X_test)

    train_score = metrics.accuracy_score(y_train, train_pred)
    test_score = metrics.accuracy_score(y_test, test_pred)

    return train_score, test_score

def test_on_train(model, X, y):
    input_to_multi = []
    start = datetime.datetime.now()
#     for train_index, test_index in RepeatedStratifiedKFold(n_splits=5, n_repeats=4, random_state=123).split(X, y):
    for train_index, test_index in StratifiedKFold(n_splits=5,random_state=123).split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        input_to_multi.append([model, X_train, X_test, y_train, y_test])

    with multiprocessing.Pool() as p:
        KFold_results = p.starmap(model_run, input_to_multi)


    print(datetime.datetime.now() - start)
    return pd.DataFrame(KFold_results,
                        columns=['train_score', 'test_score']).mean(axis=0)

In [None]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.itervalues().next())

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

In [81]:
from nltk.stem.snowball import SnowballStemmer
from nltk import tokenize
from nltk.corpus import stopwords
# import nltk
# nltk.download('stopwords')
    
class StemmedCountVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        stemmer = SnowballStemmer("english")
        stops = set(stopwords.words("english"))
        return lambda doc: ([stemmer.stem(w) for w in analyzer(re.sub("[^\w\s]", "", doc)) if w not in stops])
    
# out_index = train[train.ingredients_count > 40].index
X = train.ingredients #.drop(out_index)
y = train.cuisine #.drop(out_index)

model = Pipeline([
#     ('bag_of_words', CountVectorizer(tokenizer=text_process(), stop_words='english')),
    ('bag_of_words', StemmedCountVectorizer(lowercase=True, ngram_range=(1, 2), stop_words='english')),

#     ('tfidf', TfidfVectorizer(stop_words='english')),
    ('logreg', LogisticRegression(penalty='l2',C=10, max_iter=10000))
#     ('logreg', OneVsRestClassifier(LogisticRegression(penalty='l2',C=10, max_iter=1000)))
#     ('forest', RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2,
#                                       min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto',
#                                       max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None,
#                                       bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0,
#                                       warm_start=False, class_weight=None))
#     ('svm', OneVsRestClassifier(SVC(C=100, coef0=1)))
#     ('boost', GradientBoostingClassifier()) #best scores
#     ('xgb', xgb.XGBClassifier(max_depth=6, learning_rate=0.01, n_estimators=100,
#                               objective='multi:softmax', booster='gbtree', n_jobs=-1))
])

In [82]:
np.random.seed(123)
print(datetime.datetime.now())
# test_on_train(model, X, y)
# x_train, x_test, y_train, y_test = train_test_split(X, pd.get_dummies(y), stratify= pd.get_dummies(y))
x_train, x_test, y_train, y_test = train_test_split(X, y, stratify= y)
model.fit(x_train, y_train)
train_pred = model.predict(x_train) 
test_pred = model.predict(x_test)

print(metrics.accuracy_score(y_train, train_pred))
print(metrics.accuracy_score(y_test, test_pred))


print(datetime.datetime.now())

2018-07-27 19:25:55.664556
0.9837412001340932
0.791331456154465
2018-07-27 19:27:30.764309


In [133]:
vec = StemmedCountVectorizer(lowercase=True, ngram_range=(1, 2), stop_words='english', max_df=0.3).fit(X)
x_trans = vec.transform(X)
x_trans

<39774x77936 sparse matrix of type '<class 'numpy.float64'>'
	with 1369399 stored elements in Compressed Sparse Row format>

In [128]:
x_trans

<39774x77943 sparse matrix of type '<class 'numpy.float64'>'
	with 1496944 stored elements in Compressed Sparse Row format>

# model feature selection

In [135]:
np.random.seed(123)
importance_model = Pipeline([
    ('bag_of_words', StemmedCountVectorizer(lowercase=True, ngram_range=(1, 2), stop_words='english')),
    ('forest', RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, min_samples_split=2,
                                      min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto',
                                      bootstrap=True, oob_score=False, n_jobs=-1,
                                      warm_start=False, class_weight=None))

])
importance_model.fit(X, y)

Pipeline(memory=None,
     steps=[('bag_of_words', StemmedCountVectorizer(analyzer='word', binary=False, decode_error='strict',
            dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
            lowercase=True, max_df=1.0, max_features=None, min_df=1,
            ngram_range=(1, 2), norm='l2', preprocesso..._jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [138]:
score = importance_model.steps[-1][-1].feature_importances_
columns = importance_model.steps[0][-1].get_feature_names()

In [148]:
importance = pd.DataFrame(list(zip(score, columns)), columns=['scores', 'f_name']).sort_values('f_name', ascending=False)
importance.head(50)

Unnamed: 0,scores,f_name
77942,8.670534e-07,épices shallot
77941,2.498326e-06,épices salt
77940,7.809187e-07,épices raisin
77939,1.32432e-06,épices larg
77938,4.037788e-06,épice
77937,5.574334e-08,zucchini zucchini
77936,7.452059e-07,zucchini yukon
77935,1.330238e-05,zucchini yellow
77934,0.0,zucchini worcestershir
77933,5.834038e-07,zucchini wishbon


In [156]:
sorted(set(score))[-1000]

0.00013847669723380642

### count the number of words in each ingridiant for the n_grams //Done
# Try to find outliers in num of ingridiants
# try plying with the CountVectorizer/TFIDF params to drop outlier ingridiants
# drop corr features
# Param CV search
### Toknizer + stemmer //Done
# Word2Vec: http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/

In [9]:
import gensim.downloader as api
v = api.info()
v

{'corpora': {'20-newsgroups': {'checksum': 'c92fd4f6640a86d5ba89eaad818a9891',
   'description': 'The notorious collection of approximately 20,000 newsgroup posts, partitioned (nearly) evenly across 20 different newsgroups.',
   'fields': {'data': '',
    'id': 'original id inferred from folder name',
    'set': "marker of original split (possible values 'train' and 'test')",
    'topic': 'name of topic (20 variant of possible values)'},
   'file_name': '20-newsgroups.gz',
   'file_size': 14483581,
   'license': 'not found',
   'num_records': 18846,
   'parts': 1,
   'read_more': ['http://qwone.com/~jason/20Newsgroups/'],
   'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/20-newsgroups/__init__.py',
   'record_format': 'dict'},
  '__testing_matrix-synopsis': {'checksum': '1767ac93a089b43899d54944b07d9dc5',
   'description': '[THIS IS ONLY FOR TESTING] Synopsis of the movie matrix.',
   'file_name': '__testing_matrix-synopsis.gz',
   'parts': 1,
   're

In [12]:
import json


print(json.dumps(v['models'], indent=4, sort_keys=True))

{
    "__testing_word2vec-matrix-synopsis": {
        "checksum": "534dcb8b56a360977a269b7bfc62d124",
        "description": "[THIS IS ONLY FOR TESTING] Word vecrors of the movie matrix.",
        "file_name": "__testing_word2vec-matrix-synopsis.gz",
        "parameters": {
            "dimensions": 50
        },
        "parts": 1,
        "preprocessing": "Converted to w2v using a preprocessed corpus. Converted to w2v format with `python3.5 -m gensim.models.word2vec -train <input_filename> -iter 50 -output <output_filename>`.",
        "read_more": []
    },
    "conceptnet-numberbatch-17-06-300": {
        "base_dataset": "ConceptNet, word2vec, GloVe, and OpenSubtitles 2016",
        "checksum": "fd642d457adcd0ea94da0cd21b150847",
        "description": "ConceptNet Numberbatch consists of state-of-the-art semantic vectors (also known as word embeddings) that can be used directly as a representation of word meanings or as a starting point for further machine learning. ConceptNet Numb

In [17]:
model, model_path = api.load('conceptnet-numberbatch-17-06-300', return_path=True)
model.most_similar("cat")



KeyboardInterrupt: 

In [16]:
model_path

NameError: name 'model_path' is not defined