In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
%matplotlib inline
sns.set()
np.random.seed(123)

In [None]:
def load_n_prep(name):
    df = pd.read_json(os.path.join('{}.json'.format(name)))
    df.set_index('id', inplace=True)
    df['ingredients_count'] = df['ingredients'].apply(lambda x: len(x))
    df['ingredients_word_count'] = df['ingredients'].apply(lambda ingredients: [len(i.split()) for i in ingredients])
    df['ingredients'] = df['ingredients'].apply(lambda ingredients: ' '.join(ingredients)) #ingredients of two words can get lost
    return df

In [None]:
train = load_n_prep('train')
train.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import Imputer, StandardScaler, RobustScaler
import xgboost as xgb
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.multiclass import OneVsRestClassifier
import datetime
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import os
import nltk
import gensim

In [None]:
import multiprocessing
def model_run(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    train_pred = model.predict(X_train) 
    test_pred = model.predict(X_test)

    train_score = metrics.accuracy_score(y_train, train_pred)
    test_score = metrics.accuracy_score(y_test, test_pred)

    return train_score, test_score

def test_on_train(model, X, y):
    input_to_multi = []
    start = datetime.datetime.now()
#     for train_index, test_index in RepeatedStratifiedKFold(n_splits=5, n_repeats=4, random_state=123).split(X, y):
    for train_index, test_index in StratifiedKFold(n_splits=5,random_state=123).split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        input_to_multi.append([model, X_train, X_test, y_train, y_test])

    with multiprocessing.Pool() as p:
        KFold_results = p.starmap(model_run, input_to_multi)


    print(datetime.datetime.now() - start)
    return pd.DataFrame(KFold_results,
                        columns=['train_score', 'test_score']).mean(axis=0)

In [None]:
from nltk.stem.snowball import SnowballStemmer
from nltk import tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
    
class StemmedCountVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        stemmer = SnowballStemmer("english")
        stops = set(stopwords.words("english"))
        return lambda doc: ([stemmer.stem(w) for w in analyzer(re.sub("[^\w\s]", "", doc)) if w not in stops])
    
# out_index = train[train.ingredients_count > 40].index
X = train.ingredients #.drop(out_index)
y = train.cuisine #.drop(out_index)

model = Pipeline([
#     ('bag_of_words', CountVectorizer(tokenizer=text_process(), stop_words='english')),
    ('bag_of_words', StemmedCountVectorizer(ngram_range=(1, 2), stop_words='english')),

#     ('tfidf', TfidfVectorizer(stop_words='english')),
    ('logreg', LogisticRegression(penalty='l1',C=10,solver='saga', max_iter=10000))
#     ('logreg', OneVsRestClassifier(LogisticRegression(penalty='l2',C=10, max_iter=1000)))
#     ('forest', RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2,
#                                       min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto',
#                                       max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None,
#                                       bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0,
#                                       warm_start=False, class_weight=None))
#     ('svm', OneVsRestClassifier(SVC(C=100, coef0=1)))
#     ('boost', GradientBoostingClassifier()) #best scores
#     ('xgb', xgb.XGBClassifier(max_depth=6, learning_rate=0.01, n_estimators=100,
#                               objective='multi:softmax', booster='gbtree', n_jobs=-1))
])

In [None]:
np.random.seed(123)
print(datetime.datetime.now())
# test_on_train(model, X, y)
# x_train, x_test, y_train, y_test = train_test_split(X, pd.get_dummies(y), stratify= pd.get_dummies(y))
x_train, x_test, y_train, y_test = train_test_split(X, y, stratify= y)
model.fit(x_train, y_train)
train_pred = model.predict(x_train) 
test_pred = model.predict(x_test)

print(metrics.accuracy_score(y_train, train_pred))
print(metrics.accuracy_score(y_test, test_pred))


print(datetime.datetime.now())

### count the number of words in each ingridiant for the n_grams //Done
# Try to find outliers in num of ingridiants
# try plying with the CountVectorizer/TFIDF params to drop outlier ingridiants
# drop corr features
# Param CV search
### Toknizer + stemmer //Done
# Word2Vec: http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/