In [49]:
import ast
import json
import os
import pickle

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_sample_weight, compute_class_weight
from xgboost import XGBClassifier

## Load data, split into train/dev/test

In [2]:
recipe_df = pd.read_csv("/Users/Carol/Google Drive/nlp_data/recipe_data/20200524_categories/20200524_gf_k_v_food_terms_extracted.tsv", sep="\t")

In [3]:
# split into training, dev, and test
train, devtest = train_test_split(recipe_df, test_size=0.4, stratify=recipe_df[['vegan', "kosher", "gluten_free"]]) 
dev, test = train_test_split(devtest, test_size=0.5, stratify=devtest[['vegan', "kosher", "gluten_free"]]) 

In [200]:
recipe_df.shape

(11022, 8)

In [23]:
for name, df in zip(["train", "dev", "test"], [train, dev, test]):
    shape = df.shape
    print(name)
    gf = df.gluten_free.tolist()
    print("gluten free: {}".format(sum(gf)/shape[0]))
    print(sum(gf))
    k = df.kosher.tolist()
    print("kosher: {}".format(sum(k)/shape[0]))
    print(sum(k))
    v = df.vegan.tolist()
    print("vegan: {}".format(sum(v)/shape[0]))
    print(sum(v))



train
gluten free: 0.3952820202631181
2614
kosher: 0.5000756086496295
3307
vegan: 0.13927113261757146
921
dev
gluten free: 0.3951905626134301
871
kosher: 0.5
1102
vegan: 0.1397459165154265
308
test
gluten free: 0.39546485260770975
872
kosher: 0.4997732426303855
1102
vegan: 0.13877551020408163
306


In [7]:
# write out the training, dev, and test sets
outpath = "/Users/Carol/Google Drive/nlp_data/recipe_data/20200524_categories"
for name, df in zip(["train", "dev", "test"], [train, dev, test]):
    outfile = os.path.join(outpath, name + ".tsv")
    df.to_csv(outfile, sep="\t", index=False)

In [3]:
# reload the data later
inpath = "/Users/Carol/Google Drive/nlp_data/recipe_data/20200524_categories"
train = pd.read_csv(os.path.join(inpath, "train.tsv"), sep="\t")
dev = pd.read_csv(os.path.join(inpath, "dev.tsv"), sep="\t")
test = pd.read_csv(os.path.join(inpath, "test.tsv"), sep="\t")

In [4]:
train.head(2)

Unnamed: 0,title,ingredients,directions,gluten_free,kosher,vegan,food_terms,joined_ingredients
0,Bratwurst with Creamy Apple Compote,"['2 tablespoons unsalted butter', '1 tablespoo...","['Preheat broiler.', 'Heat butter and oil in a...",True,False,False,"['butter', 'oil', '-', 'onion', 'apples', 'bay...",2 tablespoons unsalted butter. 1 tablespoon ve...
1,The Ultimate Bolognese Sauce,"['2 bay leaves', '3 whole cloves', '2 teaspoon...",['Toast spices in a dry small skillet over med...,False,False,False,"['spices', 'flavor', 'spice', 'oil', '-', 'liv...",2 bay leaves. 3 whole cloves. 2 teaspoons fenn...


## Feature vectorization methods

In [5]:
def get_tokens(input_cell):
    return ast.literal_eval(input_cell)

def get_unique_tokens(input_cell):
    return list(set(ast.literal_eval(input_cell)))

In [6]:
count_unique = CountVectorizer(tokenizer=get_unique_tokens)
count = CountVectorizer(tokenizer=get_tokens)
tfidf = TfidfVectorizer(tokenizer=get_tokens)

## Calculate baseline performance

In [150]:
# baseline_labels
for category in ["gluten_free", "kosher", "vegan"]:
    print(category)
    baseline_labels = [0]*len(dev[category])
    labels = dev[category].tolist()
    print(classification_report(labels, baseline_labels))
    print("=====================")

gluten_free
              precision    recall  f1-score   support

       False       0.60      1.00      0.75      1333
        True       0.00      0.00      0.00       871

    accuracy                           0.60      2204
   macro avg       0.30      0.50      0.38      2204
weighted avg       0.37      0.60      0.46      2204

kosher
              precision    recall  f1-score   support

       False       0.50      1.00      0.67      1102
        True       0.00      0.00      0.00      1102

    accuracy                           0.50      2204
   macro avg       0.25      0.50      0.33      2204
weighted avg       0.25      0.50      0.33      2204

vegan
              precision    recall  f1-score   support

       False       0.86      1.00      0.92      1896
        True       0.00      0.00      0.00       308

    accuracy                           0.86      2204
   macro avg       0.43      0.50      0.46      2204
weighted avg       0.74      0.86      0.80      

  'precision', 'predicted', average, warn_for)


## Try various model types, vectorization methods, and hyperparameters

In [156]:
# logistic regression
def fit_lr(X_train, Y_train, X_dev, y_dev, custom_vec):
    lr_param_grid = [{"vect__ngram_range": [(1,1)], 
                   'clf__penalty':['l1', 'l2'], 
                   'clf__C':[1]}]

    pipeline = Pipeline([('vect', custom_vec), ('clf', LogisticRegression(class_weight="balanced")),])
    lr_gs = GridSearchCV(pipeline, lr_param_grid, scoring="f1_macro", cv=5, verbose=0, n_jobs=-1)
    lr_gs.fit(train.food_terms, y_train)
    print("Logistic regression: ")
    print(lr_gs.best_params_)
    clf = lr_gs.best_estimator_
    print(clf.score(X_dev, y_dev))
    
# fit_lr(X_train, y_train, X_dev, y_dev)

In [157]:
# multinomial naive bayes
def fit_mnb(X_train, y_train, X_dev, y_dev, custom_vec):
    
    nb_param_grid = [{"vect__ngram_range": [(1,1)], 
                   'clf__alpha': (1, 0.1, 0.01, 0.001, 0.0001, 0.00001)}]
    weights = compute_sample_weight("balanced", y_train)
    pipeline = Pipeline([('vect', custom_vec), ('clf', MultinomialNB())])
    nb_gs = GridSearchCV(pipeline, nb_param_grid, scoring="f1_macro", cv=5, verbose=0, n_jobs=-1)
    nb_gs.fit(X_train, y_train, clf__sample_weight=weights)
    print("Mutinomial Naive Bayes: ")
    print(nb_gs.best_params_)
    clf = nb_gs.best_estimator_
    print(clf.score(X_dev, y_dev))
    
# fit_mnb(X_train, y_train, X_dev, y_dev)

In [158]:
# complement naive bayes
def fit_cnb(X_train, y_train, X_dev, y_dev, custom_vec):
    nb_param_grid = [{"vect__ngram_range": [(1,1)], 
                   'clf__alpha': (1, 0.1, 0.01, 0.001, 0.0001)}]
    weights = compute_sample_weight("balanced", y_train)
    pipeline = Pipeline([('vect', custom_vec), ('clf', ComplementNB())])
    nb_gs = GridSearchCV(pipeline, nb_param_grid, scoring="f1_macro", cv=5, verbose=0, n_jobs=-1)
    nb_gs.fit(X_train, y_train, clf__sample_weight=weights)
    print("Complement Naive Bayes: ")
    print(nb_gs.best_params_)
    clf = nb_gs.best_estimator_
    print(clf.score(X_dev, y_dev))
  
# fit_cnb(X_train, y_train, X_dev, y_dev)

In [159]:
# random forest
def fit_rf(X_train, y_train, X_dev, y_dev, custom_vec):
    param_grid = {
    "vect__ngram_range": [(1,1),],
    'clf__bootstrap': [True],
    'clf__max_depth': [100],
    'clf__max_features': ['auto'],
    'clf__min_samples_leaf': [1],
    'clf__min_samples_split': [2],
    'clf__n_estimators': [100, 200]
}

    pipeline = Pipeline([('vect', custom_vec), ('clf', RandomForestClassifier())])
    rf_gs = GridSearchCV(pipeline, param_grid, scoring="f1_macro", cv=5, verbose=0, n_jobs=-1)
    rf_gs.fit(X_train, y_train)
    print("Random Forest: ")
    print(rf_gs.best_params_)
    clf = rf_gs.best_estimator_
    print(clf.score(X_dev, y_dev))
    
# fit_rf(X_train, y_train, X_dev, y_dev)

In [174]:
# multilayer perceptron
def fit_mlp(X_train, y_train, X_dev, y_dev, custom_vec):
    param_grid = [{"clf__hidden_layer_sizes": [(50), (100,), (200,), (50, 50)]}]
    pipeline = Pipeline([('vect', custom_vec), ('clf', MLPClassifier(early_stopping=True))])
    mlp_gs = GridSearchCV(pipeline, param_grid, scoring="f1_macro", cv=5, verbose=0, n_jobs=-1)
    mlp_gs.fit(X_train, y_train)
    print("Multilayer perceptron: ")
    print(mlp_gs.best_params_)
    clf = mlp_gs.best_estimator_
    print(clf.score(X_dev, y_dev))
  

X_train = train.food_terms
y_train = train["vegan"]
X_dev = dev.food_terms
y_dev = dev["vegan"]

# fit_mlp(X_train, y_train, X_dev, y_dev, count_unique)

In [7]:
# xgboost
def fit_xgb(X_train, y_train, X_dev, y_dev, custom_vec):
    param_grid = [{"clf__n_estimators": [100, 200]}]
    pipeline = Pipeline([('vect', custom_vec), ('clf', XGBClassifier())])
    gs = GridSearchCV(pipeline, param_grid, scoring="f1_macro", cv=5, verbose=0, n_jobs=-1)
    gs.fit(train.food_terms, y_train)
    print("XGBoost: ")
    print(gs.best_params_)
    clf = gs.best_estimator_
    print(clf.score(X_dev, y_dev))
    
#xgboost with weighted gradients   
def fit_xgb_weighted(X_train, y_train, X_dev, y_dev, custom_vec):
    param_grid = [{"clf__n_estimators": [100, 200]}]
    weights = compute_class_weight("balanced", [0, 1], y_train)
    pos_weight = weights[1]/weights[0]
    pipeline = Pipeline([('vect', custom_vec), ('clf', XGBClassifier(scale_pos_weight=pos_weight))])
    gs = GridSearchCV(pipeline, param_grid, scoring="f1_macro", cv=5, verbose=0, n_jobs=-1)
    gs.fit(train.food_terms, y_train)
    print("XGBoost: ")
    print(gs.best_params_)
    clf = gs.best_estimator_
    print(clf.score(X_dev, y_dev))   


In [175]:
for category in ["gluten_free", "kosher", "vegan"]:
    print(category)
    X_train = train.food_terms
    y_train = train[category]
    X_dev = dev.food_terms
    y_dev = dev[category]
    

    for name, vectorizer in zip(["counts", "unique", "tfidf"], [count, count_unique, tfidf]):
        print(name)
        fit_mlp(X_train, y_train, X_dev, y_dev, vectorizer)

gluten_free
counts
Multilayer perceptron: 
{'clf__hidden_layer_sizes': (100,)}
0.7590744101633394
unique
Multilayer perceptron: 
{'clf__hidden_layer_sizes': 50}
0.7581669691470054
tfidf
Multilayer perceptron: 
{'clf__hidden_layer_sizes': (200,)}
0.7558983666061706
kosher
counts
Multilayer perceptron: 
{'clf__hidden_layer_sizes': (100,)}
0.7368421052631579
unique
Multilayer perceptron: 
{'clf__hidden_layer_sizes': 50}
0.7313974591651543
tfidf
Multilayer perceptron: 
{'clf__hidden_layer_sizes': 50}
0.7291288566243194
vegan
counts
Multilayer perceptron: 
{'clf__hidden_layer_sizes': (200,)}
0.8906533575317604
unique
Multilayer perceptron: 
{'clf__hidden_layer_sizes': (100,)}
0.8933756805807622
tfidf
Multilayer perceptron: 
{'clf__hidden_layer_sizes': (200,)}
0.8901996370235935


In [161]:
for category in ["gluten_free", "kosher", "vegan"]:
    print(category)
    X_train = train.food_terms
    y_train = train[category]
    X_dev = dev.food_terms
    y_dev = dev[category]
    
    for model in fit_lr, fit_cnb, fit_rf:
        for name, vectorizer in zip(["counts", "unique", "tfidf"], [count, count_unique, tfidf]):
            print(name)
            model(X_train, y_train, X_dev, y_dev, vectorizer)

gluten_free
counts




Logistic regression: 
{'clf__C': 1, 'clf__penalty': 'l1', 'vect__ngram_range': (1, 1)}
0.7490925589836661
unique




Logistic regression: 
{'clf__C': 1, 'clf__penalty': 'l1', 'vect__ngram_range': (1, 1)}
0.7431941923774955
tfidf




Logistic regression: 
{'clf__C': 1, 'clf__penalty': 'l1', 'vect__ngram_range': (1, 1)}
0.7486388384754991
counts
Complement Naive Bayes: 
{'clf__alpha': 0.1, 'vect__ngram_range': (1, 1)}
0.7182395644283122
unique
Complement Naive Bayes: 
{'clf__alpha': 0.1, 'vect__ngram_range': (1, 1)}
0.7196007259528131
tfidf
Complement Naive Bayes: 
{'clf__alpha': 0.1, 'vect__ngram_range': (1, 1)}
0.73502722323049
counts
Random Forest: 
{'clf__bootstrap': True, 'clf__max_depth': 100, 'clf__max_features': 'auto', 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2, 'clf__n_estimators': 200, 'vect__ngram_range': (1, 1)}
0.7663339382940109
unique
Random Forest: 
{'clf__bootstrap': True, 'clf__max_depth': 100, 'clf__max_features': 'auto', 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2, 'clf__n_estimators': 200, 'vect__ngram_range': (1, 1)}
0.7617967332123412
tfidf
Random Forest: 
{'clf__bootstrap': True, 'clf__max_depth': 100, 'clf__max_features': 'auto', 'clf__min_samples_leaf': 1, 'clf__mi



Logistic regression: 
{'clf__C': 1, 'clf__penalty': 'l1', 'vect__ngram_range': (1, 1)}
0.721415607985481
unique




Logistic regression: 
{'clf__C': 1, 'clf__penalty': 'l1', 'vect__ngram_range': (1, 1)}
0.7186932849364791
tfidf




Logistic regression: 
{'clf__C': 1, 'clf__penalty': 'l1', 'vect__ngram_range': (1, 1)}
0.7295825771324864
counts
Complement Naive Bayes: 
{'clf__alpha': 0.1, 'vect__ngram_range': (1, 1)}
0.6923774954627949
unique
Complement Naive Bayes: 
{'clf__alpha': 1, 'vect__ngram_range': (1, 1)}
0.6869328493647913
tfidf
Complement Naive Bayes: 
{'clf__alpha': 1, 'vect__ngram_range': (1, 1)}
0.6982758620689655
counts
Random Forest: 
{'clf__bootstrap': True, 'clf__max_depth': 100, 'clf__max_features': 'auto', 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2, 'clf__n_estimators': 200, 'vect__ngram_range': (1, 1)}
0.7323049001814882
unique
Random Forest: 
{'clf__bootstrap': True, 'clf__max_depth': 100, 'clf__max_features': 'auto', 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2, 'clf__n_estimators': 200, 'vect__ngram_range': (1, 1)}
0.7377495462794919
tfidf
Random Forest: 
{'clf__bootstrap': True, 'clf__max_depth': 100, 'clf__max_features': 'auto', 'clf__min_samples_leaf': 1, 'clf__min_



Logistic regression: 
{'clf__C': 1, 'clf__penalty': 'l1', 'vect__ngram_range': (1, 1)}
0.8734119782214156
unique




Logistic regression: 
{'clf__C': 1, 'clf__penalty': 'l1', 'vect__ngram_range': (1, 1)}
0.8706896551724138
tfidf




Logistic regression: 
{'clf__C': 1, 'clf__penalty': 'l1', 'vect__ngram_range': (1, 1)}
0.8484573502722323
counts
Complement Naive Bayes: 
{'clf__alpha': 0.01, 'vect__ngram_range': (1, 1)}
0.8511796733212341
unique
Complement Naive Bayes: 
{'clf__alpha': 0.1, 'vect__ngram_range': (1, 1)}
0.8548094373865699
tfidf
Complement Naive Bayes: 
{'clf__alpha': 0.1, 'vect__ngram_range': (1, 1)}
0.8448275862068966
counts
Random Forest: 
{'clf__bootstrap': True, 'clf__max_depth': 100, 'clf__max_features': 'auto', 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2, 'clf__n_estimators': 200, 'vect__ngram_range': (1, 1)}
0.8802177858439202
unique
Random Forest: 
{'clf__bootstrap': True, 'clf__max_depth': 100, 'clf__max_features': 'auto', 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2, 'clf__n_estimators': 200, 'vect__ngram_range': (1, 1)}
0.8806715063520871
tfidf
Random Forest: 
{'clf__bootstrap': True, 'clf__max_depth': 100, 'clf__max_features': 'auto', 'clf__min_samples_leaf': 1, 'clf_

In [199]:
for category in ["gluten_free", "kosher", "vegan"]:
    print(category)
    X_train = train.food_terms
    y_train = train[category]
    X_dev = dev.food_terms
    y_dev = dev[category]
    

    for name, vectorizer in zip(["counts", "unique", "tfidf"], [count, count_unique, tfidf]):
        print(name)
        fit_xgb(X_train, y_train, X_dev, y_dev, vectorizer)

gluten_free
counts
XGBoost: 
{'clf__n_estimators': 100}
0.7676950998185118
unique
XGBoost: 
{'clf__n_estimators': 100}
0.7672413793103449
tfidf
XGBoost: 
{'clf__n_estimators': 100}
0.75
kosher
counts
XGBoost: 
{'clf__n_estimators': 100}
0.7386569872958257
unique
XGBoost: 
{'clf__n_estimators': 200}
0.73502722323049
tfidf
XGBoost: 
{'clf__n_estimators': 100}
0.721415607985481
vegan
counts
XGBoost: 
{'clf__n_estimators': 200}
0.9065335753176044
unique
XGBoost: 
{'clf__n_estimators': 200}
0.8997277676950998
tfidf
XGBoost: 
{'clf__n_estimators': 200}
0.8947368421052632


In [8]:
for category in ["gluten_free", "kosher", "vegan"]:
    print(category)
    X_train = train.food_terms
    y_train = train[category]
    X_dev = dev.food_terms
    y_dev = dev[category]

    for name, vectorizer in zip(["counts", "unique", "tfidf"], [count, count_unique, tfidf]):
        print(name)
        fit_xgb_weighted(X_train, y_train, X_dev, y_dev, vectorizer)

gluten_free
counts
XGBoost: 
{'clf__n_estimators': 200}
0.7622504537205081
unique
XGBoost: 
{'clf__n_estimators': 200}
0.7518148820326679
tfidf
XGBoost: 
{'clf__n_estimators': 100}
0.7463702359346642
kosher
counts
XGBoost: 
{'clf__n_estimators': 200}
0.7291288566243194
unique
XGBoost: 
{'clf__n_estimators': 100}
0.7400181488203267
tfidf
XGBoost: 
{'clf__n_estimators': 100}
0.7368421052631579
vegan
counts
XGBoost: 
{'clf__n_estimators': 200}
0.8901996370235935
unique
XGBoost: 
{'clf__n_estimators': 200}
0.8911070780399274
tfidf
XGBoost: 
{'clf__n_estimators': 100}
0.8747731397459165


## Compare Xgboost with and without class weights

In [14]:
X_train = train.food_terms
y_train = train["vegan"]
X_dev = dev.food_terms
y_dev = dev["vegan"]

# xgboost
clf =  XGBClassifier(n_estimators=200)
X_train = count.fit_transform(X_train)
clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [15]:
X_dev = count.transform(X_dev)
ypred_dev = clf.predict(X_dev)

In [17]:
print(classification_report(y_dev, ypred_dev))

              precision    recall  f1-score   support

       False       0.94      0.96      0.95      1896
        True       0.69      0.60      0.64       308

    accuracy                           0.91      2204
   macro avg       0.81      0.78      0.79      2204
weighted avg       0.90      0.91      0.90      2204



In [20]:
weights = compute_class_weight("balanced", [0, 1], y_train)
pos_weight = weights[1]/weights[0]
weighted_clf =  XGBClassifier(n_estimators=200, scale_pos_weight=pos_weight)
weighted_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=6.180238870792617, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [21]:
ypred_dev_weighted = weighted_clf.predict(X_dev)

In [22]:
print(classification_report(y_dev, ypred_dev_weighted))

              precision    recall  f1-score   support

       False       0.98      0.89      0.93      1896
        True       0.57      0.87      0.69       308

    accuracy                           0.89      2204
   macro avg       0.77      0.88      0.81      2204
weighted avg       0.92      0.89      0.90      2204



In [41]:
X_train = train.food_terms
y_train = train["gluten_free"]
X_dev = dev.food_terms
y_dev = dev["gluten_free"]

# xgboost
clf =  XGBClassifier(n_estimators=200)
X_train = count.fit_transform(X_train)
clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [42]:
X_dev = count.transform(X_dev)
ypred_dev = clf.predict(X_dev)

In [43]:
print(classification_report(y_dev, ypred_dev))

              precision    recall  f1-score   support

       False       0.83      0.77      0.80      1333
        True       0.68      0.76      0.72       871

    accuracy                           0.76      2204
   macro avg       0.76      0.76      0.76      2204
weighted avg       0.77      0.76      0.77      2204



In [38]:
weights = compute_class_weight("balanced", [0, 1], y_train)
pos_weight = weights[1]/weights[0]
weighted_clf =  XGBClassifier(n_estimators=200, scale_pos_weight=pos_weight)
weighted_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1.5298393267023718, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [39]:
ypred_dev_weighted = weighted_clf.predict(X_dev)

In [40]:
print(classification_report(y_dev, ypred_dev_weighted))

              precision    recall  f1-score   support

       False       0.88      0.71      0.78      1333
        True       0.65      0.85      0.74       871

    accuracy                           0.76      2204
   macro avg       0.77      0.78      0.76      2204
weighted avg       0.79      0.76      0.76      2204



## Final model training with combined training + dev sets

In [25]:
traindev = pd.concat([train, dev])
traindev.shape

(8817, 8)

In [68]:
X_train = traindev.food_terms
X_test = test.food_terms
X_train = count.fit_transform(X_train)
X_test = count.transform(X_test)

In [50]:
def train_save_test(X_train, X_test, y_train, y_test, outfile):
    weights = compute_class_weight("balanced", [0, 1], y_train)
    pos_weight = weights[1]/weights[0]
    clf =  XGBClassifier(n_estimators=200, scale_pos_weight=pos_weight)
    clf.fit(X_train, y_train)
    with open(outfile, "wb") as out:
        pickle.dump(clf, out)
    ypred_test = clf.predict(X_test)
    print(classification_report(ypred_test, y_test))
    

for category in ["gluten_free", "vegan", "kosher"]:
    outfile = "/Users/Carol/Google Drive/nlp_data/models/20200613_{}.p".format(category)
    y_train = traindev[category]
    y_test = test[category]
    print(category)
    train_save_test(X_train, X_test, y_train, y_test, outfile)

gluten_free
              precision    recall  f1-score   support

       False       0.70      0.88      0.78      1064
        True       0.85      0.65      0.73      1141

    accuracy                           0.76      2205
   macro avg       0.77      0.76      0.76      2205
weighted avg       0.78      0.76      0.75      2205

vegan
              precision    recall  f1-score   support

       False       0.91      0.98      0.95      1767
        True       0.89      0.62      0.73       438

    accuracy                           0.91      2205
   macro avg       0.90      0.80      0.84      2205
weighted avg       0.91      0.91      0.90      2205

kosher
              precision    recall  f1-score   support

       False       0.68      0.78      0.72       954
        True       0.81      0.71      0.76      1251

    accuracy                           0.74      2205
   macro avg       0.74      0.75      0.74      2205
weighted avg       0.75      0.74      0.74      

In [104]:
# save the vectorizer
outfile = "/Users/Carol/Google Drive/nlp_data/models/20200613_vectorizer.p".format(category)

with open(outfile, "wb") as out:
    pickle.dump(count, out)

## Error analysis

In [93]:
# False positives
for category in ["gluten_free", "vegan", "kosher"]:
    print(category.upper())
    saved_model = "/Users/Carol/Google Drive/nlp_data/models/20200613_{}.p".format(category)
    clf = pickle.load(open(saved_model, "rb"))
    ypred_test = clf.predict(X_test)
    y_test = test[category]
    FP = np.where((ypred_test == True) & (y_test==False))[0].tolist()
    fp = test.food_terms[FP]
    for item in fp:
        print(category)
        print(item)
        print("=========")





GLUTEN_FREE
gluten_free
['steak', '-', 'peppercorns', 'allspice', 'berries', 'peppercorn', 'steaks', 'steaks', 'butter', 'oil', 'fat', 'steaks', 'seasoned', 'salt', 'meat', 'steaks', 'fat', 'butter', 'white', 'scallion', 'scallion', 'wine', 'bits', 'wine', 'wine', 'glaze', 'butter', 'scallion green', 'salt', 'black pepper', 'red - cabbage', 'confit', 'venison', 'steak', 'sauce', 'steak']
gluten_free
['crabmeat', '-', 'cakes', 'oil', 'crab', 'cakes', 'bottoms', 'cakes', 'greens', 'vinaigrette', 'crab', 'cakes', 'lemon']
gluten_free
['375ºF.', 'oil', 'prosciutto', 'cheese', 'prosciutto', 'egg', 'yolk', 'cream', 'egg', 'thyme', 'salt', 'pepper', 'whites', 'prosciutto']
gluten_free
['wholemilk', 'Greek yogurt', 'tahini', 'powdered sugar', 'tahini', 'toasted', 'coconut', 'cacao nibs']
gluten_free
['rind', 'fat', 'ham', '-', 'fat', 'ham', 'onions', 'mangos', 'nectar', 'ham', 'juices', 'ham', 'mangos', 'onions', 'ham', 'mangos', 'onions', 'juices', 'ham', 'mangos', 'onions', 'juices']
gluten_

vegan
['sugar', 'sugar', 'zest', 'sugar syrup', 'syrup', 'pitcher', 'lemon juice', 'mint', 'lemonade', 'lemonade', 'lemonade', 'beer', 'beer']
vegan
['tomato juice', 'dill', 'pickle juice', 'fresh', 'lemon juice', 'horseradish', 'sauce', 'curry powder', 'Sriracha', 'kosher', 'salt', 'celery seeds', 'black pepper', 'pitcher', 'vodka', 'pitcher', 'ice', 'Bloody', 'cocktail', 'celery stalk', 'lemon']
vegan
['leek', 'roots', 'roots', '-', 'green', 'leek', 'leeks', 'vinegar', 'leeks', 'seasoned', 'broth', 'broth', 'oil', 'cilantro sprigs', 'garlic', 'cinnamon', 'cumin', 'curry powder', 'saffron', 'salt', 'potato', 'zucchini', 'zucchini', 'leeks', 'carrots', 'potatoes', 'turnips', 'onions', 'broth', 'zucchini', 'tomatoes', 'prunes', 'raisins', 'vegetables', 'leeks', 'roots', 'white bulb', 'vegetables', 'broth', 'cilantro', 'couscous', 'vegetables', 'broth', 'cilantro']
vegan
['sugar', 'sugar', 'rhubarb', 'raspberries', 'Sauce', 'sauce']
vegan
['-', 'zucchini', 'oil', 'salt', 'black pepper', 

['corn', 'broth', 'butter', '-', 'corn', 'broth', '-', 'potatoes', 'cream', 'chowder', 'salt', 'pepper', 'onions']
kosher
['Cream', 'oz', 'butter', 'herbs', 'parsley', 'chervil', 'tarragon', 'creamed', 'butter', 'parsley', 'chervil', 'white', 'mushrooms', 'butter', 'herbed', 'butter', 'shallots', 'pepper', 'salt', 'chicken', 'herbed', 'butter', 'meat', 'chicken', '350#&176;F', 'chicken', 'chicken', 'juices', 'juices', 'chervil', 'parsley', 'white', 'onions', 'onions', 'onion', 'onions', 'onion', 'onions', 'onions', 'olive', 'oil', 'salt', 'pepper', 'celery', 'pine', 'nuts', 'onions', 'diced celery', 'pine', 'nuts', 'ricotta', 'pork fat', 'sage', 'parmesan', 'parsley', 'salt', 'pepper', 'onions', 'onions', 'olive', 'oil', 'onions', 'juices', 'tasty', 'fondant', 'onions', 'chicken', 'juices', 'freshly', 'pepper', 'tomatoes', 'tomato', 'stalk', 'tomatoes', 'Salt', 'pepper', 'tomatoes', 'tomato', 'stuffing', 'tomatoes', 'insides', 'tomato', 'confit', 'diced', 'olive', 'oil', 'unpeeled', 'g

In [102]:
# False negatives
for category in ["gluten_free", "vegan", "kosher"]:
    print(category.upper())
    saved_model = "/Users/Carol/Google Drive/nlp_data/models/20200613_{}.p".format(category)
    clf = pickle.load(open(saved_model, "rb"))
    ypred_test = clf.predict(X_test)
    y_test = test[category]
    FN = np.where((ypred_test == False) & (y_test==True))[0].tolist()
    fn_title = test.title[FN]
    fn_ing = test.joined_ingredients[FN]
    for title, ing in zip(fn_title, fn_ing):
        print(category)
        print(title)
        print(ing)
        print("=========")



GLUTEN_FREE
gluten_free
Asparagus, Fingerling Potato, and Goat Cheese Pizza 
5 ounces fingerling potatoes. Cornmeal (for sprinkling). Pizza Dough. 2 tablespoons extra-virgin olive oil, divided. 1 garlic clove, pressed. 4 green onions, thinly sliced, divided. 1 1/3 cups grated whole-milk mozzarella cheese (about 6 ounces). 4 ounces soft fresh goat cheese, crumbled. 8 ounces asparagus, trimmed, each spear cut in half lengthwise, then crosswise into 2- to 3-inch pieces. 1/2 cup grated Parmesan cheese. 
gluten_free
Citrus Marinated Shrimp Cocktail 
1 cup orange juice. 1 cup fresh lemon juice. 3/4 cup ketchup. 1/3 cup vodka. 1/4 teaspoon hot pepper sauce. 1/4 cup olive oil. 1 1/2 pounds cooked peeled large shrimp. 1 small red onion, thinly sliced (about 1 3/4 cups). 1 cup finely chopped fresh cilantro. 
gluten_free
Sauteed Shrimp on Parmesan Grits with Redeye Gravy 
6 tablespoons (3/4 stick) butter. 4 ounces ham, diced (about 1 cup). 1 1/2 cups sliced stemmed shiitake mushrooms. 1 cup finel

kosher
Spinach-Pineapple-Mint Juice 
1/2 lemon. 3 cups spinach. 2 Granny Smith apples, sliced. 2 cups cubed pineapple. 8 mint leaves. 
kosher
Braised Chicken with Dates and Moroccan Spices 
3 1/2 pounds chicken breast halves, thighs, and drumsticks. 1 tablespoon all purpose flour. 1 tablespoon extra-virgin olive oil. 2 pounds shallots (about 11 large), peeled. 3 cinnamon sticks. 1 1/2 teaspoons ground ginger. 1 teaspoon ground cumin. 1/2 teaspoon turmeric. 1/8 teaspoon cayenne pepper. 3 cups low-salt chicken broth. 5 tablespoons fresh lemon juice, divided. 12 dates, pitted, halved. 1/4 cup almonds, toasted, coarsely chopped. 1/4 cup chopped fresh cilantro. 
kosher
Southwestern Slaw 
2 cups fine-shredded green cabbage. 2 tsp lime juice. 2 tsp honey. 2 tbsp minced red onion. 2 tsp minced jalapeños. 2 tsp chopped cilantro. Salt, to taste. 
kosher
Mahogany Beef Stew with Red Wine and Hoisin Sauce 
4 tablespoons olive oil. 3 1/2 pounds boneless beef chuck roast, trimmed, cut into 2 1/2-inch