# Imports

In [206]:
# sql database connection
from config import USERNAME, PASSWORD, HOST_PORT, DB_NAME
from sqlalchemy import create_engine

# data cleaning and wrangling tools
import pandas as pd
import ast
import re
import string

# visualization tools
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pyLDAvis
import pyLDAvis.gensim
import pyLDAvis.sklearn
from pprint import pprint

# nlp tools
import spacy
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer, PorterStemmer
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, NMF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Enable logging for gensim to keep track of the training process
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

# pickle
import pickle

# disable warnings that bring up a deprecation warning
import warnings
warnings.filterwarnings("ignore")

In [2]:
# create sqlalchemy engine for connecting to postgresql db
engine = create_engine(f"postgresql+psycopg2://{USERNAME}:{PASSWORD}@localhost:{HOST_PORT}/{DB_NAME}")

# Natural Language Processing

## Prepare Stop Words

In [115]:
# prepare stopwords
from nltk.corpus import stopwords
stop_words = stopwords.words("english")
user_defined_stops = ["12", "14", "15", "16", "from", "subject", "edu", "re", "edu", "use", "tablespoon", "tbsp", "tbs", "tbl", "teaspoon", "tsp", "ounce", "ounces", "fluid ounce", "fluid ounces", "oz", "fluid oz", "fl oz", "gill", "cup", "c", "C", "pint", "pt", "fluid pint", "fl pt", "quart", "qt", "fluid quart", "fl qt", "gallon", "liter", "litre", "L", "milliliter", "millilitre", "mL", "ml", "deciliter", "dl", "dL", "decilitre", "gal", "gram", "gramme", "g", "pound", "lb", "milligram", "mg", "decigram", "dg", "kilogram", "kg", "kilogramme", "millimeter", "millimetre", "mm", "decimeter", "decimetre", "dm", "meter", "metre", "m", "kilometer", "kilometre", "kilo", "km", "centimeter", "centimetre", "cm", "inch", "in", "cubic meter", "cm3", "m3", "mm3", "km3", "celsius", "Celsius", "Fahrenheit", "F", "pinch", "handful", "loaf", "dash", "Dash", "stick", "recipe", "recipe follows", "follows", "fluid", "large", "little", "medium", "a", "an", "is", "of", "glug", "good" "accompaniment", "as an accompaniment", "dusting", "a good glug of", "for", "at", "room", "temperature", "room temperature", "loosely", "packed", "loosely packed", "package", "bags", "bag", "thinly", "thin", "sliced", "slice", "ground", "container", "ontainer", "cored", "stoned", "instant", "thickly", "thick", "plu", "inche", "box", "inches", "good", "freshly", "ground", "desired", "long", "lengthwise", "halve", "halved", "love", "kosher", "extra", "virgin", "cracked", "salt", "black pepper", "oil", "olive oil", "water", "sprig", "chopped", "l", "minced", "smashed", "small", "turn", "pan"]
stop_words.extend(user_defined_stops + list(string.punctuation))

In [116]:
len(stop_words)

373

## Import Ingredients Data

In [117]:
# read in ingredients data
ingredients = pd.read_pickle("./data/foodnetwork_ingredients.pkl")

In [118]:
ingredients.shape

(91839, 6)

In [119]:
ingredients.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91839 entries, 0 to 91838
Data columns (total 6 columns):
recipe_id          91839 non-null int64
level_1            91839 non-null int64
ingredients        91839 non-null object
ingredient_qty     91839 non-null object
ingredient_unit    58325 non-null object
ingredient         78828 non-null object
dtypes: int64(2), object(4)
memory usage: 4.2+ MB


In [120]:
ingredients.head()

Unnamed: 0,recipe_id,level_1,ingredients,ingredient_qty,ingredient_unit,ingredient
0,1,0,1 loaf French bread (13 to 16 ounces),1,loaf,French bread
1,1,1,8 large eggs,8,,large eggs
2,1,2,2 cups half-and-half,2,cup,halfandhalf
3,1,3,1 cup milk,1,cup,milk
4,1,4,2 tablespoons granulated sugar,2,tablespoon,granulated sugar


In [121]:
ingredients.isnull().sum()

recipe_id              0
level_1                0
ingredients            0
ingredient_qty         0
ingredient_unit    33514
ingredient         13011
dtype: int64

In [122]:
# function to parse out ingredients
def ingredients_parser(ingredient):
    # define list of units
    units = ["tablespoon", "tbsp", "tbs", "tbl", "teaspoon", "tsp", "ounce", "ounces", "fluid ounce", "fluid ounces", "oz", 
                "fluid oz", "fl oz", "gill", "cup", "c", "C", "pint", "pt", "fluid pint", "fl pt", "quart", "qt", "fluid quart", "fl qt", 
                "gallon", "liter", "litre", "L", "milliliter", "millilitre", "mL", "ml", "deciliter", "dl", "dL", "decilitre", "gal", "gram", 
                "gramme", "g", "pound", "lb", "milligram", "mg", "decigram", "dg", "kilogram", "kg", "kilogramme", "millimeter", 
                "millimetre", "mm", "decimeter", "decimetre", "dm", "meter", "metre", "m", "kilometer", "kilometre", "kilo", "km", 
                "centimeter", "centimetre", "cm", "inch", "in", "cubic meter", "cm3", "m3", "mm3", "km3", "celsius", "Celsius", "Fahrenheit", 
                "F", "pinch", "handful", "loaf", "dash", "Dash", "stick"]
    # join each unit with an or operator
    anyUnitRE = '|'.join(units)
    # remove all text between parentheses ingredients
    parsed = re.sub("([(][^(]+[$)])", " ", ingredient)
    # remove punctuation
    parsed = re.sub('[%s]' % re.escape(string.punctuation), " ", parsed)
    # define pattern to match with input
    m = re.match(
                        r'(?P<amount>\d{1,3})\s*'
                        r'(?P<unit>(' + anyUnitRE + r')?)\s*'
                        r'(?P<preposition>(of)?)\s*'
                        r'(?P<name>.*$)', parsed)
    if m:
        return m.groupdict()["name"]
    else:
        return None

In [123]:
# run above parsing function through each ingredient in the ingredient_comment column
ingredients["ingredient_parsed"] = ingredients.ingredients.apply(lambda x: ingredients_parser(x))

In [124]:
ingredients.ingredient_parsed

0                                           French bread  
1                                               large eggs
2                                          s half and half
3                                                     milk
4                                       s granulated sugar
5                                          vanilla extract
6                               4 teaspoon ground cinnamon
7                                 4 teaspoon ground nutmeg
8                                                     None
9                                                     None
10                                                    None
11                                        2 pound   butter
12                                packed light brown sugar
13                                          chopped pecans
14                                      s light corn syrup
15                              2 teaspoon ground cinnamon
16                                2 teaspoon ground nutm

In [125]:
ingredients.isnull().sum()

recipe_id                0
level_1                  0
ingredients              0
ingredient_qty           0
ingredient_unit      33514
ingredient           13011
ingredient_parsed    13018
dtype: int64

In [126]:
# for ingredients in the ingredient column that is null, fill with info from ingredient_comment
ingredients.loc[ingredients.ingredient.isnull() == True, "ingredient_parsed"] = ingredients[ingredients.ingredient.isnull() == True]["ingredients"]

In [127]:
ingredients.isnull().sum()

recipe_id                0
level_1                  0
ingredients              0
ingredient_qty           0
ingredient_unit      33514
ingredient           13011
ingredient_parsed        7
dtype: int64

In [128]:
ingredients[ingredients.ingredient_parsed.isnull() == True]

Unnamed: 0,recipe_id,level_1,ingredients,ingredient_qty,ingredient_unit,ingredient,ingredient_parsed
29696,2183,3,"*1 egg yolk, at room temperature",1,,egg yolk at room temperature,
32081,2365,3,"*1 egg yolk, at room temperature",1,,egg yolk at room temperature,
68271,5019,18,.38 ounces yeast,38,ounce,yeast,
69646,5148,1,.75 oz. Godiva Cappuccino Liqueur,75,oz,Godiva Cappuccino Liqueur,
73658,5391,16,*2 egg yolks,2,,egg yolks,
73665,5391,23,*2 egg yolks,2,,egg yolks,
77231,5620,10,.25 grams saffron threads,25,gram,saffron threads,


In [129]:
ingredients.head()

Unnamed: 0,recipe_id,level_1,ingredients,ingredient_qty,ingredient_unit,ingredient,ingredient_parsed
0,1,0,1 loaf French bread (13 to 16 ounces),1,loaf,French bread,French bread
1,1,1,8 large eggs,8,,large eggs,large eggs
2,1,2,2 cups half-and-half,2,cup,halfandhalf,s half and half
3,1,3,1 cup milk,1,cup,milk,milk
4,1,4,2 tablespoons granulated sugar,2,tablespoon,granulated sugar,s granulated sugar


In [130]:
ingredients.tail()

Unnamed: 0,recipe_id,level_1,ingredients,ingredient_qty,ingredient_unit,ingredient,ingredient_parsed
91834,6652,10,2 1/2 ounces (70 grams) sugar,2 1/2,ounce,12 ounce sugar,1 2 ounces sugar
91835,6652,11,A large pinch salt,,pinch,,A large pinch salt
91836,6652,12,4 1/2 fluid ounces (130 milliliters) buttermilk,4 1/2,,12 fluid ounce buttermilk,1 2 fluid ounces buttermilk
91837,6652,13,"A little sugar, for dusting",,,,"A little sugar, for dusting"
91838,6652,14,"Vanilla ice cream, as an accompaniment",,,,"Vanilla ice cream, as an accompaniment"


In [131]:
# create function to further clean each ingredient string of the trailing s
def clean_ingredients(ingredient):
    if ingredient != None:
        lower_case = ingredient.lower()
        cleaned = lower_case.replace("s ", " ").strip(" ")
        parsed = re.sub('[^a-z\s]', " ", cleaned.lower()) 
        word_tokens = word_tokenize(parsed)
        filtered = [word.lower() for word in word_tokens if not word in stop_words]
        return " ".join(filtered)

In [132]:
# further clean the ingredients
ingredients.loc[:, "ingredient_parsed"] = ingredients.ingredient_parsed.apply(lambda x: clean_ingredients(x))

In [133]:
ingredients.head()

Unnamed: 0,recipe_id,level_1,ingredients,ingredient_qty,ingredient_unit,ingredient,ingredient_parsed
0,1,0,1 loaf French bread (13 to 16 ounces),1,loaf,French bread,french bread
1,1,1,8 large eggs,8,,large eggs,eggs
2,1,2,2 cups half-and-half,2,cup,halfandhalf,half half
3,1,3,1 cup milk,1,cup,milk,milk
4,1,4,2 tablespoons granulated sugar,2,tablespoon,granulated sugar,granulated sugar


In [134]:
ingredients.tail()

Unnamed: 0,recipe_id,level_1,ingredients,ingredient_qty,ingredient_unit,ingredient,ingredient_parsed
91834,6652,10,2 1/2 ounces (70 grams) sugar,2 1/2,ounce,12 ounce sugar,sugar
91835,6652,11,A large pinch salt,,pinch,,
91836,6652,12,4 1/2 fluid ounces (130 milliliters) buttermilk,4 1/2,,12 fluid ounce buttermilk,buttermilk
91837,6652,13,"A little sugar, for dusting",,,,sugar
91838,6652,14,"Vanilla ice cream, as an accompaniment",,,,vanilla ice cream accompaniment


In [135]:
ingredients.ingredient_parsed

0                                             french bread
1                                                     eggs
2                                                half half
3                                                     milk
4                                         granulated sugar
5                                          vanilla extract
6                                                 cinnamon
7                                                   nutmeg
8                                                         
9                                          praline topping
10                                             maple syrup
11                                                  butter
12                                       light brown sugar
13                                                  pecans
14                                        light corn syrup
15                                                cinnamon
16                                                  nutm

In [136]:
ingredients.isnull().sum()

recipe_id                0
level_1                  0
ingredients              0
ingredient_qty           0
ingredient_unit      33514
ingredient           13011
ingredient_parsed        7
dtype: int64

In [137]:
ingredients[ingredients.ingredient_parsed.isnull() == True]

Unnamed: 0,recipe_id,level_1,ingredients,ingredient_qty,ingredient_unit,ingredient,ingredient_parsed
29696,2183,3,"*1 egg yolk, at room temperature",1,,egg yolk at room temperature,
32081,2365,3,"*1 egg yolk, at room temperature",1,,egg yolk at room temperature,
68271,5019,18,.38 ounces yeast,38,ounce,yeast,
69646,5148,1,.75 oz. Godiva Cappuccino Liqueur,75,oz,Godiva Cappuccino Liqueur,
73658,5391,16,*2 egg yolks,2,,egg yolks,
73665,5391,23,*2 egg yolks,2,,egg yolks,
77231,5620,10,.25 grams saffron threads,25,gram,saffron threads,


In [138]:
# fill in null values with information from ingredient column
ingredients.loc[ingredients.ingredient_parsed.isnull() == True, "ingredient_parsed"] = ingredients.loc[ingredients.ingredient_parsed.isnull() == True].ingredient.apply(lambda x: x.lower())

In [139]:
# manually clean out at room temperature from listing
ingredients.ingredient_parsed[29696] = ingredients.ingredient_parsed[29696].replace("at room temperature", "").strip(" ")

In [140]:
ingredients.ingredient_parsed[32081]

'egg yolk at room temperature'

In [141]:
# manually clean out at room temperature from listing
ingredients.ingredient_parsed[32081] = ingredients.ingredient_parsed[32081].replace("at room temperature", "").strip(" ")

In [142]:
ingredients.ingredient_parsed[32081]

'egg yolk'

In [143]:
ingredients.ingredient_parsed[29696]

'egg yolk'

In [144]:
ingredients.ingredient_parsed[69646]

'godiva cappuccino liqueur'

In [145]:
ingredients.isnull().sum()

recipe_id                0
level_1                  0
ingredients              0
ingredient_qty           0
ingredient_unit      33514
ingredient           13011
ingredient_parsed        0
dtype: int64

In [146]:
# drop unnecessary columns
ingredients_dropped = ingredients.drop(["level_1", "ingredient_qty", "ingredient_unit", "ingredient"], axis=1)

In [147]:
ingredients_dropped.shape

(91839, 3)

In [148]:
ingredients_dropped = ingredients_dropped[(ingredients_dropped.ingredient_parsed != '') | (ingredients_dropped.ingredient_parsed != ' ')]

In [149]:
ingredients_dropped.shape

(91839, 3)

In [150]:
# pickle non grouped version of dataframe
ingredients_dropped.to_pickle("./data/foodnetwork_ingred_ungrp.pkl")

In [151]:
# group to recipe level of all ingredients
ingredients_grouped = ingredients.groupby("recipe_id").agg({"ingredients": ", ".join, "ingredient_parsed":", ".join}).reset_index()

In [152]:
ingredients_grouped.head()

Unnamed: 0,recipe_id,ingredients,ingredient_parsed
0,1,"1 loaf French bread (13 to 16 ounces), 8 large...","french bread, eggs, half half, milk, granulate..."
1,2,"1 (14-ounce) can sweetened condensed milk, 1 (...","sweetened condensed milk, frozen whipped toppi..."
2,3,"1 (18 1/4-ounce) package yellow cake mix, 1 eg...","yellow cake mix, egg, butter melted, cream che..."
3,4,"1 (15 1/4-ounce) can whole kernel corn, draine...","whole kernel corn drained, cream style corn, c..."
4,5,"4 skinless chicken breast halves, with ribs, 2...","skinles chicken breast ribs, skinles chicken t..."


In [153]:
ingredients_grouped.shape

(6652, 3)

In [154]:
# # pickle dataset
# ingredients_grouped.to_pickle("./data/foodnetwork_ingred_grp2.pkl")

## Data Preprocessing for Topic Modeling

In [155]:
# read in pickled grouped ingredients data
ingredients_grouped = pd.read_pickle("./data/foodnetwork_ingred_grp2.pkl")

In [156]:
ingredients_grouped.head()

Unnamed: 0,recipe_id,ingredients,ingredient_parsed
0,1,"1 loaf French bread (13 to 16 ounces), 8 large...","french bread, eggs, half half, milk, granulate..."
1,2,"1 (14-ounce) can sweetened condensed milk, 1 (...","sweetened condensed milk, frozen whipped toppi..."
2,3,"1 (18 1/4-ounce) package yellow cake mix, 1 eg...","yellow cake mix, egg, butter melted, cream che..."
3,4,"1 (15 1/4-ounce) can whole kernel corn, draine...","whole kernel corn drained, cream style corn, c..."
4,5,"4 skinless chicken breast halves, with ribs, 2...","skinles chicken breast ribs, skinles chicken t..."


In [157]:
ingredients_grouped.ingredient_parsed[0]

'french bread, eggs, half half, milk, granulated sugar, vanilla extract, cinnamon, nutmeg, , praline topping, maple syrup, butter, light brown sugar, pecans, light corn syrup, cinnamon, nutmeg'

In [158]:
# convert string to list for each value in the ingredient_parsed column
ingredients_grouped.loc[:, "ingredient_parsed"] = ingredients_grouped.loc[:, "ingredient_parsed"].apply(lambda x: x.split(", "))

In [159]:
ingredients_grouped[(ingredients_grouped.ingredient_parsed == '') | (ingredients_dropped.ingredient_parsed == ' ')]

Unnamed: 0,recipe_id,ingredients,ingredient_parsed


In [160]:
# pull out the collection of parsed ingredients into a list - this will be the corpus for topic modeling
preprocessed_ingredients = ingredients_grouped["ingredient_parsed"].tolist()

In [161]:
preprocessed_ingredients[0]

['french bread',
 'eggs',
 'half half',
 'milk',
 'granulated sugar',
 'vanilla extract',
 'cinnamon',
 'nutmeg',
 '',
 'praline topping',
 'maple syrup',
 'butter',
 'light brown sugar',
 'pecans',
 'light corn syrup',
 'cinnamon',
 'nutmeg']

In [162]:
# Create Dictionary
id2word = corpora.Dictionary(preprocessed_ingredients)

# Create Corpus
texts = preprocessed_ingredients

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 2), (12, 1), (13, 1), (14, 1)]]


In [163]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('', 1),
  ('butter', 1),
  ('cinnamon', 2),
  ('eggs', 1),
  ('french bread', 1),
  ('granulated sugar', 1),
  ('half half', 1),
  ('light brown sugar', 1),
  ('light corn syrup', 1),
  ('maple syrup', 1),
  ('milk', 1),
  ('nutmeg', 2),
  ('pecans', 1),
  ('praline topping', 1),
  ('vanilla extract', 1)]]

## Building the Topic Model with Gensim

In [164]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=3,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [165]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.043*"garlic" + 0.039*"black pepper" + 0.038*"" + 0.030*"canola" + '
  '0.029*"soy sauce" + 0.024*"cumin" + 0.018*"honey" + 0.014*"ginger" + '
  '0.013*"olive" + 0.012*"vegetable"'),
 (1,
  '0.040*"olive" + 0.015*"balsamic vinegar" + 0.014*"" + 0.014*"pepper" + '
  '0.012*"red wine vinegar" + 0.009*"black pepper" + 0.005*"dill" + '
  '0.005*"garlic finely" + 0.005*"saffron" + 0.004*"confectioner sugar"'),
 (2,
  '0.036*"pepper" + 0.031*"olive" + 0.025*"lime juiced" + 0.017*"fresh '
  'cilantro leaves" + 0.013*"" + 0.013*"black pepper" + 0.012*"cumin" + '
  '0.011*"cilantro leaves" + 0.010*"garlic" + 0.009*"parsley leaves"'),
 (3,
  '0.027*"garlic" + 0.025*"fish sauce" + 0.017*"vegetable" + 0.017*"fresh '
  'cilantro" + 0.015*"rice wine vinegar" + 0.014*"brown sugar" + 0.012*"sugar" '
  '+ 0.011*"sesame" + 0.010*"rice vinegar" + 0.009*""'),
 (4,
  '0.109*"" + 0.047*"black pepper" + 0.036*"sugar" + 0.030*"olive" + '
  '0.026*"eggs" + 0.026*"purpose flour" + 0.024*"unsalted butte

In [166]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score - higher the better
coherence_model_lda = CoherenceModel(model=lda_model, texts=preprocessed_ingredients, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.58447046691459

Coherence Score:  0.3886314859726213


In [167]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

## NLP: Topic Modeling with Sklearn 1

In [168]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [169]:
# read in dataframe
ingredients_ungrp = pd.read_pickle("./data/foodnetwork_ingred_ungrp.pkl")

In [170]:
ingredients_ungrp.head()

Unnamed: 0,recipe_id,ingredients,ingredient_parsed
0,1,1 loaf French bread (13 to 16 ounces),french bread
1,1,8 large eggs,eggs
2,1,2 cups half-and-half,half half
3,1,1 cup milk,milk
4,1,2 tablespoons granulated sugar,granulated sugar


In [171]:
# add underscore between each word to keep them together for later in the process
ingredients_ungrp.loc[:, "ingredient_parsed"] = ingredients_ungrp.loc[:, "ingredient_parsed"].apply(lambda x: x.replace(" ", "_"))

In [172]:
ingredients_ungrp.head()

Unnamed: 0,recipe_id,ingredients,ingredient_parsed
0,1,1 loaf French bread (13 to 16 ounces),french_bread
1,1,8 large eggs,eggs
2,1,2 cups half-and-half,half_half
3,1,1 cup milk,milk
4,1,2 tablespoons granulated sugar,granulated_sugar


In [173]:
# group ingredients to recipe level
ingredients_grouped2 = ingredients_ungrp.groupby("recipe_id").agg({"ingredients": ", ".join, "ingredient_parsed":" ".join})

In [174]:
# define documents
documents = ingredients_grouped2.ingredient_parsed

In [175]:
documents[:5]

recipe_id
1    french_bread eggs half_half milk granulated_su...
2    sweetened_condensed_milk frozen_whipped_toppin...
3    yellow_cake_mix egg butter_melted cream_cheese...
4    whole_kernel_corn_drained cream_style_corn cor...
5    skinles_chicken_breast_ribs skinles_chicken_th...
Name: ingredient_parsed, dtype: object

In [176]:
# define number of max features
no_features = 1000

In [177]:
# define number of topics
no_topics = 10

In [178]:
# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

In [179]:
# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

In [180]:
# LDA can only use raw term counts because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

In [181]:
tf.shape

(6652, 1000)

In [182]:
# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

In [183]:
# define number of top words to display
no_top_words = 10

In [184]:
# display topics for NMF
print(display_topics(nmf, tfidf_feature_names, no_top_words))

Topic 0:
black_pepper mayonnaise dijon_mustard lemon_juiced honey red_wine_vinegar garlic_powder olive_drizzling arlic_clove cayenne_pepper
Topic 1:
sugar vanilla_extract baking_powder fish_sauce egg_yolks milk rice_vinegar purpose_flour egg_beaten cold
Topic 2:
soy_sauce sesame garlic rice_wine_vinegar honey ginger peanut rice_vinegar cornstarch hoisin_sauce
Topic 3:
pepper red_onion red_wine_vinegar dijon_mustard lemon_juiced garlic_finely garlic_coarsely mayonnaise chicken_stock olive_drizzling
Topic 4:
olive garlic arlic_clove onion grated_parmesan red_wine_vinegar dry_white_wine shallot balsamic_vinegar fresh_lemon_juice
Topic 5:
butter heavy_cream milk flour chicken_stock garlic onion purpose_flour white_wine eggs
Topic 6:
vegetable cumin garlic turmeric coriander onion chili_powder garam_masala fresh_cilantro cayenne_pepper
Topic 7:
canola garlic black_pepper_taste ginger chicken_stock lime_juiced honey low_sodium_soy_sauce turmeric ancho_chile_powder
Topic 8:
unsalted_butter pu

In [185]:
# display topics from LDA
print(display_topics(lda, tf_feature_names, no_top_words))

Topic 0:
egg_beaten vegetable_frying cornstarch sugar warm purpose_flour cold vegetable hot fresh_orange_juice
Topic 1:
black_pepper unsalted_butter heavy_cream vegetable coconut_milk sugar eggs purpose_flour panko onion_finely
Topic 2:
black_pepper garlic honey vegetable mayonnaise garlic_powder canola cayenne_pepper lemon_juice ketchup
Topic 3:
lemon_juiced crushed_red_pepper_flakes chicken_stock pepper coarse coarse_black_pepper parsley_leaves garlic_finely olive couscous
Topic 4:
soy_sauce garlic sugar fish_sauce sesame ginger rice_wine_vinegar rice_vinegar peanut vegetable
Topic 5:
fresh_basil_leaves butter black_pepper hot_sauce brown_sugar grated_parmesan half_half cayenne_pepper eggs butter_melted
Topic 6:
cumin olive garlic black_pepper canola pepper fresh_lime_juice lime_juiced black_pepper_taste fresh_cilantro_leaves
Topic 7:
sugar purpose_flour eggs cinnamon milk baking_powder unsalted_butter vanilla_extract granulated_sugar whole_milk
Topic 8:
turmeric cumin_seeds garam_ma

In [186]:
# visualize LDA model with pyLDAvis
pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)

## NLP: Topic Modeling with Sklearn 2

In [187]:
ingredients_grouped2.head()

Unnamed: 0_level_0,ingredients,ingredient_parsed
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"1 loaf French bread (13 to 16 ounces), 8 large...",french_bread eggs half_half milk granulated_su...
2,"1 (14-ounce) can sweetened condensed milk, 1 (...",sweetened_condensed_milk frozen_whipped_toppin...
3,"1 (18 1/4-ounce) package yellow cake mix, 1 eg...",yellow_cake_mix egg butter_melted cream_cheese...
4,"1 (15 1/4-ounce) can whole kernel corn, draine...",whole_kernel_corn_drained cream_style_corn cor...
5,"4 skinless chicken breast halves, with ribs, 2...",skinles_chicken_breast_ribs skinles_chicken_th...


In [188]:
# define documents
docs = ingredients_grouped2.ingredient_parsed

In [189]:
docs[:5]

recipe_id
1    french_bread eggs half_half milk granulated_su...
2    sweetened_condensed_milk frozen_whipped_toppin...
3    yellow_cake_mix egg butter_melted cream_cheese...
4    whole_kernel_corn_drained cream_style_corn cor...
5    skinles_chicken_breast_ribs skinles_chicken_th...
Name: ingredient_parsed, dtype: object

In [190]:
# define number of max features
no_features = 1000

In [191]:
# define number of topics
no_topics = 20

In [192]:
# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(docs)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

In [193]:
# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

In [194]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(docs)
tf_feature_names = tf_vectorizer.get_feature_names()

In [195]:
tf.shape

(6652, 1000)

In [196]:
# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

In [197]:
# define number of top words to display
no_top_words = 10

In [198]:
# display topics for NMF
print(display_topics(nmf, tfidf_feature_names, no_top_words))

Topic 0:
black_pepper garlic_powder grated_parmesan olive_drizzling lemon_juiced fresh_flat_leaf_parsley fresh_basil_leaves fresh_thyme_leaves fresh_parsley_leaves arlic_clove
Topic 1:
sugar vanilla_extract egg_yolks rice_vinegar milk rice_wine_vinegar cold butter_melted pure_vanilla_extract ice
Topic 2:
soy_sauce sesame rice_wine_vinegar rice_vinegar cornstarch ginger mirin toasted_sesame hoisin_sauce peanut
Topic 3:
pepper garlic_coarsely chicken_stock garlic_finely olive_drizzling lemon_juiced spanish_onion_finely cilantro fresh_parsley tomato_sauce
Topic 4:
olive balsamic_vinegar arlic_clove dijon_mustard fresh_lemon_juice grated_parmesan shallot dry_white_wine red_pepper_flakes dried_oregano
Topic 5:
butter flour chicken_stock milk white_wine pepper_taste sour_cream egg_beaten red_pepper_flakes dry_white_wine
Topic 6:
cumin coriander turmeric chili_powder cayenne_pepper garam_masala paprika cayenne lime_juiced fresh_cilantro_leaves
Topic 7:
canola black_pepper_taste ginger low_sod

In [199]:
# display topics from LDA
print(display_topics(lda, tf_feature_names, no_top_words))

Topic 0:
pepper scallion fresh_cilantro_leaves red_onion clove_garlic fresh_cilantro fresh_lime_juice vegetable olive sugar
Topic 1:
unsalted_butter eggs heavy_cream purpose_flour black_pepper sugar whole_milk flour fresh_parsley baking_powder
Topic 2:
balsamic_vinegar parsley_leaves grated_parmigiano_reggiano bread_crumbs juice_limes marinara_sauce edium_onion_diced spanish_onion_cut_dice stalk_celery_diced eggs
Topic 3:
fresh_italian_parsley_leaves cherry_tomatoes fresh_cilantro_leave_garnish japanese_eggplant olive_tablespoons pitted_kalamata_olives shredded_cheddar finely_fresh_oregano_leaves drained_capers sour_cream_serving
Topic 4:
onion arlic_clove garlic orn_tortillas bay_leaf sea_black_pepper stalk_celery arrot tomato whole_black_peppercorns
Topic 5:
sugar milk eggs vanilla_extract butter purpose_flour cinnamon granulated_sugar egg_yolks baking_powder
Topic 6:
saffron shredded_mozzarella rice russian_dressing vegetable_cooking_spray ripe_tomatoe_diced fusilli_pasta garlic_pas

In [200]:
# visualize LDA model with pyLDAvis
pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)

In [201]:
# # pickle dataframe for sklearn processing
# ingredients_grouped2.to_pickle("./data/foodnetwork_ingred_grp_sklearn.pkl")

In [207]:
# define function to pick model after it is fit
def pickle_model(model_name, model):
    model_pickle_path = f'./foodnetwork_{model_name}.pkl'
    model_pickle = open(model_pickle_path, 'wb')
    pickle.dump(model, model_pickle)
    model_pickle.close()

In [209]:
# # pickle models from gensim and sklearn
# pickle_model("tf_sklearn", tf)
# pickle_model("tfidf_sklearn", tfidf)
# pickle_model("lda_gensim", lda)