# Imports

In [36]:
# sql database connection
from config import USERNAME, PASSWORD, HOST_PORT, DB_NAME
from sqlalchemy import create_engine

# data cleaning and wrangling tools
import pandas as pd
import ast
import re
import string

# visualization tools
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pyLDAvis
import pyLDAvis.gensim
import pyLDAvis.sklearn
from pprint import pprint

# nlp tools
import spacy
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer, PorterStemmer
import gensim
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from stopwords import stop_words

# Enable logging for gensim to keep track of the training process
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

# pickle
import pickle

# disable warnings that bring up a deprecation warning
import warnings
warnings.filterwarnings("ignore")

In [2]:
# create sqlalchemy engine for connecting to postgresql db
engine = create_engine(f"postgresql+psycopg2://{USERNAME}:{PASSWORD}@localhost:{HOST_PORT}/{DB_NAME}")

# Natural Language Processing

## Import Ingredients Data

In [3]:
# define query to pull in ingredients data for topic modeling
query = """SELECT recipes_ingredients.recipe_id, recipes.title, recipes_ingredients.ingredient_id, ingredients.ingredient 
                FROM food.recipes_ingredients
                LEFT OUTER JOIN food.recipes ON recipes_ingredients.recipe_id = recipes.recipe_id
                LEFT OUTER JOIN food.ingredients ON recipes_ingredients.ingredient_id = ingredients.ingredient_id
                WHERE recipes_ingredients.ingredient_id IS NOT NULL;"""

In [4]:
# read in data from sql table
data = pd.read_sql_query(query, con=engine)

In [5]:
# convert data type of ingredient_id to string
data.ingredient_id = data.ingredient_id.apply(str)

In [6]:
# add underscores between words
data.ingredient = data.ingredient.str.replace(" ", "_")

In [7]:
# group rows to recipe level
data_grouped = data.groupby(["recipe_id", "title"]).agg({"ingredient_id":" ".join, "ingredient":" ".join}).reset_index()

In [8]:
data_grouped.head()

Unnamed: 0,recipe_id,title,ingredient_id,ingredient
0,1,Baked French Toast Casserole with Maple Syrup,12109 12111 11890 12088 11972 11843 11974 1205...,french_bread eggs halfandhalf milk granulated_...
1,2,Not Yo' Mama's Banana Pudding,10652 2782 2525 8739 12088 2781 40,sweetened_condensed_milk whipped_topping_thawe...
2,3,Pumpkin Gooey Butter Cakes,2532 41 242 40 10044 41 11920 242 11395 11974 ...,cake_mix egg butter cream_cheese pumpkin egg v...
3,4,Corn Casserole,5 235 6711 12016 242 11717,corn creamstyle_corn corn_muffin_mix sour_crea...
4,5,Roman-Style Chicken,3930 3931 12102 12095 30 12066 11940 29 7 1130...,skinless_chicken_breast_halves skinless_chicke...


In [9]:
data_grouped.shape

(6650, 4)

## Data Preprocessing for Topic Modeling

In [17]:
ingredients = data_grouped.ingredient.tolist()

In [18]:
tokens = [word_tokenize(ingred) for ingred in ingredients]

In [37]:
# Create Dictionary
id2word = Dictionary(tokens)

# Remove words that appear less than 5 times and that are in more than in 80% documents
id2word.filter_extremes(no_below=5, no_above=0.8)

# Term Document Frequency of corpus
corpus = [id2word.doc2bow(text) for token in tokens]

# View
print(corpus[:1])

[[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 2), (11, 1), (12, 1)]]


In [38]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('butter', 1),
  ('cinnamon', 2),
  ('eggs', 1),
  ('french_bread', 1),
  ('granulated_sugar', 1),
  ('halfandhalf', 1),
  ('light_brown_sugar', 1),
  ('light_corn_syrup', 1),
  ('maple_syrup', 1),
  ('milk', 1),
  ('nutmeg', 2),
  ('pecans', 1),
  ('vanilla_extract', 1)]]

## Building the Topic Model with Gensim

In [53]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=42,
                                           update_every=3,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True,
                                           minimum_probability=0.0)

In [54]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.073*"mint_leaves" + 0.051*"carrots" + 0.034*"golden_raisins" + '
  '0.031*"sprigs_thyme" + 0.029*"lowsodium_chicken_broth" + '
  '0.028*"baby_spinach" + 0.025*"spanish_onions" + 0.025*"red_onions" + '
  '0.025*"celery" + 0.024*"ears_corn"'),
 (1,
  '0.066*"lime_juice" + 0.055*"fish_sauce" + 0.054*"scallion" + 0.022*"mint" + '
  '0.020*"curry_powder" + 0.018*"cilantro_leaves" + 0.018*"shrimp" + '
  '0.017*"thai_basil_leaves" + 0.015*"unsweetened_coconut_milk" + '
  '0.015*"lime_wedges"'),
 (2,
  '0.119*"butter" + 0.091*"sugar" + 0.089*"allpurpose_flour" + 0.077*"egg" + '
  '0.059*"milk" + 0.052*"eggs" + 0.034*"cinnamon" + 0.028*"baking_powder" + '
  '0.026*"nutmeg" + 0.024*"water"'),
 (3,
  '0.084*"butter" + 0.062*"onion" + 0.051*"lemon_juice" + 0.051*"clove_garlic" '
  '+ 0.047*"heavy_cream" + 0.046*"parsley" + 0.032*"carrot" + 0.027*"chives" + '
  '0.025*"tomato_paste" + 0.023*"coconut_milk"'),
 (4,
  '0.101*"ginger" + 0.090*"garlic" + 0.071*"soy_sauce" + 0.059*"sugar" + '
 

In [55]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score - higher the better
coherence_model_lda = CoherenceModel(model=lda_model, texts=tokens, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.095902900141955

Coherence Score:  0.4457366347717423


In [56]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [64]:
# # save visualization as HTML file
# pyLDAvis.save_html(vis, 'lda_gensim_vis.html')

In [60]:
# obtain topic distribution for each recipe using ingredients list and LDA model
data_grouped["topic_distribution"] = data_grouped["ingredient"].apply(lambda x: lda_model[id2word.doc2bow(x.split())])

In [61]:
data_grouped.head()

Unnamed: 0,recipe_id,title,ingredient_id,ingredient,topic_distribution
0,1,Baked French Toast Casserole with Maple Syrup,12109 12111 11890 12088 11972 11843 11974 1205...,french_bread eggs halfandhalf milk granulated_...,"([(0, 0.01706039), (1, 0.025358025), (2, 0.532..."
1,2,Not Yo' Mama's Banana Pudding,10652 2782 2525 8739 12088 2781 40,sweetened_condensed_milk whipped_topping_thawe...,"([(0, 0.031982645), (1, 0.04749314), (2, 0.383..."
2,3,Pumpkin Gooey Butter Cakes,2532 41 242 40 10044 41 11920 242 11395 11974 ...,cake_mix egg butter cream_cheese pumpkin egg v...,"([(0, 0.02165293), (1, 0.032152902), (2, 0.513..."
3,4,Corn Casserole,5 235 6711 12016 242 11717,corn creamstyle_corn corn_muffin_mix sour_crea...,"([(0, 0.031985626), (1, 0.047493123), (2, 0.11..."
4,5,Roman-Style Chicken,3930 3931 12102 12095 30 12066 11940 29 7 1130...,skinless_chicken_breast_halves skinless_chicke...,"([(0, 0.018641932), (1, 0.028349612), (2, 0.03..."


In [62]:
data_grouped.shape

(6650, 5)

In [63]:
# # pickle dataframe with topic distribution of each recipe
# data_grouped.to_pickle("./data/recipes_gensim_topics_dist.pkl")

In [48]:
# # pickle dictionary that was trained on text
# pickle.dump(id2word, open('data/gensim_lda_dictionary.pkl', 'wb'))

In [51]:
# # pickle corpus
# pickle.dump(corpus, open('data/gensim_lda_corpus.pkl', 'wb'))

In [49]:
# define function to pick model after it is fit
def pickle_model(model_name, model):
    model_pickle_path = f'./foodnetwork_{model_name}.pkl'
    model_pickle = open(model_pickle_path, 'wb')
    pickle.dump(model, model_pickle)
    model_pickle.close()

In [59]:
# # pickle models from gensim
# pickle_model("lda_gensim2", lda_model)
# lda_model.save("./gensim_lda_model")