# Latent Dirichlet Allocation(LDA)-content based recommender system

LDA is a form of unsupervised learning that views documents as bags of words. In this notebook, the ingradients of each recipe are treated as a document. The number of topics is set to be 25 and 100 respectively. A given query will return top 5 recommendations by the similarity score. 

In [1]:
#Import Libraries
import pandas as pd
import numpy as np
import gensim
from gensim.models.ldamodel import LdaModel
from gensim import models, similarities
from gensim.corpora import Dictionary

In [2]:
#Read in raw csv data for recipes
recipes = pd.read_csv('data/RAW_recipes.csv')

In [3]:
# confirm there are 231,637 recipes and 12 columns
recipes.shape

(231637, 12)

In [4]:
# change string to list for ingredients column
recipes['ingredients'] = recipes['ingredients'].apply(lambda s: eval(s))

In [5]:
# take a look at the first few recipes and understand the data structure
recipes.head(3)

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"[prepared pizza crust, sausage patty, eggs, mi...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"[ground beef, yellow onions, diced tomatoes, t...",13


In [6]:
# make dictionary to include all of the ingredient words. Use the ingredient to make recommendations
dictionary = Dictionary([ing for ing in recipes['ingredients']])

In [7]:
# check how many words in the dictionary
len(dictionary)

14942

In [8]:
#check ingredients column is a list of words. 
recipes['ingredients'][3]

['spreadable cheese with garlic and herbs',
 'new potatoes',
 'shallots',
 'parsley',
 'tarragon',
 'olive oil',
 'red wine vinegar',
 'salt',
 'pepper',
 'red bell pepper',
 'yellow bell pepper']

In [9]:
# build corpus for each ingredient by using bag of words
corpus = [dictionary.doc2bow(text) for text in recipes['ingredients']]

In [10]:
#length of the corpus is the same as the number of ingredients in the dataset
len(corpus)

231637

In [11]:
#build LDA model with 30 topics by using all of the ingredient words
ldamodel = LdaModel(corpus,num_topics = 30, passes = 10,random_state = 42, id2word = dictionary)

In [12]:
#build similarity matrix based on LDA model, and the ingredient of each 
index = similarities.MatrixSimilarity(ldamodel[corpus])

In [13]:
#user input a query, a string of possible ingredients seperated by comma
# query = 'chocolate cookies, baked potatoes, eggs, cheese, chichen'
query = 'winter squash, mexican seasoning, mixed spice, honey, butter, olive oil, salt'

In [14]:
#change the user input ingredients to a bag of words.
query_bow = dictionary.doc2bow(query.lower().split(', '))

In [15]:
#map the user input ingredients to a 30 dimension vector, corresponding to the 30 topics generated by the LDA model
query_vector = ldamodel[query_bow]

In [16]:
query_vector

[(13, 0.48639968), (24, 0.13133611), (27, 0.26975682)]

In [17]:
#find cosine similarities of the user input query with the existing 231637 ingredients.
sims = index[query_vector]

In [18]:
# check similarity array, values are in [0,1]
sims

array([1., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [19]:
# sort the similarity score from highest to lowest to find most relavant ones
sim_rank = sorted(enumerate(sims), key=lambda item: item[1], reverse=True)

In [20]:
sim_rank[0]

(0, 1.0)

In [21]:
#choose top 5 rank in similarity
top_5 = sim_rank[:5]

In [22]:
#show top 5 with the index number and similarity score of 1.0
top_5

[(0, 1.0),
 (91318, 0.9725976),
 (150490, 0.97194046),
 (67211, 0.9719329),
 (143749, 0.9719295)]

In [23]:
#save the indexes of the top 5 similar recipes in a list
top_select_number = [top_5[i][0] for i in range(5)]

In [24]:
# show the top 5 recipes with the highest similarity scores from the 231637 recipes
recipes.iloc[top_select_number]

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...",7
91318,german erdbeer bowle strawberry wine punch,34268,180,27783,2002-07-15,"['weeknight', 'time-to-make', 'course', 'main-...","[462.4, 0.0, 158.0, 0.0, 1.0, 0.0, 16.0]",9,"['2-3 hours before your guests arrive , wash t...",germans often serve a delicious bowle at summe...,"[fresh strawberries, powdered sugar, lemon bal...",5
150490,outback croutons,157524,60,296349,2006-02-24,"['60-minutes-or-less', 'time-to-make', 'course...","[437.0, 58.0, 7.0, 22.0, 7.0, 118.0, 7.0]",18,"['take any amount of bread', 'get an electric ...",hi there folks. i used to be a hot-side and co...,"[bread, butter, garlic]",3
67211,crunchy garlic croutons,385865,7,967561,2009-08-17,"['15-minutes-or-less', 'time-to-make', 'course...","[44.7, 0.0, 2.0, 3.0, 2.0, 0.0, 2.0]",6,"['toast the bread', 'crush the garlic and mix ...",these are great with soups and salads and it a...,"[bread, garlic clove, butter]",3
143749,nif s bread crackers in a pinch,470749,27,65502,2011-12-29,"['30-minutes-or-less', 'time-to-make', 'course...","[80.5, 5.0, 3.0, 5.0, 3.0, 10.0, 3.0]",11,['preheat oven to 400f line a cookie sheet wit...,i was making appetizers for a work christmas p...,"[bread, butter, garlic clove]",3


## Repeat the model with 100 topics

In [25]:
ldamodel_100 = LdaModel(corpus,num_topics = 100, passes = 10,random_state = 42, id2word = dictionary)

In [26]:
index_100 = similarities.MatrixSimilarity(ldamodel_100[corpus])

In [27]:
query_100 = 'winter squash, mexican seasoning, mixed spice, honey, butter, olive oil, salt'

In [28]:
query_bow_100 = dictionary.doc2bow(query_100.lower().split(', '))

In [29]:
query_vector_100 = ldamodel_100[query_bow_100]

In [30]:
query_vector_100

[(65, 0.268383), (79, 0.568281)]

In [31]:
sims_100 = index_100[query_vector_100]
sim_rank_100 = sorted(enumerate(sims_100), key=lambda item: item[1], reverse=True)

In [32]:
top_5_100 = sim_rank_100[:5]

In [33]:
top_select_number_100 = [top_5_100[i][0] for i in range(5)]
recipes.iloc[top_select_number_100]

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...",7
224913,way too easy char siu chicken wings,223082,37,279205,2007-04-16,"['60-minutes-or-less', 'time-to-make', 'course...","[326.3, 36.0, 0.0, 4.0, 53.0, 32.0, 0.0]",13,"['pre heat oven to 400f', 'microwave char siu ...",sometime i don't have time to make a marinade ...,"[chicken wings, char siu sauce, water, toasted...",4
84578,five star house salad dressing,23478,10,30503,2002-03-28,"['15-minutes-or-less', 'time-to-make', 'course...","[68.3, 4.0, 35.0, 2.0, 3.0, 2.0, 3.0]",2,['combine all ingredients and mix well with wi...,an absolutely delicious salad dressing that is...,"[apple cider vinegar, garlic clove, dijon must...",8
18713,basil mint infused honey,223053,15,58104,2007-04-16,"['15-minutes-or-less', 'time-to-make', 'course...","[65.7, 0.0, 69.0, 0.0, 0.0, 0.0, 5.0]",6,['place honey and mint in a double boiler with...,you can make straight mint or straight basil.\...,"[honey, mint leaves, basil leaves]",3
104569,heirloom tomato salad,432866,10,1651152,2010-07-21,"['weeknight', '15-minutes-or-less', 'time-to-m...","[193.5, 28.0, 18.0, 9.0, 3.0, 12.0, 2.0]",11,['start with the vinaigrette to give the flavo...,multi-variety heirloom tomato salad with fresh...,"[heirloom tomatoes, fresh goat cheese, fresh t...",12


In [34]:
top_5_100

[(0, 0.99999994),
 (224913, 0.9997021),
 (84578, 0.9603593),
 (18713, 0.9413549),
 (104569, 0.91810584)]

### Conclustion:  30 topic model looks more relavant than 100 topic model for this example. 