In [63]:
import pandas as pd
import numpy as np
import gensim
import random
from sklearn.feature_extraction.text import CountVectorizer
from gensim.matutils import Sparse2Corpus
from gensim.models.ldamodel import LdaModel
from gensim import models, similarities
from gensim.corpora import Dictionary
from gensim.models import TfidfModel

seed = 0

In [12]:
data = pd.read_pickle('data/raw_df.pkl')

In [13]:
def treat_ingredients(ing_list):
    output = []
    for ingredient in ing_list:
        ingredient_list = ingredient.split(' ')
        output.append("_".join(ingredient_list))
    return output

In [34]:
def get_similarity(lda, query_vector):
    index = similarities.MatrixSimilarity(lda[corpus])
    sims = index[query_vector]
    return sims

In [15]:
def train_lda_model(ingredients,num_topics = 100,passes = 15,random_state = seed):
    # get list of ingredients transformed
    ingredients_all = ingredients.apply(lambda x: treat_ingredients(x))
    
    #build dict (for Gensim vectorizer)
    dictionary = Dictionary([ing for ing in list(ingredients_all)])
    
    # build corpus: BOW
    corpus = [dictionary.doc2bow(text) for text in list(ingredients_all)]
    
    #train lda
    ldamodel = LdaModel(corpus,num_topics = num_topics, passes = passes,random_state = random_state, id2word = dictionary)
    return ldamodel,dictionary,corpus

In [17]:
sample_w = "butter, salt, pepper"

In [29]:
# define treat input function, returning a list of tokenized ingredients
def treat_words (words):
    list_words = words.split(",")
    output = []
    for w in list_words:
        output.append("_".join(w.strip().split(" ")))
    return output

In [37]:
def calculate_similarity(query,ldamodel,dct):
    # treat input words
    words_bow = dct.doc2bow(treat_words(query))
    query_vector = ldamodel[words_bow]
    
    #calculate ranking
    sim_rank = get_similarity(lda = ldamodel, query_vector = query_vector)
    sim_rank = sorted(enumerate(sim_rank), key=lambda item: -item[1])
    
    return sim_rank

In [20]:
def calculate_recommendation(sim_rank,groups,n_reco = 10):
    results = [sim_rank[0][0]]
    results_prob = [sim_rank[0][1]]
    result_group = [sim_rank[0][1]]
        
    for recipe,group in zip(sim_rank[1:],groups[1:]):
        if group not in set(result_group):
            results.append(recipe[0])
            result_group.append(group)
            results_prob.append(recipe[1])
        if len(results) == n_reco:
            break
    print(result_group,"\n",results_prob)
    return results

In [32]:
# this is a wrapper function for calculate simu and calculate reco
def get_similarity_reco (query,ldamodel,dct,corpus,n_reco = 10):
    #calculate rank
    sim_rank = calculate_similarity(query,ldamodel,dct)
    #find groups according to lda model
    groups = []
    for l in ldamodel[corpus]:
        try:
            groups.append(l[0][0])
        except:
            groups.append(random.randint(1, 100))
            
    return calculate_recommendation(sim_rank,groups,n_reco)

In [23]:
def print_reco(results):
    return data.iloc[results]

In [24]:
def pretty_name (name):
    return " ".join([ word.capitalize() for word in name.split(" ") if word != ""])

In [25]:
ldamodel,dictionary,corpus = train_lda_model(data.ingredients)

In [43]:
query = "flour, chocolate"

In [44]:
results = get_similarity_reco (query, ldamodel, dct = dictionary, corpus = corpus,n_reco = 10)

[1.0, 60, 10, 55, 2, 41, 14, 43, 19, 42] 
 [1.0, 0.95025396, 0.95025265, 0.8892121, 0.8861892, 0.8858435, 0.88212824, 0.88162076, 0.8612087, 0.8576546]


In [45]:
print_reco(results)

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
27419,bowl of sin,48499,105,34122,2002-12-12,"[weeknight, time-to-make, course, main-ingredi...","[665.2, 46.0, 260.0, 33.0, 11.0, 75.0, 29.0]",9,[prepare cake according to directions in a 9x1...,i got this recipe from the newspaper in richmo...,"[chocolate cake mix, coffee liqueur, instant c...",5
82376,fast and easy black forest pie,80300,15,103876,2004-01-06,"[15-minutes-or-less, time-to-make, course, mai...","[1814.4, 126.0, 481.0, 99.0, 43.0, 203.0, 83.0]",8,[spread 1 cup of whipped topping on bottom of ...,this is a great pie to make during the week wh...,"[chocolate graham wafer pie crust, whipped top...",5
155904,peanut butter chocolate pudding,79686,15,89831,2003-12-29,"[15-minutes-or-less, time-to-make, course, mai...","[178.6, 12.0, 42.0, 14.0, 9.0, 18.0, 7.0]",8,"[in a small bowl , mix 2 tbsp milk with the pe...",a wonderful and easy to prepare ending to any ...,"[milk, chunky peanut butter, whipped topping, ...",4
23701,biscota thipla me marmelada,135019,25,209255,2005-08-26,"[30-minutes-or-less, time-to-make, course, cui...","[320.3, 20.0, 68.0, 7.0, 14.0, 30.0, 14.0]",14,"[sift flour with baking powder into a bowl, ma...","yield is a guess, it depends on what size of c...","[flour, baking powder, eggs, sugar, vanilla, b...",12
85466,fondants au chocolat,179842,27,141569,2006-07-31,"[30-minutes-or-less, time-to-make, course, mai...","[400.4, 55.0, 61.0, 8.0, 14.0, 108.0, 7.0]",12,"[melt the chocolate, add 80 g of butter to the...","as i am a chocolate-lover, this has to be one ...","[chocolate, butter, sugar, eggs, salt]",5
142979,nearly no fat fudge brownies,429805,30,798353,2010-06-15,"[30-minutes-or-less, time-to-make, course, pre...","[107.9, 5.0, 40.0, 1.0, 6.0, 10.0, 6.0]",5,"[preheat oven to 350 degrees f, mix chocolate ...","this came from the top of the 'astro"" yogurt c...","[chocolate, plain fat-free yogurt, sugar, flou...",7
53102,churros from spain authentic,398614,40,1430306,2009-11-09,"[weeknight, 60-minutes-or-less, time-to-make, ...","[1055.9, 53.0, 5.0, 54.0, 62.0, 99.0, 56.0]",11,"[we heat water with salt in a large saucepan, ...","delicious fried churros, a tradition from spain.","[flour, baking powder, water, salt, eggs, choc...",6
27173,bounty cake,217580,60,444015,2007-03-20,"[60-minutes-or-less, time-to-make, course, mai...","[538.0, 68.0, 109.0, 7.0, 16.0, 153.0, 11.0]",10,"[for the sponge-cake:, separate egg whites and...",my favourite cake. just like the candy bar.,"[eggs, sugar, coconut, chocolate, butter]",5
51817,chocolate toffee trifle,224474,6,251917,2007-04-24,"[15-minutes-or-less, time-to-make, course, mai...","[674.1, 54.0, 159.0, 35.0, 20.0, 105.0, 27.0]",7,[cut up frozen pound cake into bite sized piec...,this is my cousin teri's recipe. very easy to ...,"[butter pound cake, instant chocolate pudding ...",5
95523,grandmother s sugar cake,179512,55,293946,2006-07-27,"[60-minutes-or-less, time-to-make, course, pre...","[472.0, 39.0, 137.0, 15.0, 12.0, 25.0, 18.0]",12,"[grease 6 cup fluted tube pan, sprinkle entire...",this is from my mom's recipe cards. i have no...,"[breadcrumbs, flour, baking powder, eggs, suga...",9


Let's try the same flow but with our LDA 25 topics model

In [52]:
lda_25,dict_25,corpus_25 = train_lda_model(data.ingredients,num_topics = 25)

In [53]:
results_25 = get_similarity_reco (query, lda_25, dct = dict_25, corpus = corpus_25,n_reco = 10)

[1.0000001, 8, 1, 6, 12, 11, 7, 0, 13, 4] 
 [1.0000001, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


In [54]:
print_reco(results_25)

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
145494,nut candy,30799,10,22973,2002-06-10,"[15-minutes-or-less, time-to-make, course, mai...","[4071.0, 482.0, 817.0, 116.0, 170.0, 443.0, 10...",4,"[melt chocolate chips in a double boiler, stir...","easy to make, extra delicious candy treat. go...","[chocolate chips, mixed nuts]",2
15594,baker s friend homemade pan release,443310,10,253974,2010-12-01,"[weeknight, 15-minutes-or-less, time-to-make, ...","[136.0, 21.0, 0.0, 0.0, 0.0, 19.0, 1.0]",2,"[simply mix the two ingredients together, keep...",here's just a really simple recipe to keep on ...,"[vegetable shortening, flour]",2
31304,bugle candies,270343,20,66448,2007-12-07,"[30-minutes-or-less, time-to-make, course, pre...","[221.1, 31.0, 8.0, 4.0, 15.0, 41.0, 3.0]",4,"[pipe peanut butter into the bugles, melt choc...",everyone who tries this recipe loves it! \r\ns...,"[bugles original flavor snacks, peanut butter,...",3
57391,cookies frosting,26491,6,227671,2002-04-27,"[15-minutes-or-less, time-to-make, course, mai...","[284.0, 41.0, 10.0, 5.0, 19.0, 55.0, 4.0]",4,[first mix the chocolate and peanut butter tog...,"hi, my name is jane and this recipie will make...","[peanut butter, chocolate, chocolate cookies]",3
90205,garlic dipping sauce,20839,6,1533,2002-02-28,"[15-minutes-or-less, time-to-make, course, pre...","[1626.7, 279.0, 1.0, 88.0, 4.0, 157.0, 1.0]",3,"[combine ingredients in a small bowl, microwav...",,"[margarine, garlic powder]",2
106347,homemade blackberry jam,40235,25,27643,2002-09-15,"[30-minutes-or-less, time-to-make, course, mai...","[841.4, 0.0, 818.0, 0.0, 2.0, 0.0, 72.0]",12,"[carefully measure out the berries , put them ...",posted by request.,"[blackberries, sugar, dry pectin]",3
106374,homemade butterfinger candy bars,475389,15,383853,2012-03-01,"[15-minutes-or-less, time-to-make, course, pre...","[2667.4, 351.0, 167.0, 86.0, 227.0, 238.0, 29.0]",8,[melt candy corn in microwave on high 1 minute...,found this online...have to try it out!,"[candy corn, peanut butter, chocolate-flavored...",3
136087,mini cheese dog wraps,373870,23,166642,2009-05-23,"[30-minutes-or-less, time-to-make, course, mai...","[217.7, 20.0, 8.0, 20.0, 17.0, 30.0, 5.0]",10,"[heat oven to 375 degrees f, unroll crescent r...",these are a great snack or lunch. recipe is fr...,"[refrigerated crescent dinner rolls, beef hot ...",3
155632,peanut butter and bacon sandwich,74475,15,58844,2003-10-30,"[bacon, 15-minutes-or-less, time-to-make, cour...","[319.0, 25.0, 18.0, 29.0, 34.0, 21.0, 8.0]",2,"[spread peanut butter on one slice of toast, t...",don't say ooohhhh gross! try it; it's addicting.,"[peanut butter, whole wheat bread, crisp bacon]",3
213973,toasted peanut butter nutella sandwich,316625,10,182809,2008-07-30,"[15-minutes-or-less, time-to-make, course, mai...","[717.2, 70.0, 117.0, 25.0, 47.0, 45.0, 20.0]",6,"[toast bread to your liking , but make sure yo...",have been trying to cut down on my peanut butt...,"[peanut butter, nutella, whole wheat bread]",3


In [55]:
def save_file_to_pickle(item, file_name, file_type = 'obj'):
    file = open(f'output/{file_name}.{file_type}', 'wb') 
    pickle.dump(item, file)
    file.close()

In [56]:
save_file_to_pickle(ldamodel,"lda_100")

In [57]:
save_file_to_pickle(dictionary,"lda_100_dct")

In [58]:
save_file_to_pickle(corpus,"lda_100_corp")

In [61]:
ldamodel.top_topics(corpus)

[([(0.18288301, 'red_pepper_flakes'),
   (0.16054481, 'whipping_cream'),
   (0.15387756, 'monterey_jack_cheese'),
   (0.14872397, 'sharp_cheddar_cheese'),
   (0.05478826, 'velveeta_cheese'),
   (0.047360882, 'celery_salt'),
   (0.046294626, 'butter'),
   (0.041176748, 'peaches'),
   (0.034478165, 'dried_marjoram'),
   (0.031177634, 'boneless_chicken_breasts'),
   (0.025489423, 'eggs'),
   (0.012157958, 'sugar'),
   (0.009072225, 'onion'),
   (0.0071312995, 'colby_cheese'),
   (0.0056141037, 'olive_oil'),
   (0.005190932, 'water'),
   (0.004966151, 'onions'),
   (0.004279048, 'salt'),
   (0.0038793923, 'garlic_cloves'),
   (0.0037462262, 'garlic')],
  -3.8117004586148666),
 ([(0.13333829, 'tomato_sauce'),
   (0.12930943, 'onion'),
   (0.12916473, 'green_pepper'),
   (0.108814776, 'diced_tomatoes'),
   (0.053502414, 'avocado'),
   (0.04516997, 'spinach'),
   (0.04429881, 'garlic_cloves'),
   (0.044281807, 'jalapeno_pepper'),
   (0.03755554, 'kidney_beans'),
   (0.03348021, 'spaghetti'),


In [62]:
len(ldamodel.top_topics(corpus))

100

In [64]:
def train_lda_model_tfidf(ingredients,num_topics = 100,passes = 15,random_state = seed):
    # get list of ingredients transformed
    ingredients_all = ingredients.apply(lambda x: treat_ingredients(x))
    
    #build dict (for Gensim vectorizer)
    dictionary = Dictionary([ing for ing in list(ingredients_all)])
    
    # build corpus: BOW
    corpus = [dictionary.doc2bow(text) for text in list(ingredients_all)]
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    
    #train lda
    ldamodel = LdaModel(corpus_tfidf, num_topics = num_topics, passes = passes,random_state = random_state, id2word = dictionary)
    return ldamodel,dictionary,corpus

In [65]:
lda_100_tf,dictionary_tf,corpus_tf = train_lda_model_tfidf(data.ingredients)

In [73]:
query = "flour, chocolate, milk"

In [72]:
results_100tf = get_similarity_reco (query, ldamodel = lda_100_tf, dct = dictionary_tf, corpus = corpus_tf,n_reco = 10)

ValueError: cannot index a corpus with zero features (you must specify either `num_features` or a non-empty corpus in the constructor)

<gensim.interfaces.TransformedCorpus at 0x12d576633c8>