In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import matplotlib.pyplot as plt
%matplotlib inline

## Load recipe data

In [111]:
recipe_file = "/Users/Carol/Dropbox/epicurious-recipes-with-rating-and-nutrition/full_format_recipes.json"

In [112]:
recipe_df = pd.read_json(recipe_file, orient='records')
recipe_df.head(2)

Unnamed: 0,directions,fat,date,categories,calories,desc,protein,rating,title,ingredients,sodium
0,"[1. Place the stock, lentils, celery, carrot, ...",7.0,2006-09-01 04:00:00+00:00,"[Sandwich, Bean, Fruit, Tomato, turkey, Vegeta...",426.0,,30.0,2.5,"Lentil, Apple, and Turkey Wrap","[4 cups low-sodium vegetable or chicken stock,...",559.0
1,[Combine first 9 ingredients in heavy medium s...,23.0,2004-08-20 04:00:00+00:00,"[Food Processor, Onion, Pork, Bake, Bastille D...",403.0,This uses the same ingredients found in boudin...,18.0,4.375,Boudin Blanc Terrine with Red Onion Confit,"[1 1/2 cups whipping cream, 2 medium onions, c...",1439.0


## Preprocess recipes

In [117]:
recipe_df = recipe_df.dropna(subset=['directions'])
recipe_df['joined_directions'] = [" ".join(steps) for steps in recipe_df.directions.tolist()]
recipe_df = recipe_df.drop_duplicates(subset="joined_directions", keep="first")
print(recipe_df.shape)

(18097, 12)


In [120]:
recipes = recipe_df.joined_directions.tolist()

In [203]:
vectorizer = TfidfVectorizer(stop_words='english')
count_vectorizer = CountVectorizer(stop_words='english')

In [122]:
vectors = vectorizer.fit_transform(recipes).todense() # (documents, vocab)
vectors.shape

(18097, 10190)

In [204]:
count_vectors = count_vectorizer.fit_transform(recipes).todense()

In [206]:
count_vectors.shape

(18097, 10190)

In [123]:
vocab = np.array(vectorizer.get_feature_names())

In [187]:
terms = vectorizer.get_feature_names()

In [201]:
vocab_dict = vectorizer.vocabulary_

In [190]:
len(terms)

10190

In [192]:
len(vocab_2)

10190

In [199]:
terms[1631]

'cake'

In [124]:
vocab.shape

(10190,)

In [125]:
vocab[1000:1010]

array(['bakeware', 'baking', 'baklava', 'balance', 'balanced',
       'balancing', 'balinese', 'ball', 'baller', 'balloon'], dtype='<U26')

# Non-negative matrix factorization in sklearn

In [126]:
clf = decomposition.NMF(n_components=10, random_state=1)

W1 = clf.fit_transform(vectors)
H1 = clf.components_

In [165]:
def show_topics(a, num_top_words, vocab):
    # return a list of the top words for each topic
    top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_top_words-1:-1]]
    topic_words = ([top_words(t) for t in a])
    return [' '.join(t) for t in topic_words]

def show_docs_by_topic(a, num_top_docs, topics, docs):
    # return a DataFrame with num_top_docs rows; each column
    # contains the top documents from a topic
    num_top_docs = 3
    top_docs = np.argsort(a, axis=0)[:-num_top_docs-1:-1]
    top_docs_text = [[docs[i] for i in rank] for rank in top_docs]
    topics = [i for i in topics]
    doc_df = pd.DataFrame(top_docs_text)
    doc_df.columns = topics
    return doc_df

In [149]:
H1.shape

(10, 10190)

In [137]:
topics = show_topics(H1, 8, vocab)
topics

['skillet heat add minutes cook sauce oil sauté',
 'cake pan beat batter butter flour bowl oven',
 'chicken cooked skin sauce minutes breast marinade skillet',
 'dressing bowl toss pepper salad season whisk salt',
 'sugar ice cream mixture syrup saucepan juice stir',
 'dough crust inch baking sheet oven pie bake',
 'pasta water pot cooking boiling salted drain cook',
 'potatoes potato tender oven drain minutes dish sweet',
 'simmer add heat minutes soup pot broth rice',
 'grill pork lamb marinade pan minutes turning meat']

In [134]:
show_docs_by_topic(W1, 3, topics, recipes)

Unnamed: 0,skillet heat add minutes cook sauce oil sauté,cake pan beat batter butter flour bowl oven,chicken cooked skin sauce minutes breast marinade skillet,dressing bowl toss pepper salad season whisk salt,sugar ice cream mixture syrup saucepan juice stir,dough crust inch baking sheet oven pie bake,pasta water pot cooking boiling salted drain cook,potatoes potato tender oven drain minutes dish sweet,simmer add heat minutes soup pot broth rice,grill pork lamb marinade pan minutes turning meat
0,Pat halibut dry and season with salt and peppe...,Preheat oven to 350°F. Butter and flour 10-inc...,Melt butter with olive oil in heavy medium ski...,Whisk first 4 ingredients in medium bowl to bl...,"Boil berries, 1/4 cup sugar, 1 tablespoon lemo...","Whisk flour, sugar, and salt in large bowl to ...",Cook pasta in large pot of boiling salted wate...,In a kettle combine potatoes with enough salte...,Bring broth and 1 cup water to simmer in mediu...,"Stir together yogurt, garlic, rosemary, and pe..."
1,Pat scallops dry and sprinkle with pepper and ...,Preheat oven to 375°F with rack in middle. But...,Combine first 6 ingredients in heavy medium ov...,Whisk first 5 ingredients in small bowl to ble...,"Combine heavy whipping cream, whole milk, 1 cu...",Cut butter into bits and in a bowl with a past...,Cook pasta in an 8-quart pot of boiling salted...,Peel potatoes and cut into 2-inch pieces. (If ...,Heat oil in heavy large pot over medium heat. ...,"Combine oil, lemon juice, garlic, oregano, sal..."
2,Season fish with salt and pepper. Dust lightly...,Preheat oven to 350°F. Butter 8-inch-diameter ...,Sprinkle chicken with salt and pepper. Heat oi...,Whisk first 3 ingredients in small bowl to ble...,In a small saucepan boil the water with the su...,"Combine flour, sugar and salt in processor. Us...",Cook pasta in a 6- to 8-quart pot of boiling s...,Peel potatoes and cut into 1/2-inch cubes. In ...,Heat oil in heavy large pot over medium-high h...,Season lamb with salt and pepper and place eac...


# LDA in scikit learn

In [207]:
lda_model = decomposition.LatentDirichletAllocation(n_components=10, # Number of topics
                                      learning_method='online',
                                      random_state=0,       
                                      n_jobs = -1  # Use all available CPUs
                                     )
doc_matrix = lda_model.fit_transform(count_vectors)
topic_matrix = lda_model.components_

In [208]:
lda_topics = show_topics(topic_matrix, 8, vocab)

In [209]:
df = show_docs_by_topic(doc_matrix, 3, lda_topics, recipes)
df

Unnamed: 0,beat cake bowl cream chocolate mixer sugar egg,minutes grill heat fish oil medium pepper lamb,cut water knife fennel noodles lengthwise beets use,minutes oven pan mixture butter baking bake bowl,sugar ice juice mixture water saucepan stir bowl,dough inch minutes sheet baking oven roll surface,bowl salt oil pepper season large toss add,heat minutes add cook skillet stirring salt medium,chicken pan oven roast minutes turkey roasting juices,leaves sweet artichokes goat place make artichoke little
0,"Beat cream, sugar and peppermint extract in la...",Stir together all ingredients for beef with 1 ...,"In large stockpot, bring 4 gallons water to ro...",Preheat oven to 300°F. Lightly butter four 3/4...,"Combine tea bags, 3 lemon verbena sprigs, and ...",Stir yeast and sugar into warm water in measur...,"Whisk mayonnaise, oil, lemon zest, and lemon j...",Combine all broth ingredients in a 4- to 6-qua...,Position rack in bottom third of oven and preh...,Stir together all ingredients in a small bowl....
1,In large bowl of a standing electric mixer bea...,Whisk together first 6 ingredients in shallow ...,Whisk together all ingredients except coarse s...,"Pulse together flour, almonds, brown sugar, an...","Combine cherries, kirsch, and sugar in medium ...","Blend together butter, cheese, and yolk in a f...",Combine cucumber and buttermilk in blender. Ch...,Cook bacon in heavy 8-quart Dutch oven over hi...,Preheat oven to 425°F. Gently loosen skin from...,"A matter of preference, naturally. A ratio of ..."
2,"In a large bowl with an electric mixer, beat t...",Combine first 4 ingredients in shallow baking ...,Preheat the oven to 250°F. Line the bottom of ...,Preheat oven to 350°F. Lightly butter 10-inch-...,Combine 1/4 cup clover honey and 2 tablespoon ...,"Sift together flour, baking powder, and salt i...",Mix first 4 ingredients in small bowl. Season ...,Heat oil in a large heavy pot over medium-high...,Set a rack inside a large heavy roasting pan. ...,Pour in the order listed so that each ingredie...


In [210]:
for t in lda_topics:
    print("======{}=========".format(t))
    t_recipes = df[t].tolist()
    for r in t_recipes:
        print(r)
        print("-----------")

Beat cream, sugar and peppermint extract in large bowl until stiff peaks form.
-----------
In large bowl of a standing electric mixer beat together butter, sugar, and salt until light and fluffy. Beat in yolks, 1 at a time, and vanilla and beat until smooth. Beat in flour gradually, beating dough until just combined well.
-----------
In a large bowl with an electric mixer, beat the egg yolks with the sugar until the mixture is thick and pale with ribbons when the beaters are lifted. In another large bowl, beat two thirds of the egg whites until they hold soft peaks. In another large bowl beat the cream until it holds soft peaks. Fold the whites into the yolk mixture until the mixture is combined well and fold in the whipped cream gently but thoroughly. Pour the mixture into a large punch bowl, whisk in the bourbon, the rum, the vanilla, the milk, and the salt. Chill the eggnog, covered, for at least four hours, or until it is cold. Just before serving, in a bowl, beat the remaining thr

# LSA, a.k.a. truncated SVD, in scikit learn

In [175]:
lsa_model = decomposition.TruncatedSVD(n_components=100)
lsa_doc_matrix = lsa_model.fit_transform(vectors)
lsa_topic_matrix = lsa_model.components_

In [180]:
svs = lsa_model.singular_values_
svs

array([36.93578438, 18.77197347, 14.31073601, 13.96327485, 13.17321857,
       12.09472998, 11.72729893, 11.37863432, 11.06909287, 10.51728921,
       10.26305928,  9.97901522,  9.83926985,  9.62415371,  9.54220295,
        9.29550904,  9.13215189,  9.01091991,  8.90056216,  8.88657433,
        8.71396525,  8.60052358,  8.55362881,  8.50312302,  8.30273489,
        8.08804098,  7.97735891,  7.93193665,  7.78289947,  7.64980216,
        7.51535926,  7.4887354 ,  7.36281693,  7.34825746,  7.25074456,
        7.21793358,  7.15974012,  7.12235371,  7.04567302,  7.03064274,
        6.89794457,  6.83198432,  6.82136576,  6.78753477,  6.7655953 ,
        6.73150518,  6.68594192,  6.6564103 ,  6.5981569 ,  6.5607608 ,
        6.53372885,  6.51350405,  6.45988773,  6.44297395,  6.41993884,
        6.35595586,  6.29680796,  6.23625118,  6.21734277,  6.18539489,
        6.1658299 ,  6.1565343 ,  6.14583207,  6.07212394,  6.01423112,
        5.98756475,  5.94594301,  5.90504102,  5.88044914,  5.86

In [182]:
lsa_topics = show_topics(lsa_topic_matrix[:10], 8, vocab)
lsa_topics

['minutes heat add medium bowl salt pepper cook',
 'dough cake sugar beat pan baking cream flour',
 'chicken grill dressing oil pepper marinade baking toss',
 'dressing toss bowl salad ingredients season whisk pepper',
 'chicken cake sugar ingredients cream saucepan syrup ice',
 'dough chicken crust ice water floured pie soup',
 'pasta chicken cake potatoes toss dressing salted drain',
 'potatoes soup oven tender cover vegetables potato roast',
 'medium sauté ahead add cover mix skillet ingredients',
 'pasta lamb grill pork pot cake pan turkey']

In [186]:
show_docs_by_topic(lsa_doc_matrix[:, :10], 3, lsa_topics, recipes)

Unnamed: 0,minutes heat add medium bowl salt pepper cook,dough cake sugar beat pan baking cream flour,chicken grill dressing oil pepper marinade baking toss,dressing toss bowl salad ingredients season whisk pepper,chicken cake sugar ingredients cream saucepan syrup ice,dough chicken crust ice water floured pie soup,pasta chicken cake potatoes toss dressing salted drain,potatoes soup oven tender cover vegetables potato roast,medium sauté ahead add cover mix skillet ingredients,pasta lamb grill pork pot cake pan turkey
0,Boil broth in small saucepan until reduced to ...,Spray 9-inch-diameter springform pan with 3-in...,"Preheat oven to 450°F. Whisk oil, vinegar, and...",Whisk first 4 ingredients in medium bowl to bl...,Combine first 4 ingredients in a small bowl; w...,"Mix flour, sugar and salt in processor. Add bu...",Cook pasta in large pot of boiling salted wate...,In a kettle combine potatoes with enough salte...,Combine all ingredients in heavy large saucepa...,Preheat oven to 325°F. Heat oil in heavy large...
1,Heat 2 tablespoons oil in heavy large pot over...,"Mix flour, cake flour, 1 tablespoon sugar and ...",Combine chicken and 1/4 cup oil in large bowl....,Whisk first 5 ingredients in small bowl to ble...,Mix first 5 ingredients in large bowl. Add chi...,Cut butter into bits and in a bowl with a past...,Cook pasta in large pot of boiling salted wate...,Preheat oven to 450°F. and line a baking sheet...,Mix all ingredients in medium bowl. Season wit...,Heat olive oil in heavy large skillet over med...
2,Melt butter with oil in heavy large skillet ov...,Preheat oven to 350°F. Butter 10-inch-diameter...,Combine first 5 ingredients in processor. Proc...,Whisk first 3 ingredients in small bowl to ble...,Bring 4 cups water to a simmer with chicken br...,Blend first 4 ingredients in processor. Add bu...,Cook pasta in an 8-quart pot of boiling salted...,Butter 6- to 8-cup ovenproof dish. Cook potato...,Mix all ingredients in medium bowl. DO AHEAD: ...,Cook pasta in an 8-quart pot of boiling salted...
