# Imports and configuration.

In [134]:
%matplotlib inline

In [135]:
import lda
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import psycopg2
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text

import common

import functions as f



In [136]:
matplotlib.style.use('ggplot')

# Load data

In [137]:
con = common.make_engine()

df_ep = pd.read_sql_table('recipes_recipe', con)
df_ep = df_ep[['title','ingredient_txt','url','image']]
df_ep['source'] = 'epicurious.com'
print('Loaded %s records from epicurious.com' % df_ep.shape[0])

df_ar = pd.read_sql_table('allrecipes', con)
df_ar = df_ar[['data-name','ingredients','url','data-imageurl']]
df_ar.columns = ['title','ingredient_txt','url','image']
df_ar['source'] = 'allrecipes.com'

df_ar = df_ar.drop_duplicates('url')
df_ar.reset_index()

print('Loaded %s records from allrecipes.com' % df_ar.shape[0])

df = pd.concat([df_ep, df_ar], ignore_index=True)

print('Loaded %s records in total' % df.shape[0])

## TODO: fix unicode here.



Loaded 16539 records in total


Loaded 4506 records from epicurious.com
Loaded 12033 records from allrecipes.com

## Examining and cleaning data

In [138]:
df = df[df['ingredient_txt'].str.len() > 20]
print df['ingredient_txt'].str.len().describe()
df['ingredient_txt'].str.len().plot(kind='hist').set_title('Ingredients character count')
df = df.reset_index()




<matplotlib.figure.Figure at 0x7fd86331ef90>

count    16526.000000
mean       296.671366
std        127.103774
min         23.000000
25%        206.000000
50%        280.000000
75%        371.000000
max       1087.000000
Name: ingredient_txt, dtype: float64


In [139]:
## clean up quoting.
pattern = "[\"\']"
for k in ['title', 'ingredient_txt', 'url', 'image']:
    df[k] = df[k].str.replace(pattern, '')

## formatting ingredients.
df['ingredient_txt'] = df['ingredient_txt'].str.replace('\n',' ')


In [140]:
# Extract features
vectorizer = CountVectorizer(
             stop_words='english'
           , ngram_range=(1, 2)
           , token_pattern='[A-Za-z]+'
           , min_df = 10
           , max_df = 0.25
)

features = vectorizer.fit_transform(df.ingredient_txt)
## features is a document x term matrix.

wc = f.feature_counts(vectorizer, features)

wc.sort('count').tail(25).plot('word','count', kind='bar')


<matplotlib.figure.Figure at 0x7fd86315b3d0>

<matplotlib.axes._subplots.AxesSubplot at 0x7fd862af2d10>

In [141]:
m = lda.LDA(n_topics=40, random_state=0, n_iter=200)
m.fit(features)
print('Finished running model')


Finished running model


# Evaluating the model.
## Convergence

In [142]:
plt.plot(m.loglikelihoods_, '-')
plt.title('Loglikelihood')


<matplotlib.figure.Figure at 0x7fd872a43e90>

<matplotlib.text.Text at 0x7fd86316db90>

# Assessing topics

In [143]:
## Extracting topic data.

## most probable words by topic.
## TODO: check if these are properly sorted within each topic.
w = f.most_probable_words(m, vectorizer.get_feature_names(), 10)

## most probable documents by topic.
# np.apply_along_axis(lambda i: df.iloc[i]['title'], 1, doc_ids)

doc_ids = np.argsort(m.doc_topic_, axis=0)[-4:-1,:].T

doc_probs = np.sort(m.doc_topic_, axis=0)[-4:-1,:].T




In [144]:
## TODO: store one set of results for each run.


## massage document ids and probabilities into form suitable for database.
di = pd.DataFrame(doc_ids)
di['topic'] = di.index
di = pd.melt(di, id_vars='topic')
di.columns = ['topic','rank','recipe_key']

dp = pd.DataFrame(doc_probs)
dp['topic'] = dp.index
dp = pd.melt(dp, id_vars='topic')
dp.columns = ['topic','rank','prob']

dd = pd.merge(di, dp)
dd.to_sql('doc_prob', con, if_exists='replace')

# store recipes
df['key'] = df.index
df.to_sql('clean_recipes', con, if_exists='replace')

# store words
w.columns = ['rank','topic','word','prob']
w.to_sql('word_probs', con, if_exists='replace')


In [145]:
xx = pd.merge(df, dd, left_on='key', right_on='recipe_key', how='right')
for n, g in xx.groupby('topic'):
    print 'topic: %s' % n
    print g[['title','prob']].sort('prob').to_string()


topic: 39
                        title      prob
19           Kung Pao Chicken  0.805000
81  Japanese Broiled Mackerel  0.809677
49         Sesame Seared Tuna  0.820455



                                 title      prob
18  Calamari with Roasted Tomato Sauce  0.504545
95            Mediterranean Fried Rice  0.544068
58                     Pasta Siciliano  0.555357
topic: 27
                             title      prob
114  Simple Classic Roasted Turkey  0.478947
104          Cube Steak with Gravy  0.503846
90         Venison Salisbury Steak  0.551667
topic: 28
                          title      prob
3            Green Banana Shake  0.641667
7  Raspberry Lemon Splash Shake  0.658621
6              Hot Vanilla Chai  0.736667
topic: 29
                                                title      prob
86                                   Orange Salmon II  0.569565
72  Sweet Dijon Basa Swai Fish (or a fish of your ...  0.636667
54                         Lemon-Orange Orange Roughy  0.671429
topic: 30
                             title      prob
26         Charred Tomatillo Salsa  0.771795
8   Thai Celery Salad with Peanuts  0.801818
11           Tomatillo S


                              title      prob
108            Marsha\u0027s Garoni  0.656522
48   Three Packet Slow Cooker Roast  0.702000
102             Onion Ranch Burgers  0.703333
topic: 14
                  title      prob
22   Blue Crab Beignets  0.360714
16  Arugula Salsa Verde  0.360937
79      Filipino Lumpia  0.391667
topic: 15
                   title      prob
59    Quiche Lorraine II  0.592308
64        Crab Quiche II  0.592308
91  Chicken Pecan Quiche  0.597872
topic: 16
                                                title      prob
25       Chocolate-Dipped Salted Caramel Marshmallows  0.765278
34  Maple-Gingerbread Layer Cake with Salted Maple...  0.811538
5                        Bourbon-Vanilla Marshmallows  0.816279
topic: 17
                  title      prob
30       Peach Prosecco  0.783333
37        Ipanema Punch  0.786486
17  Corpse Reviver 3000  0.794737
topic: 18
                                title      prob
78          Rajma (Kidney Bean Curry)  0.743299
4

topic: 0
                                                title      prob
14  Poached Seckel Pear with Pomegranate, Cabrales...  0.522917
24                                 Thiebaud Pink Cake  0.531579
35                     Mushroom Goat Cheese Pan Sauce  0.531579
topic: 1
                                     title      prob
29                              Bún Bò Hue  0.452500
103  Classic Smoked Sausage \u0026 Peppers  0.474545
113                             Dinengdeng  0.526190
topic: 2
                                    title      prob
111                     Cretons de Quebec  0.459091
70         Roasted Garlic Flat Iron Steak  0.478947
115  Coffee-Crusted Beef Tenderloin Steak  0.515385
topic: 3
                             title      prob
110  Pork Tenderloin (Gluten-Free)  0.565625
82             Roasted Sherry Duck  0.583871
45     Mediterranean Lemon Chicken  0.604000
topic: 4
                                                title      prob
94                              Man

In [146]:

print('='*70)
for t in range(m.n_topics):
    print('topic: %s' % t)
    print('documents:')
    print pd.DataFrame([df.iloc[doc_ids[t,:]]['title'].values, doc_probs[t,:]]).T.sort(1, ascending=False).to_string(header=False, index=False)
#    print('\n'.join(df.iloc[doc_ids[t,:]]['title']))
    print('-----'.join(df.iloc[doc_ids[t,:]]['ingredient_txt']))
    print('-'*70)
    print w[w['topic']==t][['word','prob']].sort('prob', ascending=False).T.to_string(index=False, header=False, float_format=lambda x: '% 4.3f' % x)
    print('='*70)




topic: 38
documents:
 Scott Ure\u0027s Clams And Garlic   0.669697
               The Best Clam Sauce  0.6393939
               Smoked Herb Chicken  0.5896552
1 (4 pound) whole chicken 3 tablespoons butter 1 tablespoon chopped fresh parsley 1 tablespoon chopped fresh oregano 1 tablespoon chopped fresh basil 1 tablespoon fresh chives, finely chopped-----1 (16 ounce) package dry linguini 1 onion, chopped 6 cloves garlic, chopped 3 tablespoons olive oil 4 (6.5 ounce) cans minced clams 1/2 cup butter salt and pepper to taste 2 tablespoons dry white wine-----50 small clams in shell, scrubbed 2 tablespoons extra virgin olive oil 6 cloves garlic, minced 1 cup white wine 2 tablespoons butter 1/2 cup chopped fresh parsley
----------------------------------------------------------------------
 parsley  white   wine  fresh parsley  white wine    dry  minced  tablespoons chopped  tablespoon chopped
   0.045  0.042  0.041          0.038       0.035  0.029   0.026                0.025              


topic: 34
documents:
 Easy Weeknight Tuna Pot Pie   0.862
  Fern\u0027s Tuna Casserole  0.8525
      One Pot Tuna Casserole    0.85
1 (16 ounce) package egg noodles 1 (10 ounce) package frozen green peas, thawed 1/4 cup butter 1 (10.75 ounce) can  condensed cream of mushroom soup 1 (6 ounce) can tuna, drained 1/4 cup milk 1 cup shredded Cheddar cheese-----1/2 (8 ounce) package egg noodles 1 (10.75 ounce) can  condensed cream of mushroom soup 3/4 cup whole milk 1 (6 ounce) can tuna 2 slices processed cheese food (such as Velveeta®) 1/2 cup crushed potato chips, or as needed-----2 tablespoons butter 1 small onion, diced 2 (6 ounce) cans tuna, drained 1 (10 ounce) package frozen mixed vegetables 1 (10.75 ounce) can  condensed cream of mushroom soup 1/2 cup shredded Cheddar cheese 1 (8 ounce) package refrigerated crescent rolls
----------------------------------------------------------------------
 condensed  cream  ounce condensed  condensed cream  mushroom  mushroom soup  ounce package 


topic: 30
documents:
          Tomatillo Salsa Verde  0.8135135
 Thai Celery Salad with Peanuts  0.8018182
        Charred Tomatillo Salsa  0.7717949
 1/2 pound tomatillos, husked, rinsed, patted dry 1/2 large white onion, cut into 8 wedges 1 jalapeño or serrano chile, halved, seeded 2 tablespoons (or more) fresh lime juice Kosher salt ----- 3 tablespoons vegetable oil 2 tablespoons fresh lime juice 2 teaspoons fish sauce 6 celery stalks, thinly sliced on a diagonal 3 thinly sliced scallions 1 thinly sliced red chile, such as Fresno 1 cup fresh cilantro leaves with tender stems 1/4 cup chopped roasted, salted peanuts ----- 1 pound quartered rinsed husked tomatillos 1/2 coarsely chopped medium onion 1 smashed garlic clove 1 coarsely chopped serrano chile (seeds removed if desired) 1/4 cup fresh cilantro leaves with tender stems Salt 
----------------------------------------------------------------------
 thinly  thinly sliced  removed  peeled  kosher  kosher salt  coarsely  leaves  cil


topic: 26
documents:
                    Pasta Siciliano  0.5553571
           Mediterranean Fried Rice  0.5440678
 Calamari with Roasted Tomato Sauce  0.5045455
 2 cups whole peeled tomatoes 2 cloves garlic, chopped 2 tablespoons fine sea salt 2 tablespoons crushed red pepper -----2 tablespoons olive oil 1 clove garlic, minced 1 1/2 cups cooked rice 1 (10 ounce) package frozen chopped spinach, thawed and drained 1 (6 ounce) jar marinated artichoke hearts, drained and quartered 1 (4 ounce) jar roasted red peppers, drained and chopped 1/2 cup crumbled feta cheese with herbs-----1 (16 ounce) package uncooked farfalle pasta 1/4 cup olive oil 3 cloves chopped garlic 1 teaspoon crushed red pepper flakes 2 tablespoons lemon juice 1/2 cup pine nuts 1 (2.25 ounce) can sliced black olives 1/2 cup chopped sun-dried tomatoes 1 cup crumbled feta cheese salt and pepper to taste
----------------------------------------------------------------------
 red pepper  flakes  pepper flakes  crushed  crush


topic: 22
documents:
       Chopstick Ready Rice       0.61
 Cucumber and Avocado Sushi  0.5923077
      Kimbop (Korean Sushi)       0.54
1 cup uncooked glutinous white rice (sushi rice) 1 1/2 cups water 1 tablespoon sesame oil salt, to taste 2 eggs, beaten 4 sheets sushi nori (dry seaweed) 1 cucumber, cut into thin strips 1 carrot, cut into thin strips 4 slices American processed cheese, cut into thin strips 4 slices cooked ham, cut into thin strips 2 teaspoons sesame oil-----1 1/4 cups water 1 cup uncooked glutinous white rice (sushi rice) 3 tablespoons rice vinegar 1 pinch salt 4 sheets nori (dry seaweed) 1/2 cucumber, sliced into thin strips 1 avocado - peeled, pitted and sliced----- 2 cups shortgrain white rice 2 cups water 
----------------------------------------------------------------------
 strips  water    cut  white  uncooked  cups water  white rice  cut strips  grain
  0.053  0.043  0.035  0.035     0.029       0.028       0.027       0.026  0.019
topic: 23
documents:
   


topic: 18
documents:
          Indian Dahl with Spinach      0.762
 Indian Chicken Curry (Murgh Kari)  0.7536082
         Rajma (Kidney Bean Curry)   0.743299
2 cups dry red kidney beans 1 large onion, chopped 4 cloves garlic, chopped 1 (2 inch) piece fresh ginger root, chopped 2 tablespoons vegetable oil 2 teaspoons ghee (clarified butter) 2 dried red chile peppers, broken into pieces 1 teaspoon cumin seeds 6 whole cloves 1 teaspoon ground turmeric 1 teaspoon ground cumin 1 teaspoon ground coriander 2 tomatoes, chopped 2 cups water 1 teaspoon white sugar salt to taste 2 teaspoons garam masala 1 teaspoon ground red pepper 1/4 cup cilantro leaves, chopped-----2 pounds skinless, boneless chicken breast halves 2 teaspoons salt 1/2 cup cooking oil 1 1/2 cups chopped onion 1 tablespoon minced garlic 1 1/2 teaspoons minced fresh ginger root 1 tablespoon curry powder 1 teaspoon ground cumin 1 teaspoon ground turmeric 1 teaspoon ground coriander 1 teaspoon cayenne pepper 1 tablespoon water 1 


 1/2 small shallot, finely chopped 6 ounces fresh blue or other lump crabmeat, picked over 1/3 cup mascarpone 1 tablespoon finely chopped fresh chives Kosher salt ----- 2 plum tomatoes, finely chopped (optional) Finely grated zest of 1 small lemon 1 garlic clove, finely chopped 3 cups arugula leaves, finely chopped 1/4 cup coarse fresh breadcrumbs 1 tablespoon finely chopped capers 1 tablespoon red wine vinegar 3/4 cup extra-virgin olive oil plus more 1/4 cup finely grated Parmesan -----1 lumpia wrappers 1 pound ground beef 1/2 pound ground pork 1/3 cup finely chopped onion 1/3 cup finely chopped green bell pepper 1/3 cup finely chopped carrot 1 quart oil for frying
----------------------------------------------------------------------
 finely chopped  cup finely  balsamic  onion finely  balsamic vinegar  vinegar  chopped fresh  tablespoons finely  grated
          0.151       0.040     0.027         0.026             0.025    0.024          0.023               0.018   0.017
topic: 15


topic: 11
documents:
   Quinoa Stuffed Peppers  0.6741379
 Superb Sausage Casserole  0.6346939
                     Tava  0.6089286
2 pounds beef stew meat, cut into 1/2 inch pieces 1 small red onion, diced 1 medium green bell pepper, seeded and diced 2 fresh tomatoes, diced 1 green chile pepper, seeded and diced (optional) 1/2 teaspoon chile paste-----6 links pork sausage 1 large red bell pepper, chopped 1 large orange bell pepper, chopped 1 large yellow bell pepper, chopped 1 large onion, diced 1/2 cup ketchup 3/4 cup hot water salt and pepper to taste-----1 cup quinoa, rinsed and drained 2 cups water 2 tablespoons olive oil 1 small onion, diced 2 cloves garlic, minced 1 zucchini, diced 1 small eggplant, diced 1 tomato, diced 1 cup tomato sauce salt and ground black pepper to taste 6 bell peppers, tops cut off and seeded 1 cup shredded mozzarella cheese, or more to taste
----------------------------------------------------------------------
 bell pepper  diced  green    red  green b


topic: 7
documents:
 Fresh Herb, Grapefruit, and Fennel Salad     0.6275
                 Caprese-Stuffed Zucchini  0.5772727
          Kale Pesto with Toasted Walnuts  0.5673913
 2 cups packed torn kale leaves, stems removed 1 cup packed fresh basil leaves 1 teaspoon sea salt 1/4 cup extra virgin olive oil 1/4 cup toasted walnuts 4 cloves garlic, chopped 1/2 cup grated Parmesan cheese -----2 large zucchinis, halved lengthwise and seeded sea salt to taste freshly ground black pepper to taste extra-virgin olive oil, or to taste 2 (1/2 inch thick) slices eggplant 1 cup cherry tomatoes, quartered 1 clove garlic, minced 1 cup fresh basil leaves, divided 1 teaspoon sea salt 1/2 teaspoon ground black pepper 3 tablespoons extra-virgin olive oil, plus more for drizzling 1 tablespoon balsamic vinegar----- 1 large ruby-red grapefruit or 3 blood oranges 2 small fresh fennel bulbs, trimmed, halved vertically, sliced paper-thin 1 cup fresh Italian parsley leaves 1/4 cup fresh mint leaves Extra-vir


2 cups rice-bread crumbs (such as PaneRiso™) 1/2 cup olive oil 1 teaspoon dried oregano 1 teaspoon dried basil 1 teaspoon dried rosemary 1 teaspoon garlic powder 2 pork tenderloins-----1 (4 pound) frozen duck 6 tablespoons olive oil 2 cups dry sherry 3 teaspoons dried oregano 3 teaspoons dried rosemary 3 teaspoons dried basil-----1 lemon 2 teaspoons dried oregano 3 cloves garlic, minced 1 tablespoon olive oil 1/4 teaspoon salt 1/4 teaspoon ground black pepper 6 chicken legs
----------------------------------------------------------------------
 teaspoon dried  oregano  dried oregano  teaspoons  dried basil  thyme  basil  dried thyme  minced
          0.090    0.049          0.040      0.027        0.024  0.024  0.024        0.022   0.022
topic: 4
documents:
                          Fried Chicken with Gravy      0.724
 Witchs Finger Bread Sticks with Maple Mustard Dip  0.7033333
                             Manicotti Pancakes II        0.7
3 eggs 1 cup milk 1 cup all-purpose flour----

topic: 0
documents:
                    Mushroom Goat Cheese Pan Sauce  0.5315789
                                Thiebaud Pink Cake  0.5315789
 Poached Seckel Pear with Pomegranate, Cabrales...  0.5229167
 3 cups (750 g/26.4 oz) whole milk 1 cup plus 1 Tbsp (250 g/8.8 oz) heavy cream 3 Tbsp plus 1 tsp (20 g/0.7 oz) Szechuan peppercorns 8 1/2 large (160 g/5.6 oz) egg yolks 3/4 cup plus 2 Tbsp (180 g/6.3 oz) granulated sugar ----- 1/2 pound (227 g) fresh strawberries 1/2 cup (4.3 oz / 120 g) water 1/4 cup (1.8 oz / 50 g) sugar ----- 1/4 cup canned low-sodium chicken broth 1/4 cup full-bodied red wine 
----------------------------------------------------------------------
 broth  chicken broth  celery    low      g  stock  sodium  cups chicken  cup chicken
 0.055          0.041   0.032  0.026  0.023  0.018   0.018         0.016        0.016
topic: 1
documents:
                            Dinengdeng  0.5261905
 Classic Smoked Sausage \u0026 Peppers  0.4745455
                            B