# Imports and configuration.

In [747]:
%matplotlib inline

In [748]:
import lda
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import psycopg2
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text

import common

import functions as f



In [749]:
matplotlib.style.use('ggplot')

# Load data

In [750]:
con = common.make_engine()

df_ep = pd.read_sql_table('recipes_recipe', con)
df_ep = df_ep[['title','ingredient_txt','url','image']]
df_ep['source'] = 'epicurious.com'
print('Loaded %s records from epicurious.com' % df_ep.shape[0])

df_ar = pd.read_sql_table('allrecipes', con)
df_ar = df_ar[['data-name','ingredients','url','data-imageurl']]
df_ar.columns = ['title','ingredient_txt','url','image']
df_ar['source'] = 'allrecipes.com'

print('Loaded %s records from allrecipes.com' % df_ar.shape[0])

df = pd.concat([df_ep, df_ar], ignore_index=True)

print('Loaded %s records in total' % df.shape[0])

## TODO: fix unicode here.



Loaded 17227 records in total


Loaded 4506 records from epicurious.com
Loaded 12721 records from allrecipes.com

## Examining and cleaning data

In [751]:
df = df[df['ingredient_txt'].str.len() > 20]
print df['ingredient_txt'].str.len().describe()
df['ingredient_txt'].str.len().plot(kind='hist').set_title('Ingredients character count')
df = df.reset_index()




<matplotlib.figure.Figure at 0x7fd054baf5d0>

count    17214.000000
mean       297.450854
std        127.100378
min         23.000000
25%        206.000000
50%        281.000000
75%        372.000000
max       1087.000000
Name: ingredient_txt, dtype: float64


In [752]:
## clean up quoting.
pattern = "[\"\']"
for k in ['title', 'ingredient_txt', 'url', 'image']:
    df[k] = df[k].str.replace(pattern, '')

## formatting ingredients.
df['ingredient_txt'] = df['ingredient_txt'].str.replace('\n',' ')


In [753]:
# Extract features
vectorizer = CountVectorizer(
             stop_words='english'
           , ngram_range=(1, 2)
           , token_pattern='[A-Za-z]+'
           , min_df = 10
           , max_df = 0.25
)

features = vectorizer.fit_transform(df.ingredient_txt)
## features is a document x term matrix.

wc = f.feature_counts(vectorizer, features)

wc.sort('count').tail(25).plot('word','count', kind='bar')


<matplotlib.figure.Figure at 0x7fd05ed2c150>

<matplotlib.axes._subplots.AxesSubplot at 0x7fd051ea0ad0>

In [754]:
m = lda.LDA(n_topics=40, random_state=0, n_iter=200)
m.fit(features)
print('Finished running model')


Finished running model


# Evaluating the model.
## Convergence

In [755]:
plt.plot(m.loglikelihoods_, '-')
plt.title('Loglikelihood')


<matplotlib.figure.Figure at 0x7fd05491b510>

<matplotlib.text.Text at 0x7fd051ce6890>

# Assessing topics

In [756]:
## Extracting topic data.

## most probable words by topic.
## TODO: check if these are properly sorted within each topic.
w = f.most_probable_words(m, vectorizer.get_feature_names(), 10)

## most probable documents by topic.
# np.apply_along_axis(lambda i: df.iloc[i]['title'], 1, doc_ids)

doc_ids = np.argsort(m.doc_topic_, axis=0)[-4:-1,:].T

doc_probs = np.sort(m.doc_topic_, axis=0)[-4:-1,:].T




In [757]:
## TODO: store one set of results for each run.


## massage document ids and probabilities into form suitable for database.
di = pd.DataFrame(doc_ids)
di['topic'] = di.index
di = pd.melt(di, id_vars='topic')
di.columns = ['topic','rank','recipe_key']

dp = pd.DataFrame(doc_probs)
dp['topic'] = dp.index
dp = pd.melt(dp, id_vars='topic')
dp.columns = ['topic','rank','prob']

dd = pd.merge(di, dp)
dd.to_sql('doc_prob', con, if_exists='replace')

# store recipes
df['key'] = df.index
df.to_sql('clean_recipes', con, if_exists='replace')

# store words
w.columns = ['rank','topic','word','prob']
w.to_sql('word_probs', con, if_exists='replace')


In [758]:
xx = pd.merge(df, dd, left_on='key', right_on='recipe_key', how='right')
for n, g in xx.groupby('topic'):
    print 'topic: %s' % n
    print g[['title','prob']].sort('prob').to_string()


topic: 31
                                      title      prob
24           Mushroom Goat Cheese Pan Sauce  0.394444
5                        24th Street Spritz  0.406667
109  Curried Scallops with Angel Hair Pasta  0.407813
topic: 32
                          title      prob
30  Fast and Friendly Meatballs  0.656522
48     The Perfect Basic Burger  0.686364
88             Grilled Kingfish  0.740000
topic: 33
                                             title      prob
1   Crispy Jerusalem Artichokes with Aged Balsamic  0.700000
12    Frascatelli with Pecorino and Mustard Greens  0.719565
15           Shellfish and Potatoes à la Marinière  0.751087
topic: 34
                 title      prob
96    Spicy Baked Tofu  0.660526
31  Asian Beef Skewers  0.685366
61  Garlic Ginger Tofu  0.717857
topic: 35
                                            title      prob
77                               Rich Cheese Pie   0.597297
56  Mom\u0027s Easy Creamed Chipped Beef on Toast  0.633333
78       


topic: 18
                 title      prob
50  Mango Salsa Salmon  0.617949
82     Onion Spaghetti  0.645238
81     Onion Spaghetti  0.669048
topic: 19
                           title      prob
40       Quick Brownbag Burritos  0.784375
39             Mexican Casserole  0.789286
90  Super-Simple Dorito(R) Tacos  0.789362
topic: 20
                      title      prob
62            Chicken Voila  0.560465
86  Salt and Pepper Chicken  0.569565
87  Salt and Pepper Chicken  0.569565
topic: 21
                          title      prob
66         Roast Pork Loin Chop  0.576190
100  Easy Tangy Pork Loin Chops  0.624138
101  Easy Tangy Pork Loin Chops  0.624138
topic: 22
                                            title      prob
21                             Limoncello Sparkle  0.623077
19                            Campari-Orange Pops  0.636842
27  Mackerel with Cauliflower Couscous and Tahini  0.700000
topic: 23
                                    title      prob
107  Pineapple, Black B


topic: 5
                                        title      prob
55               Moist Garlic Roasted Chicken  0.623810
114                  Peri Peri Chicken Livers  0.644000
73   The Attention-Hungry Turkey of Moistness  0.719048
topic: 6
             title      prob
91      Pluto Pups  0.850943
0   No-Knead Bread  0.855882
17         Qatayef  0.855882
topic: 7
                                   title      prob
22  Apple Torte with Breadcrumb-Hazelnut  0.578788
35                   Oven Baked BBQ Ribs  0.593182
79         Slow Cooker Potluck Spareribs  0.594444
topic: 8
                                          title      prob
111  Spence\u0027s Secret Thai Red Shrimp Curry  0.529213
7            Red Curry of Lobster and Pineapple  0.553125
60                        Shrimp Red Thai Curry  0.577500
topic: 9
                                     title      prob
80                  Grilled Cheese De Mayo  0.766667
85  Turkey Sandwiches with Cranberry Sauce  0.775000
44                 

topic: 0
                                         title      prob
99                      No Bean Veggie Burgers  0.472549
8   Red, White, and Blue Potato and Beet Chips  0.488571
26                                    Po-Tacos  0.512500
topic: 1
                           title      prob
97  Lamb Merguez Sausage Patties  0.615909
98  Lamb Merguez Sausage Patties  0.615909
58    Middle Eastern Turkey Dogs  0.628125
topic: 2
                                    title      prob
41                Chicken Kabobs Mexicana  0.517188
95  Classic Smoked Sausage \u0026 Peppers  0.519643
64                         Chicken Kabobs  0.542000
topic: 3
                                      title      prob
2   Charred Romaine with Tomatillo Dressing  0.820455
16                         Watermelon Sugar  0.831034
4                     Tomatillo Salsa Verde  0.840541
topic: 4
                                        title      prob
119                          Mushroom Sliders  0.551667
116                

In [759]:

print('='*70)
for t in range(m.n_topics):
    print('topic: %s' % t)
    print('documents:')
    print pd.DataFrame([df.iloc[doc_ids[t,:]]['title'].values, doc_probs[t,:]]).T.sort(1, ascending=False).to_string(header=False, index=False)
#    print('\n'.join(df.iloc[doc_ids[t,:]]['title']))
    print('-----'.join(df.iloc[doc_ids[t,:]]['ingredient_txt']))
    print('-'*70)
    print w[w['topic']==t][['word','prob']].sort('prob', ascending=False).T.to_string(index=False, header=False, float_format=lambda x: '% 4.3f' % x)
    print('='*70)




topic: 37
documents:
   Mom Schroer\u0027s Beef Tenderloin  0.6302326
                  Roasted Sherry Duck   0.616129
 Louise\u0027s Herbed Beef Tenderloin  0.6138889
1 (3 pound) beef tenderloin; 2 teaspoons olive oil; 2 cloves garlic, minced; 2 teaspoons dried basil; 1 1/2 teaspoons dried rosemary, crushed; 1 teaspoon sea salt; fresh ground black pepper to taste-----1 (4 pound) frozen duck; 6 tablespoons olive oil; 2 cups dry sherry; 3 teaspoons dried oregano; 3 teaspoons dried rosemary; 3 teaspoons dried basil-----3 tablespoons olive oil; 1 tablespoon lemon juice; 2 cloves garlic, minced; 2 teaspoons salt; 1 1/2 teaspoons dried oregano; 1 1/2 teaspoons dried basil; 1 teaspoon dried rosemary; 1/8 teaspoon ground black pepper; 6 pounds beef tenderloin
----------------------------------------------------------------------
 teaspoon dried  oregano  dried oregano  minced  thyme  parsley  teaspoons  dried basil  basil
          0.077    0.040          0.034   0.029  0.026    0.022      0


topic: 33
documents:
          Shellfish and Potatoes à la Marinière   0.751087
   Frascatelli with Pecorino and Mustard Greens  0.7195652
 Crispy Jerusalem Artichokes with Aged Balsamic        0.7
; 2 tablespoons olive oil; 2 pounds small Jerusalem artichokes (sunchokes), scrubbed, quartered; Kosher salt, freshly ground pepper; 4 sprigs rosemary; 1/4 cup (1/2 stick) unsalted butter; 3 tablespoons aged balsamic vinegar; -----; 2 cups semolina flour (pasta flour); Kosher salt; 1/4 cup (1/2 stick) unsalted butter; 1 bunch mustard greens (about 6 ounces), center ribs and stems removed, leaves torn into pieces (about 6 cups); Freshly ground black pepper; 1/4 cup grated Pecorino or Parmesan; -----; 1 1/4 pounds small new or baby Yukon Gold potatoes; 1 tablespoon kosher salt plus more; 2 tablespoons olive oil; 1 fennel bulb, trimmed, diced; 2 shallots, finely chopped; 4 garlic cloves, finely chopped; 3/4 cup dry white wine; 1 pound littleneck clams, scrubbed; 1 pound mussels, scrubbed, debe


topic: 29
documents:
             Lemon-Orange Orange Roughy  0.7666667
         Foil Barbecued Trout with Wine  0.7088235
 Jamaican-Seasoned Sauteed Swai Fillets  0.6714286
1 tablespoon olive oil; 2 tablespoons butter; 1 tablespoon Jamaican jerk seasoning; 1 teaspoon salt; 5 (3 ounce) fillets swai fish-----2 trout, cleaned and head removed; 1/4 cup dry white wine; 2 tablespoons butter, melted; 1 tablespoon lemon juice; 2 tablespoons chopped fresh parsley; salt and pepper to taste-----1 tablespoon olive oil; 4 (4 ounce) fillets orange roughy; 1 orange, juiced; 1 lemon, juiced; 1/2 teaspoon lemon pepper
----------------------------------------------------------------------
 juice  fillets  lemon juice  juiced  pepper taste  salmon   lime  white  chopped fresh
 0.045    0.037        0.023   0.022         0.020   0.019  0.017  0.012          0.012
topic: 30
documents:
              Green Turkey and Cheese  0.5954545
       Mild Cheesy Chicken Enchiladas  0.5552632
 Quick Chicken and Stuf


topic: 25
documents:
         Irish Boiled Dinner (Corned Beef)  0.8515152
 Chef John\u0027s Corned Beef and Cabbage   0.8410714
                    Sausage Steamed Dinner  0.8042254
2 1/2 pounds smoked sausage; 2 pounds ham hocks; 4 large potatoes, peeled and cut into large chunks; 1 rutabaga, peeled and cut into large chunks; 6 turnips, peeled and cut into large chunks; 6 carrots, peeled and cut into large chunks; 2 onions, cut into large chunks; 1 large head cabbage, quartered; 1 teaspoon salt, or to taste; 1/2 teaspoon ground black pepper, or to taste; 1 quart water-----1 (4 pound) corned beef brisket with spice packet; 3 quarts water; 1 onion, quartered; 3 carrots, cut into large chunks; 3 stalks celery, cut into 2-inch pieces; 1 teaspoon salt; 2 pounds red potatoes, halved; 1 small head cabbage, cut into eighths-----1 (5 1/2 pound) corned beef brisket; 2 large onions; 15 small white (Irish) potatoes; 10 carrots, cut into 1 inch pieces; 2 heads cabbage, cored and cut into wedges



topic: 21
documents:
 Easy Tangy Pork Loin Chops  0.6241379
 Easy Tangy Pork Loin Chops  0.6241379
       Roast Pork Loin Chop  0.5761905
4 thick cut pork chops; salt and pepper to taste; 1 large onion, peeled and sliced; 1 cup water-----2 tablespoons canola oil; 1 1/2 pounds pork tenderloin, sliced 1 1/2 inch thick; 1 cup ketchup; 1 cup cola-flavored carbonated beverage (such as Coca-Cola®); ground black pepper to taste-----2 tablespoons canola oil; 1 1/2 pounds pork tenderloin, sliced 1 1/2 inch thick; 1 cup ketchup; 1 cup cola-flavored carbonated beverage (such as Coca-Cola®); ground black pepper to taste
----------------------------------------------------------------------
 boneless  chops  roast  pepper taste  pork chops  vegetable oil  vegetable  boneless pork  pounds
    0.035  0.033  0.032         0.028       0.025          0.024      0.023          0.022   0.021
topic: 22
documents:
 Mackerel with Cauliflower Couscous and Tahini        0.7
                           Campari-


topic: 17
documents:
                  Easy Passover Lasagna  0.7965517
                 Slow Cooker Lasagna II    0.79375
 Spinach Manicotti with Italian Sausage  0.7930233
1 (9 ounce) bag fresh spinach; 1 pound bulk Italian sausage; 1 (24 ounce) carton small curd cottage cheese; 1 (12 ounce) package shredded mozzarella cheese, divided; 12 manicotti shells; 2 (24 ounce) jars spaghetti sauce-----1 (16 ounce) package lasagna noodles; 1 pound lean ground beef; 1 1/2 (26 ounce) jars spaghetti sauce; 2 cups shredded mozzarella cheese; 1/2 cup grated Parmesan cheese; 1 (8 ounce) container ricotta cheese; 2 eggs; 2 cups shredded mozzarella cheese-----28 ounces ricotta cheese; 3 eggs; 8 matzah sheets, or more as needed; 3 (32 ounce) jars marinara sauce (such as Classico® Tomato and Basil); 2 (16 ounce) packages shredded mozzarella cheese
----------------------------------------------------------------------
 mozzarella  mozzarella cheese  package  ounce package  shredded mozzarella  parmesan


topic: 13
documents:
    Chili Dog Casserole I  0.6333333
         Russian Dressing  0.5509804
 Baked Hot Dog Sandwiches  0.5484848
8 hot dogs, chopped; 2/3 cup shredded Cheddar cheese; 3 tablespoons pickle relish; 3 tablespoons ketchup; 2 teaspoons prepared mustard; 3 tablespoons chopped onion; 8 hot dog buns-----; 1 tablespoon finely chopped onion ; 1 cup purchased or homemade mayonnaise; 1/4 cup ketchup-style chili sauce or ketchup; 4 teaspoons bottled (regular; not packed in beet juice) horseradish, or to taste; 1 teaspoon hot sauce, preferably Franks Red Hot Sauce; 1 teaspoon Worcestershire sauce; 1/4 teaspoon sweet paprika; Fine sea salt; -----8 hot dog buns; 8 hot dogs; 1 (15 ounce) can chili; 1/4 cup chopped onion; 1 tablespoon prepared mustard; 2 cups shredded Cheddar cheese
----------------------------------------------------------------------
 worcestershire sauce  worcestershire    hot  dijon  dijon mustard  prepared  teaspoons  pepper sauce  ketchup
                0.048 


topic: 9
documents:
                                    BLT  0.7772727
 Turkey Sandwiches with Cranberry Sauce      0.775
                 Grilled Cheese De Mayo  0.7666667
1 tablespoon mayonnaise, divided; 2 slices white bread; 2 slices American cheese; 1 slice pepperjack cheese-----1 loaf French bread; 4 tablespoons margarine; 8 ounces sliced deli turkey meat; 8 slices provolone cheese; 8 slices precooked bacon; 4 tablespoons mayonnaise; 4 tablespoons jellied cranberry sauce; 8 slices fresh tomatoes; 4 lettuce leaves-----4 slices bacon; 2 leaves lettuce; 2 slices tomato; 2 slices bread, toasted; 1 tablespoon mayonnaise
----------------------------------------------------------------------
 bacon  bread  slices bacon  split  mayonnaise  crumbled   buns  ounces  rolls
 0.042  0.032         0.017  0.016       0.013     0.012  0.011   0.010  0.010
topic: 10
documents:
          Pot Roast in Foil  0.8115385
 Fern\u0027s Tuna Casserole     0.8025
       Marsha\u0027s Garoni  0.7434783
2 s


topic: 5
documents:
 The Attention-Hungry Turkey of Moistness  0.7190476
                 Peri Peri Chicken Livers      0.644
             Moist Garlic Roasted Chicken  0.6238095
1 (4 pound) whole chicken; salt and pepper to taste; 1 large lemon, sliced; 6 cloves garlic, sliced; 6 sprigs thyme-----2 tablespoons olive oil; 1 large onion, chopped; 1 teaspoon cayenne pepper, or to taste; 1/2 pound chicken livers, rinsed and trimmed; salt and ground black pepper to taste; 1 tablespoon brandy-----1 (18 pound) whole turkey, neck and giblets removed; 8 cups prepared stuffing; 1/2 cup softened butter; salt and pepper to taste
----------------------------------------------------------------------
 pepper taste  needed  salt ground  optional  water  cloves  salt taste  cloves garlic  pinch salt
        0.049   0.042        0.021     0.018  0.017   0.016       0.016          0.016       0.013
topic: 6
documents:
        Qatayef  0.8558824
 No-Knead Bread  0.8558824
     Pluto Pups  0.8509434
1 q


topic: 1
documents:
   Middle Eastern Turkey Dogs   0.628125
 Lamb Merguez Sausage Patties  0.6159091
 Lamb Merguez Sausage Patties  0.6159091
1 teaspoon salt; 1/4 teaspoon fennel seeds; 1 teaspoon ground cumin; 1/2 teaspoon ground cinnamon; 1/2 teaspoon ground coriander; 1/4 teaspoon ground turmeric; 3 cloves garlic, peeled; 2 tablespoons harissa, or to taste (see Ingredient note); 1 tablespoon tomato paste; 1 pound lean ground lamb; 1 tablespoon olive oil-----1 teaspoon salt; 1/4 teaspoon fennel seeds; 1 teaspoon ground cumin; 1/2 teaspoon ground cinnamon; 1/2 teaspoon ground coriander; 1/4 teaspoon ground turmeric; 3 cloves garlic, peeled; 2 tablespoons harissa, or to taste (see Ingredient note); 1 tablespoon tomato paste; 1 pound lean ground lamb; 1 tablespoon olive oil-----1 teaspoon ground cumin; 1 teaspoon ground coriander seed; 1 teaspoon ground ginger; 1 teaspoon ground cinnamon; 1 teaspoon fresh ground black pepper ; 1/4 teaspoon kosher salt; 1 pound ground turkey
----------


1 medium potato, peeled and diced; 1 small onion, diced; 1/2 cup chopped walnuts (optional); 1 cup oats; 2 cups water; 2 tablespoons vegetable oil; 1 cup bread crumbs; 2 teaspoons salt; 2 tablespoons soy sauce; 1 teaspoon dried sage; 1 tablespoon vegetable oil-----; 1/2 pound baking potatoes, such as russet or Idaho (about 2 small potatoes); 1/2 pound purple potatoes (about 4 small potatoes) ; 1/2 pound beets, peeled, and greens removed (about 2 small beets); 6 cups vegetable or canola oil; Kosher or sea salt; -----; 4 sweet or russet potatoes; 
----------------------------------------------------------------------
 potatoes  peeled  cooking  cubed  spray  cooking spray  cup diced  onion diced  large
    0.040   0.033    0.033  0.029  0.027          0.027      0.020        0.019  0.017

topic: 0
documents:
                                   Po-Tacos     0.5125
 Red, White, and Blue Potato and Beet Chips  0.4885714
                     No Bean Veggie Burgers   0.472549