# Imports and configuration.

In [19]:
%matplotlib inline

In [648]:
import lda
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import psycopg2
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text

import common

import functions as f



In [649]:
matplotlib.style.use('ggplot')

# Load data

In [650]:
con = common.make_engine()

df_ep = pd.read_sql_table('recipes_recipe', con)
df_ep = df_ep[['title','ingredient_txt','url','image']]
df_ep['source'] = 'epicurious.com'
print('Loaded %s records from epicurious.com' % df_ep.shape[0])

df_ar = pd.read_sql_table('allrecipes', con)
df_ar = df_ar[['data-name','ingredients','url','data-imageurl']]
df_ar.columns = ['title','ingredient_txt','url','image']
df_ar['source'] = 'allrecipes.com'

print('Loaded %s records from allrecipes.com' % df_ar.shape[0])

df = pd.concat([df_ep, df_ar], ignore_index=True)

print('Loaded %s records in total' % df.shape[0])

## TODO: fix unicode here.



Loaded 17227 records in total


Loaded 4506 records from epicurious.com
Loaded 12721 records from allrecipes.com

## Examining and cleaning data

In [710]:
df = df[df['ingredient_txt'].str.len() > 20]
print df['ingredient_txt'].str.len().describe()
df['ingredient_txt'].str.len().plot(kind='hist').set_title('Ingredients character count')
df = df.reset_index()


<matplotlib.figure.Figure at 0x7fd05859c6d0>

count    17214.000000
mean       297.450854
std        127.100378
min         23.000000
25%        206.000000
50%        281.000000
75%        372.000000
max       1087.000000
Name: ingredient_txt, dtype: float64


In [711]:
# Extract features
vectorizer = CountVectorizer(
             stop_words='english'
           , ngram_range=(1, 2)
           , token_pattern='[A-Za-z]+'
           , min_df = 10
           , max_df = 0.25
)

features = vectorizer.fit_transform(df.ingredient_txt)
## features is a document x term matrix.

wc = f.feature_counts(vectorizer, features)

wc.sort('count').tail(25).plot('word','count', kind='bar')


<matplotlib.figure.Figure at 0x7fd0596933d0>

<matplotlib.axes._subplots.AxesSubplot at 0x7fd054a90650>

In [712]:
m = lda.LDA(n_topics=40, random_state=0, n_iter=200)
m.fit(features)
print('Finished running model')


Finished running model


# Evaluating the model.
## Convergence

In [713]:
plt.plot(m.loglikelihoods_, '-')
plt.title('Loglikelihood')


<matplotlib.figure.Figure at 0x7fd054d0b450>

<matplotlib.text.Text at 0x7fd054b35810>

# Assessing topics

In [715]:
## Extracting topic data.

## most probable words by topic.
## TODO: check if these are properly sorted within each topic.
w = f.most_probable_words(m, vectorizer.get_feature_names(), 10)

## most probable documents by topic.
# np.apply_along_axis(lambda i: df.iloc[i]['title'], 1, doc_ids)

doc_ids = np.argsort(m.doc_topic_, axis=0)[-4:-1,:].T

doc_probs = np.sort(m.doc_topic_, axis=0)[-4:-1,:].T




In [716]:
## TODO: store one set of results for each run.


## massage document ids and probabilities into form suitable for database.
di = pd.DataFrame(doc_ids)
di['topic'] = di.index
di = pd.melt(di, id_vars='topic')
di.columns = ['topic','rank','recipe_key']

dp = pd.DataFrame(doc_probs)
dp['topic'] = dp.index
dp = pd.melt(dp, id_vars='topic')
dp.columns = ['topic','rank','prob']

dd = pd.merge(di, dp)
dd.to_sql('doc_prob', con, if_exists='replace')

# store recipes
df['key'] = df.index
df.to_sql('clean_recipes', con, if_exists='replace')

# store words
w.columns = ['rank','topic','word','prob']
w.to_sql('word_probs', con, if_exists='replace')


In [657]:
xx = pd.merge(df, dd, left_on='key', right_on='recipe_key', how='right')
for n, g in xx.groupby('topic'):
    print 'topic: %s' % n
    print g[['title','prob']].sort('prob').to_string()


topic: 37
                          title      prob
75      "Honey Glazed Scallops"  0.596296
96         "Apricot Glazed Ham"  0.596875
27  Grandma Douglas's Schnecken  0.613043
topic: 38
                                               title      prob
5                         Pan-Seared Pork Blade Chop  0.705556
9                                Garlicky Mayonnaise  0.742105
14  Jerusalem Artichoke and Artichoke Heart Linguine  0.743103
topic: 39
                                             title      prob
6   Shaved Broccoli Stalk Salad with Lime & Cotija  0.675510
8                                Peaches in Lillet  0.679032
28                                 Butter Crackers  0.742105



topic: 24
                                   title      prob
2                        Berries and Rum  0.831034
3                 Strawberries and Vodka  0.831034
101  "Black Bean and Soy Veggie Burgers"  0.871053
topic: 25
                                  title      prob
49                    "Accidental Fish"  0.532099
30         "Chicken Noodle Casserole I"  0.562000
112  "Tasty Turkey Meatloaf With Sauce"  0.593878
topic: 26
                            title      prob
92    "Meaty Barbeque Sandwiches"  0.673333
93                "Pesto Zoodles"  0.673333
98  "Delicious Spinach Manicotti"  0.743478
topic: 27
                              title      prob
41     "Sour Cream Chicken Paprika"  0.455000
51  "Creamy Zucchini with Linguine"  0.462162
7                  Kippers and Bits  0.482609
topic: 28
                                       title      prob
104       "Braised Cabernet Beef Short Ribs"  0.796552
94          "Restaurant-Style Fried Chicken"  0.830435
64   "Grilled Chicke


topic: 13
                                         title      prob
119  "Frank Zappa\u0027s Breakfast Club Pasta"  0.528571
95               "Tuna Cheese Whirl Casserole"  0.545833
18                     Indian Clarified Butter  0.555000
topic: 14
                                                 title      prob
111  "Croatian Dalmatian Pot Roast (Dalmatinska Pas...  0.504167
78                        "Cranberry Glazed Pork Ribs"  0.526087
36                          "Creamy Chipotle Chicken "  0.580769
topic: 15
                                                 title      prob
110              "Quick and Easy Spaghetti and Spices"  0.688571
99   "Grilled Pattypan Squash with Hot Chorizo Vina...  0.689474
73                              "Garlic Pepper Seitan"  0.709756
topic: 16
                              title      prob
17      Sorghum-Glazed Baby Carrots  0.575000
20                      Salsa Verde  0.587805
4   Spring Pea And Pea Shoot Omelet  0.598077
topic: 17
                 


topic: 1
                                    title      prob
50         "Grilled Lemon Yogurt Chicken"  0.720513
102  "Tacos in Pasta Shells with Veggies"  0.721875
58                  "Never Fail Meatloaf"  0.745161
topic: 2
                                         title      prob
83                 "Glazed Grilled Pork Chops"  0.471875
113  "Oriental Tacos with Black and Blue Tuna"  0.524000
106             "Sweet and Spicy Lime Chicken"  0.526087
topic: 3
                                      title      prob
82  "Grandma Kay\u0027s Chicken Enchiladas"  0.641667
84    "Chicken and Asparagus in Cream Soup"  0.641667
37                         "Chile Verde II"  0.682143
topic: 4
                                            title      prob
56              "Chicken Meatballs and Spaghetti"  0.420833
23                 Ready for Guests Roasted Pears  0.421739
0   Red Rice Salad with Pecans, Fennel, and Herbs  0.435000
topic: 5
                                      title      prob
55  "Fir

topic: 0
                           title      prob
40  "Corned Beef and Cabbage II"  0.678378
79      "Pork Tenderloin Canton"  0.678378
34               "Greek Chicken"  0.686364

In [658]:

print('='*70)
for t in range(m.n_topics):
    print('topic: %s' % t)
    print('documents:')
    print pd.DataFrame([df.iloc[doc_ids[t,:]]['title'].values, doc_probs[t,:]]).T.sort(1, ascending=False).to_string(header=False, index=False)
#    print('\n'.join(df.iloc[doc_ids[t,:]]['title']))
    print('-----'.join(df.iloc[doc_ids[t,:]]['ingredient_txt']))
    print('-'*70)
    print w[w['topic']==t][['word','prob']].sort('prob', ascending=False).T.to_string(index=False, header=False, float_format=lambda x: '% 4.3f' % x)
    print('='*70)




topic: 37
documents:
   Tomato, Corn, and Avocado Salsa  0.6130435
 "Suegra\u0027s Tomatillo Chicken"   0.596875
            "Lengua (Beef Tongue)"  0.5962963
1 beef tongue
1 large onion, chopped, divided
2 cloves garlic
3 tablespoons salt
1 whole jalapeno pepper, stemmed
3 whole tomatoes
2 whole jalapeno peppers, stemmed
2 tablespoons vegetable oil-----1 (3 1/2) pound whole chicken, cut into 6 pieces
1 pound fresh tomatillos, husks removed
2 dried California chile pods
3 dried red chile peppers
2 tablespoons olive oil
salt to taste-----
Ripe tomatoes
Raw corn kernels
Diced avocado
Chopped cilantro
Fresh lime juice
Minced serrano chile
Salt

----------------------------------------------------------------------
  lime  fresh cilantro  lime juice  seeded  juice  chopped fresh  chile  peeled  peppers
 0.042           0.029       0.025   0.025  0.024          0.023  0.022   0.018    0.018
topic: 38
documents:
 Slow-Roasted Green Beans with Sage  0.7431034
              Chicken Under a Br


----------------------------------------------------------------------
   soy  sesame  sugar   rice  white  green  vegetable  vegetable oil  sesame oil
 0.028   0.022  0.020  0.019  0.018  0.017      0.017          0.016       0.014
topic: 34
documents:
 "Margarita Beef Skewers"  0.5157143
             "Dinengdeng"  0.4785714
   "Rabbit Italian Style"       0.45
1 (2 pound) rabbit, cut into small pieces
1/4 pound salt pork
1 (750 milliliter) bottle sweet sherry-----2 (8 ounce) fillets milkfish (bangus)
1 tomato, quartered
1 onion, chopped
2 tablespoons shrimp paste (bagoong)
1 cup water
salt and pepper to taste
1/2 pound long beans, cut into bite-size pieces
1/2 pound zucchini, cut into bite-size pieces
1/2 pound fresh okra-----1 cup margarita mix
1/2 teaspoon salt
1 tablespoon white sugar
2 cloves garlic, minced
1/4 cup vegetable oil
1 pound top sirloin steak, cut into 1 1/2-inch cubes
8 metal skewers, or bamboo skewers soaked in water for 30 minutes
16 mushrooms, stems trimmed
1 oni


topic: 30
documents:
         Petits Farcis  0.5954545
 "Pepper Steak Packet"  0.5705882
    "Mushroom Sliders"  0.5683333
1 pound lean ground beef, or more to taste
1 large egg
1 small onion, finely chopped
1 cup finely chopped mushrooms
1 teaspoon ground black pepper
1/2 teaspoon garlic salt
salt to taste
6 portobello mushrooms, or more to taste
1 green bell pepper, halved and seeded
1 red bell pepper, halved and seeded
1 yellow bell pepper, halved and seeded-----1 (1/2 pound) sirloin steak, cut into bite size strips
1 red bell pepper, chopped
1 green bell pepper, chopped
1 yellow bell pepper, chopped
1 sweet onion, chopped
1 pint cherry tomatoes
1 zucchini, chopped
2 tablespoons butter
1/4 cup steak sauce-----
4 small new onions, with tops attached 
4 small pattypan squashes
4 small tomatoes
4 small eggplants
4 bell peppers
4 small zucchini

----------------------------------------------------------------------
 bell pepper  green    red  green bell  red bell  cup chopped  pepper c


4 (10 ounce) 1 1/2 inch thick rib-eye steaks
1 cup single malt Scotch whiskey-----4 (10 ounce) 1 1/2 inch thick rib-eye steaks
1 cup single malt Scotch whiskey-----4 smoked turkey legs
4 cups hot water
2 teaspoons chicken bouillon granules
2 cups barbeque sauce
----------------------------------------------------------------------
 water  fluid  pepper sauce  bottle  hot pepper  bouillon  ounce bottle  fluid ounce  pineapple
 0.030  0.025         0.023   0.023       0.023     0.023         0.021        0.019      0.018
topic: 27
documents:
      Calamari with Roasted Tomato Sauce  0.4826087
                      "Garlic Crab Legs"  0.4621622
 "Baked Tilapia in Garlic and Olive Oil"      0.455
4 (4 ounce) fillets tilapia
4 cloves crushed garlic
3 tablespoons olive oil
1 onion, chopped
1/4 teaspoon cayenne pepper-----3 1/2 pounds Alaskan king crab legs with shell
6 ears fresh corn
1 1/2 cups butter
3 teaspoons minced garlic
1/8 teaspoon crushed red pepper flakes
1 teaspoon Old Bay Seaso


topic: 23
documents:
                            "Tuscan Chicken"  0.5305556
                       "Calico Veggie Beans"  0.5109091
 "Gracie\u0027s Amazing Vegetarian Burritos"  0.4878378
1 tablespoon olive oil
1 onion, chopped
1 tablespoon minced garlic
1 tablespoon chili powder, divided
2 teaspoons ground black pepper, divided
2 teaspoons cayenne pepper, divided
1 teaspoon ground cumin
1 cup canned pinto beans, rinsed and drained
1 cup canned black beans, rinsed and drained
1 cup frozen corn, thawed, drained
1 cup cooked brown rice
6 burrito-size flour tortillas-----1 (15 ounce) can black beans
1 (15 ounce) can kidney beans, drained
1 (15 ounce) can butter beans
1 (15 ounce) can lima beans, drained
1 cup packed brown sugar
1 cup chopped onion
1/2 cup fresh tomato sauce
1/2 cup chopped celery
3 tablespoons white wine vinegar
1 teaspoon mustard powder-----4 skinless, boneless chicken breast halves
1 (4 ounce) can sliced black olives, drained
1 (15 ounce) can cannellini beans, rinsed 


1/2 cup wine
1/2 cup soy sauce
1/4 cup olive oil
1/4 cup brown sugar
1/4 cup grated fresh ginger root
2 cloves garlic, crushed
1 teaspoon ground black pepper
1 1/2 pounds beef flank steak-----1/4 cup soy sauce
2 tablespoons olive oil
1 tablespoon dry sherry
1 tablespoon orange juice
1 teaspoon minced garlic
1 teaspoon minced fresh ginger root
2 teaspoons Chinese five-spice powder
1 1/2 pounds skinless, boneless chicken breast, thinly sliced-----3 tablespoons hoisin sauce
3 tablespoons sherry
1/4 cup soy sauce
1 teaspoon barbeque sauce
2 green onions, chopped
2 cloves garlic, minced
1 tablespoon minced fresh ginger root
1 1/2 pounds flank steak
skewers
----------------------------------------------------------------------
 minced garlic    soy  soy sauce  minced fresh  teaspoons  ginger  tablespoon minced  teaspoon minced  tablespoons minced
         0.045  0.032      0.030         0.028      0.028   0.024              0.024            0.021               0.021
topic: 20
documents:
  C


topic: 16
documents:
 Spring Pea And Pea Shoot Omelet  0.5980769
                    Cacio e Pepe  0.5878049
 Corn Griddle Cakes with Sausage      0.575

1/2 cup (1 stick) unsalted butter
1/2 cup honey
1 1/2 tablespoons finely grated orange zest
Kosher salt, freshly ground pepper
-----
Kosher salt
6 ounces pasta (such as egg tagliolini, bucatini, or spaghetti)
3 tablespoons unsalted butter, cubed, divided
1 teaspoon freshly cracked black pepper
3/4 cup finely grated Grana Padano or Parmesan
1/3 cup finely grated Pecorino
-----
1 teaspoon unsalted butter 
2 large eggs, whisked
Fine sea salt
Freshly ground black pepper
1/4 cup pea shoots plus more for garnish
1 rounded tablespoon fresh shelled peas plus more for garnish
1 tablespoon goat cheese, crumbled
1 teaspoon finely chopped fresh chives, plus more for garnish 
1/2 teaspoon fresh thyme leaves

----------------------------------------------------------------------
 finely chopped  grated  freshly    sea  sea salt  freshly ground  fi


1 tablespoon olive oil
2 tablespoons butter
1 tablespoon Jamaican jerk seasoning
1 teaspoon salt
5 (3 ounce) fillets swai fish-----3 1/2 gallons peanut oil for frying
1 (10 pound) whole turkey, neck and giblets removed
1 tablespoon salt, or to taste
1 tablespoon ground black pepper, or to taste-----
2 cups Indian parboiled rice, Uncle Ben's Converted Rice, or long-grain white rice

----------------------------------------------------------------------
 fillets  white  water  uncooked  white rice  salmon  pepper taste  cups water  grain
   0.045  0.033  0.028     0.026       0.025   0.023         0.021       0.021  0.017
topic: 14
documents:
       "Spicy Lime Grilled Shrimp"  0.5807692
 "Bubble \u0027n\u0027 Squeak II "   0.526087
           "Garlic Grilled Shrimp"  0.5041667
4 skewers
4 cloves garlic
kosher salt to taste
1/4 cup olive oil
1 pound large shrimp, peeled and deveined
1/4 teaspoon ground black pepper, or to taste-----1 (16 ounce) package farfalle (bow tie) pasta
1 medium 



2 slices of whole-grain bread
1 ounce turkey bacon
2 lettuce leaves
2 slices tomato
1/4 sliced avocado
1 tablespoon mayo
A medium apple
-----1 tablespoon mayonnaise, divided
2 slices white bread
2 slices American cheese
1 slice pepperjack cheese-----4 slices bacon
2 leaves lettuce
2 slices tomato
2 slices bread, toasted
1 tablespoon mayonnaise
----------------------------------------------------------------------
 bacon  bread  split  slices bacon   buns  mayonnaise  hamburger  tomato  hamburger buns
 0.039  0.031  0.018         0.017  0.016       0.016      0.012   0.012           0.011
topic: 11
documents:
     "Easy Baked Chicken"  0.5761905
 "Rachael\u0027s Chicken"   0.526087
 "Salsa Simmered Chicken"  0.5222222
6 skinless, boneless chicken breast halves
salt and ground black pepper to taste
2 tablespoons olive oil
1 (15 ounce) jar mild picante salsa
1 (14 ounce) can chicken broth-----2 skinless, boneless chicken breast halves
1/4 cup butter
1/2 teaspoon dried thyme
1 (14.5 ounc


2 (10 inch) flour tortillas
1 cup shredded Cheddar cheese-----1 (16 ounce) can refried beans
3/4 onion, diced
5 (10 inch) flour tortillas
1 cup salsa
2 cups shredded Cheddar or Colby Jack cheese-----1 (16 ounce) container sour cream
12 ounces shredded Monterey Jack cheese
1 (20 ounce) can green enchilada sauce
18 (6 inch) corn tortillas
1 (2 ounce) can chopped black olives
----------------------------------------------------------------------
 cheddar  cheddar cheese  cup shredded  shredded cheddar  cups shredded   inch  tortillas  green   jack
   0.057           0.054         0.036             0.034          0.021  0.016      0.016  0.016  0.014
topic: 8
documents:
 "German Spaghettini"  0.5911765
     "Simple Golabki"  0.5323529
        "Slumgullion"  0.5139535
1 (16 ounce) package elbow macaroni
1 pound lean ground beef
1 large onion, chopped
2 cups tomato sauce
2 (4.5 ounce) cans mushrooms, drained
2 teaspoons minced garlic
salt and pepper to taste
1 (14.5 ounce) can stewed tomato


topic: 4
documents:
    Red Rice Salad with Pecans, Fennel, and Herbs      0.435
 Linguine with Bay Scallops, Fennel, and Tomatoes  0.4217391
                               "Bum\u0027s Lunch"  0.4208333
4 (4 ounce) cube steaks
4 medium potatoes, thinly sliced 
1 large onion, thinly sliced
4 teaspoons margarine
salt and pepper to taste-----
8 ounces linguine
3 tablespoons extra-virgin olive oil, divided
1 medium fennel bulb, halved, very thinly sliced, plus 1 tablespoon chopped fennel fronds
1 medium onion, halved, thinly sliced
1 pound bay scallops
1 6-ounce container cherry tomatoes, halved if large
1 tablespoon Pernod or other anise-flavored liqueur
4 tablespoons chopped fresh parsley, divided
1 lemon, cut into 4 wedges
-----
1 cup red rice
1 small fennel bulb, very thinly sliced
1/4 medium red onion, thinly sliced
3 tablespoons fresh lime juice, divided
2/3 cup pecans, divided
1/4 cup olive oil
1/2 cup cilantro leaves and finely chopped tender stems
Kosher salt, freshly ground pepp


3 cups cooked, finely chopped chicken meat
1 1/2 cups seasoned dry bread crumbs
2 eggs, lightly beaten
2 cups sauteed chopped onion
1 tablespoon chopped fresh parsley
1 teaspoon salt
1/2 teaspoon ground black pepper-----1 pound ground pork sausage
4 eggs, beaten
1 (15 ounce) can cream-style corn
1 cup soft bread crumbs
1/4 teaspoon ground black pepper
1/3 cup cracker crumbs
2 tablespoons chopped fresh parsley-----1 (14.75 ounce) can canned salmon
1 egg
1/4 cup chopped onion
1/2 cup seasoned dry bread crumbs
1 tablespoon olive oil
----------------------------------------------------------------------
 crumbs  bread crumbs  beaten    egg   eggs    dry  dry bread  eggs beaten  crumbs cup
  0.056         0.052   0.045  0.041  0.033  0.021      0.019        0.017       0.015
topic: 1
documents:
                           "Steamed Mussels II"  0.7451613
 "Paleo Roasted Whitefish with Leeks and Bacon"   0.721875
                            "Mussels Mariniere"  0.7205128
4 quarts mussels, cle

topic: 0
documents:
            "Salmon Patties I"  0.6863636
 "Leftover Chicken Croquettes"  0.6783784
  "Sausage and Corn Casserole"  0.6783784