# Imports and configuration.

In [1]:
%matplotlib inline

In [9]:
import lda
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import psycopg2
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text


import functions as f

In [3]:
matplotlib.style.use('ggplot')

# Load data

In [6]:
con = psycopg2.connect("host='localhost' dbname='explore' user='explore' password='Ln2bOYAVCG6utNUSaSZaIVMH'")
cursor = con.cursor()

cursor.execute('select category, count(*) from recipes_recipe group by category')

df = pd.read_sql_query('select * from recipes_recipe', con=con)
print('Loaded %s records' % df.shape[0])


Loaded 4506 records


# Extract features

In [338]:
vectorizer = CountVectorizer(
             stop_words='english'
           , ngram_range=(2, 2)
           , token_pattern='[A-Za-z]+'
)

features = vectorizer.fit_transform(df.ingredient_txt)
## features is a document x term matrix.

wc = f.feature_counts(vectorizer, features)

wc.sort('count').tail(25).plot('word','count', kind='bar')


<matplotlib.figure.Figure at 0x7f17f42e6290>

<matplotlib.axes._subplots.AxesSubplot at 0x7f17f43f79d0>

In [339]:
m = lda.LDA(n_topics=40, random_state=0, n_iter=100)
m.fit(features)
print('Finished running model')


Finished running model




# Evaluating the model.
## Convergence

In [340]:
plt.plot(m.loglikelihoods_, '-')
plt.title('Loglikelihood')


<matplotlib.figure.Figure at 0x7f17f43a7c10>

<matplotlib.text.Text at 0x7f17eeb31dd0>

# Assessing topics

In [542]:
## Extracting topic data.

## most probable words by topic.
## TODO: check if these are properly sorted within each topic.
w = f.most_probable_words(m, vectorizer.get_feature_names(), 10)

## most probable documents by topic.
np.apply_along_axis(lambda i: df.iloc[i]['title'], 1, doc_ids)

doc_ids = np.argsort(m.doc_topic_, axis=0)[-4:-1,:].T

print('='*70)
for t in range(m.n_topics):
    print('topic: %s' % t)
    print('documents:')
    print('\n'.join(df.iloc[doc_ids[t,:]]['title']))
    print('-----'.join(df.iloc[doc_ids[t,:]]['ingredient_txt']))
    print('-'*70)
    print(', '.join(w[w['label']==t]['word']))
    print('='*70)




1/2 cup reduced-sodium soy sauce
1/3 cup finely grated Asian pear with juices
2 scallions, thinly sliced
2 garlic cloves, minced
1 tablespoon raw or brown sugar
2 teaspoons grated peeled ginger
1 pound thinly sliced (1/8") boneless beef rib-eye steak or short ribs
-----
8 garlic cloves, peeled, crushed
1 2" piece ginger, peeled, sliced
1/2 cup dry sake
1/2 cup gochujang (Korean hot pepper paste)
1/2 cup mirin (sweet Japanese rice wine)
1/4 cup vegetable oil, plus more for grilling
1 1/2 pounds skinless, boneless pork shoulder (Boston butt), sliced 3/4" thick
-----
1 1/2 cups Napa cabbage kimchi (a 12-ounce jar; do not drain), chopped
2 tablespoons gochujang (Korean hot pepper paste)

----------------------------------------------------------------------
thinly sliced, finely grated, garlic cloves, sodium soy, grated peeled, tablespoons vegetable, cup vegetable, peeled ginger, soy sauce
topic: 23
documents:
Homemade Fresh Chorizo
Bacon-Molasses Breakfast Sausage
Braised Brisket with B

topic: 0
documents:
Roasted Fingerling Potatoes with Chive Pesto
Fava Bean and Pea Salad with Poppy Seed Dressing
Green Garlic and Pea Soup with Whipped Cream

1 3/4 pounds fingerling potatoes, halved lengthwise
1 tablespoon plus 1/2 cup extra-virgin olive oil
Kosher salt and freshly ground black pepper
1/2 cup (packed) chopped fresh chives, plus more for garnish
1/2 cup (packed) chopped fresh flat-leaf parsley
2 tablespoons slivered almonds, chopped walnuts, or pine nuts
1 garlic clove 
2 teaspoons fresh lemon juice
-----
2 cups shelled fava beans (from about 2 pounds pods) or frozen fava beans, thawed
Kosher salt
2 cups shelled fresh peas (from about 2 pounds pods) or frozen peas, thawed
1/3 cup sour cream
1 1/2 ounces fresh goat cheese, crumbled
2 tablespoons (or more) buttermilk or whole milk
1 tablespoon fresh lemon juice
Freshly ground black pepper
2 heads of Bibb lettuce, leaves separated (about 8 cups)
4 cups pea shoots (tendrils)
1 teaspoon poppy seeds
-----
2 bunches green ga