# Imports and configuration.

In [1]:
%matplotlib inline

In [9]:
import lda
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import psycopg2
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text


import functions as f

In [3]:
matplotlib.style.use('ggplot')

# Load data

In [6]:
con = psycopg2.connect("host='localhost' dbname='explore' user='explore' password='Ln2bOYAVCG6utNUSaSZaIVMH'")
cursor = con.cursor()

cursor.execute('select category, count(*) from recipes_recipe group by category')

df = pd.read_sql_query('select * from recipes_recipe', con=con)
print('Loaded %s records' % df.shape[0])


Loaded 4506 records


# Extract features

In [551]:
vectorizer = CountVectorizer(
             stop_words=f.get_stop_words()
           , ngram_range=(1, 1)
           , token_pattern='[A-Za-z]+'
)

features = vectorizer.fit_transform(df.ingredient_txt)
## features is a document x term matrix.

wc = f.feature_counts(vectorizer, features)

wc.sort('count').tail(25).plot('word','count', kind='bar')


<matplotlib.figure.Figure at 0x7f17f4085c90>

<matplotlib.axes._subplots.AxesSubplot at 0x7f17f41a3b10>

In [553]:
m = lda.LDA(n_topics=40, random_state=0, n_iter=100)
m.fit(features)
print('Finished running model')


Finished running model




# Evaluating the model.
## Convergence

In [554]:
plt.plot(m.loglikelihoods_, '-')
plt.title('Loglikelihood')


<matplotlib.figure.Figure at 0x7f17ee841a50>

<matplotlib.text.Text at 0x7f17ee807850>

# Assessing topics

In [555]:
## Extracting topic data.

## most probable words by topic.
## TODO: check if these are properly sorted within each topic.
w = f.most_probable_words(m, vectorizer.get_feature_names(), 10)

## most probable documents by topic.
np.apply_along_axis(lambda i: df.iloc[i]['title'], 1, doc_ids)

doc_ids = np.argsort(m.doc_topic_, axis=0)[-4:-1,:].T

print('='*70)
for t in range(m.n_topics):
    print('topic: %s' % t)
    print('documents:')
    print('\n'.join(df.iloc[doc_ids[t,:]]['title']))
    print('-----'.join(df.iloc[doc_ids[t,:]]['ingredient_txt']))
    print('-'*70)
    print(', '.join(w[w['label']==t]['word']))
    print('='*70)




1 pineapple 
4 cups water 
6 cups superfine sugar
-----
3 cups 2% plain organic Greek yogurt
Granulated stevia extract equivalent to 1/2 cup sugar
2 tablespoons coconut milk
1 teaspoon ground cinnamon
1 teaspoon coconut extract
1/2 cup unsweetened shredded coconut
-----
Vegetable oil for brushing pan
About 1 cup confectioners' sugar for coating pan and marshmallows
3 tablespoons unsweetened cocoa powder
3 tablespoons malted milk powder
1 cup plus 3 tablespoons light corn syrup
3 (1/4-ounce) envelopes powdered unflavored gelatin
1 1/2 cups granulated sugar
1/4 teaspoon salt

----------------------------------------------------------------------
syrup, water, powder, sugar, tablespoons, unsweetened, cups, milk, coconut
topic: 23
documents:
Butterfinger Truffles
Chocolate Truffles
Frozen Chocolate-Peppermint Bùche de Noël

10 ounces semisweet chocolate (do not exceed 61% cacao), chopped
1 tablespoon unsalted butter
1 cup heavy cream
1 1/2 cups chopped Butterfinger candy bars (about 8 ou

topic: 0
documents:
Melon Lime Slushy
Mint Caipirinha Ice Pops
Tangerine Soufflé with Citrus Coulis

6 cups diced honeydew, cantaloupe or watermelon 
1/4 cup fresh lime juice, plus more to taste 
1/2 cup sugar, plus more to taste 
1 teaspoon fresh thyme leaves, plus sprigs for garnish 
1/2 cup silver tequila or mezcal (optional)
-----
1 cup mint leaves
3/4 cup fresh lime juice
1 cup water
3/4 cup superfine granulated sugar
1/3 cup cachaça
-----
6 tangerines, peeled
1/4 cup fresh tangerine juice
3 tablespoons sugar
1 tablespoon (or more) fresh lemon juice

----------------------------------------------------------------------
lime, tablespoon, plus, sugar, lemon, tablespoons, cups, juice, cup
topic: 1
documents:
Cherry Vodka
Bagels
Apple Crisp

1 1/8 lb/510 g fresh sour cherries (or black currants or jagody)
25 oz/750 ml clear vodka
1 to 2 tbsp sugar (optional)
-----
1 tablespoon (0.75 oz / 21 g) barley malt syrup, honey, or rice syrup, or 1 teaspoon (0.25 oz / 7 g) diastatic malt powde