In [1]:
import psycopg2
from sqlalchemy import create_engine, Column, Integer, String, Float, ForeignKey
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship, sessionmaker

In [2]:
Base = declarative_base()

In [3]:
class Recipe(Base):
    __tablename__ = 'recipes'
    
    id = Column(Integer, primary_key=True)
    name = Column(String)
    url = Column(String, unique=True)
    calories = Column(Integer)
    fat = Column(Float)
    carbs = Column(Float)
    protein = Column(Float)
    cholesterol = Column(Float)
    sodium = Column(Float)
    servings = Column(Integer)
    #ingredients = 
    #__table_args__ = {'extend_existing': True}
    
    def __repr__(self):
        return "<Recipe(name='%s', url='%s')>" % (
            self.name, self.url)
    
class Ingredient(Base):
    __tablename__ = 'ingredients'
    id = Column(Integer, primary_key = True)
    ingredient = Column(String, nullable = False)
    recipe_id = Column(Integer, ForeignKey('recipes.id'))
    
    recipe = relationship(Recipe, back_populates = 'ingredients')
    #__table_args__ = {'extend_existing': True}
    def __repr__(self):
        return "<Ingredient(ingredient='%s')>" % self.ingredient

Recipe.ingredients = relationship("Ingredient", order_by=Ingredient.id, back_populates="recipe")

class Restaurant(Base):
    __tablename__ = 'restaurants'
    id = Column(Integer, primary_key=True)
    name = Column(String)
    url = Column(String)
    zomatoID = Column(Integer, unique=True)
    costfortwo = Column(Float)
    featured_image = Column(String)
    photos = Column(String)
    menu_url = Column(String)
    price_range = Column(Integer)
    latitude = Column(Float)
    longitude = Column(Float)
    address = Column(String)
    
    def __repr__(self):
        return "<Restaurant(name='%s', url='%s')>" % (
            self.name, self.url)

class MenuItem(Base):
    __tablename__ = 'menuitems'
    id = Column(Integer, primary_key=True)
    menuitem = Column(String, nullable = False)
    description = Column(String)
    restaurant_id = Column(Integer, ForeignKey('restaurants.id'))
    price = Column(String)
    
    restaurant = relationship(Restaurant, back_populates = 'menuitems')
    __table_args__ = {'extend_existing': True}
    def __repr__(self):
        return "<Restaurant(name='%s', url='%s')>" % (
            self.name, self.url)
    
Restaurant.menuitems = relationship("MenuItem", order_by=MenuItem.id, back_populates="restaurant")

In [4]:
dbname = 'restaurants'
username = 'andylane'
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))
print(engine.url)

postgres://andylane@localhost/restaurants


In [5]:
con = None
con = psycopg2.connect(database = "restaurants", user = "andylane")
Session = sessionmaker(bind=engine)
session = Session()

In [6]:
query = session.query(Recipe).filter(Recipe.name.ilike("%chicken cacciatore%"))

##Get recipe names into a vector

In [191]:
names = session.query(Recipe).all()

In [213]:
[clean_up_ingredient(i.ingredient) for i in names[3].ingredients]

[{'ingredient': u'large bananas, unpeeled, stems removed',
  'quantity': u'6',
  'unit': 'each'},
 {'ingredient': u'semisweet chocolate chips',
  'quantity': u'2',
  'unit': u'cups'},
 {'ingredient': u'(10.5 ounce) package miniature marshmallows',
  'quantity': u'1',
  'unit': 'each'}]

In [205]:
ingredient_line = names[4].ingredients[0].ingredient

In [8]:
titles=[]
for item in names:
    titles.append(item.name)

##Get ingredient lists into vector, order matching names

In [175]:
# original attempt: leave all measurements and nonalpha chars in before
# treating as bag o words
synopses = []
for item in names:
    synopses.append(" ".join([a.ingredient.encode("ascii") for a in item.ingredients]))

In [227]:
synopses = []
for item in names:
    synopses.append(" ".join([clean_up_ingredient(a.ingredient)["ingredient"] for a in item.ingredients]))

In [228]:
synopses[0:2]

[u'milk white vinegar all-purpose flour white sugar baking powder baking soda salt egg butter, melted spray',
 u'canola oil ground beef clove garlic, minced kidney beans, undrained chopped onion green bell pepper, chopped tomato sauce chili powder salt']

In [174]:
measurements = ("femtogram", "gigagram", "gram", "hectogram", "kilogram", \
                "long", "ton", "mcg", "megagram", "metric", "ton", "metric"\
                "tonne", "microgram", "milligram","nanogram", "ounce", \
                "lb", "oz", "each", "pound", "short", "Gram", "Ounce", "Pint", "Quart",\
                "Tablespoon", "Teaspoon", "Tablespoons", "Teaspoons", "Cups", "cup","Fluid Ounce", "fl oz", "Gallon", "Ounce", \
                "Pint", "Quart", "Tablespoon", "Teaspoon", "liter", "litre", "L", "ml", "fluid ounces", "can", "cans")

In [187]:
def find_measurement_words(i):
    ingredient_line = i
    for item in measurements:
        meas = str(" " + item.lower() + " ")
        a = re.search(meas, ingredient_line)
        if a > 0:
            unit = ingredient_line[a.start():a.end()].strip()
            quantity = ingredient_line[:a.start()].strip()
            ingredient = ingredient_line[a.end():].strip()
            break
        else:
            ilist = i.split(" ")
            quantity = ilist[0]
            unit = "each"
            ingredient = " ".join(ilist[1:])
    newitem = {"ingredient": ingredient,
    "unit": unit,
    "quantity":  quantity}
    try:
        fracsplit = ([float(k) for k in newitem["quantity"].split("/")])
        if len(fracsplit) >= 2:
            newitem["quantity"] = fracsplit[0] / fracsplit[1]
    except:
        None
    #print(newitem)
    return newitem

In [157]:
newitem = {"quantity": "1/2"}

In [212]:
def clean_up_ingredient(ingredient_line):
    ingredient_line = re.sub("\[u\'", "", ingredient_line)
    ingredient_line = re.sub("\']", "", ingredient_line)
    return find_measurement_words(ingredient_line)

In [172]:
for ingredient_list in synopses[:2]:
    for index, ingredient_line in enumerate(ingredient_list):
        ingredient_list[index] = re.sub("\[u\'", "", ingredient_line)
        ingredient_list[index] = re.sub("\']", "", ingredient_list[index])
        find_measurement_words(ingredient_line)

{'quantity': 0.75, 'unit': u'cup', 'ingredient': u'milk'}
{'quantity': u'2', 'unit': u'tablespoons', 'ingredient': u'white vinegar'}
{'quantity': u'1', 'unit': u'cup', 'ingredient': u'all-purpose flour'}
{'quantity': u'2', 'unit': u'tablespoons', 'ingredient': u'white sugar'}
{'quantity': u'1', 'unit': u'teaspoon', 'ingredient': u'baking powder'}
{'quantity': 0.5, 'unit': u'teaspoon', 'ingredient': u'baking soda'}
{'quantity': 0.5, 'unit': u'teaspoon', 'ingredient': u'salt'}
{'quantity': u'1', 'unit': 'each', 'ingredient': u'egg'}
{'quantity': u'2', 'unit': u'tablespoons', 'ingredient': u'butter, melted'}
{'quantity': u'cooking', 'unit': 'each', 'ingredient': u'spray'}
{'quantity': u'2', 'unit': u'tablespoons', 'ingredient': u'canola oil'}
{'quantity': u'1', 'unit': u'pound', 'ingredient': u'ground beef'}
{'quantity': 0.5, 'unit': 'each', 'ingredient': u'clove garlic, minced'}
{'quantity': u'1 (15 ounce)', 'unit': u'can', 'ingredient': u'kidney beans, undrained'}
{'quantity': 0.5, 'uni

In [216]:
calories=[]
for item in names:
    calories.append(item.calories)

http://brandonrose.org/clustering

In [218]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3

In [219]:
stopwords = nltk.corpus.stopwords.words('english')

In [220]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [229]:
# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as its own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


In [230]:
def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [231]:
#not super pythonic, no, not at all.
#use extend so it's a big flat list of vocab
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in synopses:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [232]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

there are 117798 items in vocab_frame


In [233]:
totalvocab_stemmed[0]

u'milk'

In [234]:
print(vocab_frame.head())

                  words
milk               milk
white             white
vinegar         vinegar
all-purpos  all-purpose
flour             flour


In [235]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(synopses) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

CPU times: user 3.31 s, sys: 24 ms, total: 3.33 s
Wall time: 3.32 s
(3941, 36)


In [236]:
tokenize_and_stem(synopses[0])

[u'milk',
 u'white',
 u'vinegar',
 u'all-purpos',
 u'flour',
 u'white',
 u'sugar',
 u'bake',
 u'powder',
 u'bake',
 u'soda',
 u'salt',
 u'egg',
 u'butter',
 u'melt',
 u'spray']

In [237]:
terms = tfidf_vectorizer.get_feature_names()

In [238]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [239]:
from sklearn.cluster import KMeans
num_clusters = 100
km = KMeans(n_clusters=num_clusters)
%time km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

CPU times: user 4.3 s, sys: 0 ns, total: 4.3 s
Wall time: 4.3 s


In [240]:
from sklearn.externals import joblib

#uncomment the below to save your model 
#since I've already run my model I am loading from the pickle

#joblib.dump(km,  'doc_cluster.pkl')

#km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [241]:
recipes = {'title': titles, 'synopsis': synopses,'calories': calories, 'cluster': clusters}

frame = pd.DataFrame(recipes, index = [clusters] , columns = ['title', 'cluster', 'calories'])

In [242]:
frame['cluster'].value_counts()

79    71
39    70
44    68
88    63
75    63
61    62
74    59
34    58
70    57
67    56
52    53
85    53
43    53
25    52
0     52
40    51
10    51
54    50
13    50
24    49
21    49
32    48
63    48
80    48
28    47
73    47
26    46
99    46
92    45
38    45
      ..
97    32
1     32
9     32
42    31
72    31
94    31
22    30
29    30
65    30
58    29
36    28
89    28
3     28
91    26
93    26
47    25
6     25
17    24
16    24
20    24
5     24
31    23
12    23
87    23
66    22
50    21
64    20
37    20
15    14
33    13
Name: cluster, dtype: int64

In [243]:
grouped = frame['calories'].groupby(frame['cluster']) #groupby cluster for aggregation purposes

grouped.mean() #average rank (1 to 100) per cluster

cluster
0     335.365385
1     451.281250
2     411.195122
3     358.464286
4     366.727273
5     453.916667
6     405.160000
7     490.421053
8     275.272727
9     415.968750
10    394.529412
11    439.214286
12    344.478261
13    434.340000
14    404.790698
15    479.428571
16    384.458333
17    411.083333
18    382.916667
19    428.775000
20    408.708333
21    490.306122
22    412.866667
23    532.432432
24    423.795918
25    440.269231
26    453.695652
27    379.914286
28    375.553191
29    330.400000
         ...    
70    527.771930
71    423.305556
72    488.774194
73    434.957447
74    359.593220
75    392.460317
76    416.235294
77    472.232558
78    337.837209
79    281.535211
80    467.541667
81    362.619048
82    436.702703
83    456.880952
84    402.500000
85    399.358491
86    344.333333
87    426.695652
88    370.523810
89    365.214286
90    376.564103
91    329.038462
92    393.155556
93    529.692308
94    375.838710
95    483.923077
96    439.195122
97    

In [244]:
from __future__ import print_function

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print() #add whitespace
    print() #add whitespace
    
    print("Cluster %d titles:" % i, end='')
    for title in frame.ix[i]['title'].values.tolist()[:20]:
        print(' %s,' % title, end='')
    print() #add whitespace
    print() #add whitespace
    
print()
print()

Top terms per cluster:

Cluster 0 words: ground, minced, ground, ground, black, black,

Cluster 0 titles: Baked Teriyaki Chicken, Red Pepper Potato Soup, Mushroom Meatloaf, Slow Cooker Pernil Pork, Lemon-Garlic Shrimp and Grits, Easy Slow Cooker Chicken Tikka Masala, Gourmet Pub Burgers, Slow Cooker Baby Back Ribs, Cajun Scallop Chowder, Chardonnay Rosemary Pork Roast, Italian Sausage Soup, Italian BBQ Pork Chops, Couscous-Stuffed Pork Chops, Pork Chops with Tangy Honey Sauce, Pork Chops with Apple Cider Glaze, Maple-Garlic Marinated Pork Tenderloin, Steamed Fresh Green Beans with Garlic Dill Hollandaise Sauce, Margarita Shrimp Salad from Swanson®, Parsley and Parmesan Crusted Pork Tenderloin, Southern Sweet Grilled Pork Chops,

Cluster 1 words: chicken, chicken, breasts, clove, minced, clove,

Cluster 1 titles: Spinach Stuffed Chicken Breasts, Mexican Chicken Corn Chowder, One Dish Bourbon Chicken, Easy Lemon Garlic Chicken, Mozzarella Mushroom Chicken, Chicken Piccata with Artichoke 

## Multidimensional Scaling

In [245]:
import os  # for os.path.basename
import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.manifold import MDS

MDS()

# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]
print()
print()





In [247]:
#define custom toolbar location
class TopToolbar(mpld3.plugins.PluginBase):
    """Plugin for moving toolbar to top of figure"""

    JAVASCRIPT = """
    mpld3.register_plugin("toptoolbar", TopToolbar);
    TopToolbar.prototype = Object.create(mpld3.Plugin.prototype);
    TopToolbar.prototype.constructor = TopToolbar;
    function TopToolbar(fig, props){
        mpld3.Plugin.call(this, fig, props);
    };

    TopToolbar.prototype.draw = function(){
      // the toolbar svg doesn't exist
      // yet, so first draw it
      this.fig.toolbar.draw();

      // then change the y position to be
      // at the top of the figure
      this.fig.toolbar.toolbar.attr("x", 150);
      this.fig.toolbar.toolbar.attr("y", 400);

      // then remove the draw function,
      // so that it is not called again
      this.fig.toolbar.draw = function() {}
    }
    """
    def __init__(self):
        self.dict_ = {"type": "toptoolbar"}

In [248]:
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=titles)) 

#group by cluster
groups = df.groupby('label')

#define custom css to format the font and to remove the axis labeling
css = """
text.mpld3-text, div.mpld3-tooltip {
  font-family:Arial, Helvetica, sans-serif;
}

g.mpld3-xaxis, g.mpld3-yaxis {
display: none; }

svg.mpld3-figure {
margin-left: -200px;}
"""

# Plot 
fig, ax = plt.subplots(figsize=(14,6)) #set plot size
ax.margins(0.03) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
#     points = ax.plot(group.x, group.y, marker='o', linestyle='', ms=18, 
#                      label=cluster_names[name], mec='none', 
#                      color=cluster_colors[name])
    points = ax.plot(group.x, group.y, marker='o', linestyle='', ms=5, 
                     mec='none')
    ax.set_aspect('auto')
    labels = [i for i in group.title]
    
    #set tooltip using points, labels and the already defined 'css'
    tooltip = mpld3.plugins.PointHTMLTooltip(points[0], labels,
                                       voffset=10, hoffset=10, css=css)
    #connect tooltip to fig
    mpld3.plugins.connect(fig, tooltip, TopToolbar())    
    
    #set tick marks as blank
    ax.axes.get_xaxis().set_ticks([])
    ax.axes.get_yaxis().set_ticks([])
    
    #set axis as blank
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)

    
ax.legend(numpoints=1) #show legend with only one dot

mpld3.display() #show the plot

#uncomment the below to export to html
#html = mpld3.fig_to_html(fig)
#print(html)

In [251]:
# #strip any proper names from a text...unfortunately right now this is yanking the first word from a sentence too.
import string
def strip_proppers(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent) if word.islower()]
    return "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in tokens]).strip()

In [252]:
# #strip any proper nouns (NNP) or plural proper nouns (NNPS) from a text
from nltk.tag import pos_tag

def strip_proppers_POS(text):
    tagged = pos_tag(text.split()) #use NLTK's part of speech tagger
    non_propernouns = [word for word,pos in tagged if pos != 'NNP' and pos != 'NNPS']
    return non_propernouns

In [253]:
from gensim import corpora, models, similarities 

#remove proper names
%time preprocess = [strip_proppers(doc) for doc in synopses]

#tokenize
%time tokenized_text = [tokenize_and_stem(text) for text in preprocess]

#remove stop words
%time texts = [[word for word in text if word not in stopwords] for text in tokenized_text]

CPU times: user 1.01 s, sys: 0 ns, total: 1.01 s
Wall time: 1.01 s
CPU times: user 2.64 s, sys: 20 ms, total: 2.66 s
Wall time: 2.63 s
CPU times: user 436 ms, sys: 8 ms, total: 444 ms
Wall time: 430 ms


In [254]:
#create a Gensim dictionary from the texts
dictionary = corpora.Dictionary(texts)

#remove extremes (similar to the min/max df step used when creating the tf-idf matrix)
dictionary.filter_extremes(no_below=1, no_above=0.8)

#convert the dictionary to a bag of words corpus for reference
corpus = [dictionary.doc2bow(text) for text in texts]

In [255]:
%time lda = models.LdaModel(corpus, num_topics=5, id2word=dictionary, update_every=5, chunksize=10000, passes=100)

CPU times: user 11min 1s, sys: 28 ms, total: 11min 1s
Wall time: 11min 1s


In [256]:
lda.show_topics()

[(0,
  u'0.047*ground + 0.046*pepper + 0.030*dri + 0.030*black + 0.029*pound + 0.028*salt + 0.027*garlic + 0.027*tast + 0.025*pork + 0.022*chop'),
 (1,
  u'0.047*chop + 0.042*fresh + 0.042*pepper + 0.031*oliv + 0.029*oil + 0.029*tast + 0.028*garlic + 0.026*ounc + 0.022*chees + 0.020*minc'),
 (2,
  u'0.043*chop + 0.036*pepper + 0.031*onion + 0.027*cut + 0.025*sauc + 0.021*slice + 0.020*oil + 0.020*ground + 0.019*green + 0.018*garlic'),
 (3,
  u'0.052*sugar + 0.051*egg + 0.040*white + 0.037*flour + 0.035*ground + 0.033*all-purpos + 0.031*butter + 0.027*salt + 0.026*milk + 0.025*bake'),
 (4,
  u'0.057*ounc + 0.053*chicken + 0.039*breast + 0.036*chees + 0.036*boneless + 0.035*skinless + 0.031*cream + 0.030*packag + 0.030*halv + 0.024*slice')]

In [107]:
topics_matrix = lda.show_topics(formatted=False, num_words=20)
topics_matrix = np.array(topics_matrix)

topic_words = topics_matrix[:,:,1]
for i in topic_words:
    print([str(word) for word in i])
    print()

ValueError: setting an array element with a sequence

In [257]:
topics_matrix

NameError: name 'topics_matrix' is not defined