In [1]:
import psycopg2
from sqlalchemy import create_engine, Column, Integer, String, Float, ForeignKey
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship, sessionmaker

In [2]:
Base = declarative_base()

In [260]:
class Recipe(Base):
    __tablename__ = 'recipes'
    
    id = Column(Integer, primary_key=True)
    name = Column(String)
    url = Column(String, unique=True)
    calories = Column(Integer)
    fat = Column(Float)
    carbs = Column(Float)
    protein = Column(Float)
    cholesterol = Column(Float)
    sodium = Column(Float)
    servings = Column(Integer)
    #ingredients = 
    #__table_args__ = {'extend_existing': True}
    
    def __repr__(self):
        return "<Recipe(name='%s', url='%s')>" % (
            self.name, self.url)
    
class Ingredient(Base):
    __tablename__ = 'ingredients'
    id = Column(Integer, primary_key = True)
    ingredient = Column(String, nullable = False)
    recipe_id = Column(Integer, ForeignKey('recipes.id'))
    
    recipe = relationship(Recipe, back_populates = 'ingredients')
    #__table_args__ = {'extend_existing': True}
    def __repr__(self):
        return "<Ingredient(ingredient='%s')>" % self.ingredient

Recipe.ingredients = relationship("Ingredient", order_by=Ingredient.id, back_populates="recipe")

class Restaurant(Base):
    __tablename__ = 'restaurants'
    id = Column(Integer, primary_key=True)
    name = Column(String)
    url = Column(String)
    zomatoID = Column(Integer, unique=True)
    costfortwo = Column(Float)
    featured_image = Column(String)
    photos = Column(String)
    menu_url = Column(String)
    price_range = Column(Integer)
    latitude = Column(Float)
    longitude = Column(Float)
    address = Column(String)
    
    def __repr__(self):
        return "<Restaurant(name='%s', url='%s')>" % (
            self.name, self.url)

class MenuItem(Base):
    __tablename__ = 'menuitems'
    id = Column(Integer, primary_key=True)
    menuitem = Column(String, nullable = False)
    description = Column(String)
    restaurant_id = Column(Integer, ForeignKey('restaurants.id'))
    price = Column(String)
    
    restaurant = relationship(Restaurant, back_populates = 'menuitems')
    __table_args__ = {'extend_existing': True}
    def __repr__(self):
        return "<MenuItem(name='%s', description='%s')>" % (
            self.menuitem, self.description)
    
Restaurant.menuitems = relationship("MenuItem", order_by=MenuItem.id, back_populates="restaurant")

  item.__name__


InvalidRequestError: Table 'recipes' is already defined for this MetaData instance.  Specify 'extend_existing=True' to redefine options and columns on an existing Table object.

In [4]:
dbname = 'restaurants'
username = 'andylane'
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))
print(engine.url)

postgres://andylane@localhost/restaurants


In [5]:
con = None
con = psycopg2.connect(database = "restaurants", user = "andylane")
Session = sessionmaker(bind=engine)
session = Session()

##Get recipe names into a vector

In [7]:
names = session.query(Recipe).all()

In [8]:
titles=[]
for item in names:
    titles.append(item.name)

##Get ingredient lists into vector, order matching names

In [9]:
calories=[]
for item in names:
    calories.append(item.calories)

http://brandonrose.org/clustering

In [10]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3

In [11]:
stopwords = nltk.corpus.stopwords.words('english')

In [12]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [78]:
# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed
def tokenize_and_stem(title):
    stemmed_titles = []
    new_title=[]
    for word in nltk.word_tokenize(title):
        new_title.append(stemmer.stem(word))
    stemmed_titles.extend(new_title)
    return [i for i in stemmed_titles]

In [79]:
def tokenize_only(title):
    tokenized_titles = []
    new_title=[]
    for word in nltk.word_tokenize(title):
        new_title.append(word)
    tokenized_titles.extend(new_title)
    return [i for i in tokenized_titles]

In [80]:
#not super pythonic, no, not at all.
#use extend so it's a big flat list of vocab
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in titles:
    totalvocab_stemmed.extend([j for j in tokenize_and_stem(i)])
    totalvocab_tokenized.extend([k for k in tokenize_only(i)])

In [81]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

there are 18382 items in vocab_frame


In [82]:
print(vocab_frame.head())

           words
fluffi    Fluffy
pancak  Pancakes
joan        Joan
's            's
quick      Quick


In [236]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, max_features=200000,
                                 min_df=0.01, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(titles) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

CPU times: user 764 ms, sys: 32 ms, total: 796 ms
Wall time: 765 ms
(3941, 96)


In [237]:
terms = tfidf_vectorizer.get_feature_names()

In [238]:
terms

[u"'s",
 u"'s pie",
 u'(',
 u')',
 u',',
 u'appl',
 u'bacon',
 u'bake',
 u'bake chicken',
 u'bbq',
 u'beef',
 u'best',
 u'bread',
 u'breast',
 u'broccoli',
 u'burger',
 u'casserol',
 u'chees',
 u'chef',
 u'chef john',
 u"chef john 's",
 u'chicken',
 u'chicken breast',
 u'chip',
 u'chocol',
 u'chocol chip',
 u'chop',
 u'coconut',
 u'cooker',
 u'cooker chicken',
 u'cooki',
 u'cranberri',
 u'cream',
 u'creami',
 u'curri',
 u'easi',
 u'egg',
 u'fri',
 u'garlic',
 u'glaze',
 u'grill',
 u'ham',
 u'herb',
 u'honey',
 u'ii',
 u'italian',
 u'john',
 u"john 's",
 u'lemon',
 u'lobster',
 u'loin',
 u'meat',
 u'mushroom',
 u'onion',
 u'orang',
 u'parmesan',
 u'pasta',
 u'pecan',
 u'pepper',
 u'pie',
 u'pork',
 u'pork chop',
 u'pork loin',
 u'pork tenderloin',
 u'pot',
 u'potato',
 u'pull',
 u'pumpkin',
 u'quick',
 u'rice',
 u'roast',
 u'roast chicken',
 u'salad',
 u'salmon',
 u'sauc',
 u'sausag',
 u'shepherd',
 u"shepherd 's",
 u"shepherd 's pie",
 u'shrimp',
 u'slow',
 u'slow cooker',
 u'slow cook

In [239]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [240]:
from sklearn.cluster import KMeans
num_clusters = 380
km = KMeans(n_clusters=num_clusters)
%time km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

CPU times: user 14.8 s, sys: 72 ms, total: 14.8 s
Wall time: 4.63 s


In [247]:
# Predict with models
X_new = tfidf_vectorizer.transform(["Pad Thai"])
prediction = km.predict(X_new)

In [248]:
prediction[0]

31

In [250]:
frame.ix[31]['title']

31                                 Thai Pork Satay
31                                Classic Pad Thai
31                            Thai Peanut Dressing
31                                  SPAM® Pad Thai
31                              Thai Chicken Satay
31      Easy and Spicy Thai Basil Chicken with Egg
31                       Thai Ground Chicken Basil
31                Thai Chicken with Basil Stir Fry
31                             Zen Garden Pad Thai
31       NP's Spicy Thai Basil Chicken and Veggies
31                              Thai Chicken Pizza
31    Thai Chicken Pizza with Carrots and Cilantro
31                             Thai Peanut Chicken
Name: title, dtype: object

In [246]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for ind in order_centroids[prediction, :10]: #replace 6 with n words per cluster
    print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
print() #add whitespace

print("Cluster %d titles:" %  prediction, end='')
for title in frame.ix[prediction]['title'].values.tolist()[:20]:
    print(' %s,' % title, end='')

TypeError: only integer arrays with one element can be converted to an index

In [113]:
from sklearn.externals import joblib

#uncomment the below to save your model 
#since I've already run my model I am loading from the pickle

#joblib.dump(km,  'doc_cluster.pkl')

#km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [244]:
recipes = {'title': titles, 'calories': calories, 'cluster': clusters}

frame = pd.DataFrame(recipes, index = [clusters] , columns = ['title', 'cluster', 'calories'])

In [115]:
frame['cluster'].value_counts()

7      171
1      135
19      94
22      80
12      66
18      54
5       52
17      51
13      48
45      41
9       41
34      38
26      37
36      33
84      29
99      28
158     28
8       28
258     28
16      26
53      25
61      25
113     25
24      25
10      25
142     25
11      24
0       24
54      23
27      22
      ... 
85       2
149      2
147      2
338      2
161      2
299      2
218      2
326      2
316      2
291      2
362      2
82       2
237      2
297      2
307      2
311      2
376      2
213      2
323      2
368      1
355      1
178      1
68       1
184      1
41       1
365      1
95       1
177      1
145      1
334      1
Name: cluster, dtype: int64

In [116]:
grouped = frame['calories'].groupby(frame['cluster']) #groupby cluster for aggregation purposes
grouped.mean() #average rank (1 to 100) per cluster

cluster
0      375.375000
1      417.607407
2      538.800000
3      455.714286
4      507.380952
5      375.769231
6      448.000000
7      395.286550
8      328.035714
9      483.951220
10     575.080000
11     445.208333
12     426.166667
13     431.770833
14     397.764706
15     517.625000
16     384.692308
17     445.294118
18     359.814815
19     343.542553
20     220.700000
21     466.153846
22     457.337500
23     467.615385
24     421.840000
25     383.333333
26     446.216216
27     240.818182
28     351.000000
29     403.562500
          ...    
350    260.666667
351    564.500000
352    644.750000
353    388.666667
354    322.750000
355    255.000000
356    538.333333
357    435.857143
358    337.666667
359    378.333333
360    336.631579
361    509.600000
362    225.000000
363    497.250000
364    272.285714
365    319.000000
366    417.000000
367    282.625000
368    212.000000
369    608.000000
370    121.500000
371    578.000000
372    371.222222
373     94.500000
37

In [117]:
from __future__ import print_function

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print() #add whitespace
    print() #add whitespace
    
    print("Cluster %d titles:" % i, end='')
    for title in frame.ix[i]['title'].values.tolist()[:20]:
        print(' %s,' % title, end='')
    print() #add whitespace
    print() #add whitespace
    
print()
print()

Top terms per cluster:

Cluster 0 words: Steak, Tomato, Spicy, Rice, Bake, Easy,

Cluster 0 titles: Baked Fake Steak with Gravy, Beer and Brown Sugar Steak Marinade, Cube Steak with Gravy, Smothered Hamburger Steak, Salisbury Steak, Marinated Flank Steak, Charred Corn and Heirloom Tomato Steak Fajitas from Mission®, Round Steak and Gravy I, Easy Flat Iron Steak in Wine Sauce, My Country Style Steak, Tropical Steak Sandwich, Southwest Steak Bites, Flat Iron Steaks Marinated in Red Wine, Original Steak Tartare, Sorta Salisbury Steak, Less-Butter Steak Diane, No Fun Steaks, China Lake Barbequed Steak, Marinated Flank Steak, Balsamic Marinated Flank Steak,

Cluster 1 words: Chicken, Wraps, Chocolate, Chops, Coconut, Cooker,

Cluster 1 titles: Mexican Chicken Corn Chowder, Chicken Marsala, Simple Chicken Cacciatore, Caper Chicken Cacciatore, Chicken Cacciatore Delight, One Dish Bourbon Chicken, Chicken Asparagus Roll-Ups, Chicken Piccata with Artichoke Hearts, Chicken Dijon, Chicken Stuff, 

AttributeError: 'unicode' object has no attribute 'values'

## Multidimensional Scaling

In [118]:
import os  # for os.path.basename

import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.manifold import MDS

MDS()

# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]
print()
print()





In [145]:
#set up colors per clusters using a dict
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e'}

#set up cluster names using a dict
cluster_names = {0: 'teaspoon, sugar, flour', 
                 1: 'tablespoons, chopped, pepper', 
                 2: 'teaspoon, ground, teaspoon', 
                 3: 'teaspoon, baking', 
                 4: 'ounce, chopped, pound, onion, beef'}

In [136]:
plt.close()

In [119]:
#define custom toolbar location
class TopToolbar(mpld3.plugins.PluginBase):
    """Plugin for moving toolbar to top of figure"""

    JAVASCRIPT = """
    mpld3.register_plugin("toptoolbar", TopToolbar);
    TopToolbar.prototype = Object.create(mpld3.Plugin.prototype);
    TopToolbar.prototype.constructor = TopToolbar;
    function TopToolbar(fig, props){
        mpld3.Plugin.call(this, fig, props);
    };

    TopToolbar.prototype.draw = function(){
      // the toolbar svg doesn't exist
      // yet, so first draw it
      this.fig.toolbar.draw();

      // then change the y position to be
      // at the top of the figure
      this.fig.toolbar.toolbar.attr("x", 150);
      this.fig.toolbar.toolbar.attr("y", 400);

      // then remove the draw function,
      // so that it is not called again
      this.fig.toolbar.draw = function() {}
    }
    """
    def __init__(self):
        self.dict_ = {"type": "toptoolbar"}

In [120]:
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=titles)) 

#group by cluster
groups = df.groupby('label')

#define custom css to format the font and to remove the axis labeling
css = """
text.mpld3-text, div.mpld3-tooltip {
  font-family:Arial, Helvetica, sans-serif;
}

g.mpld3-xaxis, g.mpld3-yaxis {
display: none; }

svg.mpld3-figure {
margin-left: -200px;}
"""

# Plot 
fig, ax = plt.subplots(figsize=(14,6)) #set plot size
ax.margins(0.03) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
#     points = ax.plot(group.x, group.y, marker='o', linestyle='', ms=18, 
#                      label=cluster_names[name], mec='none', 
#                      color=cluster_colors[name])
    points = ax.plot(group.x, group.y, marker='o', linestyle='', ms=5, 
                     mec='none')
    ax.set_aspect('auto')
    labels = [i for i in group.title]
    
    #set tooltip using points, labels and the already defined 'css'
    tooltip = mpld3.plugins.PointHTMLTooltip(points[0], labels,
                                       voffset=10, hoffset=10, css=css)
    #connect tooltip to fig
    mpld3.plugins.connect(fig, tooltip, TopToolbar())    
    
    #set tick marks as blank
    ax.axes.get_xaxis().set_ticks([])
    ax.axes.get_yaxis().set_ticks([])
    
    #set axis as blank
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)

    
ax.legend(numpoints=1) #show legend with only one dot

mpld3.display() #show the plot

#uncomment the below to export to html
#html = mpld3.fig_to_html(fig)
#print(html)

In [258]:
menu_items = session.query(MenuItem).filter(MenuItem.restaurant_id == 82).all()

In [268]:
for item in menu_items:
    print(item.menuitem)
    X_new = tfidf_vectorizer.transform([item.menuitem])
    prediction = km.predict(X_new)[0]
    print(km.score(X_new))
    print(frame.ix[prediction]['title'])
    print
    print

Moroccan Spiced Sweet Potato & Apple Soup
-0.306133204168
355                    Spicy Sweet Potato Soup
355    Butternut Squash and Sweet Potato Soup 
Name: title, dtype: object
Baby Kale & Pink Lady Apple Salad
-0.524359236295
20                     Fennel and Watercress Salad
20         New Year Three-Bean and Artichoke Salad
20                                    Smiley Salad
20                                    Tahini Salad
20                                   Harvest Salad
20              Persian-Style Tomato Avocado Salad
20                              Winter Green Salad
20              Seared Scallop and Asparagus Salad
20               German Potato Salad from Swanson®
20                          Caesar Salad Pinwheels
20    Blackened Steak Salad with Berry Vinaigrette
20       Tuna Nicoise Salad with Dijon Vinaigrette
20    Crispy Seafood Salad with Citrus Vinaigrette
20                     Veggie Chick'n Caesar Salad
20                              Asian Salmon Salad
20    