# Resturant Menu

In [180]:
from lxml import html
import requests
from bs4 import BeautifulSoup
import re
import time
import pandas as pd
import pickle
import numpy as np
from nltk.corpus import stopwords
import nltk
import string

from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import random 

from matplotlib import pyplot as plt
import seaborn as sns
% matplotlib inline

## Scraping the data

In [35]:
t0 = time.time()

headers={'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) \
        Chrome/64.0.3282.167 Safari/537.36'}
food_categories = ['american','american-new','german','crepes','french','burgers','deli','indian']
base_url='https://www.allmenus.com/ny/buffalo/'
resturant_url_list=[]
category_list=[]

for fcat in food_categories:
    r=requests.get(base_url + '-/' + fcat + '/',headers=headers)
    soup=BeautifulSoup(r.text,'lxml')
    resturant_links=soup.findAll('h4',{'class','name'})
    
    for resturant in resturant_links:
        resturant_url_list.append(resturant.find('a')['href'])
        category_list.append(fcat)

In [162]:
def scrape_menus(resturant_url_list,category_list):
    item_list=[]
    restaurant_names=[]
    restaurant_address=[]
    for url,fcat in zip(resturant_url_list,category_list):
        page=requests.get('https://allmenus.com'+url,headers=headers)
        soup=BeautifulSoup(page.text,"lxml")
        all_categories=soup.findAll('li',{'class':'menu-category'})
        try:
            name=soup.find('div',{'class':'restaurant-summary'}).find('h1').text
            address=soup.find('ul',{'class':'info-list'}).find('a').text
        except:
            name=[]
            address=[]
        for cat in all_categories:
            category_name=cat.find('div',{'class':'category-name'}).text
            all_menu_items_in_category = cat.find_all('li',{"class":"menu-items"})
            for menu in all_menu_items_in_category:
                item_name=menu.find('span',{'class':'item-title'}).text
                try:
                    item_description=menu.find('p',{'class':'description'}).text
                    item_price=menu.find('span',{'class':'item-price'}).text.strip()
                except:
                    item_description=[]
                    item_price=[]
                new_item={'resturant':name,'item_name':item_name,'item_description':item_description,'item_price':item_price,\
                         'category_name':category_name,'address':address,'resturant_category':fcat}
                item_list.append(new_item)
        restaurant_names.append(name)
        restaurant_address.append(address)
    return item_list,restaurant_names,restaurant_address

        

In [163]:
item_list,restaurant_names,restaurant_address=scrape_menus(resturant_url_list,category_list)

In [171]:
df=pd.DataFrame(item_list)

In [170]:
len(restaurant_address)

599

In [172]:
df.rename(columns={'resturant':'restaurant','resturant_category':'restaurant_category'},inplace=True)

In [173]:
df.drop_duplicates(['restaurant','item_name'],inplace=True)

In [174]:
pickle.dump(df,open('indian.p','wb'))

## Data Preprocessing

In [175]:
def lowerCase(df,col_name):
    out_df = df[col_name].apply(lambda x: x.lower())
    return out_df

In [176]:
def removePunctuation(df,col_name):
    out_df=df[col_name].apply(lambda x: re.sub(r'[^\w\s]', '',x))
    return out_df.str.split()

In [181]:
stop=set(stopwords.words('english'))
def removeStopWords(df,col_name):
    out_df=df[col_name].apply(lambda x: [item for item in x if item not in stop])
    return out_df

In [182]:
df['item_name_and_description'] = df.item_name.map(str) + " " + df.item_description.map(str)

In [183]:
df['item_name_and_description']= lowerCase(df,'item_name_and_description')
df['item_name_and_description']= removePunctuation(df,'item_name_and_description')
df['item_name_and_description']= removeStopWords(df,'item_name_and_description')

In [184]:
df.head(1)

Unnamed: 0,address,category_name,item_description,item_name,item_price,restaurant,restaurant_category,item_name_and_description
0,"20 Lake Ave, Buffalo, NY 14219",Appetizers,fresh house made mozzarella with beef steak&co...,Mozzarella Caprese,$7.95,Omarl's,american,"[mozzarella, caprese, fresh, house, made, mozz..."


In [185]:
pickle.dump(df,open('processed.p','wb'))

In [186]:
df=pd.read_pickle('processed.p')

In [187]:
token_list=df.item_name_and_description.tolist()

In [188]:
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()
df['item_desc_lem'] = [[lmtzr.lemmatize(i) for i in x] for x in token_list]

In [189]:
tokens=df.item_desc_lem.tolist()

In [190]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(tokenizer=lambda i:i, lowercase=False)
tfs = tfidf.fit_transform(tokens)

In [191]:
df['tf_idf']=[i for i in tfs]

In [192]:
pickle.dump(df,open('processed.p','wb'))

In [294]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

cosine_similarities = cosine_similarity(tfs[10000], tfs).flatten()
related_food_idcs = cosine_similarities.argsort()[:-10:-1]

cosine_similarities[related_food_idcs]

related_food_idcs

df.iloc[related_food_idcs][["item_name","item_description","restaurant"]]

Unnamed: 0,item_name,item_description,restaurant
10000,Cheese Ravioli,pasta filled with romano and parmesan cheese w...,Cameo's Restaurant & Catering
11827,Ravioli,Homemade Pasta filled with seasoned ricotta ch...,Gullo's Macaroni Grill
12626,Cheese Ravioli,,Bob And John's La Hacienda
6132,Ravioli,Filled with ricotta cheese,Anchor Bar
18764,Four Cheese Pasta,"Penne Pasta, Mozzarella, Ricotta, Romano and P...",The Cheesecake Factory
16918,Pasta Marinara,,Pizza Plant
13743,Cheese Ravioli,cheese ravioli topped with mama rose's homemad...,My Tomato Pie
10565,Ravioli,spinach and cheese ravioli simmered in marinar...,The Buffalo House Restaurant
10622,Ravioli,spinach and cheese ravioli simmered in marinar...,The Buffalo House Cafe


In [296]:
df.shape

(20516, 10)

In [194]:
df.reset_index(drop=True,inplace=True)

In [196]:
pickle.dump(df,open('processed.p','wb'))

In [15]:
rest2=pd.read_pickle('/Users/Phaneendra/Website/name_and_addr_list.p')

In [11]:
rest1=pd.read_pickle('/Users/Phaneendra/Website/all_menus.p')

In [21]:
rest3=pd.read_pickle('/Users/Phaneendra/Website/name_and_addr_list_split.p')

In [23]:
rest4=pd.read_pickle('/Users/Phaneendra/Website/tfidf_mat_menus.p')

In [290]:
df

Unnamed: 0,address,category_name,item_description,item_name,item_price,restaurant,restaurant_category,item_name_and_description,item_desc_lem,tf_idf
0,"20 Lake Ave, Buffalo, NY 14219",Appetizers,fresh house made mozzarella with beef steak&co...,Mozzarella Caprese,$7.95,Omarl's,american,"[mozzarella, caprese, fresh, house, made, mozz...","[mozzarella, caprese, fresh, house, made, mozz...","(0, 5445)\t0.309005264793\n (0, 1522)\t0.28..."
1,"20 Lake Ave, Buffalo, NY 14219",Appetizers,served with mango sauce,Jamaican Beef Wellington,$7.95,Omarl's,american,"[jamaican, beef, wellington, served, mango, sa...","[jamaican, beef, wellington, served, mango, sa...","(0, 902)\t0.275690326029\n (0, 7239)\t0.180..."
2,"20 Lake Ave, Buffalo, NY 14219",Appetizers,served with roasted red pepper cream sauce,Smith Island Crab Cakes,$8.95,Omarl's,american,"[smith, island, crab, cakes, served, roasted, ...","[smith, island, crab, cake, served, roasted, r...","(0, 6828)\t0.265576373268\n (0, 7239)\t0.16..."
3,"20 Lake Ave, Buffalo, NY 14219",Appetizers,served with toasted crostini,Warm Lobster Dip,$6.95,Omarl's,american,"[warm, lobster, dip, served, toasted, crostini]","[warm, lobster, dip, served, toasted, crostini]","(0, 7239)\t0.187411561108\n (0, 8292)\t0.34..."
4,"20 Lake Ave, Buffalo, NY 14219",Appetizers,sauteed shrimp served over mayport grits with ...,Shrimp Scampi,$7.95,Omarl's,american,"[shrimp, scampi, sauteed, shrimp, served, mayp...","[shrimp, scampi, sauteed, shrimp, served, mayp...","(0, 7239)\t0.142936178397\n (0, 7070)\t0.16..."
5,"20 Lake Ave, Buffalo, NY 14219",Soup,,Soup Du Jour,$2.95+,Omarl's,american,"[soup, du, jour]","[soup, du, jour]","(0, 7592)\t0.418264281633\n (0, 2798)\t0.63..."
6,"20 Lake Ave, Buffalo, NY 14219",Soup,,Seafood Chowder,$3.95+,Omarl's,american,"[seafood, chowder]","[seafood, chowder]","(0, 7160)\t0.652758831426\n (0, 1899)\t0.75..."
7,"20 Lake Ave, Buffalo, NY 14219",Salad,marinated chicken over a bed of greens,Jerk Chicken Caesar,$7.95,Omarl's,american,"[jerk, chicken, caesar, marinated, chicken, be...","[jerk, chicken, caesar, marinated, chicken, be...","(0, 4375)\t0.530720220251\n (0, 1785)\t0.40..."
8,"20 Lake Ave, Buffalo, NY 14219",Salad,,Omaris' House Salad,$3.95,Omarl's,american,"[omaris, house, salad]","[omaris, house, salad]","(0, 4132)\t0.44179903531\n (0, 5722)\t0.841..."
9,"20 Lake Ave, Buffalo, NY 14219",Entrees,served with sweet potato plantain mashed with ...,Caribbean Pork Loin,$14.95,Omarl's,american,"[caribbean, pork, loin, served, sweet, potato,...","[caribbean, pork, loin, served, sweet, potato,...","(0, 6828)\t0.222086323011\n (0, 7239)\t0.13..."


In [32]:
pickle.dump(df1,open('all_menus.p','wb'))

In [82]:
r=requests.get('https://www.allmenus.com/ny/buffalo/245696-india-gate/menu/',headers=headers)

In [108]:
soup=BeautifulSoup(r.text,"lxml")

In [158]:
name=soup.find('div',{'class':'restaurant-summary'}).find('h1').text

In [159]:
name

'India Gate'

In [222]:
d['restaurant_address']=restaurant_address

In [223]:
new=pd.DataFrame(data=d)

In [229]:
new['name_and_address_list'] = new.restaurant_names.map(str) + ", " + new.restaurant_address.map(str)

In [248]:
name_and_address_list=new['name_and_address_list'].tolist()

In [249]:
pickle.dump(name_and_address_list,open('name_and_addr_list.p','wb'))

In [291]:
pickle.dump(tfs,open('tfidf_mat_muls.p','wb'))

In [261]:
rest3[300][1]

'1500 S Van Ness Ave, San Francisco, CA, 94110'

In [280]:
new['merged_list']=new.apply(lambda x: [x.restaurant_names, x.restaurant_address], axis=1)

In [283]:
name_and_addr_list_split=new.merged_list.tolist()

In [285]:
pickle.dump(name_and_addr_list_split,open('name_and_addr_list_split.p','wb'))

In [287]:
new=pd.read_pickle('name_and_addr_list_split.p')

In [293]:
df

Unnamed: 0,address,category_name,item_description,item_name,item_price,restaurant,restaurant_category,item_name_and_description,item_desc_lem,tf_idf
0,"20 Lake Ave, Buffalo, NY 14219",Appetizers,fresh house made mozzarella with beef steak&co...,Mozzarella Caprese,$7.95,Omarl's,american,"[mozzarella, caprese, fresh, house, made, mozz...","[mozzarella, caprese, fresh, house, made, mozz...","(0, 5445)\t0.309005264793\n (0, 1522)\t0.28..."
1,"20 Lake Ave, Buffalo, NY 14219",Appetizers,served with mango sauce,Jamaican Beef Wellington,$7.95,Omarl's,american,"[jamaican, beef, wellington, served, mango, sa...","[jamaican, beef, wellington, served, mango, sa...","(0, 902)\t0.275690326029\n (0, 7239)\t0.180..."
2,"20 Lake Ave, Buffalo, NY 14219",Appetizers,served with roasted red pepper cream sauce,Smith Island Crab Cakes,$8.95,Omarl's,american,"[smith, island, crab, cakes, served, roasted, ...","[smith, island, crab, cake, served, roasted, r...","(0, 6828)\t0.265576373268\n (0, 7239)\t0.16..."
3,"20 Lake Ave, Buffalo, NY 14219",Appetizers,served with toasted crostini,Warm Lobster Dip,$6.95,Omarl's,american,"[warm, lobster, dip, served, toasted, crostini]","[warm, lobster, dip, served, toasted, crostini]","(0, 7239)\t0.187411561108\n (0, 8292)\t0.34..."
4,"20 Lake Ave, Buffalo, NY 14219",Appetizers,sauteed shrimp served over mayport grits with ...,Shrimp Scampi,$7.95,Omarl's,american,"[shrimp, scampi, sauteed, shrimp, served, mayp...","[shrimp, scampi, sauteed, shrimp, served, mayp...","(0, 7239)\t0.142936178397\n (0, 7070)\t0.16..."
5,"20 Lake Ave, Buffalo, NY 14219",Soup,,Soup Du Jour,$2.95+,Omarl's,american,"[soup, du, jour]","[soup, du, jour]","(0, 7592)\t0.418264281633\n (0, 2798)\t0.63..."
6,"20 Lake Ave, Buffalo, NY 14219",Soup,,Seafood Chowder,$3.95+,Omarl's,american,"[seafood, chowder]","[seafood, chowder]","(0, 7160)\t0.652758831426\n (0, 1899)\t0.75..."
7,"20 Lake Ave, Buffalo, NY 14219",Salad,marinated chicken over a bed of greens,Jerk Chicken Caesar,$7.95,Omarl's,american,"[jerk, chicken, caesar, marinated, chicken, be...","[jerk, chicken, caesar, marinated, chicken, be...","(0, 4375)\t0.530720220251\n (0, 1785)\t0.40..."
8,"20 Lake Ave, Buffalo, NY 14219",Salad,,Omaris' House Salad,$3.95,Omarl's,american,"[omaris, house, salad]","[omaris, house, salad]","(0, 4132)\t0.44179903531\n (0, 5722)\t0.841..."
9,"20 Lake Ave, Buffalo, NY 14219",Entrees,served with sweet potato plantain mashed with ...,Caribbean Pork Loin,$14.95,Omarl's,american,"[caribbean, pork, loin, served, sweet, potato,...","[caribbean, pork, loin, served, sweet, potato,...","(0, 6828)\t0.222086323011\n (0, 7239)\t0.13..."
