In [226]:
import io, time, json
import requests
from bs4 import BeautifulSoup
import time

In [222]:
def read_api_key(filepath):
    """
    Read the Yelp API Key from file.
    
    Args:
        filepath (string): File containing API Key
    Returns:
        api_key (string): The API Key
    """
    
    # feel free to modify this function if you are storing the API Key differently
    with open('api_key.txt', 'r') as f:
        return f.read().replace('\n','')

In [223]:
api_key = read_api_key('api_key.txt')

In [224]:
def yelp_search(api_key, cities):
    """
    Make an authenticated request to the Yelp API.

    Args:
        query (string): Search term

    Returns:
        total (integer): total number of businesses on Yelp corresponding to the query
        businesses (list): list of dicts representing each business
    """
    
    # Write solution here
    headers = {
    "authorization": 'Bearer %s' % api_key, # for the yelp API 
    }
    result = dict()
    result['total'] = 0
    result['businesses'] = []
    for city in cities:
        params = {
            "location": city,  
        }
        response = requests.get('https://api.yelp.com/v3/businesses/search',
                            headers=headers, params=params)
        new_result = response.json() #converts json into a python dictionary
        result['total'] += new_result['total']
        result['businesses'].append(new_result['businesses'])
    return result


In [225]:
def get_data_for_state(key, state, cities):
    """
    Get the yelp data for a state, and write it to a file

    Args:
        state (string): Name of the state
        cities (list): List of the top 3 populated cities in the state

    """
    with open('data/'+state+'.txt', 'w') as f:
        data = yelp_search(key, cities)
        json.dump(data, f, ensure_ascii=False)

In [131]:
states = {'alabama':['Birmingham, AL', 'Montgomery, AL', 'Mobile, AL'],
          'alaska':['Anchorage, AK'],
          'arizona':['Phoenix, AZ', 'Tucson, AZ', 'Mesa, AZ'],
          'arkansas':['Little Rock, AS', 'Fort Smith, AS', 'Fayetteville, AS'],
          'california':['Los Angeles, CA', 'San Diego, CA', 'San Jose, CA', 'San Francisco, CA'],
          'colorado':['Denver, CO', 'Colorado Springs, CO', 'Aurora, CO'],
          'connecticut':['Bridgeport, CT', 'New Haven, CT', 'Hartford, CT'],
          'delaware':['Wilmington, DE'],
          'dc':['Washington, DC'],
          'florida':['Jacksonville, FL', 'Miami, FL', 'Tampa, FL'],
          'georgia':['Atlanta, GA','Augusta-Richmond County, GA','Columbus, GA'],
          'hawaii':['Honolulu, HI'],
          'idaho':['Boise City, ID', 'Nampa, ID', 'Meridian, ID'],
          'illinois':['Chicago, IL','Aurora, IL', 'Rockford, IL'],
          'indiana':['Indianapolis, IN', 'Fort Wayne, IN', 'Evansville, IN'],
          'iowa':['Des Moines, IA', 'Cedar Rapids, IA', 'Davenport, IA'],
          'kansas':['Wichita, KS', 'Overland Park, KS', 'Kansas City, KS'],
          'louisiana':['New Orleans, LA', 'Baton Rouge, LA', 'Shreveport, LA'],
          'maine':['Portland, ME'],
          'maryland':['Baltimore, MD'],
          'massachusetts':['Boston, MA','Worcester, MA', 'Springfield, MA'],
          'michigan':['Detroit, MI', 'Grand Rapids, MI', 'Warren, MI'],
          'minnesota':['Minneapolis, MN', 'St. Paul, MN', 'Rochester, MN'],
          'mississippi':['Jackson, MS', 'Gulfport, MS'],
          'missouri':['Kansas City, MO', 'St. Louis, MO', 'Springfield, MO'],
          'montana':['Billings, MT', 'Missoula, MT'],
          'nebraska':['Omaha, NE', 'Lincoln, NE'],
          'nevada':['Las Vegas, NV', 'Henderson, NV', 'Reno, NV'],
          'newHampshire':['Manchester, NH', 'Nashua, NH'],
          'newJersey':['Newark, NJ', 'Jersey City, NJ', 'Paterson, NJ'],
          'newMexico':['Albuquerque, NM', 'Las Cruces, NM', 'Rio Rancho, NM'],
          'newYork':['New York, NY', 'Buffalo, NY', 'Rochester, NY'],
          'northCarolina':['Charlotte, NC', 'Raleigh, NC', 'Greensboro, NC'],
          'northDakota':['Fargo, ND'],
          'ohio':['Columbus, OH', 'Cleveland, OH', 'Cincinnati, OH'],
          'oklahoma':['Oklahoma City, OK', 'Tulsa, OK', 'Norman, OK'],
          'oregon':['Portland, OR', 'Eugene, OR', 'Salem, OR'],
          'pennsylvania':['Philadelphia, PA', 'Pittsburgh, PA', 'Allentown, PA'],
          'rhodeIsland':['Providence, RI', 'Warwick, RI', 'Cranston, RI'],
          'southCarolina':['Columbia, SC', 'Charleston, SC', 'North Charleston, SC'],
          'southDakota':['Sioux Falls, SD', 'Rapid City, SD'],
          'tennessee':['Memphis, TN', 'Nashville-Davidson, TN', 'Knoxville, TN'],
          'texas':['Houston, TX', 'San Antonio, TX', 'Dallas, TX'],
          'utah':['Salt Lake City, UT', 'West Valley City, UT', 'Provo, UT'],
          'vermont':['Burlington, VT'],
          'virginia':['Virginia Beach, VA', 'Norfolk, VA', 'Chesapeake, VA'],
          'washington':['Seattle, WA', 'Spokane, WA', 'Tacoma, WA'],
          'westVirginia':['Charleston, WV'],
          'wisconsin':['Milwaukee, WI', 'Madison, WI', 'Green Bay, WI'],
          'wyoming':['Cheyenne, WY'],
         }

In [7]:
for state in states:
    get_data_for_state(api_key, state, states[state])

In [8]:
def parse_page(html):
    """
    Parse the reviews on a single page of a restaurant.
    
    Args:
        html (string): String of HTML corresponding to a Yelp restaurant

    Returns:
        tuple(list, string): a tuple of two elements
            first element: list of dictionaries corresponding to the extracted review information
            second element: URL for the next page of reviews (or None if it is the last page)
    """
    res = []
    soup = BeautifulSoup(html, 'html.parser')
    for head in soup.find_all("div", class_="review review--with-sidebar"):
        reviewid = head.get("data-review-id")
        userid = head.get("data-signup-object")
        userid = userid[8:]
        rating = head.find("div", class_="i-stars")
        rating = rating.get("title")
        rating = float(rating[0:3])
        date = head.find("span", class_="rating-qualifier").get_text().strip()
        text = head.find("p").get_text()
        temp = dict()
        temp["review_id"] = reviewid
        temp["user_id"] = userid
        temp["rating"] = rating
        temp["date"] = date
        temp["text"] = text
        res += [temp]
    url = soup.find("a", class_="u-decoration-none next pagination-links_anchor")
    if (url == None): return (res, None)
    return (res, url["href"])




In [13]:
#AUTOLAB_IGNORE_START
response = requests.get("https://www.yelp.com/biz/meat-and-potatoes-pittsburgh")
soup = parse_page(response.content)
print(soup)
#AUTOLAB_IGNORE_STOP

([{'review_id': '-TLKqeXrpcM_IQK_u2meNg', 'user_id': 'M7-K7GwC-vUUPVHfsv_xvQ', 'rating': 5.0, 'date': '5/7/2018', 'text': "Was in town to celebrate my brothers birthday which fortunately/unfortunately fell on Easter this year. Not many places were open for dinner and after walking aimlessly around downtown Pittsburgh for an hour we literally stumbled upon this place. We walked in scanned the menu quickly and were seated right after. The drink options/specials were reasonably priced at $7 to $9 and the ambiance was great. All the hostess and waiters seemed welcoming. The food. The food. The food. Unfortunately, the menu item I ordered isn't listed on M&P's website. It doesn't seem like they've updated since 2017 but I can say that my dish was I believe a lamb chop with either quinoa or couscous. There were a variety of vegetables and sauces and the flavors were SO GOOD. The flavor I cannot speak enough about. I really did savor every bite. My brother ordered the pappardelle and said alt

In [1]:
import pandas as pd
import os

price_converter = {
        "$"   : 1,
        "$$"  : 2,
        "$$$" : 3,
        "$$$$": 4,
        '€'    : 1,
        '€€'   : 2,
        '€€€'  : 3,
        '€€€€' : 4
    }

def state_to_frame(state_path):
    state = pd.read_json(state_path)
    business_list = [pd.DataFrame(state["businesses"][i]) for i in range(len(state["businesses"]))]
    state_frame = pd.concat(business_list, ignore_index=True)
    state_frame.drop(['alias', 'display_phone', 'id',
       'image_url', 'is_closed', 'transactions'], axis=1, inplace=True)
    state_frame.dropna(inplace=True)
    state_frame["categories"] = state_frame["categories"].apply(lambda x : [x[i]["title"] for i in range(len(x))])
    state_frame["price"] = state_frame["price"].apply(lambda x: price_converter[x])
    return state_frame

def read_statefiles(pathname="./data"):
    state_dict = {}
    for filename in os.listdir(pathname):
        state_dict[filename[:-4]] = state_to_frame("./data/" + filename)
    return state_dict

In [156]:
states = read_statefiles()

AttributeError: 'dict' object has no attribute 'reset_index'

In [18]:
a = states["alabama"]

In [158]:
a.reset_index()

Unnamed: 0,index,categories,coordinates,distance,location,name,phone,price,rating,review_count,url
0,0,[Mexican],"{'latitude': 33.516636, 'longitude': -86.802849}",5386.333367,"{'address1': '2211 2nd Ave N', 'address2': '',...",El Barrio,12058683737.0,2,4.5,421,https://www.yelp.com/biz/el-barrio-birmingham?...
1,1,"[Barbeque, Seafood, Burgers]","{'latitude': 33.52419, 'longitude': -86.77383}",5621.07715,"{'address1': '215 41st St S', 'address2': None...",Saw's Soul Kitchen,12055911409.0,2,4.5,525,https://www.yelp.com/biz/saws-soul-kitchen-bir...
2,2,"[Cocktail Bars, Gastropubs, Pubs]","{'latitude': 33.5171239295806, 'longitude': -8...",5236.715503,"{'address1': '2430 Morris Ave', 'address2': ''...",Carrigan's Public House,12054402430.0,2,4.5,396,https://www.yelp.com/biz/carrigans-public-hous...
3,3,"[French, Bars]","{'latitude': 33.500550020261, 'longitude': -86...",3461.784536,"{'address1': '2007 11th Ave S', 'address2': ''...",Chez Fonfon,12059393221.0,2,4.5,246,https://www.yelp.com/biz/chez-fonfon-birmingha...
4,4,"[Cafes, Sandwiches, Burgers]","{'latitude': 33.51761, 'longitude': -86.80194}",5430.687514,"{'address1': '2320 2nd Ave N', 'address2': '',...",Urban Standard,12052508200.0,1,4.5,293,https://www.yelp.com/biz/urban-standard-birmin...
5,5,"[Southern, Bars]","{'latitude': 33.500572, 'longitude': -86.795577}",3457.727326,"{'address1': '2011 11th Ave S', 'address2': ''...",Highlands Bar & Grill,12059391400.0,3,4.5,208,https://www.yelp.com/biz/highlands-bar-and-gri...
6,6,"[American (New), Gluten-Free, Comfort Food]","{'latitude': 33.51766, 'longitude': -86.80183}",5423.439805,"{'address1': '2328 2nd Ave N', 'address2': Non...",Yo' Mama's,12059576545.0,2,4.5,231,https://www.yelp.com/biz/yo-mamas-birmingham-2...
7,7,"[Breakfast & Brunch, Italian]","{'latitude': 33.5155820048533, 'longitude': -8...",5410.999938,"{'address1': '207A 20th St N', 'address2': '',...",Trattoria Centrale,12052025612.0,2,4.5,210,https://www.yelp.com/biz/trattoria-centrale-bi...
8,8,"[Latin American, Tapas Bars, Tacos]","{'latitude': 33.5117073059082, 'longitude': -8...",4363.541574,"{'address1': '2808 7th Ave S', 'address2': 'St...",Babalu Tapas & Tacos,12052970200.0,2,4.0,365,https://www.yelp.com/biz/babalu-tapas-and-taco...
9,9,"[Sushi Bars, Asian Fusion]","{'latitude': 33.5172233581543, 'longitude': -8...",5420.257261,"{'address1': '2212 2nd Ave N', 'address2': '',...",Bamboo on 2nd,12057030551.0,2,4.5,214,https://www.yelp.com/biz/bamboo-on-2nd-birming...


In [139]:
def get_reviews(key, state, url, name):
    """
    Get the yelp data for a state, and write it to a file

    Args:
        state (string): Name of the state
        cities (list): List of the top 3 populated cities in the state

    """
    with open('reviews/'+state+name+'.txt', 'w') as f:
        response = requests.get(url)
        soup = parse_page(response.content)
        json.dump(soup, f, ensure_ascii=False)

In [239]:
# for state in states:
#     get_reviews(api_key, state, states[state]['url'][0])
# for state in states:
#     states[state] = states[state].reset_index(drop = True)
#     states[state]['name'] = states[state]['name'].str.replace("/", "")
#     for i in range(len(states[state]['url'])):
#         get_reviews(api_key, state, states[state]['url'][i], states[state]['name'][i])

state = 'michigan'
states[state] = states[state].reset_index(drop = True)
states[state]['name'] = states[state]['name'].str.replace("/", "")
for i in range(len(states[state]['url'])):
    get_reviews(api_key, state, states[state]['url'][i], states[state]['name'][i])
    time.sleep(0.2)

In [167]:
print(states)

{'alabama':     level_0  index                                         categories  \
0         0      0                                          [Mexican]   
1         1      1                       [Barbeque, Seafood, Burgers]   
2         2      2                  [Cocktail Bars, Gastropubs, Pubs]   
3         3      3                                     [French, Bars]   
4         4      4                       [Cafes, Sandwiches, Burgers]   
5         5      5                                   [Southern, Bars]   
6         6      6        [American (New), Gluten-Free, Comfort Food]   
7         7      7                      [Breakfast & Brunch, Italian]   
8         8      8                [Latin American, Tapas Bars, Tacos]   
9         9      9                         [Sushi Bars, Asian Fusion]   
10       10     10                      [Italian, Seafood, Wine Bars]   
11       11     11  [Southern, American (Traditional), Chicken Wings]   
12       12     12                     

In [94]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [203]:
food = wn.synset('food.n.01')
foods = list(set([w for s in food.closure(lambda s:s.hyponyms()) for w in s.lemma_names()]))
food = wn.synset('food.n.02')
foods.extend(list(set([w for s in food.closure(lambda s:s.hyponyms()) for w in s.lemma_names()])))
for i in range(len(foods)):
    foods[i] = foods[i].replace('_', ' ')
    foods[i] = foods[i].replace('-', ' ')
    foods[i] = foods[i].lower()
#foods = [lemmatizer.lemmatize(t) for t in foods]
foods.remove('a')
foods.remove('b')
foods.remove('c')
foods.remove('d')
foods.remove('e')
foods.remove('breakfast')
foods.remove('lunch')
foods.remove('dinner')
foods.remove('brunch')
foods.remove('menu')
foods.remove('meal')
foods.remove('plate')
foods.remove('table')
foods = [' ' + food + ' ' for food in foods]

In [172]:
print(foods)

[' intermixture ', " saint john's bread ", ' pastry ', ' caramelized sugar ', ' gulyas ', ' lemon oil ', ' jewish rye bread ', ' chocolate pudding ', ' onion butter ', ' garlic sauce ', ' pina colada ', ' adobo ', ' lemonade mix ', ' rechewed food ', ' canned meat ', ' ravigote ', ' tomato sauce ', ' hungarian goulash ', ' peppermint ', ' salad nicoise ', ' dodger ', ' refection ', ' soul food ', ' vitamin k ', ' pigwash ', ' moussaka ', ' veal parmesan ', ' anchovy butter ', ' proof spirit ', ' bourbon ', ' pastry dough ', ' bordeaux wine ', ' kabob ', ' open face sandwich ', ' turkey stew ', ' allemande ', ' knish ', ' stuffed cabbage ', ' drip coffee ', ' loaf of bread ', ' wonton ', ' fare ', ' weissbier ', ' flour ', ' lasagna ', ' fish mousse ', ' peanut bar ', ' liquorice ', ' codfish ball ', ' coquilles saint jacques ', ' stuffing ', ' cinnamon snail ', ' pyridoxal ', ' slaw ', ' low calorie diet ', ' philadelphia pepper pot ', ' petfood ', ' cough drop ', ' garlic butter ', ' 

In [128]:
with open('reviews/delaware.txt', 'r') as f:
    data = json.load(f)

In [129]:
for i in range(len(data[0])):
    whole_review = data[0][i]
    review = whole_review['text']
    tokens = nltk.word_tokenize(review)
#     lem = [lemmatizer.lemmatize(t) for t in tokens]
#     review = lemmatizer.lemmatize(review)
    review = review.lower()
print(review)

i came here for my sister's graduation and absolutely loved it! it's a cute cozy mexican restaurant downtown and they had a long wait, but they text you when your table opens up, so we had a drink at a bar nearby until our table was ready. the chips and salsa are good and the queso is amazing! everyone at the table really enjoyed their food as well. the waitresses were wonderful and they managed to sit our very large group all together. can't recommend this place highly enough.


In [173]:
for food in foods:
    if food in review:
        print(food)

 salsa 
 table 
 drink 
 chips 
 chips 


In [145]:
states[state]['url'][18]

'https://www.yelp.com/biz/perrys-steakhouse-and-grille-birmingham-birmingham?adjust_creative=TpLShpR2csZ2AtLGvsRWXw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=TpLShpR2csZ2AtLGvsRWXw'

In [174]:
for state in states:
    for i in range(len(states[state]['url'])):
        with open('reviews/'+state+states[state]['name'][i]+'.txt', 'r') as f:
            data = json.load(f)
print(data)

[[], None]


In [220]:
res = {}
for i in range(len(states['alaska']['url'])):
    with open('reviews/'+'alaska'+states['alaska']['name'][i]+'.txt', 'r') as f:
        data = json.load(f)
        for j in range(len(data[0])):
            whole_review = data[0][j]
            review = whole_review['text']
            tokens = nltk.word_tokenize(review)
            review = review.lower()
            for food in foods:
                if food in review:
                    if food in res:
                        res[food] += 1
                    else:
                        res[food] = 1
print(res)

{' cheese ': 74, ' salsa ': 5, ' garlic ': 17, ' beer ': 32, ' drink ': 18, ' pizza ': 22, ' ale ': 4, ' delicious ': 48, ' apple ': 5, ' gyro ': 2, ' sauce ': 31, ' bread ': 60, ' picnic ': 1, ' salad ': 33, ' crisp ': 7, ' side ': 31, ' chicken ': 41, ' mix ': 4, ' wheat ': 4, ' sweet ': 34, ' brew ': 3, ' raspberry ': 6, ' chipotle ': 6, ' salmon ': 33, ' steak ': 24, ' veggie ': 7, ' barbecue ': 2, ' water ': 7, ' pie ': 9, ' shrimp ': 8, ' apricot ': 1, ' spread ': 5, ' meat ': 19, ' smoked salmon ': 4, ' must ': 14, ' soda ': 6, ' date ': 10, ' sausage ': 8, ' special ': 21, ' split ': 5, ' chili ': 16, ' mushroom ': 5, ' coffee ': 14, ' cup ': 6, ' eggs benedict ': 2, ' eggs ': 12, ' ricotta ': 2, ' potato ': 18, ' hash ': 10, ' bacon ': 17, ' pancake ': 2, ' canadian bacon ': 1, ' lemon ': 4, ' burrito ': 4, ' rice ': 8, ' sandwich ': 21, ' fried egg ': 1, ' egg ': 10, ' veg ': 1, ' dietary ': 2, ' gravy ': 4, ' patty ': 4, ' course ': 7, ' cream ': 17, ' cake ': 6, ' mocha ': 

In [218]:
top5 = sorted(res, key=res.get, reverse=True)[:10]

In [219]:
print(top5)

[' cheese ', ' chicken ', ' pork ', ' delicious ', ' potato ', ' salad ', ' sweet ', ' bread ', ' sauce ', ' fries ']


In [236]:
foodstr = ""
for key in res:
    for i in range(res[key]):
        foodstr = foodstr+key

In [237]:
print(foodstr)

 cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  cheese  salsa  salsa  salsa  salsa  salsa  garlic  garlic  garlic  garlic  garlic  garlic  garlic  garlic  garlic  garlic  garlic  garlic  garlic  garlic  garlic  garlic  garlic  beer  beer  beer  beer  beer  beer  beer  beer  beer  beer  beer  beer  beer  beer  beer  beer  beer  beer  beer  beer  beer  beer  beer  beer  beer  beer  beer  beer  beer  beer  beer  beer  drink  drink  drink  drink  drink  drink  dr