# Food Map
#### Authors: 
##### Arrido Arfiadi, Christine Nguyen

In [1]:
# Web scraping, pickle imports
import requests
from bs4 import BeautifulSoup
import pickle
import pandas as pd
from datetime import datetime
import string
from functools import reduce

# Web Scraping

## Constants

In [2]:
SECTION_CLASS_NAME = "lemon--section__373c0__fNwDM margin-t4__373c0__1TRkQ padding-t4__373c0__3hVZ3 border--top__373c0__3gXLy border-color--default__373c0__3-ifU"
REVIEW_LIST_NAME = "lemon--ul__373c0__1_cxs undefined list__373c0__3GI_T"
REVIEW_LIST_ITEM_NAME = "lemon--li__373c0__1r9wz margin-b3__373c0__q1DuY padding-b3__373c0__342DA border--bottom__373c0__3qNtD border-color--default__373c0__3-ifU"
COMMENT_SECTION = "lemon--p__373c0__3Qnnj text__373c0__2Kxyz comment__373c0__3EKjH text-color--normal__373c0__3xep9 text-align--left__373c0__2XGa-"
DATE_SECTION = "lemon--span__373c0__3997G text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa-"
CUT_OFF_YEAR = datetime.now().year - 2
NUMBER_OF_COMMENTS_PER_PAGE = 20
NUMBER_OF_RESTAURANTS_PER_PAGE = 10

#REVIEWS, RATINGS AND OTHER ATTRIBUTES OF A RESTUARANT
RESTAURANT_CLASS_NAME = "lemon--h1__373c0__2ZHSL heading--h1__373c0__dvYgw undefined heading--inline__373c0__10ozy"
RESTAURANT_STAR_RATING_CLASS_NAME = "lemon--div__373c0__1mboc i-stars__373c0__1T6rz i-stars--large-4__373c0__1d6HV border-color--default__373c0__3-ifU overflow--hidden__373c0__2y4YK"
RESTAURANT_NUMBER_OF_RATING_CLASS_NAME = "lemon--p__373c0__3Qnnj text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa- text-size--large__373c0__3t60B"
RESTAURANT_PRICE_RANGE_CLASS_NAME = "lemon--span__373c0__3997G text__373c0__2Kxyz text-color--normal__373c0__3xep9 text-align--left__373c0__2XGa- text-bullet--after__373c0__3fS1Z text-size--large__373c0__3t60B"
RESTAURANT_HEADER_CLASS_NAME = "lemon--div__373c0__1mboc arrange__373c0__2C9bH gutter-4__373c0__3s8bL border-color--default__373c0__3-ifU"
RESTAURANT_TYPE_CLASS_NAME = "lemon--span__373c0__3997G text__373c0__2Kxyz text-color--black-extra-light__373c0__2OyzO text-align--left__373c0__2XGa- text-size--large__373c0__3t60B"
RESTAURANT_ADDRESS_CLASS_NAME = "lemon--span__373c0__3997G raw__373c0__3rcx7"
RESTAURANT_LOCATION_CLASS_NAME = "lemon--div__373c0__1mboc padding-t2__373c0__11Iek padding-r2__373c0__28zpp padding-b2__373c0__34gV1 padding-l2__373c0__1Dr82 border-color--default__373c0__3-ifU"

#SEARCH RESULTS
SEARCH_RESULTS_RESTAURANT_NAME_CONTAINER = "container__09f24__21w3G hoverable__09f24__2nTf3 margin-t3__09f24__5bM2Z margin-b3__09f24__1DQ9x padding-t3__09f24__-R_5x padding-r3__09f24__1pBFG padding-b3__09f24__1vW6j padding-l3__09f24__1yCJf border--top__09f24__1H_WE border--right__09f24__28idl border--bottom__09f24__2FjZW border--left__09f24__33iol border-color--default__09f24__R1nRO"
SEARCH_RESULTS_RESTAURANT_LINK = "link__09f24__1kwXV link-color--inherit__09f24__3PYlA link-size--inherit__09f24__2Uj95"

In [3]:
restaurantsRating_df = pd.DataFrame(columns = ["RestaurantID", "ReviewDate", "StarRating","UserComment"]) 
restaurantsInfo_df = pd.DataFrame(columns = ["RestaurantID", "RestaurantName", "StarRating","NumberOfRating", "PriceRange","FoodTypes", "Address"]) 

In [24]:
# get_ratings_and_reviews function takes a URL link and a dataframe as an argument. The dataframe must contain 3 columns: review date, star rating, and user commment.
#it will go to the 'review' section and search for 'recommended reviews', and from there it will find the review date, star rating, and user comment. 
#There are 5 constants in the function that will jump straight to the section needed to grab the data and appends it to a dataframe.
#After it completes scraping the data on the page, it will return return the dataframe.
def get_ratings_and_reviews(url, df):
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml") 
    reviewSection = None
    for header in soup.select("h4"):
        if "Recommended Reviews" in header.get_text():
            reviewSection = header.find_parent(class_ = SECTION_CLASS_NAME)
            break
    reviewList = reviewSection.find(class_ = REVIEW_LIST_NAME) #the review section
    reviews = reviewList.find_all(class_ = REVIEW_LIST_ITEM_NAME) #contains all the everything, containing images, star ratings, comments and date, probably user, etc.

    for review in reviews:
        reviewDate = review.find(class_ = DATE_SECTION).get_text()
        reviewYear = datetime.strptime(reviewDate, '%m/%d/%Y').year
        if reviewYear >= CUT_OFF_YEAR:
            df = df.append({"RestaurantID": len(df) + 1,"ReviewDate":reviewDate
                      ,"StarRating":review.select_one("[aria-label*=rating]")["aria-label"]
                      ,"UserComment":review.find(class_ = COMMENT_SECTION).get_text()}, ignore_index = True)
        else:
            break
    return df         

In [5]:
#page_iterator_for_reviews is a function takes a URL link and a dataframe as an argument. The dataframe must contain 3 columns: review date, star rating, and user commment.
#NUMBER_OF_COMMENTS_PER_PAGE is a constant set to 20 because it represents the maximum number of reviews per page on yelp. 
#BASE_URL is a constant that takes the original URL link and adds '&sort_by=date_desc' that sorts the reviews by year in descending order.
#'&start=' is added, which in yelp is a parameter, to specify which review index to start 
#The function iterates through pages of reviews per restaurant link. It calls another funciton, get_data_from_url and passes in a URL link and dataframe.
#After it iterates through all the pages it needs to, itt returns a dataframe.
#definition of this function might change the next time we meet...

def page_iterator_for_restaurant_reviews(url, df):
    page = 0
    baseUrl = url+'&sort_by=date_desc'+'&start='
    
    while page*NUMBER_OF_COMMENTS_PER_PAGE == len(df):
        df = get_data_from_url(baseUrl+str(page*NUMBER_OF_COMMENTS_PER_PAGE),df).copy()
        page+=1
    return df    

In [6]:
#get_restaurant_info_from_url is a function takes a URL link and a dataframe as an argument. The dataframe must contain 3 columns: "RestaurantName", "StarRating","NumberOfRating", "PriceRange","FoodTypes", "Address"
#The function parses through the page to get the information from that restaurant. It consist of inner functions get_food_type and merge_address that helps in parsing the data by allowing it to be used in map and reduce 
#After it finishes, it returns a dataframe.

def get_restaurant_info_from_url(url, df):
    #COMMENT HERE
    def get_food_type(htmlObject):
        return htmlObject.get_text().translate(str.maketrans('', '', string.punctuation)).strip()
    
    def merge_address(x1, x2): 
        return x1.get_text() + ", " + x2.get_text()
    
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml") 
    restaurantName = soup.find(class_ = RESTAURANT_CLASS_NAME).get_text()
    starRating = soup.find(class_ = RESTAURANT_STAR_RATING_CLASS_NAME)["aria-label"]
    numberOfRating = soup.find(class_ = RESTAURANT_NUMBER_OF_RATING_CLASS_NAME).get_text()
    priceRange = soup.find(class_ = RESTAURANT_PRICE_RANGE_CLASS_NAME).get_text()
    foodTypes = list(map(get_food_type,soup.find(class_ = RESTAURANT_HEADER_CLASS_NAME).find_all(class_ = RESTAURANT_TYPE_CLASS_NAME)))
    address = reduce(merge_address,soup.find(class_ = RESTAURANT_LOCATION_CLASS_NAME).find_all(class_ = RESTAURANT_ADDRESS_CLASS_NAME))
    
    df = df.append({"RestaurantName": restaurantName,
                    "StarRating":starRating,
                    "NumberOfRating": numberOfRating,
                    "PriceRange": priceRange,
                    "FoodTypes": foodTypes,
                    "Address": address
                   }, ignore_index = True)
    return df


# restaurantsInfo_df = get_restaurant_info_from_url('https://www.yelp.com/biz/a-hong-kong-kitchen-seattle?osq=chinese',restaurantsInfo_df)
# restaurantsInfo_df = get_restaurant_info_from_url('https://www.yelp.com/biz/fogo-de-ch%C3%A3o-brazilian-steakhouse-washington-8', restaurantsInfo_df)
restaurantsInfo_df.head()

Unnamed: 0,RestaurantID,RestaurantName,StarRating,NumberOfRating,PriceRange,FoodTypes,Address


In [7]:
testUrl = 'https://www.yelp.com/biz/a-hong-kong-kitchen-seattle?osq=chinese'
yelp_data = page_iterator(testUrl, yelp_data).copy()

NameError: name 'page_iterator' is not defined

In [None]:
yelp_data.tail()

In [None]:
test = yelp_data.copy()

In [None]:
#TODO
#Scrape more restaurants
#Design Data Warehouse
#Test Data Warehouse

In [8]:
testUrl = 'https://www.yelp.com/search?cflt=chinese&find_loc=Seattle%2C%20WA'

In [18]:
def page_iterator_for_search_results(url):
    page = 0
    baseUrl = url+'&start='
    restuarantURLs = []
    
    while page*NUMBER_OF_RESTAURANTS_PER_PAGE == len(restuarantURLs):
        restuarantURLs += get_urls_on_search_page(baseUrl+str(page*NUMBER_OF_RESTAURANTS_PER_PAGE))
        page+=1
    return restuarantURLs   

In [19]:
def get_urls_on_search_page(url):
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml") 
    URLs = []
    restaurantSections = soup.find_all(class_ = SEARCH_RESULTS_RESTAURANT_NAME_CONTAINER)

    for restaurant in restaurantSections:
        URL = restaurant.find(class_ = SEARCH_RESULTS_RESTAURANT_LINK)["href"]
        if URL.__contains__("/adredir") == False:
            URLs.append(URL)
    return URLs

In [20]:
links = page_iterator_for_search_results(testUrl)

240


['/biz/a-hong-kong-kitchen-seattle',
 '/biz/tyger-tyger-seattle',
 '/biz/dumpling-the-noodle-seattle',
 '/biz/harbor-city-restaurant-seattle',
 '/biz/lionhead-seattle-2',
 '/biz/chengdu-taste-%E6%BB%8B%E5%91%B3%E6%88%90%E9%83%BD-seattle-3',
 '/biz/tian-fu-seattle-4',
 '/biz/biang-biang-noodles-seattle-2',
 '/biz/magic-dragon-seattle-3',
 '/biz/mei-mei-cafe-seattle-2',
 '/biz/fu-shen-seattle',
 '/biz/qin-xian-noodles-seattle-3',
 '/biz/chef-liao-asian-fusion-cuisine-seattle',
 '/biz/xian-noodles-seattle-4',
 '/biz/chef-king-seattle-4',
 '/biz/taste-of-xian-seattle',
 '/biz/little-chengdu-seattle-2',
 '/biz/spiceup-szechuan-cuisine-seattle-2',
 '/biz/lucky-chinese-restaurant-seattle',
 '/biz/tai-tung-chinese-restaurant-seattle',
 '/biz/sichuanese-cuisine-seattle',
 '/biz/six-pack-foods-seattle',
 '/biz/kau-kau-barbeque-seattle',
 '/biz/green-tree-seattle-2',
 '/biz/mikes-noodle-house-seattle',
 '/biz/qian-noodles-seattle-2',
 '/biz/king-noodle-seattle',
 '/biz/ballard-mandarin-seattle',


In [30]:
def get_yelp_data(url,restaurantsInfo,restaurantsRating):
    links = page_iterator_for_search_results(url)
    for link in links:
        fullLink = "https://yelp.com" + link
        print(fullLink)
        restaurantsInfo_df = get_restaurant_info_from_url(fullLink, restaurantsInfo)
        #get_ratings_and_reviews(fullLink,restaurantsRating)
        
get_yelp_data(testUrl,restaurantsInfo_df, restaurantsRating_df)    

0


In [28]:
restaurantsInfo_df.head()

Unnamed: 0,RestaurantID,RestaurantName,StarRating,NumberOfRating,PriceRange,FoodTypes,Address


In [None]:
#function takes URL of search reuslts, and two dataframe

#instantiate restauraunts links array, and use some kind of function to populate it per page

#iterate through restauraunt links array :
#function finds link to first restaurant, then calls get_restaurant_info. Pass in restaurant link & pass in restaurantsInfo_df  .
#function calls get_ratings_and_reviews and passes in restauraunt link & restaurantsRating_df

