# Food Map
#### Authors: 
##### Arrido Arfiadi, Christine Nguyen

In [48]:
# Web scraping, pickle imports
import requests
from bs4 import BeautifulSoup
import pickle
import pandas as pd
from datetime import datetime
import string
from functools import reduce

# Web Scraping

## Constants

In [57]:
SECTION_CLASS_NAME = "lemon--section__373c0__fNwDM margin-t4__373c0__1TRkQ padding-t4__373c0__3hVZ3 border--top__373c0__3gXLy border-color--default__373c0__3-ifU"
REVIEW_LIST_NAME = "lemon--ul__373c0__1_cxs undefined list__373c0__3GI_T"
REVIEW_LIST_ITEM_NAME = "lemon--li__373c0__1r9wz margin-b3__373c0__q1DuY padding-b3__373c0__342DA border--bottom__373c0__3qNtD border-color--default__373c0__3-ifU"
COMMENT_SECTION = "lemon--p__373c0__3Qnnj text__373c0__2Kxyz comment__373c0__3EKjH text-color--normal__373c0__3xep9 text-align--left__373c0__2XGa-"
DATE_SECTION = "lemon--span__373c0__3997G text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa-"
CUT_OFF_YEAR = datetime.now().year - 2
NUMBER_OF_COMMENTS_PER_PAGE = 20

RESTAURANT_CLASS_NAME = "lemon--h1__373c0__2ZHSL heading--h1__373c0__dvYgw undefined heading--inline__373c0__10ozy"
RESTAURANT_STAR_RATING_CLASS_NAME = "lemon--div__373c0__1mboc i-stars__373c0__1T6rz i-stars--large-4__373c0__1d6HV border-color--default__373c0__3-ifU overflow--hidden__373c0__2y4YK"
RESTAURANT_NUMBER_OF_RATING_CLASS_NAME = "lemon--p__373c0__3Qnnj text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa- text-size--large__373c0__3t60B"
RESTAURANT_PRICE_RANGE_CLASS_NAME = "lemon--span__373c0__3997G text__373c0__2Kxyz text-color--normal__373c0__3xep9 text-align--left__373c0__2XGa- text-bullet--after__373c0__3fS1Z text-size--large__373c0__3t60B"
RESTAURANT_HEADER_CLASS_NAME = "lemon--div__373c0__1mboc arrange__373c0__2C9bH gutter-4__373c0__3s8bL border-color--default__373c0__3-ifU"
RESTAURANT_TYPE_CLASS_NAME = "lemon--span__373c0__3997G text__373c0__2Kxyz text-color--black-extra-light__373c0__2OyzO text-align--left__373c0__2XGa- text-size--large__373c0__3t60B"
RESTAURANT_ADDRESS_CLASS_NAME = "lemon--span__373c0__3997G raw__373c0__3rcx7"
RESTAURANT_LOCATION_CLASS_NAME = "lemon--div__373c0__1mboc padding-t2__373c0__11Iek padding-r2__373c0__28zpp padding-b2__373c0__34gV1 padding-l2__373c0__1Dr82 border-color--default__373c0__3-ifU"

In [3]:
yelp_data = pd.DataFrame(columns = ["ReviewDate", "StarRating","UserComment"]) 

In [4]:
# get_data_from_url function takes a URL link and a dataframe as an argument. The dataframe must contain 3 columns: review date, star rating, and user commment.
#it will go to the 'review' section and search for 'recommended reviews', and from there it will find the review date, star rating, and user comment. 
#There are 5 constants in the function that will jump straight to the section needed to grab the data and appends it to a dataframe.
#After it completes scraping the data on the page, it will return return the dataframe.
def get_data_from_url(url, df):
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml") 
    reviewSection = None
    for header in soup.select("h4"):
        if "Recommended Reviews" in header.get_text():
            reviewSection = header.find_parent(class_ = SECTION_CLASS_NAME)
            break
    reviewList = reviewSection.find(class_ = REVIEW_LIST_NAME) #the review section
    reviews = reviewList.find_all(class_ = REVIEW_LIST_ITEM_NAME) #contains all the everything, containing images, star ratings, comments and date, probably user, etc.

    for review in reviews:
        reviewDate = review.find(class_ = DATE_SECTION).get_text()
        reviewYear = datetime.strptime(reviewDate, '%m/%d/%Y').year
        if reviewYear >= CUT_OFF_YEAR:
            df = df.append({"ReviewDate":reviewDate
                      ,"StarRating":review.select_one("[aria-label*=rating]")["aria-label"]
                      ,"UserComment":review.find(class_ = COMMENT_SECTION).get_text()}, ignore_index = True)
        else:
            break
    return df         

In [5]:
#page_iterator is a function takes a URL link and a dataframe as an argument. The dataframe must contain 3 columns: review date, star rating, and user commment.
#NUMBER_OF_COMMENTS_PER_PAGE is a constant set to 20 because it represents the maximum number of reviews per page on yelp. 
#BASE_URL is a constant that takes the original URL link and adds '&sort_by=date_desc' that sorts the reviews by year in descending order.
#'&start=' is added, which in yelp is a parameter, to specify which review index to start 
#The function iterates through pages of reviews per restaurant link. It calls another funciton, get_data_from_url and passes in a URL link and dataframe.
#After it iterates through all the pages it needs to, itt returns a dataframe.
#definition of this function might change the next time we meet...

def page_iterator(url, df):
    page = 0
    baseUrl = url+'&sort_by=date_desc'+'&start='
    
    while page*NUMBER_OF_COMMENTS_PER_PAGE == len(df):
        df = get_data_from_url(baseUrl+str(page*NUMBER_OF_COMMENTS_PER_PAGE),df).copy()
        page+=1
    return df    

In [56]:
#get_restaurant_info_from_url is a function takes a URL link and a dataframe as an argument. The dataframe must contain 3 columns: "RestaurantName", "StarRating","NumberOfRating", "PriceRange","FoodTypes", "Address"
#The function parses through the page to get the information from that restaurant. It consist of inner functions get_food_type and merge_address that helps in parsing the data by allowing it to be used in map and reduce 
#After it finishes, it returns a dataframe.

def get_restaurant_info_from_url(url, df):
    #COMMENT HERE
    def get_food_type(htmlObject):
        return htmlObject.get_text().translate(str.maketrans('', '', string.punctuation)).strip()
    
 
    def merge_address(x1, x2): 
        return x1.get_text() + ", " + x2.get_text()
    
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml") 
    restaurantName = soup.find(class_ = RESTAURANT_CLASS_NAME).get_text()
    starRating = soup.find(class_ = RESTAURANT_STAR_RATING_CLASS_NAME)["aria-label"]
    numberOfRating = soup.find(class_ = RESTAURANT_NUMBER_OF_RATING_CLASS_NAME).get_text()
    priceRange = soup.find(class_ = RESTAURANT_PRICE_RANGE_CLASS_NAME).get_text()
    foodTypes = list(map(get_food_type,soup.find(class_ = RESTAURANT_HEADER_CLASS_NAME).find_all(class_ = RESTAURANT_TYPE_CLASS_NAME)))
    address = reduce(merge_address,soup.find(class_ = RESTAURANT_LOCATION_CLASS_NAME).find_all(class_ = RESTAURANT_ADDRESS_CLASS_NAME))
    
    df = df.append({"RestaurantName": restaurantName,
                    "StarRating":starRating,
                    "NumberOfRating": numberOfRating,
                    "PriceRange": priceRange,
                    "FoodTypes": foodTypes,
                    "Address": address
                   }, ignore_index = True)
    return df

restaurantsInfo_df = pd.DataFrame(columns = ["RestaurantName", "StarRating","NumberOfRating", "PriceRange","FoodTypes", "Address"]) 
restaurantsInfo_df = get_restaurant_info_from_url('https://www.yelp.com/biz/a-hong-kong-kitchen-seattle?osq=chinese',restaurantsInfo_df)
restaurantsInfo_df = get_restaurant_info_from_url('https://www.yelp.com/biz/fogo-de-ch%C3%A3o-brazilian-steakhouse-washington-8', restaurantsInfo_df)
restaurantsInfo_df.head()

Unnamed: 0,RestaurantName,StarRating,NumberOfRating,PriceRange,FoodTypes,Address
0,A + Hong Kong Kitchen,4 star rating,545 reviews,$,"[Cantonese, Hong Kong Style Cafe]","419 6th Ave S, Seattle, WA 98104"
1,Fogo de Chão Brazilian Steakhouse,4 star rating,1855 reviews,$$$,"[Brazilian, Steakhouses]","1101 Pennsylvania Ave NW, Washington, DC 20004"


In [7]:
testUrl = 'https://www.yelp.com/biz/a-hong-kong-kitchen-seattle?osq=chinese'
yelp_data = page_iterator(testUrl, yelp_data).copy()

In [8]:
yelp_data.tail()

Unnamed: 0,ReviewDate,StarRating,UserComment
270,1/5/2018,5 star rating,"The mala tang is amazing!! Warning though, it'..."
271,1/4/2018,3 star rating,Waited 45 mins and it was sub-par. Lots of ite...
272,1/3/2018,4 star rating,Most authentic HK style food Ive had outside o...
273,1/3/2018,5 star rating,Found this little place on Yelp! When in the I...
274,1/1/2018,5 star rating,"Great, tiny and unassuming hole in a wall in I..."


In [9]:
#TODO
#Scrape more restaurants
#Design Data Warehouse
#Test Data Warehouse