# Food Map
#### Authors: 
##### Arrido Arfiadi, Christine Nguyen

In [78]:
# Web scraping, pickle imports
import requests
from bs4 import BeautifulSoup
import pickle
import pandas as pd
from datetime import datetime

# Web Scraping

In [106]:
yelp_data = pd.DataFrame(columns = ["ReviewDate", "StarRating","UserComment"]) 

In [107]:
# get_data_from_url function takes a URL link and a dataframe as an argument. The dataframe must contain 3 columns: review date, star rating, and user commment.
#it will go to the 'review' section and search for 'recommended reviews', and from there it will find the review date, star rating, and user comment. 
#There are 5 constants in the function that will jump straight to the section needed to grab the data and appends it to a dataframe.
#After it completes scraping the data on the page, it will return return the dataframe.
def get_data_from_url(url, df):
    #HTML Class Name
    
    SECTION_CLASS_NAME = "lemon--section__373c0__fNwDM margin-t4__373c0__1TRkQ padding-t4__373c0__3hVZ3 border--top__373c0__3gXLy border-color--default__373c0__3-ifU"
    REVIEW_LIST_NAME = "lemon--ul__373c0__1_cxs undefined list__373c0__3GI_T"
    REVIEW_LIST_ITEM_NAME = "lemon--li__373c0__1r9wz margin-b3__373c0__q1DuY padding-b3__373c0__342DA border--bottom__373c0__3qNtD border-color--default__373c0__3-ifU"
    COMMENT_SECTION = "lemon--p__373c0__3Qnnj text__373c0__2Kxyz comment__373c0__3EKjH text-color--normal__373c0__3xep9 text-align--left__373c0__2XGa-"
    DATE_SECTION = "lemon--span__373c0__3997G text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa-"
    CUT_OFF_YEAR = datetime.now().year - 2
    
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml") 
    reviewSection = None
    for header in soup.select("h4"):
        if "Recommended Reviews" in header.get_text():
            reviewSection = header.find_parent(class_ = SECTION_CLASS_NAME)
            break
    reviewList = reviewSection.find(class_ = REVIEW_LIST_NAME) #the review section
    reviews = reviewList.find_all(class_ = REVIEW_LIST_ITEM_NAME) #contains all the everything, containing images, star ratings, comments and date, probably user, etc.

    for review in reviews:
#         print(review.find(class_ = dateSection).get_text())            #gets date of review  
#         print(review.select_one("[aria-label*=rating]")["aria-label"]) #gets star rating
#         print(review.find(class_ = commentSection).get_text())         #gets user comments
#         print('\n')
        reviewYear = datetime.strptime(review.find(class_ = DATE_SECTION).get_text(), '%m/%d/%Y').year
#         print(reviewYear)
        if reviewYear >= CUT_OFF_YEAR:
            df = df.append({"ReviewDate":review.find(class_ = DATE_SECTION).get_text()
                      ,"StarRating":review.select_one("[aria-label*=rating]")["aria-label"]
                      ,"UserComment":review.find(class_ = COMMENT_SECTION).get_text()}, ignore_index = True)
        else:
            break
    return df         

In [102]:
#page_iterator is a function takes a URL link and a dataframe as an argument. The dataframe must contain 3 columns: review date, star rating, and user commment.
#NUMBER_OF_COMMENTS_PER_PAGE is a constant set to 20 because it represents the maximum number of reviews per page on yelp. 
#BASE_URL is a constant that takes the original URL link and adds '&sort_by=date_desc' that sorts the reviews by year in descending order.
#'&start=' is added, which in yelp is a parameter, to specify which review index to start 
#The function iterates through pages of reviews per restaurant link. It calls another funciton, get_data_from_url and passes in a URL link and dataframe.
#After it iterates through all the pages it needs to, itt returns a dataframe.
#definition of this function might change the next time we meet...

def page_iterator(url, df):
    NUMBER_OF_COMMENTS_PER_PAGE = 20 
    page = 0
    BASE_URL = url+'&sort_by=date_desc'+'&start='
    
    while page*NUMBER_OF_COMMENTS_PER_PAGE == len(df):
#         print(page)
        df = get_data_from_url(BASE_URL+str(page*NUMBER_OF_COMMENTS_PER_PAGE),df).copy()
        page+=1
    return df    

In [None]:
testUrl = 'https://www.yelp.com/biz/a-hong-kong-kitchen-seattle?osq=chinese'
yelp_data = url_to_transcript(testUrl, yelp_data).copy()

In [108]:
yelp_data = page_iterator(testUrl, yelp_data).copy()

0
1
2
3
4
5
6
7
8
9
10
11
12
13


In [109]:
yelp_data.tail()

Unnamed: 0,ReviewDate,StarRating,UserComment
270,1/5/2018,5 star rating,"The mala tang is amazing!! Warning though, it'..."
271,1/4/2018,3 star rating,Waited 45 mins and it was sub-par. Lots of ite...
272,1/3/2018,4 star rating,Most authentic HK style food Ive had outside o...
273,1/3/2018,5 star rating,Found this little place on Yelp! When in the I...
274,1/1/2018,5 star rating,"Great, tiny and unassuming hole in a wall in I..."


In [None]:
#TODO
#get restaraunt data: Restaraunt Name, Star Rating, Dollar Sign, Cuisine Type, Restaraunt Address
#code cleaning