## Crawler for Yelp Reviews

In [1]:
import time
import pandas as pd
import os

# packages to scrap content from web pages
import selenium
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

# interpretation of html 
from bs4 import BeautifulSoup

#### entities to crawl

In [2]:
l_search_entities = ["Chick-fil-A", "Chipotle Mexican Grill", "Burger King", "KFC", "McDonald’s", "Subway Restaurants", "Taco Bell", "Wendy’s", "Dunkin’", "Papa John’s Pizza", "Pizza+Hut", "Starbucks", "Applebee’s+Grill+%2B+Bar", "Olive Garden Italian Restaurant", 'Shake shack'
]

d_places = {'New York': 'New+York%2C+NY%2C+Vereinigte+Staaten', 'Los Angeles': 'Los Angeles%2C CA', 'Chicago': 'Chicago%2C IL', 'Houston': 'Houston%2C TX', 'Phoenix': 'Phoenix%2C AZ', 'Philadelphia': 'Philadelphia%2C PA', 'San Antonio': 'San Antonio%2C TX', 'San Diego': 'San Diego%2C CA', 'Dallas': 'Dallas%2C TX', 'San Jose': 'San Jose%2C CA', 'Austin': 'Austin%2C TX', 'Jacksonville': 'Jacksonville%2C FL', 'Fort Worth': 'Fort Worth%2C TX', 'Columbus': 'Columbus%2C OH', 'San Francisco': 'San Francisco%2C CA',
'Charlotte': 'Charlotte%2C NC', 'Indianapolis': 'Indianapolis%2C IN', 'Seattle': 'Seattle%2C WA', 'Denver': 'Denver%2C CO', 'Washington': 'Washington%2C DC', 'Boston': 'Boston%2C MA', 'El Paso': 'El Paso%2C TX', 'Detroit': 'Detroit%2C MI', 'Nashville': 'Nashville%2C TN', 'Portland': 'Portland%2C OR', 'Memphis': 'Memphis%2C TN', 'Oklahoma City': 'Oklahoma City%2C OK', 
'Las Vegas': 'Las Vegas%2C NV', 'Louisville': 'Louisville%2C KY', 'Baltimore': 'Baltimore%2C MD', 'Milwaukee': 'Milwaukee%2C WI', 'Albuquerque': 'Albuquerque%2C NM', 'Tucson': 'Tucson%2C AZ', 'Fresno': 'Fresno%2C CA', 'Mesa': 'Mesa%2C AZ', 
'Sacramento': 'Sacramento%2C CA', 'Atlanta': 'Atlanta%2C GA', 'Kansas City': 'Kansas City%2C MO', 'Colorado Springs': 'Colorado Springs%2C CO', 'Miami': 'Miami%2C FL', 'Raleigh': 'Raleigh%2C NC', 'Omaha': 'Omaha%2C NE', 
'Long Beach': 'Long Beach%2C CA', 'Virginia Beach': 'Virginia Beach%2C VA', 'Oakland': 'Oakland%2C CA', 'Minneapolis': 'Minneapolis%2C MN', 'Tulsa': 'Tulsa%2C OK', 'Tampa': 'Tampa%2C FL', 'New Orleans': 'New Orleans%2C LA'
}

URL_BODY = "https://www.yelp.com/search?find_desc=$1&find_loc=$2"
RESULT_PATH = r"./"
RESULT_FILE = r"yelp_restaurants.csv"

#### init driver 

In [3]:
#driver = webdriver.Chrome(r"P:\chromedriver_win32\chromedriver.exe")
driver = webdriver.Chrome(r"C:/chromedriver.exe")

driver.implicitly_wait(5)


#### Function to crawl links from restaurants from overview page
**input:** name of restaurant like 'Subway' (**string**), location like 'Boston' (**string**) <br>
**output:** list of links to all found items

In [50]:
from tqdm import tqdm
from selenium.common.exceptions import StaleElementReferenceException, ElementClickInterceptedException

def get_items(item_name, place):
    l_links = []
    driver.get(URL_BODY.replace("$1", item_name).replace("$2", place))
    while True:
        time.sleep(1.5)
        soup = BeautifulSoup(driver.page_source, features="html.parser")

        items = soup.select('a[class="css-1422juy"]')
        items = list(filter(lambda x: x.has_attr('name') and
                                      item_name.lower() in x['name'].lower(), items))
        if len(items) == 0: break

        new_links = list(map(lambda x : x['href'], items))
        l_links += new_links

        try:
            next_page = driver.find_element_by_css_selector('a[class*="next-link navigation-button__09f24__m9qRz css-1pxws0l"]')
        except NoSuchElementException or ElementClickInterceptedException or StaleElementReferenceException:
            break

        try:
            next_page.click()
        except NoSuchElementException or ElementClickInterceptedException or StaleElementReferenceException:
            break

    return list(set(l_links))
    

#### Testing (not mandatory to execute)

In [51]:
restaurants = get_items("McDonald’s", 'Houston')
print(restaurants)

['/biz/mcdonalds-houston-4?osq=McDonald%27s', '/biz/mcdonalds-houston-158?osq=McDonald%27s', '/biz/mcdonalds-houston-25?osq=McDonald%27s', '/biz/mcdonalds-houston-147?osq=McDonald%27s', '/biz/mcdonalds-houston-44?osq=McDonald%27s']


#### Function to crawl reviews to a single item
**input:** link to a certain restaurant/ item (**string**) (provided by function 'get_items()' above) <br>
**output:** list of lists of rating, comment and date

In [59]:
def handle_reviews(link):
    driver.get("https://www.yelp.com%s&sort_by=date_desc" % link)

    l_ratings = []
    l_comments = []
    l_dates = []

    while True:
        time.sleep(1.5)
        soup = BeautifulSoup(driver.page_source, features="html.parser")
        section_ratings = soup.select('section[class="margin-t4__09f24__G0VVf padding-t4__09f24__Y6aGL border--top__09f24__exYYb border-color--default__09f24__NPAKY"]')

        if not section_ratings:
            print('no selection_ratings')
            return [l_ratings, l_dates, l_comments]

        # print(f'Comment page {page}, length of section ratings: {len(section_ratings)}')
        for i in range(len(section_ratings)):
            soup_ratings = BeautifulSoup(str(section_ratings[i]), features="html.parser")
            if not soup_ratings:
                continue

            ratings = soup_ratings.select('div[class="review__09f24__oHr9V border-color--default__09f24__NPAKY"]')
            if not ratings:
                continue
            # print(f'length of ratings: {len(ratings)}')
            for j in range(len(ratings)):
                # print(ratings[i])
                soup_detail = BeautifulSoup(str(ratings[j]), features="html.parser")

                if soup_detail is None:
                    continue

                rating = soup_detail.select_one('div[class*="i-stars__09f24__foihJ"]')
                rating = rating['aria-label'] if rating else ''

                comment = soup_detail.select_one('span[class*="raw__09f24__T4Ezm"]')
                comment = comment.text if comment else ''

                date = soup_detail.select_one('span[class*="css-1e4fdj9"]')
                date = date.text if date else ''

                l_ratings.append(str(rating))
                l_comments.append(str(comment))
                l_dates.append(str(date))

        try:
            next_comment_page = driver.find_element_by_css_selector('a[class*="next-link navigation-button__09f24__m9qRz"]')
        except NoSuchElementException or ElementClickInterceptedException or StaleElementReferenceException:
            break
        try:
            next_comment_page.click()
        except NoSuchElementException or ElementClickInterceptedException or StaleElementReferenceException:
            break
    
    return [l_ratings, l_dates, l_comments]

In [60]:
test_review = handle_reviews(restaurants[1])
print(test_review[2][-1])

I mean...does anyone actually look at reviews of fast food places on Yelp? I"m not sure, but this is the happiest crew I've ever seen. They deserved a shout.This McD's is brand spankin' new. The interior is bright and modern. Around 10am on a Saturday, it was buzzing. I interacted with 3-4 crew members, all of whom must have mixed Red Bull in their coffee that morning and were exceptionally perky and happy. Greeted when I entered, pleasant to order, order brought to the table. Food hot and tasty as usual (biscuit sandwich with bacon - mmm).I'm still going to knock them for limiting their breakfast hours to 10am and for their merely passable offerings of "healthy" foods, but salads, oatmeal, and apple slices are on the menu to their credit. We'll see if it keeps up the excellence.


#### Function to read already handled links

In [61]:
def get_list_of_handled_links():
    if not os.path.exists(RESULT_PATH + RESULT_FILE):
        # df_temp = pd.DataFrame()
        # df_temp.to_csv(columns=['link', 'restaurant', 'place', 'rating', 'date', 'comment'])
        return None
    df = pd.read_csv(RESULT_FILE, header=0, encoding = "ISO-8859-1")
    if "link" in df:
        return df["link"].tolist()
    else:
        return None

#### Main 

In [None]:
import json

handled_links = get_list_of_handled_links()
print(f'Handled reviews length: {len(handled_links)}') if handled_links else print(handled_links)

FINISHED_JSON_FILE = 'finished_cities_and_restaurants.json'

for item in l_search_entities:
    for place in d_places:
        if not os.path.exists(FINISHED_JSON_FILE):
            with open(FINISHED_JSON_FILE, 'w') as f:
                json.dump({}, f)
                f.close()
        with open(FINISHED_JSON_FILE, 'r') as f:
            finished: dict = json.load(f)
            f.close()
        if place in finished and item in finished[place]:
            print(f'Already finished crawling {item} in {place}, passed.')
            continue

        l_link = []
        l_restaurant = []
        l_place = []
        l_rating = []
        l_date = []
        l_text = []

        # first, get all restaurant links
        print(f'Start crawling {item} lists in {place}')
        links = get_items(item, d_places[place])

        if handled_links:
            links_filtered = list(filter(lambda x: x not in handled_links, links))
        else:
            links_filtered = list(links)
        print(f'Now, crawl detailed {item} information in {place}')
        for i in tqdm(range(len(links_filtered))):
            try:
                # then, get (rating, comments, date)
                l_results = handle_reviews(links_filtered[i])
            except IndexError:
                continue
            
            l_link += [links_filtered[i]] * len(l_results[0])
            l_restaurant += [item] * len(l_results[0])
            l_place += [place] * len(l_results[0])
            l_rating += l_results[0]
            l_date += l_results[1]
            l_text += l_results[2]
                

        print("%s - %s, %d restaurants (%d already handled)"
              % (item, place, len(links), len(links) - len(links_filtered)))

        df_grid = pd.DataFrame()
        df_grid["link"] = l_link
        df_grid["restaurant"] = l_restaurant
        df_grid["place"] = l_place
        df_grid["rating"] = l_rating
        df_grid["date"] = l_date
        df_grid["comment"] = l_text
        df_grid.to_csv(RESULT_PATH + RESULT_FILE, mode='a')

        finished[place] = finished.get(place, list())
        finished[place].add(item)
        with open(FINISHED_JSON_FILE, 'w') as f:
            json.dump(finished, f)
            f.close()


None
Start crawling Chick-fil-A lists in New York
Now, crawl detailed Chick-fil-A information in New York


 20%|██        | 1/5 [00:30<02:03, 30.99s/it]

#### Building DataFrame of crawled data