In [None]:
#pip install selenium

In [1]:
import csv
import sys
import re
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException 

In [2]:
def check_element_exist(driver, selector, by = 'css'):
    """
    helper function to check if an element exists in the website, 
    default search by css selector
    """
    try:
        if by == 'css':
            driver.find_element(By.CSS_SELECTOR, selector)
        elif by == 'class':
            driver.find_element(By.CLASS_NAME, selector)
    except NoSuchElementException:
        return False
    return True

In [14]:
def get_restaurant_list(url):
    """
    given a tripadvisor restaurant keyword search result,
    return the restaurant page urls from the first page 
    to the second last page of the search result, 
    only including the resturants with more than 10 reviews,
    exculding the sponsored (advertising) result
    """
    driver = webdriver.Safari()
    driver.get(url)
    links = []
    page = 1
    page_link_count = 0
    selector = 'a.nav.next.rndBtn.ui_button.primary.taLnk'
    while check_element_exist(driver, 'a.nav.next.rndBtn.ui_button.primary.taLnk'):
        chunks = driver.find_elements(By.CSS_SELECTOR,'#component_2 > div > div > span')
        for c in chunks:
            if check_element_exist(c, 'span.NoCoR'):
                review_count = int(re.findall(r'\d+',c.find_element(By.CSS_SELECTOR,'span.NoCoR').text)[0])
                if (check_element_exist(c, 'div.bXDMz') == False) and (review_count >= 10):
                    restaurant = c.find_element(By.CSS_SELECTOR,'div.OhCyu > span > a')
                    links.append(restaurant.get_attribute('href'))
                    page_link_count += 1
        driver.find_element(By.CSS_SELECTOR, selector).send_keys("\n")
        print('page'+str(page)+': '+str(page_link_count)+' links')
        page += 1
        time.sleep(4)
    driver.close()
    return links

In [7]:
def get_restaurant_info(links, keyword, filename = 'restaurant.csv'):
    """
    from the list of the restaurant urls, get the information
    of the restaurants, write to a csv file
    columns:
        keyword (str): link list search by the keyword 
            (for example neighborhood name or cuisine type)
        restaurant_name (str)
        link (str): url
        price_class: $ = 1, $$-$$$ = 2.5, $$$$ = 4
        price_range: [lower bound, upper bound] if specified
        category: list of the Trip Advisor official categories
        michelin: (bool)
        ranking: (int) ranking out of all restaurant in Chicago
        avg_rating: (int) the overall ranking
        food_/service_/value_/atomosphere_rating: (int)
            sub categories of ratings
        rate5_/rate4_/rate3_/rate2_/rate1_count: (int)
            distribution of customer ratings
    """
    driver = webdriver.Safari()
    csv_file = open(filename, 'a', encoding="utf-8")
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['keyword', 'restaurant_name', 'link', 
                         'price_class', 'price_range', 'category', 'michelin', 'ranking', 
                         'avg_rating', 'food_rating', 'service_rating', 'value_rating', 'atmosphere_rating',
                         'review_count', 'rate5_count', 'rate4_count', 'rate3_count', 'rate2_count', 'rate1_count'])
    restaurant_df = pd.DataFrame()
    for index, link in enumerate(links):
        time.sleep(2)
        driver.get(link)

        restaurant_name = driver.find_element(By.CSS_SELECTOR,'div.eTnlN._W.w.O > h1').text
        
        if check_element_exist(driver, 'cfvAV', by = 'class'):
            pri_ran = driver.find_element(By.CLASS_NAME,'cfvAV')
            price_range = [int(i) for i in re.findall(r'\d+', pri_ran.text)]
        elif check_element_exist(driver, 'bYIkW', by = 'class'):
            pri_ran = driver.find_element(By.CLASS_NAME,'bYIkW')
            price_range = [int(i) for i in re.findall(r'\d+', pri_ran.text)]
        else:
            price_range = []

        pri_cat = driver.find_elements(By.CSS_SELECTOR,'span.dyeJW.VRlVV > a')
        price_class = (pri_cat[0].text).count('$')
        if price_class == 5:
            price_class = 2.5
        category = [cat.text for cat in pri_cat[1:]]
        
        michelin = check_element_exist(driver, 'div.eabcK.Pc > div > div > div')
        ranking = int(re.findall(r'\d+', 
                    driver.find_element(By.CSS_SELECTOR,'div.fYCpi > b > span').text)[0])
        
        review_count = int(re.findall(r'\d+', 
                        driver.find_element(By.CSS_SELECTOR,'div.eEwDq > a').text)[0])
        avg_rating = int(re.findall(r'\d+', 
                        driver.find_element(By.CSS_SELECTOR,'span.fdsdx').text)[0])

        cat_ratings = driver.find_elements(By.CLASS_NAME,'cwxUN')
        if len(cat_ratings) == 4:
            food_element = cat_ratings[0].find_element(By.CSS_SELECTOR,"span")
            food_rating = int(re.findall(r'\d+', food_element.get_attribute('class'))[0])/10
            service_element = cat_ratings[1].find_element(By.CSS_SELECTOR,"span")
            service_rating = int(re.findall(r'\d+', service_element.get_attribute('class'))[0])/10
            value_element = cat_ratings[2].find_element(By.CSS_SELECTOR,"span")
            value_rating = int(re.findall(r'\d+', value_element.get_attribute('class'))[0])/10
            atmosphere_element = cat_ratings[2].find_element(By.CSS_SELECTOR,"span")
            atmosphere_rating = int(re.findall(r'\d+', atmosphere_element.get_attribute('class'))[0])/10
        else:
            food_rating, service_rating, value_rating, atmosphere_rating = 'NA', 'NA', 'NA', 'NA'

        rate_distri = driver.find_elements(By.CSS_SELECTOR,'span.row_num.is-shown-at-tablet')
        rate5_count = int(rate_distri[0].text)
        rate4_count = int(rate_distri[1].text)
        rate3_count = int(rate_distri[2].text)
        rate2_count = int(rate_distri[3].text)
        rate1_count = int(rate_distri[4].text)
        
        csv_writer.writerow([keyword, restaurant_name, link, 
                             price_class, price_range, category, michelin, ranking, 
                             avg_rating, food_rating, service_rating, value_rating, atmosphere_rating,
                             review_count, rate5_count, rate4_count, rate3_count, rate2_count, rate1_count])
        print(str(index), restaurant_name)
    driver.close()    

In [8]:
def check_next_page(driver, selector):
    """
    helper function to check if there is next page to scrape
    """
    if check_element_exist(driver, selector):
        next_page_url = driver.find_element(By.CSS_SELECTOR, selector)\
                        .get_attribute('href')
        if next_page_url != '':
            if next_page_url != None:
                return 'Next'
            else:
                return 'Last'
    return 'None'

In [25]:
def get_restaurant_reviews(links, filename = 'reviews.csv'):
    """
    from the list of the restaurant urls, get all reviews
    of the restaurants, write to a csv file
    columns:
        restaurant_name(str)
        link(str): url
        date: (Month DD, YYYY)
        rating(int): ranging from 1-5
        title(str)
        review(str)
    """
    driver = webdriver.Safari()
    csv_file = open(filename, 'a', encoding="utf-8")
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['restaurant_name', 'link', 'date', 'rating', 'title', 'review'])
    for index, link in enumerate(links):
        driver.get(link)
        restaurant_name = driver.find_element(By.CSS_SELECTOR,'div.eTnlN._W.w.O > h1').text
        print(str(index)+' '+restaurant_name +': ')
        page = 0
        review_count = 0
        next_page = link
        while check_next_page(driver, 'a.nav.next.ui_button.primary') != 'None':
            time.sleep(1)
            driver.get(next_page)
            time.sleep(1)
            reviews = driver.find_elements(By.XPATH,".//div[@class='review-container']")
            if check_element_exist(driver, 'span.taLnk.ulBlueLinks'):
                driver.find_element(By.XPATH,"//span[@class='taLnk ulBlueLinks']").click()
                time.sleep(2)
            for review in reviews:
                title = review.find_element(By.CSS_SELECTOR,'div.quote').text
                date = review.find_element(By.CSS_SELECTOR, 'span.ratingDate')\
                             .get_attribute("title")
                rating = int(review.find_element\
                            (By.XPATH,".//span[contains(@class, 'ui_bubble_rating bubble_')]")\
                             .get_attribute("class").split("_")[3])//10
                review = review.find_element\
                            (By.CSS_SELECTOR,"div.prw_rup.prw_reviews_text_summary_hsx > div > p")\
                             .text.replace("\n", " ")
                
                csv_writer.writerow([restaurant_name, link, date, rating, title, review])
                review_count += 1
            if check_next_page(driver, 'a.nav.next.ui_button.primary') == 'Last':
                print('\t' + str(review_count)+' reviews')
                break
            page += 10
            next_page = re.sub(r'(Reviews-)', r'\1or'+str(page)+'-', link)
    driver.close()

In [16]:
keyword = 'Chinatown'
url = "https://www.tripadvisor.com/Restaurants-g35805-zfn7778526-Chicago_Illinois.html"
links = get_restaurant_list(url)

page1: 30 links
page2: 49 links
page3: 50 links


In [27]:
get_restaurant_info(links, keyword, 'restaurants_Chinatown.csv')
restaurants_df = pd.read_csv('restaurants_Chinatown.csv')
restaurants_df.head()

0 MingHin Cuisine
1 Connie's Pizza
2 Joy Yee Noodles
3 Triple Crown Restaurant
4 Moon Palace Restaurant
5 Qing Xiang Yuan Dumplings
6 Emperor's Choice
7 Happy Lamb Hot Pot, Chicago
8 Strings Ramen Shop
9 Phoenix Restaurant
10 Chiu Quon Bakery
11 Go 4 Food
12 Ken Kee Restaurant
13 Lao Sze Chuan
14 Mccb
15 Legend Tasty House
16 Lawrence's Fish & Shrimp
17 Chi Cafe
18 Slurp Slurp
19 Daebak Korean Street Food & Chatime
20 Dolo Restaurant & Bar
21 Cai
22 Joy Yee Noodle
23 Hing Kee Restaurant
24 BBQ King House
25 The Noodle
26 Golden Bull Restaurant
27 My Place
28 Sze Chuan Cuisine
29 Saint's Alp Teahouse
30 Three Happiness Restaurant
31 Saint Anna Bakery & Cafe
32 Ahjoomah's Apron
33 Seven Treasures Cantonese Restaurant
34 House Of Fortune
35 Wentworth Seafood House
36 Evergreen Family Restaurant
37 China Cafe
38 Great Wall Restaurant
39 Yan Bang Cai
40 Xi'an
41 Little Lamb
42 Cafe Hoang
43 Mayflower Chinese Restaurant
44 Bonchon
45 Sweet Station
46 Lao Beijing
47 Lee Wing Wah
48 Original T

Unnamed: 0,keyword,restaurant_name,link,price_class,price_range,category,michelin,ranking,avg_rating,food_rating,service_rating,value_rating,atmosphere_rating,review_count,rate5_count,rate4_count,rate3_count,rate2_count,rate1_count
0,Chinatown,MingHin Cuisine,https://www.tripadvisor.com/Restaurant_Review-...,2.5,"[10, 18]","['Chinese', 'Asian', 'Cantonese']",True,180,4,4.5,4.0,4.0,4.0,443,183,150,56,15,7
1,Chinatown,Connie's Pizza,https://www.tripadvisor.com/Restaurant_Review-...,2.5,"[15, 30]","['Italian', 'Pizza', 'Vegetarian Friendly']",False,275,4,4.5,4.5,4.5,4.5,177,96,55,12,7,2
2,Chinatown,Joy Yee Noodles,https://www.tripadvisor.com/Restaurant_Review-...,2.5,[],"['Chinese', 'Asian', 'Hong Kong']",False,331,4,4.5,4.0,4.5,4.5,208,97,64,22,8,6
3,Chinatown,Triple Crown Restaurant,https://www.tripadvisor.com/Restaurant_Review-...,2.5,"[10, 30]","['Chinese', 'Asian', 'Cantonese']",False,345,4,4.0,3.5,4.0,4.0,245,85,80,30,12,8
4,Chinatown,Moon Palace Restaurant,https://www.tripadvisor.com/Restaurant_Review-...,2.5,"[10, 30]","['Chinese', 'Asian', 'Shanghai']",False,365,4,4.5,4.5,4.5,4.5,115,56,37,12,0,2


In [28]:
get_restaurant_reviews(links, 'reviews_Chinatown.csv')
reviews_df = pd.read_csv('reviews_Chinatown.csv')
reviews_df.head()

0 MingHin Cuisine: 
621 reviews
1 Connie's Pizza: 
262 reviews
2 Joy Yee Noodles: 
197 reviews
3 Triple Crown Restaurant: 
215 reviews
4 Moon Palace Restaurant: 
107 reviews
5 Qing Xiang Yuan Dumplings: 
79 reviews
6 Emperor's Choice: 
87 reviews
7 Happy Lamb Hot Pot, Chicago: 
77 reviews
8 Strings Ramen Shop: 
87 reviews
9 Phoenix Restaurant: 
214 reviews
10 Chiu Quon Bakery: 
57 reviews
11 Go 4 Food: 
74 reviews
12 Ken Kee Restaurant: 
64 reviews
13 Lao Sze Chuan: 
182 reviews
14 Mccb: 
27 reviews
15 Legend Tasty House: 
24 reviews
16 Lawrence's Fish & Shrimp: 
91 reviews
17 Chi Cafe: 
49 reviews
18 Slurp Slurp: 
22 reviews
19 Daebak Korean Street Food & Chatime: 
31 reviews
20 Dolo Restaurant & Bar: 
48 reviews
21 Cai: 
85 reviews
22 Joy Yee Noodle: 
53 reviews
23 Hing Kee Restaurant: 
45 reviews
24 BBQ King House: 
42 reviews
25 The Noodle: 
42 reviews
26 Golden Bull Restaurant: 
24 reviews
27 My Place: 
16 reviews
28 Sze Chuan Cuisine: 
28 reviews
29 Saint's Alp Teahouse: 
39 revi

Unnamed: 0,restaurant_name,link,date,rating,title,review
0,MingHin Cuisine,https://www.tripadvisor.com/Restaurant_Review-...,"December 29, 2021",2,Disappointing Dimsum,We chose Minghin for dimsum because reviews we...
1,MingHin Cuisine,https://www.tripadvisor.com/Restaurant_Review-...,"November 20, 2021",5,Delicious and filling,Hot tea is always the way to start an evening ...
2,MingHin Cuisine,https://www.tripadvisor.com/Restaurant_Review-...,"October 24, 2021",3,Order pizza,Cuisine? $300 of food picked up for a family g...
3,MingHin Cuisine,https://www.tripadvisor.com/Restaurant_Review-...,"September 19, 2021",5,favorite dimsum!!,came here with family and some big expectation...
4,MingHin Cuisine,https://www.tripadvisor.com/Restaurant_Review-...,"August 30, 2021",4,Good dim sum and the restaurant was not a hole...,Highly recommend the dim sum. Comes out nice a...


In [29]:
reviews_df.tail()

Unnamed: 0,restaurant_name,link,date,rating,title,review
3422,Lee Wing Wah,https://www.tripadvisor.com/Restaurant_Review-...,"August 12, 2012",1,Better Luck Somewhere Else!,The service was terrible. I have never had a s...
3423,Lee Wing Wah,https://www.tripadvisor.com/Restaurant_Review-...,"May 13, 2012",4,Good Solid Cantonese,"After a night out, we stopped in at Lee Wing W..."
3424,Lee Wing Wah,https://www.tripadvisor.com/Restaurant_Review-...,"April 7, 2012",5,Don't miss the Spicy.salty shrimp,Salty spicy shrimpand oysters with bean/cilant...
3425,Lee Wing Wah,https://www.tripadvisor.com/Restaurant_Review-...,"June 18, 2008",5,The best restaurant in china town. This is wh...,The best restaurant in china town. This is wh...
3426,Lee Wing Wah,https://www.tripadvisor.com/Restaurant_Review-...,"January 3, 2008",5,great family restaurant,great family restaurant
