In [None]:
#pip install selenium

In [1]:
import csv
import sys
import re
import time
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys

In [2]:
def get_restaurant_links(filename, neighborhood):
    '''
    To scrape restaurant links
    
    Input:
    filename: restaurants_XXXX.csv (file scraped from TripAdvisor)
    neighborhood: neighborhood name (string) (to make the search results more accurate)
    
    Return:
    a list of tuples (restaurant name, link)
    '''
    
    remove_lst = ['/menu', '/reviews', '/photos']
    restaurant_links = []
    with open(filename, newline='') as f:
        reader = csv.reader(f)
        next(reader, None) 
        restaurant_lst = []
        driver = webdriver.Safari()
        for idx, row in enumerate(reader):
            time.sleep(1)
            driver.get("https://www.google.com/")
            m = driver.find_element_by_name("q")
            m.send_keys("Zomato " + row[0] + " Chicago " + neighborhood)
            m.send_keys(Keys.ENTER)
            time.sleep(3)
            elems = driver.find_elements_by_css_selector(".yuRUbf [href]")
            link = [elem.get_attribute('href') for elem in elems][0]
            if link.startswith('https://www.zomato.com/chicago/'):
                for word in remove_lst:
                    if link.endswith(word):
                        link = link.replace(word, '')
                restaurant_links.append((row[0], link)) 
                print(idx, row[0], link)
    driver.close()
    return restaurant_links

In [3]:
def get_restaurant_info(results, filename):
    '''
    To scrape restaurant information:
    restaurant name (str),
    link (str),
    average rating (float),
    restaurant category (list of string),
    average cost (float),
    review count (int),
    location (str)
    
    Input:
    results: list of tuples (restaurant name, link)
    filename: output filename
    
    Return:
    a list of final links for scraping reviews
    '''    
    driver = webdriver.Safari()
    csv_file = open(filename, 'a', encoding="utf-8")
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['trip_res_name', 'restaurant_name', 'link','avg_rating',
                        'category', 'avgerage_cost', 'review_count', 'location']) 

    restaurant_df = pd.DataFrame()
    final_links = []
    for index, (trip_res_name, link) in enumerate(results):
        time.sleep(3)
        driver.get(link)
        time.sleep(3)
        try:
            restaurant_name = driver.find_element(By.CSS_SELECTOR, 'section > div > div > div > h1').text
            avg_rating = driver.find_element(By.CSS_SELECTOR, 'main > div > section > section > section > div > div > div > section > div > div> div > div > div').text
            review_count = int(re.findall(r'fill(.*?)Reviews', driver.find_element(By.CSS_SELECTOR, "div > main").text)[0])
            # Omit resturants with no ratings / less than 10 reviews
            if avg_rating != '-' and review_count >= 1:
                category = driver.find_elements(By.CSS_SELECTOR, 'div > section.sc-iELTvK.hOWEjC > div > span > a')
                category_lst = []
                for c in category:
                    category_lst.append(c.text)
                a_cost = driver.find_elements(By.CSS_SELECTOR, "div > main")[0].text
                # average cost per person
                ac = re.findall(r'Cost\$(.*?)for two people', a_cost)
                if ac == []:
                    avg_cost = 'NA'
                else:
                    avg_cost = int(ac[0])/2
                location = driver.find_element(By.CSS_SELECTOR, 'main > div > section> section > article >  section >p').text
                final_links.append((trip_res_name, link))
                csv_writer.writerow([trip_res_name, restaurant_name, link, avg_rating,
                                     category_lst, avg_cost, review_count, location])
                print(str(index), restaurant_name)
        except:
            print(str(index), 'Wrong link')
    print('Restaurant count: {}'.format(len(final_links)))
    driver.close() 
    return final_links

In [4]:
def get_restaurant_reviews(final_links, filename):
    '''
    To scrape reviews from restaurants in final_links:
    date (str),
    rating (float),
    review (str),
    user (str), 
    user_id (int),
    review_id (int),
    order_type (str)
    
    Input:
    final_links: a list of restaurant links
    filename: output filename

    '''
    header = {'accept-language': 'zh-CN,zh;q=0.9',
              'content-type': 'application/json',
              'cookie':'_fbp=fb.1.1646076444853.38015706; _ga=GA1.2.692007043.1646076444; _gid=GA1.2.1682652160.1646076444; fre=0; rd=1380000; AWSALBTG=n7L+L2M5Z655cdRDAGFSBg5ki/qiaYHBK4zhnPfHEEleSYuub5Sv1C4+VchvTe/oTTWs84baG5yeIahZ46CRYSmoLTeRKPFw2lkTSUc2o8ChbcUiUrqOXgVCxW1biXKdTsvvA6x+rivHm/L/w2IL+K8SrUCAV3lndEqplCp45wzC; AWSALBTGCORS=n7L+L2M5Z655cdRDAGFSBg5ki/qiaYHBK4zhnPfHEEleSYuub5Sv1C4+VchvTe/oTTWs84baG5yeIahZ46CRYSmoLTeRKPFw2lkTSUc2o8ChbcUiUrqOXgVCxW1biXKdTsvvA6x+rivHm/L/w2IL+K8SrUCAV3lndEqplCp45wzC; locus=%7B%22addressId%22%3A0%2C%22lat%22%3A41.852198%2C%22lng%22%3A-87.634752%2C%22cityId%22%3A292%2C%22ltv%22%3A113407%2C%22lty%22%3A%22subzone%22%2C%22fetchFromGoogle%22%3Afalse%2C%22dszId%22%3A0%7D; ltv=113407; lty=113407; _gat_city=1; _gat_country=1; _gat_global=1; __gads=ID=c713a5e3970ec22e-22787faa30d10021:T=1646113246:RT=1646238273:S=ALNI_MZEj8LTBXd500RMriMM2dYZP-vZ-A; PHPSESSID=424a46ba96c34db84f947901372ad2e7; orange=6273008; squeeze=ab1f04ad139fccc40f90c58d783dbd32; zhli=1; G_ENABLED_IDPS=google; csrf=3f3f06e7e4a399fc9bb038fb11aeb209; __gpi=UID=0000010a6bc0cd60:T=1646129797:RT=1646158233:S=ALNI_Mbq8G5LLiZOsKFNMyqjWJnGQ13lHg; _gcl_au=1.1.1123516614.1646076443; fbcity=292; fbtrack=f219fff44c5039d147111e91d317c940; zl=en',
              'referer': 'https://www.zomato.com/chicago/mollys-cupcakes-lincoln-park/reviews',
              'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
              'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36'}
    
    csv_file = open(filename, 'a', encoding = "utf-8")
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['restaurant_name', 'link', 'date', 'rating', 
                         'review', 'user', 'user_id', 'review_id', 'order_type']) 
    driver = webdriver.Safari()

    for index, link in enumerate(final_links):
        a = 0
        review_count = 0
        id_collect = set()
        time.sleep(3)
        driver.get(link[1])
        res_id_url = driver.find_element(By.XPATH,
                                         "//*[@id='root']/div[2]/main/div/section[4]/section/section/article[1]/div/div[2]/a").get_attribute('href')
        res_id = ''.join(filter(str.isdigit, res_id_url))
        for i in range(1,1000):
            if a == 1:
                break
            else:
                if i%5 == 0:
                    time.sleep(10)
                url = 'https://www.zomato.com/webroutes/reviews/loadMore?sort=dd&filter=reviews-dd&res_id=' + res_id + f'&page={i}'
                res = requests.get(url,headers=header)
                res = res.json()
                for entity in res['page_data']['sections']['SECTION_REVIEWS']["entities"]:
                    entity_ids = entity["entity_ids"]
                if entity_ids == []:
                    break
                for item in res['entities']['REVIEWS']:
                    review_id = res['entities']['REVIEWS'][item]['reviewId']
                    if review_id in id_collect:
                        a = 1
                        break
                    id_collect.add(review_id)
                    date = res['entities']['REVIEWS'][item]['timestamp']
                    review = res['entities']['REVIEWS'][item]['reviewText']
                    user = res['entities']['REVIEWS'][item]['userName']
                    user_id = res['entities']['REVIEWS'][item]['reviewUserId']
                    rating = res['entities']['REVIEWS'][item]['ratingV2']
                    order_type = res['entities']['REVIEWS'][item]['ratingV2Text']
                    if '<br/>' in review:
                        review_cleaned = review.replace("<br/>", " ")
                        csv_writer.writerow([link[0], link[1] + '/reviews', date, rating, 
                                             review_cleaned, user, user_id, review_id, order_type])
                    else:
                        csv_writer.writerow([link[0], link[1] + '/reviews', date, rating, 
                                             review, user, user_id, review_id, order_type])
                    review_count += 1
                    print(str(index) + ' ' + link[0] + ': ' + str(review_count) + 'reviews')

In [5]:
# scrape restaurant links
# Example: Bridgeview
results = get_restaurant_links("restaurants_Bridgeview.csv", "Bridgeview") 

  m = driver.find_element_by_name("q")
  elems = driver.find_elements_by_css_selector(".yuRUbf [href]")


0 Al Bawadi Restaurant https://www.zomato.com/chicago/al-bawadi-restaurant-bridgeview
1 The Patio https://www.zomato.com/chicago/the-patio-bridgeview
2 Mama Luigi's Restaurant and Banquets https://www.zomato.com/chicago/mama-luigis-restaurant-bridgeview
3 White Castle https://www.zomato.com/chicago/white-castle-bridgeview
4 Lindy's Chili Gerties ice cream of Bridgeview https://www.zomato.com/chicago/lindys-chili-gertie-s-ice-cream-bridgeview
5 Ing's Palace https://www.zomato.com/chicago/ings-palace-bridgeview
6 Southern Belles https://www.zomato.com/chicago/southern-belles-bedford-park
7 View https://www.zomato.com/chicago/the-view-restaurant-bridgeview
8 The Nile Restaurant https://www.zomato.com/chicago/the-nile-restaurant-bridgeview
9 IHOP https://www.zomato.com/chicago/ihop-bridgeview
10 Taqueria Los Comales https://www.zomato.com/chicago/taqueria-los-comales-bridgeview


In [6]:
# scrape restaurant info
final_links = get_restaurant_info(results, filename = 'z_restaurants_bridgeview.csv')
restaurants_df = pd.read_csv('z_restaurants_bridgeview.csv')
restaurants_df

0 Al Bawadi Restaurant
1 The Patio
2 Mama Luigi's Restaurant
7 The View Restaurant
9 IHOP
Restaurant count: 5


Unnamed: 0,trip_res_name,restaurant_name,link,avg_rating,category,avgerage_cost,review_count,location
0,Al Bawadi Restaurant,Al Bawadi Restaurant,https://www.zomato.com/chicago/al-bawadi-resta...,4.3,['Middle Eastern'],25.0,85,"7216 W 87th Street, Bridgeview 60455"
1,The Patio,The Patio,https://www.zomato.com/chicago/the-patio-bridg...,3.6,"['American', 'BBQ']",17.5,34,9100 South Harlem Avenue 60455
2,Mama Luigi's Restaurant and Banquets,Mama Luigi's Restaurant,https://www.zomato.com/chicago/mama-luigis-res...,2.9,"['Italian', 'Pizza']",22.5,10,7500 South Harlem Avenue 60455
3,View,The View Restaurant,https://www.zomato.com/chicago/the-view-restau...,3.4,['Greek'],25.0,6,"8930 S Harlem Avenue, Bridgeview 60455"
4,IHOP,IHOP,https://www.zomato.com/chicago/ihop-bridgeview,3.1,['American'],12.5,6,7240 W. 79th Street 60455


In [7]:
# scrape reviews
get_restaurant_reviews(final_links, filename = 'z_reviews_bridgeview.csv')
reviews_df = pd.read_csv('z_reviews_bridgeview.csv')
reviews_df.head()

0 Al Bawadi Restaurant: 1reviews
0 Al Bawadi Restaurant: 2reviews
0 Al Bawadi Restaurant: 3reviews
0 Al Bawadi Restaurant: 4reviews
0 Al Bawadi Restaurant: 5reviews
0 Al Bawadi Restaurant: 6reviews
0 Al Bawadi Restaurant: 7reviews
0 Al Bawadi Restaurant: 8reviews
0 Al Bawadi Restaurant: 9reviews
0 Al Bawadi Restaurant: 10reviews
0 Al Bawadi Restaurant: 11reviews
0 Al Bawadi Restaurant: 12reviews
0 Al Bawadi Restaurant: 13reviews
0 Al Bawadi Restaurant: 14reviews
0 Al Bawadi Restaurant: 15reviews
0 Al Bawadi Restaurant: 16reviews
0 Al Bawadi Restaurant: 17reviews
0 Al Bawadi Restaurant: 18reviews
0 Al Bawadi Restaurant: 19reviews
0 Al Bawadi Restaurant: 20reviews
0 Al Bawadi Restaurant: 21reviews
0 Al Bawadi Restaurant: 22reviews
0 Al Bawadi Restaurant: 23reviews
0 Al Bawadi Restaurant: 24reviews
0 Al Bawadi Restaurant: 25reviews
0 Al Bawadi Restaurant: 26reviews
0 Al Bawadi Restaurant: 27reviews
0 Al Bawadi Restaurant: 28reviews
0 Al Bawadi Restaurant: 29reviews
0 Al Bawadi Restaurant:

Unnamed: 0,restaurant_name,link,date,rating,review,user,user_id,review_id,order_type
0,Al Bawadi Restaurant,https://www.zomato.com/chicago/al-bawadi-resta...,"Mar 03, 2021",5.0,i love the place and would love to visit it ag...,Yousra Iqbal,155950384,58808318,DINING
1,Al Bawadi Restaurant,https://www.zomato.com/chicago/al-bawadi-resta...,"Sep 08, 2020",1.0,,A Al,172752089,50023530,DINING
2,Al Bawadi Restaurant,https://www.zomato.com/chicago/al-bawadi-resta...,"Oct 01, 2018",4.0,We remained at a lodging nearby to this eatery...,Shyam Pareek,73555477,39371518,DINING
3,Al Bawadi Restaurant,https://www.zomato.com/chicago/al-bawadi-resta...,"Feb 08, 2018",4.0,The space is a little cramped but that is to b...,Simran Pande,37960120,33461103,DINING
4,Al Bawadi Restaurant,https://www.zomato.com/chicago/al-bawadi-resta...,"Mar 06, 2017",4.5,This place serves excellent authentic middle e...,Sayeed,2363210,29678787,DINING
