***
**Python scraper for tripadvisor**
***

### USED LIBRARIES

Import the packages/libraries

In [1]:
import time, random, re
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By

#---llibreries afegides
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import json

from selenium.webdriver.firefox.options import Options

options = Options()
options.add_argument('--disable-blink-features=AutomationControlled')

### CREATING THE WEB DRIVER

Opening a new automated chrome tab with selenium webdriver

In [None]:
#This is to set some parameters on the driver so that it can give me the requests it makes to the server.
desired_capabilities = DesiredCapabilities.CHROME
desired_capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
  
driver = webdriver.Chrome('/Users/user/Desktop/JupyLab/webdriver/chromedriver110', desired_capabilities=desired_capabilities)

### OBTAINING A LIST OF RESTAURANTS IN AARHUS THROUGH SCRAPING TECHNIQUES

We proceed to opening tripadvisor website. First thing we find is a pop up asking to accept privacy and cookies. We make the robot click accept with find element (we inspect the element "Acepto" in this case and we click by the xpath)

In [3]:
driver.get('https://www.tripadvisor.com/Restaurants-g189530-zfn20484093-Aarhus_East_Jutland_Jutland.html')
time.sleep(10)
try:
    driver.find_element(By.XPATH, '//*[@id="onetrust-accept-btn-handler"]').click()
except:
    pass


Create a list of the restaurants that can be found in Aarhus with name as well as the varying elements that can be collected from the webpage link (href), i.e. https://www.tripadvisor.com/Restaurant_Review-g189530-d4505984-Reviews-Ispirazione_Restaurant_Vinbar_Kaffebar_Butik-Aarhus_East_Jutland_Jutland.html. We need them because when scrapping each restaurant reviews we will use them to manipulate the link and go restaurant by restaurant. 

In [4]:
restaurant_list = [] #Create a list to gather all the elements
def get_list(soup): #Create a function to list all the restaurants (and the additional info we gather)
    
    #Loop to gather all the restaurants from the list (In this case page 0: div[data-test*="_list_item)
    for div in soup.select('div[data-test*="_list_item"]', href = True):
        
        #We start by gathering data from the link "href" that we open for each restaurant. In the link_info 
        #dictionary we add the rest_name and num_opinions and additioanlly all the elements we are collecting 
        #from the link
        try: 
             rest_name = div.find(class_='RfBGI').text.split('. ')[1]
             geo_locationId = div.find(class_='Lwqic Cj b').get('href').split('-')
        except: pass
        try:
             link_info = {
             "rest_name": rest_name,
             #'num_opinions': div.find(class_ = 'IiChw').text.split(' ')[0], WE NEED TO DEAL WITH NON-REVIEWED RESTS
             'geo': geo_locationId[1],
             'locationId': geo_locationId[2],
             'restaurant_name': geo_locationId[4],
             'location': geo_locationId[5]}
              
             restaurant_list.append(link_info) #Add dictionary elements to the list
        except: pass
    return restaurant_list

# GET RESTAURANT LIST, PARSE HTML FROM PAGE SOURCE FOR PAGE 1
#Create element that we introduce in the function to create a list: Parsed html
soup = BeautifulSoup(driver.page_source, 'html.parser') 
restaurant_list = get_list(soup) #Run the function


# GATHER ALL THE RESTAURANTS FROM PAGE 2 TO PAGE N

#Start by clicking next to get to page 2 (we can not put this part in the while loop because the XPATH for the page 1
# button "siguiente" is different than for the other pages)
try: driver.find_element(By.XPATH, '//*[@id="EATERY_LIST_CONTENTS"]/div[3]/div/a').click()
except: driver.find_element(By.XPATH, '//*[@id="EATERY_LIST_CONTENTS"]/div[2]/div/a').click()

#Loop to go through all the pages gathering all restaurant information described above)
while soup.find(class_= 'nav next rndBtn ui_button primary taLnk'):
     soup = BeautifulSoup(driver.page_source, 'html.parser') 
     time.sleep(4)
     restaurant_list = restaurant_list = get_list(soup)
     try: #We need a try here because for the last page it won't find the button XPATH and we don't 
          #want to get an exception
          driver.find_element(By.XPATH, '//*[@id="EATERY_LIST_CONTENTS"]/div[2]/div/a[2]').click() 
          time.sleep(4) 
     except:
          break

'''
print(len(restaurant_list)) #Print number of restaurants in the list
for i in restaurant_list: #Print restaurants names
    print(i['rest_name']) 
print(restaurant_list)
'''
#Save the list as a df    
restaurant_list_df = pd.DataFrame (restaurant_list)
#restaurant_list_df.to_csv("restaurant_list_df", index = False)


Saving the restaurants list in a CSV

In [None]:
restaurant_list_df.to_csv("restaurant_list_df_aarhus", index = False)

### OBTAINING RESTAURANTS LIST FROM THE CSV FILE

Open the df list saved to avoid having to scrape again all the restaurants:

In [None]:
restaurant_list_df = pd.read_csv("restaurant_list_df_aarhus", index_col=0, encoding = 'unicode_escape')
restaurant_list = restaurant_list_df.to_dict("records")
#restaurant_list = restaurant_list[0:5]
print(restaurant_list)



### OBTAINING REVIEWS

Create a function that gathers all the reviews (and all its inerent relevant information) for all the restaurants in the previously generated list:

In [6]:
def get_data(link, driver, page, df):

    driver.get(link) #Gets in the first restaurant webpage
    time.sleep(random.randint(1,3))
    
    #In case page for the reviews = 0, then a popup might appear and we make the robot to close it
    if page == '0':
        driver.find_element(By.XPATH, '//*[@id="onetrust-accept-btn-handler"]').click()
        time.sleep(1)

    try: 
        all_languages = driver.find_element(By.XPATH, '//*[@id="taplc_detail_filters_rr_resp_0"]/div/div[1]/div/div[2]/div[4]/div/div[2]/div[1]/div[1]/label')
        #time.sleep(1) #Click to all languages, to get reviews in all languages avaiblable
        all_languages.click()
        time.sleep(5)
    except:
        pass

    soup = BeautifulSoup(driver.page_source, 'html.parser') #Create element with html parsed that allows us to extract data from it
  
    local_name = soup.title.text.split(',')[0] #We don't need to put the name of the restaurant in the reviews loop
    #because is always part of the same page. It's in the tab of the website (that's the title)

    for div in soup.findAll('div', attrs={'class':'reviewSelector'}): #Loop for the reviews list
        
        #From the reviews we need: 
        
        #TITLE OF THE REVIEW
        title = div.find(class_ = 'noQuotes').text
        
        #DATE OF THE REVIEW
        review_date = div.find(class_ = 'ratingDate')['title']
        
        #ID OF THE REVIEW
        review_id = div.get('id')

        #MEMBER NAME + MEMBER ID
        try:
            member_name = div.find(class_='info_text pointer_cursor').text
            member_id_all = div.find(class_= "memberOverlayLink clickable")["id"]
            member_id = re.split(r'[-_]', member_id_all)[1]
            
        except:
            member_name = 'unknown'
            member_id = 'unknown'
        
        #MEMBER NAME (IN CASE THE REVIEW COMES FROM EL TENEDOR APP)
        try:
            member_name = div.find(class_='info_text ').text
            member_id = member_name
        except:
            pass

        #DAY OF THE VISIT
        try:
            visit_day = div.find(class_='prw_rup prw_reviews_stay_date_hsx').text.split(': ')[1]
        except:
            visit_day = None

        #RATING
        rating = div.select_one('[class*="ui_bubble_rating bubble"]')['class'][1].split('_')[1][0]
        
        #TEXT REVIEW
        text_review = div.find(class_ = 'entry').text.replace('Más','').replace('...',' ').replace('\n',' ')
        
        #REVIEW DONE VIA MOBILE OR NOT
        via_mobile = div.find('span', {'class': 'ui_icon mobile-phone'})
        if via_mobile:
            via_mobile = True
        else:
            via_mobile = None

        dict_aux = {
            'restaurant': [local_name],
            'review_date': review_date,
            'visit_day': visit_day,
            'review_id': review_id,
            'member_id': member_id,
            'member_name': member_name,
            'rating': rating,
            'title_review': title,
            'text_review': text_review,
            'via_mobile': via_mobile,
            'page': page
        }

        df_aux = pd.DataFrame(dict_aux) #Names of the columns
        df = pd.concat([df, df_aux], axis=0) #Giving names/labels to the columns of our dataframe

    
    df.reset_index(drop=True, inplace=True) #To enumerate the dataframe rows


    return df, soup

Create a loop with the function that gathers all the reviews for each restaurant. We need the loop to access each restaurant using their link and the changing parts that we collected in the restaurants list part:

In [None]:
df = pd.DataFrame() #Create an empy df
for restaurant in restaurant_list:
    #Reviews in page 1
    page = 0
    geo = restaurant['geo']
    locationId = restaurant['locationId']
    restaurant_name = restaurant['restaurant_name']
    location = restaurant['location']

    print('Scraping ',restaurant_name, 'in ', location) #just to know what the scrapper is doing

    link = f'https://www.tripadvisor.com/Restaurant_Review-{geo}-{locationId}-Reviews-{restaurant_name}-{location}'
    get_data_return = get_data(link, driver, page, df)
    df = get_data_return[0]
    soup = get_data_return[1]
    print(link,'done')
    time.sleep(1)
    if soup.find(class_= 'nav next ui_button primary') != None:

#To proceed with reviews in page 2 to N, we define a while function (while the "next" o "siguiente" botton is not
# disabled, then the scrapper needs to continue gathering reviews). Since the link changes for each page, we can
# use that to scrappe each page.
    # 2nd to N-1 pages
        while soup.find(class_= 'nav next ui_button primary disabled') == None:
                page+=1
                link = f'https://www.tripadvisor.com/Restaurant_Review-{geo}-{locationId}-Reviews-or{page}0-{restaurant_name}-{location}'
                get_data_return = get_data(link, driver, page, df)
                df = get_data_return[0]
                soup = get_data_return[1]
                time.sleep(random.randint(2,3))
                print(link,' done')

        #Code for the last page of reviews
        if soup.find(class_= 'nav next ui_button primary disabled') == None:
                page+=1
                link = f'https://www.tripadvisor.com/Restaurant_Review-{geo}-{locationId}-Reviews-or{page}0-{restaurant_name}-{location}'
                get_data_return = get_data(link, driver, page, df)
                df = get_data_return[0]
                soup = get_data_return[1]
                print(link,' done')

    
print('JOB DONE')
df





In [16]:
df.to_csv("reviews", mode="a", index=False, header=False)

### OBTAINING RESTAURANT INFORMATION (CHARACTERISTICS)

Function to gather all the relevant information of each restaurant in a df

In [12]:
def get_data1(link, driver, page, df):

      driver.get(link) #Gets in the first restaurant webpage
      time.sleep(random.randint(1,2))

      #---aqui agafo tots els requests que el driver fa al network pq es l'unica manera d'obtenir la location
      logs = driver.get_log("performance")

      #In case page for the reviews = 0, then a popup might appear and we make the robot to close it
      if page == '0':
            driver.find_element(By.XPATH, '//*[@id="onetrust-accept-btn-handler"]').click()
            time.sleep(1)
            
      
      soup = BeautifulSoup(driver.page_source, 'html.parser') #Create element with html parsed that allows us 
      #to extract data from it

      #RESTAURANT NAME
      local_name = soup.title.text.split(',')[0] #We don't need to put the name of the restaurant in the reviews loop
      #because is always part of the same page. It's in the tab of the website (that's the title) 
            
      #NUMBER OF REVIEWS
      try: 
            nreviews = soup.find(class_ = 'AfQtZ').text.split(' ')[0]
      except: nreviews = "0"

      #AVERAGE GRADE
      try:
            avgrade = soup.find(class_ = "ZDEqb").text
      except: avgrade = "N/A"

      #PRICE RANGE
      #try:
            #priceRange = soup.find(class_ = "dlMOJ").text
      #except: priceRange = "N/A"

      #Loop details
      #---posar variables a NA d'entrada aixi si no ho troba no em dona error a l'assignar el valor al diccionari "dict_aux"
      #---he tret l'if perquè és el que causava l'error. Jo els borraria
      price, cuisine_types, special_diets, meals, advantages, location = "N/A", "N/A", "N/A", "N/A", "N/A", "N/A"
      try:
            try: 
                  
                  for div in soup.find(class_ = "BMlpu"):
                  #Get PRICE
                        if div.find(class_ = "tbUiL b").text == "PRICE RANGE":
                              price = div.find(class_ = "SrqKb").text
                        #else: price = "N/A"
                              
                  #Get cuisine type
                        if div.find(class_ = "tbUiL b").text == "CUISINES":
                              cuisine_types = div.find(class_ = "SrqKb").text 
                        #else: cuisine_types = "N/A"
                  
                  #Get special diet
                        if div.find(class_ = "tbUiL b").text == "Special Diets":
                              special_diets = div.find(class_ = "SrqKb").text
                        #else: special_diets = "N/A" 

                  #Get meals
                        if div.find(class_ = "tbUiL b").text == "Meals":
                              meals = div.find(class_ = "SrqKb").text
                        #else: special_diets = "N/A" 

                  #Get advantages
                        if div.find(class_ = "tbUiL b").text == "Functions":
                              advantages = div.find(class_ = "SrqKb").text
                        #else: special_diets = "N/A"
      
            except:
                  
                  
                  for div in soup.find(class_ = "RHcXN"):
                  #Get PRICE
                        if div.find(class_ = "tbUiL b").text == "PRICE RANGE":
                              price = div.find(class_ = "SrqKb").text
                        #else: price = "N/A"
                              
                  #Get cuisine type
                        if div.find(class_ = "tbUiL b").text == "CUISINES":
                              cuisine_types = div.find(class_ = "SrqKb").text 
                        #else: cuisine_types = "N/A"
                  
                  #Get special diet
                        if div.find(class_ = "tbUiL b").text == "Special Diets":
                              special_diets = div.find(class_ = "SrqKb").text
                        #else: special_diets = "N/A" 

                  #Get meals
                        if div.find(class_ = "tbUiL b").text == "Meals":
                              meals = div.find(class_ = "SrqKb").text
                        #else: special_diets = "N/A" 

                  #Get advantages
                        if div.find(class_ = "tbUiL b").text == "Functions":
                              advantages = div.find(class_ = "SrqKb").text
                        #else: special_diets = "N/A"
      except: pass
      #LOCATION
      network_string = '.png|' #--- l'string aquest es el que em marca on després hi haura les coordenades
      network_logs = [log for log in logs if (network_string in str(log))] #---list comprehension per filtrar nomes els requests que tinguin el network string dins el seu text

      for network_log in network_logs: #---bucle per llegir els diferents logs que han passat el filtre
            if len(location)>7: #--- aixo es pq si troba les coordenades surti del loop for pq no vull q el faci tot si ja te dades
                  break
            try:
                  #---aqui basicament entro dins el diccionari, com que es passsa a string li faig un json.loads i entro a les diferents claus... 
                  #...per obtenir el valor que vull, amb splits i agafant el 1r valor de la llista que em queda
                  location = json.loads(network_log['message'])['message']['params']['request']['url'].split('.png|')[1].split('&')[0]
                  print(location)
            except:
                  location = "loc N/A"
                  pass
            
      dict_aux = {
            'restaurant': [local_name],
            'nreviews': [nreviews],
            'avgrade': [avgrade],
            'price': [price],
            'cuisine_types' : [cuisine_types],
            'special_diets' : [special_diets],
            'meals': [meals],
            'advantages': [advantages],
            #'priceRange': [priceRange],
            'location': [location]}

      df_aux = pd.DataFrame(dict_aux) #Names of the columns
      df = pd.concat([df, df_aux], axis=0) #Giving names/labels to the columns of our dataframe

      df.reset_index(drop=True, inplace=True) #To enumerate the dataframe rows

      return df


Create a loop with the function that gathers relevant information for each restaurant. We need the loop to access each restaurant using their link and the changing parts that we collected in the restaurants list part:


In [None]:
df = pd.DataFrame() #Create an empy df

for restaurant in restaurant_list:
    page = 0
    geo = restaurant['geo']
    locationId = restaurant['locationId']
    restaurant_name = restaurant['restaurant_name']
    location = restaurant['location']

    print('Scraping ',restaurant_name, 'in ', location) #just to know what the scrapper is doing

    link = f'https://www.tripadvisor.com/Restaurant_Review-{geo}-{locationId}-Reviews-{restaurant_name}-{location}'
    get_data_return = get_data1(link, driver, page, df)
    df = get_data_return
    #soup = get_data_return[1]
    print(link,'done')
    time.sleep(1)

print('JOB DONE')
df
df.to_csv("restaurant_price_aarhus", index = False, encoding = "utf-8")




### SAVE THE DATAFRAME IN A CSV

In [None]:
df.to_csv("restaurant_features_aarhus", index = False, encoding = "utf-8")

preu rest

In [9]:
def get_data1(link, driver, page, df):

      driver.get(link) #Gets in the first restaurant webpage
      time.sleep(random.randint(1,2))

      #---aqui agafo tots els requests que el driver fa al network pq es l'unica manera d'obtenir la location
      logs = driver.get_log("performance")

      #In case page for the reviews = 0, then a popup might appear and we make the robot to close it
      if page == '0':
            driver.find_element(By.XPATH, '//*[@id="onetrust-accept-btn-handler"]').click()
            time.sleep(1)
            
      
      soup = BeautifulSoup(driver.page_source, 'html.parser') #Create element with html parsed that allows us 
      #to extract data from it

      #RESTAURANT NAME
      local_name = soup.title.text.split(',')[0] #We don't need to put the name of the restaurant in the reviews loop
      #because is always part of the same page. It's in the tab of the website (that's the title) 

      #PRICE RANGE
      try:
              priceRange = soup.find(class_ = "dlMOJ").text
      except: priceRange = "N/A"

      
            
      dict_aux = {
            'restaurant': [local_name],
            'priceRange': [priceRange]}

      df_aux = pd.DataFrame(dict_aux) #Names of the columns
      df = pd.concat([df, df_aux], axis=0) #Giving names/labels to the columns of our dataframe

      df.reset_index(drop=True, inplace=True) #To enumerate the dataframe rows

      return df