In [1]:
import requests
import pandas as pd
import csv
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from itertools import chain

def get_user_agent():
    user_agent = UserAgent()
    return {'User-Agent': user_agent.random, 'Accept-Language': 'en-US, en;q=0.5'}

def get_page_contents(url):
    page = requests.get(url, headers=get_user_agent())
    return BeautifulSoup(page.text, 'html.parser')


In [2]:
# Giving city variables their respective urls for restaurants

london = 'https://www.tripadvisor.com/Restaurants-g186338-London_England.html'
rome = 'https://www.tripadvisor.com/Restaurants-g187791-Rome_Lazio.html'
barcelona = 'https://www.tripadvisor.com/Restaurants-g187497-Barcelona_Catalonia.html'
interlaken = 'https://www.tripadvisor.com/Restaurants-g188081-Interlaken_Bernese_Oberland_Canton_of_Bern.html'
amsterdam = 'https://www.tripadvisor.com/Restaurants-g188590-Amsterdam_North_Holland_Province.html'
venice = 'https://www.tripadvisor.com/Restaurants-g187870-Venice_Veneto.html'
prague = 'https://www.tripadvisor.com/Restaurants-g274707-Prague_Bohemia.html'
budapest = 'https://www.tripadvisor.com/Restaurants-g274887-Budapest_Central_Hungary.html'
paris = 'https://www.tripadvisor.com/Restaurants-g187147-Paris_Ile_de_France.html'
vienna ='https://www.tripadvisor.com/Restaurants-g190454-Vienna.html'
brussels = 'https://www.tripadvisor.com/Restaurants-g188644-Brussels.html'
lisbon = 'https://www.tripadvisor.com/Restaurants-g189158-Lisbon_Lisbon_District_Central_Portugal.html'
oslo = 'https://www.tripadvisor.com/Restaurants-g190479-Oslo_Eastern_Norway.html'
stockholm = 'https://www.tripadvisor.com/Restaurants-g189852-Stockholm.html'
copenhagen = 'https://www.tripadvisor.com/Restaurants-g189541-Copenhagen_Zealand.html'


In [3]:
# Getting top 5 restaurants links for each city

locations = [london,rome,barcelona,interlaken,amsterdam,venice,prague,budapest,paris,vienna,brussels,lisbon,oslo,stockholm,copenhagen]
resturaunt_links=[]

for location in locations:
    
    each_link = get_page_contents(location)
    each_city_resturaunt_link = each_link.find_all(class_ = 'biGQs _P fiohW alXOW NwcxK GzNcM ytVPx UTQMg RnEEZ ngXxk')[1:6]
        
    for element in each_city_resturaunt_link:
        resturaunt_link = element.find('a')['href']
        resturaunt_links.append(resturaunt_link)
        

resturaunt_full_links = ["https://www.tripadvisor.com" + x for x in resturaunt_links]     

In [5]:
resturaunt_full_links[:5]

['https://www.tripadvisor.com/Restaurant_Review-g186338-d10460592-Reviews-Bonoo_Indian_Tapas_Restaurant-London_England.html',
 'https://www.tripadvisor.com/Restaurant_Review-g186338-d13082910-Reviews-Devine_Restaurant_Coffee_Bar-London_England.html',
 'https://www.tripadvisor.com/Restaurant_Review-g186338-d25044872-Reviews-Kinaara-London_England.html',
 'https://www.tripadvisor.com/Restaurant_Review-g186338-d2244333-Reviews-Sitara-London_England.html',
 'https://www.tripadvisor.com/Restaurant_Review-g186338-d14134252-Reviews-Scarlett_Green-London_England.html']

In [4]:
# Defining my function for scraping the relevant data I require.

def get_one_resturaunt(link):
    try:
        t = get_page_contents(link)
        
        # Extract Name
        name_element = t.find(class_="HjBfq")
        hotel = name_element.text.strip() if name_element else None        
        
        # Extract reviews
        all_reviews = [x.find(class_='prw_rup prw_reviews_text_summary_hsx').text for x in t.select('.review-container')] 
        
        # Extract review dates
        all_dates = [x.get_text() for x in t.find_all(class_='prw_rup prw_reviews_stay_date_hsx')]
        
        # Extract Rating
        rating_element = t.find(class_ ='ZDEqb')
        rating = rating_element.text.strip() if rating_element else None
        
        # Extract Location
        location_element = t.find_all(class_ = 'AYHFM')[1]
        location = location_element.text.strip() if location_element else None        

        # Extract ranking
        ranking_element = t.find_all(class_ = 'AYHFM')[0]    
        ranking = ranking_element.text.strip() if ranking_element else None
        
        # Extract price range
        price_range_element = t.find(class_='SrqKb')
        price_range = price_range_element.text.strip() if price_range_element else None
        
        # Extract review count
        reviews_count_element = t.find(class_='reviews_header_count')
        reviews_count = reviews_count_element.text.strip() if reviews_count_element else None
        
        # Split price range into min and max prices
        min_price_usd, max_price_usd = None, None
        if price_range:
            price_parts = [part.strip() for part in price_range.split(' - ')]
            if len(price_parts) == 2:
                min_price_usd, max_price_usd = price_parts
            
        # Create a DataFrame
        df = pd.DataFrame({'hotel':[hotel]*len(all_reviews),
            'location' : [location] * len(all_reviews),
            'ranking' : [ranking] * len(all_reviews),                   
            'rating':[rating] * len(all_reviews),
            'review_count': [reviews_count] * len(all_reviews),       
            'min_price_usd': [min_price_usd] * len(all_reviews),
            'max_price_usd': [max_price_usd] * len(all_reviews),
            'review': all_reviews,
            'date': all_dates
        })

        return df
    
    except Exception as e:
        print(f"Error processing link: {link}")
        print(f"Error message: {e}")
        return None


In [6]:
# Mapping my scraped data for each restaurant into a dataframe
data_list = list(map(get_one_resturaunt, resturaunt_full_links))
combined_df = pd.concat(data_list).reset_index(drop=True)

# Saving the data so that I can access it using a read_csv.
combined_df.to_csv('trip_advisor_resturaunts.csv', sep =',', index = False, encoding = 'utf-8',quoting=csv.QUOTE_ALL)

Error processing link: https://www.tripadvisor.com/Restaurant_Review-g274707-d15348493-Reviews-Sangam_Indian_Restaurant-Prague_Bohemia.html
Error message: list index out of range


In [7]:
combined_df = pd.concat(data_list).reset_index(drop=True)
combined_df.tail()

Unnamed: 0,hotel,location,ranking,rating,review_count,min_price_usd,max_price_usd,review,date
1014,Maple Casual Dining,"Vesterbrogade 24, Copenhagen 1620 Denmark","#5 of 1,992 Restaurants in Copenhagen",5.0,(759),,,We had a fantastic meal and the atmosphere was...,Date of visit: October 2023
1015,Maple Casual Dining,"Vesterbrogade 24, Copenhagen 1620 Denmark","#5 of 1,992 Restaurants in Copenhagen",5.0,(759),,,What a delightful dinner we had at Maple Casua...,Date of visit: October 2023
1016,Maple Casual Dining,"Vesterbrogade 24, Copenhagen 1620 Denmark","#5 of 1,992 Restaurants in Copenhagen",5.0,(759),,,We visited on our third night in Copenhagen an...,Date of visit: October 2023
1017,Maple Casual Dining,"Vesterbrogade 24, Copenhagen 1620 Denmark","#5 of 1,992 Restaurants in Copenhagen",5.0,(759),,,"This is a great restaurant in every respect, w...",Date of visit: October 2023
1018,Maple Casual Dining,"Vesterbrogade 24, Copenhagen 1620 Denmark","#5 of 1,992 Restaurants in Copenhagen",5.0,(759),,,"Great restaurant, nice food and service. Been ...",Date of visit: October 2023


In [18]:
combined_df.to_csv('trip_advisor_resturaunts.csv', sep =',', index = False, encoding = 'utf-8',quoting=csv.QUOTE_ALL)