# Packages

Updated: 1 december, 2023



By Armin Pasalic

https://arminpasalic.github.io

In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import pandas as pd
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

import time

---

# Scraping

In [11]:
#########################################################################################
#### INPUT URL OF THE RESTAURANT:
#########################################################################################

#base_url = "https://www.yelp.com/biz/restaurant-tight-københavn-k" #Example
base_url = ''

In [7]:
# Setting up the Chrome WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Running in headless mode
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Initialize the WebDriver
driver = webdriver.Chrome(options=options)

# Initialization for multiple pages

offset = 0
offset_increment = 10  # Assuming each page shows 10 items

# Initialize lists for data storage
restaurant_name = []
total_rating_reviews = []
name_location_data = []
star_rating_data = []
comment_data = []

#####################################################################################################
######## GET H1 (TITLE OF RESTAURANT) - AND TOTAL AMOUNT OF REVIEWS + AVERAGE RATING FROM ALL REVIEWS:

# Load the first page to get the H1 text
driver.get(base_url)
time.sleep(15)  # Wait for the page to load
# Extracting the H1 text
restaurant_name.append(driver.find_element(By.TAG_NAME, "h1").text)

# Now, locate the element by its href attribute
reviews_element = driver.find_element(By.CSS_SELECTOR, "span.css-1x9ee72 a.css-19v1rkv")
total_rating_reviews.append(reviews_element.text)

#####################################################################################################
######## EXTRACT REVIEWS::::
while True:
    # Construct the URL based on whether it's the first page or subsequent pages
    page_url = base_url if offset == 0 else f"{base_url}?start={offset}"    # sometimes it is: &start= or ?start= 
    driver.get(page_url)
    print(f"Scraping from: {page_url}")
    
    time.sleep(15) # ADJUST TIME!!!
    # Scrape the data from this page
    base_xpath = '//*[@id="reviews"]/section/div[2]/ul/li'
    list_items = driver.find_elements(By.XPATH, base_xpath)
    print(f"located comments: {len(list_items)}\n")

    for item_number, _ in enumerate(list_items, start=1):
        # For the name and location
        xpath_name_location = f'({base_xpath})[{item_number}]/div/div[1]/div/div[1]/div/div/div[2]/div[1]'
        name_location = driver.find_element(By.XPATH, xpath_name_location).text
        name_location_data.append(name_location)
        
        # For the star rating
        xpath_rating = f'({base_xpath})[{item_number}]//div[contains(@class, "css-14g69b3")]'
        star_rating = driver.find_element(By.XPATH, xpath_rating).get_attribute('aria-label')
        star_rating_data.append(star_rating)

        # For the comment
        xpath_comment = f'({base_xpath})[{item_number}]//span[contains(@class, "raw__09f24__T4Ezm")]'
        comment = driver.find_element(By.XPATH, xpath_comment).text
        comment_data.append(comment)

    # Check if the expected number of items is present
    if len(list_items) < offset_increment:
        break

    # Increment the offset for the next page
    offset += offset_increment

# Close the browser
driver.quit()

Scraping from: https://www.yelp.com/biz/restaurant-tight-københavn-k
located comments: 10

Scraping from: https://www.yelp.com/biz/restaurant-tight-københavn-k?start=10
located comments: 10

Scraping from: https://www.yelp.com/biz/restaurant-tight-københavn-k?start=20
located comments: 10

Scraping from: https://www.yelp.com/biz/restaurant-tight-københavn-k?start=30
located comments: 10

Scraping from: https://www.yelp.com/biz/restaurant-tight-københavn-k?start=40
located comments: 10

Scraping from: https://www.yelp.com/biz/restaurant-tight-københavn-k?start=50


KeyboardInterrupt: 

---

# Process data

In [8]:
# Create a DataFrame from the lists
df = pd.DataFrame({
    'Name and Location': name_location_data,
    'Star Rating': star_rating_data,
    'Comment': comment_data
})

# Splitting the 'Name and Location' column into two parts
df[['Name and Location', 'Location']] = df['Name and Location'].str.split('\n', expand=True)
df = df.rename(columns={'Name and Location': 'User'})

# Add 'restaurant_name' and 'total_rating_reviews' to the DataFrame
df['Restaurant Name'] = restaurant_name * len(df)
df['Total Rating & Reviews'] = total_rating_reviews * len(df)

df.to_csv('restaurant-tight-københavn-k_YELP-REVIEWS.csv', index=False) #change for the restaurant you are scraping!
# Display the DataFrame
df

Unnamed: 0,User,Star Rating,Comment,Location,Restaurant Name,Total Rating & Reviews
0,Mitch H.,5 star rating,Despite the name--the food and ambiance are pe...,"San Francisco, CA",Restaurant Tight,(163 reviews)
1,Toyin D.Elite 23,4 star rating,I was looking for a place to eat late on a Fri...,"New York, NY",Restaurant Tight,(163 reviews)
2,Bill N.,5 star rating,Decided to seek this out based on the Yelp rec...,"Urbandale, IA",Restaurant Tight,(163 reviews)
3,Art T.Elite 23,4 star rating,Lovely restaurant with nice ambiance. Our wait...,"Albuquerque, NM",Restaurant Tight,(163 reviews)
4,Karissa D.Elite 23,5 star rating,I am so glad Gasoline Burger ran out of meat!\...,"Meriden, CT",Restaurant Tight,(163 reviews)
5,Carlos D.,5 star rating,Good ambience and phenomenal food the Pasta an...,"Carmel, IN",Restaurant Tight,(163 reviews)
6,Cameron B.Elite 23,4 star rating,"Fresh, deep and delicious. Charming, warm and ...","Washington, DC",Restaurant Tight,(163 reviews)
7,Marc N.,5 star rating,The place was great. I think you need a reserv...,"Alhambra, CA",Restaurant Tight,(163 reviews)
8,Nchimunya W.,1 star rating,"Wow, this restaurant was so bad I don't even k...","New York, NY",Restaurant Tight,(163 reviews)
9,Prasiddha H.Elite 23,4 star rating,Went to Tight when we were shopping in copenha...,"San Jose, CA",Restaurant Tight,(163 reviews)


In [10]:
df['Comment'][0]

"Despite the name--the food and ambiance are perfect! Warm, comfortable, and delicious. It's no exaggeration to say I had one of the best burgers I've ever had here. The food was amazing, service was friendly, and desserts and drinks were just what I wanted.\n\nIf you're new in Copenhagen and looking for a local place downtown that feels authentic... you won't be disappointed here. I promise."