# WEBSCRAPING

In [None]:
import requests
from bs4 import BeautifulSoup # Part of the bs4 library, used to parse and extract data from HTML pages
import urllib.request # Used to send requests to web pages and retrieve their HTML content
import re # The regular expressions (regex) library, used to find and replace patterns in text
import pandas as pd
import time

#### Checking if website can be scraped

In [None]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Referer': 'https://www.yelp.com'
}

#### Scraping 100 Pages at a time

In [None]:
############ Getting URL of all pages that can be scraped for reviews ###############
base_url = 'https://www.yelp.com/biz/din-tai-fung-san-diego-3?osq=Din+Tai+Fung'
result = requests.get(base_url)
soup = BeautifulSoup(result.text,'html.parser')

In [None]:
title = soup.title.text
det = title.split("-")
print(f"Resturant Name: {det[0]}")
print(f"Resturant Location: {det[3]}")
print(f"Resturant Style: {det[4]}")
print(f"Review Details: {det[2]}")
print(f"Last Updated: {det[1]}")

Resturant Name: DIN TAI FUNG 
Resturant Location:  4301 La Jolla Village Dr, San Diego, California 
Resturant Style:  Taiwanese 
Review Details:  12164 Photos & 5633 Reviews 
Last Updated:  Updated December 2024 


In [None]:
num_reviews = soup.find("span", class_ = "y-css-yrt0i5").text
num = int(re.findall(r'\d+', num_reviews)[0])
print(f"There are {num} reviews for {det[0]}")

There are 5633 reviews for DIN TAI FUNG 


In [None]:
parent_div = soup.find('div', class_="y-css-maehnx")  # Replace with the actual class name

# Extract all text from the parent div and its child divs
if parent_div:
    all_text = parent_div.get_text(separator='\n', strip=True)
else:
    print("Parent div not found.")
print(all_text)

$8.50
Cucumber Salad
343 Photos
360 Reviews
$20.00
Braised Beef Noodle Soup
203 Photos
316 Reviews
$16.00
Chicken Wonton Soup
93 Photos
142 Reviews
$18.50
Shrimp Fried Rice
142 Photos
168 Reviews
$16.00
String Beans with Garlic
244 Photos
123 Reviews
$15.50
Sweet & Sour Pork Baby Back Ribs
185 Photos
132 Reviews
$17.00
Chicken Dumplings
123 Photos
125 Reviews
$20.00
Pork Chop Fried Rice
118 Photos
107 Reviews
$17.50
Kurobuta Pork Xiao Long Bao
422 Photos
65 Reviews
$13.50
Shrimp & Kurobuta Pork Pot Stickers
112 Photos
85 Reviews
$9.50
Kurobuta Pork Buns
94 Photos
91 Reviews
$17.00
Chicken Fried Rice
52 Photos
78 Reviews
$12.00
Shrimp & Kurobuta Pork Shao Mai
122 Photos
66 Reviews
$10.00
Chocolate Buns
43 Photos
60 Reviews


In [None]:
pagination_div = soup.find('div', class_='pagination__09f24__VRjN4 y-css-1l7sbyz')
if pagination_div:
    # Extract the total number of pages from the pagination information
    num_pages = int(pagination_div.text.split()[-1])
    print("Total number of pages:", num_pages)
else:
    print("Pagination information not found. Unable to determine the number of pages.")

Total number of pages: 562


In [None]:
url = []
cnt=0
for i in range(0,num_pages):
  url.append(base_url + '&start=' + str(cnt))
  cnt += 10

url[:10]


['https://www.yelp.com/biz/din-tai-fung-san-diego-3?osq=Din+Tai+Fung&start=0',
 'https://www.yelp.com/biz/din-tai-fung-san-diego-3?osq=Din+Tai+Fung&start=10',
 'https://www.yelp.com/biz/din-tai-fung-san-diego-3?osq=Din+Tai+Fung&start=20',
 'https://www.yelp.com/biz/din-tai-fung-san-diego-3?osq=Din+Tai+Fung&start=30',
 'https://www.yelp.com/biz/din-tai-fung-san-diego-3?osq=Din+Tai+Fung&start=40',
 'https://www.yelp.com/biz/din-tai-fung-san-diego-3?osq=Din+Tai+Fung&start=50',
 'https://www.yelp.com/biz/din-tai-fung-san-diego-3?osq=Din+Tai+Fung&start=60',
 'https://www.yelp.com/biz/din-tai-fung-san-diego-3?osq=Din+Tai+Fung&start=70',
 'https://www.yelp.com/biz/din-tai-fung-san-diego-3?osq=Din+Tai+Fung&start=80',
 'https://www.yelp.com/biz/din-tai-fung-san-diego-3?osq=Din+Tai+Fung&start=90']

#### Scraping reviews in pages

In [None]:
def scrape_reviews (url):
  print(url)
  result = requests.get(url)
  soup = BeautifulSoup(result.text,'html.parser')
  ############# Getting comments ##############
  reviews = soup.find_all('p', class_=["comment__09f24__D0cxf y-css-1wfz87z","comment__09f24__D0cxf truncated__09f24__IiW9r y-css-1wfz87z"])
  rev = []
  if reviews:
    for review in reviews:
        rev.append(review.text)
  #print(rev)

  ########### Getting dates for respective review ##############
  date_ext = soup.find_all('div', class_='y-css-scqtta')
  #print(len(rating_div))

  date = []
  if date_ext:
    for dt in date_ext:
        date.append(dt.text)
  #print(len(date))

  ########### Getting star ratings for respective review ##############
  rating_div = soup.find_all('div', class_='y-css-dnttlc')
  #print(len(rating_div))

  rt = []
  if rating_div:
    for rte in rating_div:
        #rr = rte.find('div', {"aria-label": re.compile('star rating')})["aria-label"].split()[0]
        rt.append(rte.get('aria-label'))
  #print(len(rt))
  rt=rt[:len(rev)]

  df = pd.DataFrame({'Date': date,'Review': rev, 'Rating': rt})
  #print(df.head(10))
  return df

#### Scraping Page 1

In [None]:
df = scrape_reviews(base_url)
df.head()

https://www.yelp.com/biz/din-tai-fung-san-diego-3?osq=Din+Tai+Fung


Unnamed: 0,Date,Review,Rating
0,"Oct 24, 2024",Dine in | Dinner | $100+Definitely recomme...,5 star rating
1,"Nov 18, 2024",The service is amazing and the food is so good...,5 star rating
2,"Oct 26, 2024",This is first time for me to try this place. I...,5 star rating
3,"Dec 9, 2024",What is there to say? Amazing Taiwanese food. ...,5 star rating
4,"Dec 6, 2024",As someone who's been to the original Din Tai ...,5 star rating


In [None]:
df.shape[0]

10

#### Scraping 200 reviews

In [None]:
cnt = 2
lst_url = []
rnd = 0
for link in url[1:]:
  print(f"Scraping page {cnt}")
  cnt += 1
  df_temp = scrape_reviews(link)
  if df_temp.shape[0] == 0 :
    print( "EMPTY")
    time.sleep(5)
    df_temp = scrape_reviews(link)
    print(df_temp.shape[0])
    if df_temp.shape[0] == 0 :
      lst_url.append(link)
  rnd = rnd + df_temp.shape[0]
  if rnd > 200:
    break
  df = pd.concat([df, df_temp], ignore_index=True)


Scraping page 2
https://www.yelp.com/biz/din-tai-fung-san-diego-3?osq=Din+Tai+Fung&start=10
Scraping page 3
https://www.yelp.com/biz/din-tai-fung-san-diego-3?osq=Din+Tai+Fung&start=20
Scraping page 4
https://www.yelp.com/biz/din-tai-fung-san-diego-3?osq=Din+Tai+Fung&start=30
Scraping page 5
https://www.yelp.com/biz/din-tai-fung-san-diego-3?osq=Din+Tai+Fung&start=40
Scraping page 6
https://www.yelp.com/biz/din-tai-fung-san-diego-3?osq=Din+Tai+Fung&start=50
Scraping page 7
https://www.yelp.com/biz/din-tai-fung-san-diego-3?osq=Din+Tai+Fung&start=60
Scraping page 8
https://www.yelp.com/biz/din-tai-fung-san-diego-3?osq=Din+Tai+Fung&start=70
EMPTY
https://www.yelp.com/biz/din-tai-fung-san-diego-3?osq=Din+Tai+Fung&start=70
10
Scraping page 9
https://www.yelp.com/biz/din-tai-fung-san-diego-3?osq=Din+Tai+Fung&start=80
Scraping page 10
https://www.yelp.com/biz/din-tai-fung-san-diego-3?osq=Din+Tai+Fung&start=90
EMPTY
https://www.yelp.com/biz/din-tai-fung-san-diego-3?osq=Din+Tai+Fung&start=90
11
S

#### Scrapping pages 200-300

In [None]:
df.drop_duplicates(inplace=True)
df.shape

(207, 3)

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import sys

# Add your absolute path of your project folder to system path.
sys.path.append("/content/drive/My Drive/CS 561")

# See the full list of paths in sys.path
print(sys.path)

['/content', '/env/python', '/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/usr/local/lib/python3.10/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.10/dist-packages/IPython/extensions', '/usr/local/lib/python3.10/dist-packages/setuptools/_vendor', '/root/.ipython', '/content/drive/My Drive/CS 561']


In [None]:
%cd "/content/drive/My Drive/CS 561"

/content/drive/My Drive/CS 561


In [None]:
import openpyxl

def replace_invalid_chars(text):
    """Replaces invalid characters for Excel with empty strings."""
    # Control characters (0x00-0x1F) except tab (0x09), newline (0x0A), and carriage return (0x0D)
    # Also replace the null character (0x00)
    invalid_chars = list(range(0, 32)) # Add null char to the list of invalid_chars
    invalid_chars.remove(9)  # Tab
    invalid_chars.remove(10)  # Newline
    invalid_chars.remove(13)  # Carriage return

    for char_code in invalid_chars:
        text = text.replace(chr(char_code), "")

    return text

# Apply the function to all string columns in the DataFrame
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].apply(replace_invalid_chars)

# Now try exporting to Excel
df.to_excel('reviews_test.xlsx', index=False, engine='openpyxl')