# WEBSCRAPING

In [1]:
import requests
from bs4 import BeautifulSoup # Part of the bs4 library, used to parse and extract data from HTML pages
import urllib.request # Used to send requests to web pages and retrieve their HTML content
import re # The regular expressions (regex) library, used to find and replace patterns in text
import pandas as pd
import time

#### Checking if website can be scraped

In [2]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Referer': 'https://www.yelp.com'
}

#### Scraping 100 Pages at a time

In [3]:
############ Getting URL of all pages that can be scraped for reviews ###############
base_url = 'https://www.yelp.com/biz/c-level-san-diego-6'
result = requests.get(base_url)
soup = BeautifulSoup(result.text,'html.parser')

In [4]:
title = soup.title.text
det = title.split("-")
print(f"Resturant Name: {det[0]}")
print(f"Resturant Location: {det[3]}")
print(f"Resturant Style: {det[4]}")
print(f"Review Details: {det[2]}")
print(f"Last Updated: {det[1]}")

Resturant Name: C LEVEL 
Resturant Location:  880 Harbor Island Dr, San Diego, California 
Resturant Style:  New American 
Review Details:  7620 Photos & 5039 Reviews 
Last Updated:  Updated December 2024 


In [5]:
num_reviews = soup.find("span", class_ = "y-css-yrt0i5").text
num = int(re.findall(r'\d+', num_reviews)[0])
print(f"There are {num} reviews for {det[0]}")

There are 5043 reviews for C LEVEL 


In [7]:
parent_div = soup.find('div', class_="y-css-maehnx")  # Replace with the actual class name

# Extract all text from the parent div and its child divs
if parent_div:
    all_text = parent_div.get_text(separator='\n', strip=True)
else:
    print("Parent div not found.")
print("**"+all_text)

**Price varies
Island Prime's Lobster Bisque
195 Photos
661 Reviews
$28.95
Ahi Tuna & Salmon Poke
111 Photos
182 Reviews
$25.95
Blackened Fish Tacos
75 Photos
165 Reviews
$11.00
Chef Deborah's Pepita & Sesame Crusted Brie
68 Photos
147 Reviews
$36.95
Seafood Pasta
115 Photos
115 Reviews
$11.00
Coconut Shrimp
65 Photos
108 Reviews
$28.95
Lobster & Fontina BLT
61 Photos
83 Reviews
$38.95
Everything Crusted Ahi
35 Photos
62 Reviews
$13.95
French Onion Soup
43 Photos
63 Reviews
$15.95
Crispy Artichoke Fritters
36 Photos
63 Reviews
$22.95
Blackened Chicken Sandwich
20 Photos
56 Reviews
$17.95
Caesar Salad
11 Photos
54 Reviews


In [8]:
pagination_div = soup.find('div', class_='pagination__09f24__VRjN4 y-css-1l7sbyz')
if pagination_div:
    # Extract the total number of pages from the pagination information
    num_pages = int(pagination_div.text.split()[-1])
    print("Total number of pages:", num_pages)
else:
    print("Pagination information not found. Unable to determine the number of pages.")

Total number of pages: 504


In [9]:
url = []
cnt=0
for i in range(0,num_pages):
  url.append(base_url + '&start=' + str(cnt))
  cnt += 10

url[:10]


['https://www.yelp.com/biz/c-level-san-diego-6&start=0',
 'https://www.yelp.com/biz/c-level-san-diego-6&start=10',
 'https://www.yelp.com/biz/c-level-san-diego-6&start=20',
 'https://www.yelp.com/biz/c-level-san-diego-6&start=30',
 'https://www.yelp.com/biz/c-level-san-diego-6&start=40',
 'https://www.yelp.com/biz/c-level-san-diego-6&start=50',
 'https://www.yelp.com/biz/c-level-san-diego-6&start=60',
 'https://www.yelp.com/biz/c-level-san-diego-6&start=70',
 'https://www.yelp.com/biz/c-level-san-diego-6&start=80',
 'https://www.yelp.com/biz/c-level-san-diego-6&start=90']

#### Scraping reviews in pages

In [None]:
def scrape_reviews (url):
  print(url)
  result = requests.get(url)
  soup = BeautifulSoup(result.text,'html.parser')
  ############# Getting comments ##############
  reviews = soup.find_all('p', class_=["comment__09f24__D0cxf y-css-1wfz87z","comment__09f24__D0cxf truncated__09f24__IiW9r y-css-1wfz87z"])
  rev = []
  if reviews:
    for review in reviews:
        rev.append(review.text)
  #print(rev)

  ########### Getting dates for respective review ##############
  date_ext = soup.find_all('div', class_='y-css-scqtta')
  #print(len(rating_div))

  date = []
  if date_ext:
    for dt in date_ext:
        date.append(dt.text)
  #print(len(date))

  ########### Getting star ratings for respective review ##############
  rating_div = soup.find_all('div', class_='y-css-dnttlc')
  #print(len(rating_div))

  rt = []
  if rating_div:
    for rte in rating_div:
        #rr = rte.find('div', {"aria-label": re.compile('star rating')})["aria-label"].split()[0]
        rt.append(rte.get('aria-label'))
  #print(len(rt))
  rt=rt[:len(rev)]

  df = pd.DataFrame({'Date': date,'Review': rev, 'Rating': rt})
  #print(df.head(10))
  return df

#### Scraping Page 1

In [None]:
df = scrape_reviews('https://www.yelp.com/biz/c-level-san-diego-6')

https://www.yelp.com/biz/c-level-san-diego-6


In [None]:
df.shape[0]

10

#### Scraping page 2 - 200

In [None]:
cnt = 2
lst_url = []
rnd = 0
for link in url[1:200]:
  print(f"Scraping page {cnt}")
  cnt += 1
  df_temp = scrape_reviews(link)
  if df_temp.shape[0] == 0 :
    print( "EMPTY")
    time.sleep(5)
    df_temp = scrape_reviews(link)
    print(df_temp.shape[0])
    if df_temp.shape[0] == 0 :
      lst_url.append(link)


  rnd = rnd + df_temp.shape[0]
  df = pd.concat([df, df_temp], ignore_index=True)


Scraping page 2
https://www.yelp.com/biz/c-level-san-diego-6?start=10
Scraping page 3
https://www.yelp.com/biz/c-level-san-diego-6?start=20
EMPTY
https://www.yelp.com/biz/c-level-san-diego-6?start=20
10
Scraping page 4
https://www.yelp.com/biz/c-level-san-diego-6?start=30
Scraping page 5
https://www.yelp.com/biz/c-level-san-diego-6?start=40
EMPTY
https://www.yelp.com/biz/c-level-san-diego-6?start=40
12
Scraping page 6
https://www.yelp.com/biz/c-level-san-diego-6?start=50
Scraping page 7
https://www.yelp.com/biz/c-level-san-diego-6?start=60
Scraping page 8
https://www.yelp.com/biz/c-level-san-diego-6?start=70
Scraping page 9
https://www.yelp.com/biz/c-level-san-diego-6?start=80
Scraping page 10
https://www.yelp.com/biz/c-level-san-diego-6?start=90
EMPTY
https://www.yelp.com/biz/c-level-san-diego-6?start=90
12
Scraping page 11
https://www.yelp.com/biz/c-level-san-diego-6?start=100
Scraping page 12
https://www.yelp.com/biz/c-level-san-diego-6?start=110
Scraping page 13
https://www.yelp.co

#### Scrapping pages 200-300

In [None]:
df = pd.DataFrame({'Date': [],'Review': [], 'Rating': []})
cnt = 201
lst_url = []
rnd = 0
for link in url[200:300]:
  print(f"Scraping page {cnt}")
  cnt += 1
  df_temp = scrape_reviews(link)
  if df_temp.shape[0] == 0 :
    print("EMPTY")
    time.sleep(5)
    df_temp = scrape_reviews(link)
    print(df_temp.shape[0])
    if df_temp.shape[0] == 0 :
      lst_url.append(link)


  rnd = rnd + df_temp.shape[0]
  df = pd.concat([df, df_temp], ignore_index=True)


Scraping page 201
https://www.yelp.com/biz/c-level-san-diego-6?start=2000
EMPTY
https://www.yelp.com/biz/c-level-san-diego-6?start=2000
10
Scraping page 202
https://www.yelp.com/biz/c-level-san-diego-6?start=2010
Scraping page 203
https://www.yelp.com/biz/c-level-san-diego-6?start=2020
Scraping page 204
https://www.yelp.com/biz/c-level-san-diego-6?start=2030
Scraping page 205
https://www.yelp.com/biz/c-level-san-diego-6?start=2040
EMPTY
https://www.yelp.com/biz/c-level-san-diego-6?start=2040
0
Scraping page 206
https://www.yelp.com/biz/c-level-san-diego-6?start=2050
Scraping page 207
https://www.yelp.com/biz/c-level-san-diego-6?start=2060
EMPTY
https://www.yelp.com/biz/c-level-san-diego-6?start=2060
15
Scraping page 208
https://www.yelp.com/biz/c-level-san-diego-6?start=2070
EMPTY
https://www.yelp.com/biz/c-level-san-diego-6?start=2070
0
Scraping page 209
https://www.yelp.com/biz/c-level-san-diego-6?start=2080
EMPTY
https://www.yelp.com/biz/c-level-san-diego-6?start=2080
11
Scraping pa

#### Scraping 300 - 400

In [None]:
df = pd.DataFrame({'Date': [],'Review': [], 'Rating': []})
cnt = 301
lst_url = []
rnd = 0
for link in url[300:400]:
  print(f"Scraping page {cnt}")
  cnt += 1
  df_temp = scrape_reviews(link)
  if df_temp.shape[0] == 0 :
    print("EMPTY")
    time.sleep(5)
    df_temp = scrape_reviews(link)
    print(df_temp.shape[0])
    if df_temp.shape[0] == 0 :
      lst_url.append(link)


  rnd = rnd + df_temp.shape[0]
  df = pd.concat([df, df_temp], ignore_index=True)

Scraping page 301
https://www.yelp.com/biz/c-level-san-diego-6?start=3000
Scraping page 302
https://www.yelp.com/biz/c-level-san-diego-6?start=3010
Scraping page 303
https://www.yelp.com/biz/c-level-san-diego-6?start=3020
EMPTY
https://www.yelp.com/biz/c-level-san-diego-6?start=3020
12
Scraping page 304
https://www.yelp.com/biz/c-level-san-diego-6?start=3030
EMPTY
https://www.yelp.com/biz/c-level-san-diego-6?start=3030
0
Scraping page 305
https://www.yelp.com/biz/c-level-san-diego-6?start=3040
Scraping page 306
https://www.yelp.com/biz/c-level-san-diego-6?start=3050
Scraping page 307
https://www.yelp.com/biz/c-level-san-diego-6?start=3060
Scraping page 308
https://www.yelp.com/biz/c-level-san-diego-6?start=3070
EMPTY
https://www.yelp.com/biz/c-level-san-diego-6?start=3070
0
Scraping page 309
https://www.yelp.com/biz/c-level-san-diego-6?start=3080
Scraping page 310
https://www.yelp.com/biz/c-level-san-diego-6?start=3090
Scraping page 311
https://www.yelp.com/biz/c-level-san-diego-6?star

#### Scraping 400:503

In [None]:
df = pd.DataFrame({'Date': [],'Review': [], 'Rating': []})
cnt = 401
lst_url = []
rnd = 0
for link in url[400:503]:
  print(f"Scraping page {cnt}")
  cnt += 1
  df_temp = scrape_reviews(link)
  if df_temp.shape[0] == 0 :
    print("EMPTY")
    time.sleep(5)
    df_temp = scrape_reviews(link)
    print(df_temp.shape[0])
    if df_temp.shape[0] == 0 :
      lst_url.append(link)


  rnd = rnd + df_temp.shape[0]
  df = pd.concat([df, df_temp], ignore_index=True)

Scraping page 401
https://www.yelp.com/biz/c-level-san-diego-6?start=4000
Scraping page 402
https://www.yelp.com/biz/c-level-san-diego-6?start=4010
EMPTY
https://www.yelp.com/biz/c-level-san-diego-6?start=4010
0
Scraping page 403
https://www.yelp.com/biz/c-level-san-diego-6?start=4020
Scraping page 404
https://www.yelp.com/biz/c-level-san-diego-6?start=4030
Scraping page 405
https://www.yelp.com/biz/c-level-san-diego-6?start=4040
Scraping page 406
https://www.yelp.com/biz/c-level-san-diego-6?start=4050
Scraping page 407
https://www.yelp.com/biz/c-level-san-diego-6?start=4060
EMPTY
https://www.yelp.com/biz/c-level-san-diego-6?start=4060
0
Scraping page 408
https://www.yelp.com/biz/c-level-san-diego-6?start=4070
Scraping page 409
https://www.yelp.com/biz/c-level-san-diego-6?start=4080
EMPTY
https://www.yelp.com/biz/c-level-san-diego-6?start=4080
10
Scraping page 410
https://www.yelp.com/biz/c-level-san-diego-6?start=4090
Scraping page 411
https://www.yelp.com/biz/c-level-san-diego-6?star

In [None]:
#df = pd.DataFrame({'Date': [],'Review': [], 'Rating': []})
#cnt = 401
lst_url2 = []
rnd = 0
for link in lst_url:
  #print(f"Scraping page {cnt}")
  #cnt += 1
  df_temp = scrape_reviews(link)
  if df_temp.shape[0] == 0 :
    print("EMPTY")
    time.sleep(5)
    df_temp = scrape_reviews(link)
    print(df_temp.shape[0])
    if df_temp.shape[0] == 0 :
      lst_url2.append(link)


  rnd = rnd + df_temp.shape[0]
  df = pd.concat([df, df_temp], ignore_index=True)

Scraping page 504
https://www.yelp.com/biz/c-level-san-diego-6?start=4010
EMPTY
https://www.yelp.com/biz/c-level-san-diego-6?start=4010
10
Scraping page 504
https://www.yelp.com/biz/c-level-san-diego-6?start=4060
Scraping page 504
https://www.yelp.com/biz/c-level-san-diego-6?start=4370
Scraping page 504
https://www.yelp.com/biz/c-level-san-diego-6?start=4580
Scraping page 504
https://www.yelp.com/biz/c-level-san-diego-6?start=4840
EMPTY
https://www.yelp.com/biz/c-level-san-diego-6?start=4840
11
Scraping page 504
https://www.yelp.com/biz/c-level-san-diego-6?start=4880


In [None]:
df.shape

(1055, 3)

In [None]:
df.tail(10)

Unnamed: 0,Date,Review,Rating
1045,"Jul 2, 2015",If i lived in San Diego id eat here every day ...,5 star rating
1046,"Jun 28, 2015",Fathers' Day 5PM. Long wait. Service couldn'...,2 star rating
1047,"Jun 24, 2015",Lobster Mac and cheese was amazing. Not super...,4 star rating
1048,"Feb 23, 2016","Total experience rated as ""not recommended"". A...",1 star rating
1049,"Jun 1, 2015",This place has a great view and is right on th...,2 star rating
1050,"Dec 13, 2014",Pretty nice restaurant on harbor island. Absol...,5 star rating
1051,"Dec 28, 2009",Rebekah and the MOD were exceptional. We were...,5 star rating
1052,"Mar 18, 2010","The view is beautiful, the food is okay. Some ...",3 star rating
1053,"Jul 11, 2010",Beautiful views of San Diego! The food is not ...,4 star rating
1054,"Dec 20, 2010",My wife and I eat at C Level for a late dinner...,5 star rating


In [None]:
len(lst_url2)

0

In [None]:
rnd

62

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import sys

# Add your absolute path of your project folder to system path.
sys.path.append("/content/drive/My Drive/CS 561")

# See the full list of paths in sys.path
print(sys.path)

['/content', '/env/python', '/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/usr/local/lib/python3.10/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.10/dist-packages/IPython/extensions', '/usr/local/lib/python3.10/dist-packages/setuptools/_vendor', '/root/.ipython', '/content/drive/My Drive/CS 561']


In [None]:
%cd "/content/drive/My Drive/CS 561"

/content/drive/My Drive/CS 561


In [None]:
import openpyxl

def replace_invalid_chars(text):
    """Replaces invalid characters for Excel with empty strings."""
    # Control characters (0x00-0x1F) except tab (0x09), newline (0x0A), and carriage return (0x0D)
    # Also replace the null character (0x00)
    invalid_chars = list(range(0, 32)) # Add null char to the list of invalid_chars
    invalid_chars.remove(9)  # Tab
    invalid_chars.remove(10)  # Newline
    invalid_chars.remove(13)  # Carriage return

    for char_code in invalid_chars:
        text = text.replace(chr(char_code), "")

    return text

# Apply the function to all string columns in the DataFrame
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].apply(replace_invalid_chars)

# Now try exporting to Excel
df.to_excel('reviews3.xlsx', index=False, engine='openpyxl')

In [None]:
unscp = pd.DataFrame({'Un-scrapped':lst_url})
unscp.to_csv('un-scrapped3.csv')
unscp.head()

Unnamed: 0,Un-scrapped
0,https://www.yelp.com/biz/c-level-san-diego-6?s...
1,https://www.yelp.com/biz/c-level-san-diego-6?s...
2,https://www.yelp.com/biz/c-level-san-diego-6?s...
3,https://www.yelp.com/biz/c-level-san-diego-6?s...
4,https://www.yelp.com/biz/c-level-san-diego-6?s...
