-
Notifications
You must be signed in to change notification settings - Fork 1
/
yelp_reviews_scrapping.py
85 lines (71 loc) · 4.37 KB
/
yelp_reviews_scrapping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# Webscrapping yelp reviews of a restaurant to csv
from bs4 import BeautifulSoup
import requests
import csv
from time import sleep
from random import randint
import re
from datetime import datetime
# storing response of a get request to a variable
response = requests.get('https://www.yelp.com/biz/celine-patisserie-seattle-2')
# creating an instance/object of class bs4.BeautifulSoup i.e reviews and storing the html parsed value of the response in it
soup = BeautifulSoup(response.text, 'html.parser')
# print(reviews)
num_reviews = soup.find(class_='lemon--div__373c0__1mboc arrange-unit__373c0__o3tjT border-color--default__373c0__3-ifU nowrap__373c0__35McF').get_text('p')
# print(num_reviews) 245 reviews
# To get the number of reviews as int
num_reviews = int(re.findall('\d+', num_reviews)[0])
print(num_reviews)
# Storing all review pages to a list; each page has 20 reviews(validated comparing the url of pages)
url_list = []
for i in range(0, num_reviews, 20):
url_list.append('https://www.yelp.com/biz/celine-patisserie-seattle-2?start='+str(i))
print(url_list)
# reviews = soup.find_all('div', attrs={'class' : 'lemon--div__373c0__1mboc review__373c0__13kpL sidebarActionsHoverTarget__373c0__2kfhE arrange__373c0__2C9bH gutter-2__373c0__1DiLQ grid__373c0__1Pz7f layout-stack-small__373c0__27wVp border-color--default__373c0__3-ifU'})
# print(len(reviews))
# Reviewer
# reviews = reviews[0]
# user = reviews.find('a', attrs={'class' : 'lemon--a__373c0__IEZFH link__373c0__1G70M link-color--inherit__373c0__3dzpk link-size--inherit__373c0__1VFlE'}).string
# print(user)
# Location
# location = reviews.find('span', attrs={'class' : 'lemon--span__373c0__3997G text__373c0__2Kxyz text-color--normal__373c0__3xep9 text-align--left__373c0__2XGa- text-weight--bold__373c0__1elNz text-size--small__373c0__3NVWO'}).string
# print(location)
# Rating
# couldn't access the rating with the exact class so went to child-parent method
# rating = reviews.find('img', attrs={'class' : 'lemon--img__373c0__3GQUb offscreen__373c0__1KofL'}).parent.get('aria-label')
# rating = int(re.findall('\d+', rating)[0])
# print(rating)
# Date
# date = reviews.find('span', attrs={'class' : 'lemon--span__373c0__3997G text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa-'}).string
# print(date)
# Review content
# content = reviews.find('span', attrs={'class' : 'lemon--span__373c0__3997G raw__373c0__3rKqk'}).get_text()
# print(content)
def onepage_scrape(reviews, csvwriter):
for review in reviews:
dic = {}
user = review.find('a', attrs={'class' : 'lemon--a__373c0__IEZFH link__373c0__1G70M link-color--inherit__373c0__3dzpk link-size--inherit__373c0__1VFlE'}).string
location = review.find('span', attrs={'class' : 'lemon--span__373c0__3997G text__373c0__2Kxyz text-color--normal__373c0__3xep9 text-align--left__373c0__2XGa- text-weight--bold__373c0__1elNz text-size--small__373c0__3NVWO'}).string
rating = review.find('img', attrs={'class' : 'lemon--img__373c0__3GQUb offscreen__373c0__1KofL'}).parent.get('aria-label')
rating = int(re.findall('\d+', rating)[0])
datestr = review.find('span', attrs={'class' : 'lemon--span__373c0__3997G text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa-'}).string
date = datetime.strptime(datestr, '%m/%d/%Y').date()
content = review.find('span', attrs={'class' : 'lemon--span__373c0__3997G raw__373c0__3rKqk'}).get_text()
dic['user'] = user
dic['location'] = location
dic['rating'] = rating
dic['date'] = date
dic['content'] = content
csvwriter.writerow(dic.values())
with open('yelpreviews.csv', 'w', encoding='utf-8', newline='') as csvfile:
review_writer = csv.writer(csvfile, delimiter='|')
headers = ['User', 'Location', 'Rating', 'Date', 'Content']
review_writer.writerow(headers)
for index, url in enumerate(url_list):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
reviews = soup.find_all('div', attrs={'class' : 'lemon--div__373c0__1mboc review__373c0__13kpL sidebarActionsHoverTarget__373c0__2kfhE arrange__373c0__2C9bH gutter-2__373c0__1DiLQ grid__373c0__1Pz7f layout-stack-small__373c0__27wVp border-color--default__373c0__3-ifU'})
# function call
onepage_scrape(reviews, review_writer)
sleep(randint(1,5))
print('Completed page'+str(index+1))