forked from yeo09898/Easy_Crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
yelp_scraper.py
122 lines (94 loc) · 3.41 KB
/
yelp_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Oct 3 11:03:48 2017
@author: jackylee
"""
from bs4 import BeautifulSoup
import re
import time
import requests
import datetime
import csv
def getLink(link):
source = link.get('href')
if '/biz' in source:
source = "https://www.yelp.com" + source
source = source.split('?osq=')[0].strip()
print (source)
else:
source = 'NA'
return source
def getTitle(tag):
span = tag.find('span')
title = span.text
return title
def run_search(url, time_limit):
start_time = datetime.datetime.now()
p = -1
csv_lst = list()
fw = open('restaurant.txt', 'w+')
with fw:
while (True):
p = p + 1
running_time = datetime.datetime.now() - start_time
if running_time.seconds / 60.0 > time_limit:
print("time out")
break
print ('page', p)
html = None
pageLink = url + str(p * 10)
for i in range(5):
try:
running_time = datetime.datetime.now() - start_time
if running_time.seconds / 60.0 > time_limit:
print("time out")
break
response = requests.get(pageLink, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36', })
html = response.content
break
except Exception as e:
print ('failed attempt', i)
time.sleep(0.1)
if not html: continue
soup = BeautifulSoup(html.decode('ascii', 'ignore'), 'lxml')
restaurant_list = soup.findAll('div', {'class': re.compile('main-attributes')})
print("reading data...\n")
for restaurant in restaurant_list:
running_time = datetime.datetime.now() - start_time
if running_time.seconds / 60.0 > time_limit:
print("time out")
break
link = restaurant.find('a', {'class': re.compile('biz-name js-analytics-click')})
source = getLink(link)
title = getTitle(link)
category = get_category(restaurant)
rating = get_rating(restaurant)
if source == 'NA':
continue
fw.write(title + "\n" + category + "\n" + rating + "\n" + source + "\n")
csv_lst.append((title, category, rating, source))
time.sleep(0.1)
fw.close()
write_csv(csv_lst)
def get_rating(tag):
ratingChunk = tag.find('div', {'class': "i-stars"})
rating = ratingChunk.get('title')
return rating
def get_category(tag):
cate_Chunk = tag.find('span', {'class': "category-str-list"}).findAll('a')
category = ""
for cate in cate_Chunk:
category = cate.text + " "
return category.strip()
def write_csv(all_result):
with open('doc/result.csv', 'w+') as f:
f_csv = csv.writer(f)
f_csv.writerow(("Title", "Category", "Rating", "Link"))
for i in all_result:
f_csv.writerow(i)
f.close()
def run(keyword, time_limit):
url = 'https://www.yelp.com/search?find_desc=' + keyword + '&find_loc=New+York,+NY&start='
run_search(url, time_limit)