-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_info.py
158 lines (123 loc) · 4.61 KB
/
get_info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
"""
This script is used for scraping the reviews data for all of the restaurants
for which we have metadata in our MongoDB. It is meant to be run as a
multithreaded script called from multi_process.py in order to speed up
scraping time.
notes: make sure mongod running. use `sudo mongod` in terminal
"""
from pymongo import MongoClient
from bs4 import BeautifulSoup
import requests
import sys
import logging
import time
import os
# Specify MongoDB details here
client = MongoClient()
db = client.restaurants
coll = db.seattle_only_meta
# Proxy login information to avoid IP blocks
AUTH = requests.auth.HTTPProxyAuth(os.environ.get('AUTH_NAME'), os.environ.get('PASSWORD'))
PROXIES = {'http': 'http://' + os.environ.get('PROXY_URL')}
def get_npages(url):
"""
This function returns the number of pages of reviews on Yelp for the
restaurant who's main Yelp page is specified in 'url'.
"""
text = try_requests(url)
if len(text) > 0:
soup = BeautifulSoup(text, "html.parser")
pages = soup.find('div', class_="page-of-pages")
if pages:
pages = pages.text.split()[-1]
else:
pages = "1"
return int(pages)
return 0
def get_curr_rev_cnt(busi):
"""
This function counts the number of reviews for a given business, 'busi',
that are already in our database.
INPUT: busi - a document in our MongoDB collection containing data for a
specific restaurant.
"""
return len(busi.get('reviews', []))
def try_requests(url):
"""
This function attempts to access the html of the target webpage in 'url'.
If the attempt is unsuccessful, the function tries again three more times
before giving up and throwing a warning.
INPUT: url -- url address of target webpage to be loaded.
OUTPUT: The html text of the target webpage
"""
counter = 0
try:
r = requests.get(url, proxies=PROXIES, auth=AUTH)
except:
logging.warning('Could not access link %s' % url)
return ""
if r.status_code == 200:
return r.text
else:
while counter < 3:
counter += 1
time.sleep(5)
r = requests.get(url, proxies=PROXIES, auth=AUTH)
if r.status_code == 200:
return r.text
logging.warning('Could not access link %s' % url)
logging.warning("%d" % r.status_code)
return ""
def get_reviews(busi, collection):
"""
This function checks to see if the review data has been already scraped for
a given restaurant, 'busi', and, if not, scrapes the data from Yelp and
adds it to our 'collection'. Only the first 1200 reviews for the business
are scraped if that many exist.
INPUT: busi - a document in our MongoDB collection containing data for a
specific restaurant.
collection - the MongoDB collection that we will be using.
"""
logging.warning('getting reviews for ' + busi['name'])
rev = []
if (busi.get('review_count', 0) > 0) and (get_curr_rev_cnt(busi) < min(1200, busi.get('review_count', 0))):
collection.update({'id': busi['id']}, {'$set': {'reviews': []}})
npages = get_npages(busi['url'])
pagen = min(30, npages)
for page in xrange(pagen):
index = page * 40
if page == 0:
url = '%s' % busi['url']
else:
url = '%s?start=%d' % (busi['url'], index)
text = try_requests(url)
if len(text) > 0:
soup = BeautifulSoup(text, 'html.parser')
rev.extend(soup.find_all('div', class_='review'))
try:
collection.update({'id': busi['id']}, {'$set': {'reviews':
[{'html': str(item)} for item in rev]}})
except:
with open('failed.txt', 'w') as failed:
for item in rev:
failed.write('%s\n' % item)
if __name__ == '__main__':
# find documents that do not have reviews yet
lst = list(coll.find({"$or": [{"reviews": {"$size": 0}}, {"reviews": {"$exists": 0}}]}).sort('_id'))
start = int(sys.argv[1])
end = min(len(lst), int(sys.argv[2]))
name = "log_%d_%d.txt" % (start, end)
logging.basicConfig(filename=name, level=logging.WARNING)
count = 0
sublst = list(lst[start:end])
lst = []
for busi in sublst:
time.sleep(1)
go = time.time()
logging.warning('restaurant: %d ' % count)
get_reviews(busi, coll)
count += 1
stop = time.time()
logging.warning("took %d seconds" % (stop-go))
sublst = []
logging.warning("DONE")