-
Notifications
You must be signed in to change notification settings - Fork 7
/
unlimitedReviews.py
164 lines (127 loc) · 4.68 KB
/
unlimitedReviews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
from bs4 import BeautifulSoup
import re
import requests
import urllib.request
import json
import validators
def get_soup(url, header):
return BeautifulSoup(urllib.request.urlopen(urllib.request.Request(url,headers=header)),'html.parser')
def create_base_url():
return "http://www.amazon.in"
def create_url_from_query(query):
query = '+'.join(query.split())
return create_base_url() + "/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords="+query+"&rh=i%3Aaps%2Ck%3A"+query
def get_review(data, index):
try:
review_rating = data.find("i", class_="review-rating")
if review_rating is not None:
rating = review_rating.text
else:
rating = "N/A"
review_date = data.find("span", class_="review-date")
if review_date is not None:
date = review_date.text
else:
date = "N/A"
review_title = data.find("a", class_="review-title")
if review_title is not None:
title = review_title.text
else:
title = "N/A"
review_text = data.find("span", class_="review-text")
if review_text is not None:
text = review_text.text
else:
text = "-"
print("Review - " + str(index) + " " + date + " | (" + rating + ")")
print("---")
print(str(title))
print("---")
print(text + "\n")
except:
pass
def get_reviews_pages(data):
index = 0
lastIndex = len(data) - 2
for page in pages:
if index == lastIndex: return int(page.text)
index = index + 1
return 1
def get_review_page_url(data):
try:
return create_base_url() + data.find("a").get("href")
except Exception as e:
print(e)
return create_base_url()
def get_review_page_number_url(data, number):
return str(re.sub("btm_[0-9]+", "btm_" + str(number), data)) + str("&pageNumber=") + str(number)
def get_reviews_all(index, url, pages):
pageNumber = 1
while pageNumber <= pages:
pageUrl = get_review_page_number_url(url, pageNumber)
#print("[D] pageNumber: " + str(pageNumber) + ", index: " + str(index) + ", pageUrl: " + str(pageUrl))
# print(pageUrl)
index = get_reviews_from_html(get_soup(pageUrl, header), index)
pageNumber = pageNumber + 1
def get_reviews_from_html(data, index):
reviews = data.find_all("div", class_="a-section review")
for review in reviews:
index = index + 1
get_review(review, index)
return index
query = input("Enter Product Name\n") # you can change the query for the product here
url = create_url_from_query(query)
#print(url)
header = { 'User-Agent': "Mozilla/6.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36" }
soup = get_soup(url, header)
#print(soup.title)
ActualProducts=[] # contains the link for the products
reviews=[]
for a in soup.find_all("a", class_="s-access-detail-page"):
try:
if a.find("h2") != None and validators.url(a.get("href")):
name = a.find("h2").string
link = a.get("href")
ActualProducts.append((link,name))
except Exception as e:
print(e)
#print(ActualProducts)
for i ,(link,name) in enumerate(ActualProducts):
try:
print("%d - %s" % (i, name))
except Exception as e:
print(e)
l = len(ActualProducts)
print("\nThere are total ", l, " links")
review = int(input("Enter the product number to get reviews\n"))
try:
if review < l:
link,name = ActualProducts[review]
# print("[D] Link: " + str(link))
so = get_soup(link, header)
print("\n" + str(so.title.string))
product = so.find_all(id="revF")
# product = so.find_all(id="revSAFRLU")
if len(product) > 0:
reviewsUrl = product[0].find("a").get('href')
# print("[D] Review url: " + str(reviewsUrl))
reviewsMain = get_soup(reviewsUrl, header)
# Get page url
pageUrlBody = reviewsMain.find("li", class_="a-selected page-button")
# print(pageUrlBody)
pageUrl = get_review_page_url(pageUrlBody)
# print(pageUrl)
# Get count of pages
pages = reviewsMain.find("ul", class_="a-pagination")
if pages is not None:
pages = get_reviews_pages(pages)
else:
pages = 0
print("\nNumber of review pages: " + str(pages) + "\n")
numberPagesForFetch = int(input("Enter the number of pages for fetch\n"))
# Fetch reviews
get_reviews_all(0, pageUrl, numberPagesForFetch)
else:
print("Invalid Input")
except:
pass