Skip to content
This repository has been archived by the owner on Dec 23, 2018. It is now read-only.

Commit

Permalink
Big refactor of code.
Browse files Browse the repository at this point in the history
Rename Reviewer to UserReviews.
Fix a bunch of bad regexp.
Rename author to user (author_id -> user_id, etc).
Remove redundant duplicate code.
Replace use of attr={} with parameters.
Simplify code in areas.
Move some common code into __init__.
Fix pagination possibly breaking due to regexp.
Fix a number of bugs.
  • Loading branch information
adamlwgriffiths committed Sep 30, 2015
1 parent ab18e36 commit 6df73f6
Show file tree
Hide file tree
Showing 13 changed files with 550 additions and 439 deletions.
72 changes: 49 additions & 23 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ Review API
View lists of reviews::

>>> p = amzn.lookup(ItemId='B0051QVF7A')
>>> rs = amzn.reviews(URL=p.reviews_url)
>>> rs = p.reviews()
>>> rs.asin
B0051QVF7A
>>> # print the reviews on this first page
Expand All @@ -161,25 +161,36 @@ View lists of reviews::
>>> rs.url
http://www.amazon.com/product-reviews/B0051QVF7A/ref=cm_cr_pr_top_sort_recent?&sortBy=bySubmissionDateDescending
>>> # by iterating over the reviews object we get access to reviews on ALL pages
>>> for r in rs:
>>> for r in rs.brief_reviews:
>>> print(r.id)
'R3MF0NIRI3BT1E'
'R3N2XPJT4I1XTI'
'RWG7OQ5NMGUMW'
...

View detailed reviews::
>>> rs = amzn.reviews(ItemId='B0051QVF7A')
>>> for r in rs.full_reviews():
>>> print(r.id)
'R3MF0NIRI3BT1E'
'R3N2XPJT4I1XTI'
'RWG7OQ5NMGUMW'
...

Quickly get a list of all reviews on a review page using the `all_reviews` property.
This uses the brief reviews provided on the product page to avoid downloading each review separately. As such, some information
This uses the brief reviews provided on the review page to avoid downloading each review separately. As such, some information
may not be accessible::

>>> p = amzn.lookup(ItemId='B0051QVF7A')
>>> rs = amzn.reviews(URL=p.reviews_url)
>>> all_reviews_on_page = rs.all_reviews
>>> rs = p.reviews()
>>> all_reviews_on_page = list(rs)
>>> len(all_reviews_on_page)
10
>>> all_reviews_on_page[0].to_dict()["title"]
>>> r = all_reviews_on_page[0]
>>> r.title
'Fantastic device - pick your Kindle!'
>>> all_reviews_on_page[0].full_review().title
>>> fr = r.full_review()
>>> fr.title
'Fantastic device - pick your Kindle!'

By ASIN/ItemId::
Expand All @@ -193,7 +204,8 @@ By ASIN/ItemId::

For individual reviews use the `review` method::

>>> r = amzn.review(Id=rs.ids[0])
>>> review_id = 'R3MF0NIRI3BT1E'
>>> r = amzn.review(Id=review_id)
>>> r.id
R3MF0NIRI3BT1E
>>> r.asin
Expand All @@ -215,26 +227,40 @@ By URL::
R3MF0NIRI3BT1E


Reviewer API
~~~~~~~~~~~~
This package also supports getting information about specific reviewers and the reviews
they have written over time. It is advisable to first look up a reviewer via another one
of the products they have reviewed though. This situation will be improved in the future
though.
User Reviews API
~~~~~~~~~~~~~~~~~~
This package also supports getting reviews written by a specific user.

Get reviews that a single author has created::

Get reviews that a single reviewer has created::
>>> ur = amzn.user_reviews(Id="A2W0GY64CJSV5D")
>>> ur.brief_reviews
>>> ur.name


r = self.amzn.review(Id="R3MF0NIRI3BT1E")
reviewer = self.amzn.reviewer(r.author_reviews_url)
all_reviews = reviewer.all_reviews
Get reviews for a user, from a review object

Iterate to the authors next review page if they have one::
>>> r = amzn.review(Id="R3MF0NIRI3BT1E")
>>> # we can get the reviews directly, or via the API with a URL or ID
>>> ur = r.user_reviews()
>>> ur = amzn.user_reviews(URL=r.author_reviews_url)
>>> ur = amzn.user_reviews(Id=r.author_id)
>>> ur.brief_reviews
>>> ur.name

r = self.amzn.review(Id="R3MF0NIRI3BT1E")
reviewer = self.amzn.reviewer(r.author_reviews_url)
reviewer = self.amzn.reviewer(reviewer.next_page_url)
second_page_reviews = reviewer.all_reviews

Iterate over the current page's reviews::

>>> ur = amzn.user_reviews(Id="A2W0GY64CJSV5D")
>>> for r in ur.brief_reviews:
>>> print(r.id)


Iterate over all author reviews::

>>> ur = amzn.user_reviews(Id="A2W0GY64CJSV5D")
>>> for r in ur:
>>> print(r.id)


Authors
Expand Down
115 changes: 81 additions & 34 deletions amazon_scraper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@
import time
import requests
import warnings
from HTMLParser import HTMLParser
from amazon.api import AmazonAPI
import dateutil.parser
from bs4 import BeautifulSoup
from .version import __version__ # load our version

# stop warnings about unused variable
__version__

log = logging.getLogger(__name__)

Expand All @@ -29,6 +30,13 @@
amazon_base = 'http://www.amazon.com'

_extract_asin_regexp = re.compile(r'/dp/(?P<asin>[^/]+)')
_process_rating_regexp = re.compile(r'([\d\.]+) out of [\d\.]+ stars', flags=re.I)
_extract_reviews_asin_regexp = re.compile(r'/product-reviews/(?P<asin>[^/]+)', flags=re.I)
_extract_review_id_regexp = re.compile(r'/review/(?P<id>[^/]+)', flags=re.I)
_extract_reviewer_id_regexp = re.compile(r'/member-reviews/(?P<id>[^/]+)', flags=re.I)
_price_regexp = re.compile(ur'(?P<price>[$£][\d,\.]+)', flags=re.I)


def extract_asin(url):
try:
match = _extract_asin_regexp.search(url)
Expand All @@ -37,22 +45,31 @@ def extract_asin(url):
warnings.warn('Error matching ASIN in URL {}'.format(url))
raise


def product_url(asin):
url = '{base}/dp/{asin}'
return url.format(base=amazon_base, asin=asin)


def add_affiliate(url, affiliate):
return add_query(url, tag=affiliate)


def reviews_url(asin):
url = '{base}/product-reviews/{asin}/ref=cm_cr_pr_top_sort_recent?&sortBy=bySubmissionDateDescending'
return url.format(base=amazon_base, asin=asin)


def review_url(id):
url = '{base}/review/{id}'
return url.format(base=amazon_base, id=id)

_process_rating_regexp = re.compile(r'([\d\.]+) out of [\d\.]+ stars', flags=re.I)

def reviewer_url(id):
url = '{base}/gp/cdp/member-reviews/{id}'
return url.format(base=amazon_base, id=id)


def process_rating(text):
"""The rating normalised to 1.0
"""
Expand All @@ -63,16 +80,16 @@ def process_rating(text):
warnings.warn('Error processing rating for text "{}"'.format(text))
raise

_extract_reviews_id_regexp = re.compile(r'/product-reviews/(?P<id>[^/]+)', flags=re.I)
def extract_reviews_id(url):

def extract_reviews_asin(url):
try:
match = _extract_reviews_id_regexp.search(url)
return str(match.group('id'))
match = _extract_reviews_asin_regexp.search(url)
return str(match.group('asin'))
except:
warnings.warn('Error matching reviews ID in URL {}'.format(url))
warnings.warn('Error matching reviews ASIN in URL {}'.format(url))
raise

_extract_review_id_regexp = re.compile(r'/review/(?P<id>[^/]+)', flags=re.I)

def extract_review_id(url):
try:
match = _extract_review_id_regexp.search(url)
Expand All @@ -81,7 +98,16 @@ def extract_review_id(url):
warnings.warn('Error matching review ID in URL {}'.format(url))
raise

_price_regexp = re.compile(ur'(?P<price>[$£][\d,\.]+)', flags=re.I)

def extract_reviewer_id(url):
try:
match = _extract_reviewer_id_regexp.search(url)
return str(match.group('id'))
except:
warnings.warn('Error matching review ID in URL {}'.format(url))
raise


def extract_price(text):
try:
match = _price_regexp.search(text)
Expand All @@ -93,6 +119,7 @@ def extract_price(text):
warnings.warn('Error extracting price in text "{}"'.format(text))
raise


def add_query(url, **kwargs):
scheme, netloc, path, query_string, fragment = urlparse.urlsplit(url)
query_params = urlparse.parse_qs(query_string)
Expand All @@ -119,6 +146,7 @@ def strip_html_tags(html):
return text
return None


def retry(retries=5, exceptions=None):
if not exceptions:
exceptions = (BaseException,)
Expand All @@ -141,6 +169,15 @@ def decorator(*args, **kwargs):
return decorator
return outer


def get(url, api):
rate_limit(api)
# verify=False ignores SSL errors
r = requests.get(url, headers={'User-Agent': user_agent}, verify=False)
r.raise_for_status()
return r


def is_property(obj, k):
# only accept @property decorated functions
# these can only be detected via the __class__ object
Expand All @@ -149,6 +186,7 @@ def is_property(obj, k):
return True
return False


def dict_acceptable(obj, k, blacklist=None):
# don't store blacklisted variables
if blacklist and k in blacklist:
Expand All @@ -158,29 +196,30 @@ def dict_acceptable(obj, k, blacklist=None):
return False
return is_property(obj, k)


def rate_limit(api):
# apply rate limiting
# this is taken from bottlenose/api.py
# AmazonScraper -> SimpleProductAPI -> BottleNose
api = api.api.api
if api.MaxQPS:
last_query_time = api._last_query_time[0]
bn = api.bottlenose
if bn.MaxQPS:
last_query_time = bn._last_query_time[0]
if last_query_time:
wait_time = 1 / api.MaxQPS - (time.time() - last_query_time)
if wait_time > 0:
log.debug('Waiting %.3fs to call Amazon API' % wait_time)
time.sleep(wait_time)
api._last_query_time[0] = time.time()

#This schema of imports is non-standard and should change. It will require some re-ordering of
#functions inside the package though.
wait_time = 1 / bn.MaxQPS - (time.time() - last_query_time)
if wait_time > 0:
log.debug('Waiting %.3fs to call Amazon API' % wait_time)
time.sleep(wait_time)
bn._last_query_time[0] = time.time()

# This schema of imports is non-standard and should change. It will require some re-ordering of
# functions inside the package though.
from amazon_scraper.product import Product
from amazon_scraper.reviews import Reviews
from amazon_scraper.review import Review
from amazon_scraper.reviewer import Reviewer
from amazon_scraper.user_reviews import UserReviews


class AmazonScraper(object):

def __init__(self, access_key, secret_key, associate_tag, *args, **kwargs):
self.api = AmazonAPI(access_key, secret_key, associate_tag, *args, **kwargs)

Expand All @@ -190,31 +229,39 @@ def reviews(self, ItemId=None, URL=None):
def review(self, Id=None, URL=None):
return Review(self, Id, URL)

def reviewer(self, url):
return Reviewer(url)
def user_reviews(self, Id=None, URL=None):
return UserReviews(self, Id, URL)

def lookup(self, URL=None, **kwargs):
if URL:
kwargs['ItemId'] = extract_asin(URL)

result = self.api.lookup(**kwargs)
result = self.amazon_simple_api.lookup(**kwargs)
if isinstance(result, (list, tuple)):
result = [Product(p) for p in result]
result = [Product(self, p) for p in result]
else:
result = Product(result)
result = Product(self, result)
return result

def similarity_lookup(self, **kwargs):
for p in self.api.similarity_lookup(**kwargs):
yield Product(p)
for p in self.amazon_simple_api.similarity_lookup(**kwargs):
yield Product(self, p)

def browse_node_lookup(self, **kwargs):
return self.api.browse_node_lookup(**kwargs)
return self.amazon_simple_api.browse_node_lookup(**kwargs)

def search(self, **kwargs):
for p in self.api.search(**kwargs):
yield Product(p)
for p in self.amazon_simple_api.search(**kwargs):
yield Product(self, p)

def search_n(self, n, **kwargs):
for p in self.api.search_n(n, **kwargs):
yield Product(p)
for p in self.amazon_simple_api.search_n(n, **kwargs):
yield Product(self, p)

@property
def amazon_simple_api(self):
return self.api

@property
def bottlenose(self):
return self.api.api
Loading

0 comments on commit 6df73f6

Please sign in to comment.