# scrape review data from amazon

In [1]:
import requests
import time
import random
from bs4 import BeautifulSoup
import re
from datetime import datetime
import pandas as pd

In [5]:
def parse_reviews(page_html):
  reviews = []
  page_html = BeautifulSoup(page_html)
  review_divs = page_html.find_all(id=re.compile("customer_review-.*"))

  for div in review_divs:

    name_div = div.find(class_=re.compile(r'.*profile-name.*'))
    title_div = div.find("a", class_=re.compile(r'.*review-title.*'))
    text_div = div.find(class_=re.compile(r"review-text-content")).find("span")
    
    if name_div is None or title_div is None or text_div is None:
      continue

    name = name_div.text.strip()
    title = title_div.text.strip()
    text = text_div.text.strip()
    
    rating = div.find(title=re.compile(r"\d+\.?\d? out of \d stars")).text.strip()
    rating, out_of = re.findall(r"(\d+\.?\d?).*(\d+\.?\d?)", rating)[0]
    rating = float(rating)
    out_of = float(out_of)
    
    review_date = div.find(class_=r'review-date').text.strip()
    review_date = re.findall(r'\d{1,2}.*\d{4}', review_date)[0]
    review_date = datetime.strptime(review_date, '%d %B %Y').date()

    review = dict(
        name=name,
        title=title,
        review=text,
        rating=rating,
        rating_out_of=out_of,
        date=review_date,
    )
    reviews.append(review)
  return reviews


def send_request(url, max_tries=5, wait_time=10):
    response = None
    while max_tries > 0:
      response = requests.get(url)
      if response.status_code >= 400:
        time.sleep(wait_time)
        max_tries = max_tries - 1
      else:
        return response
    return response

def scrape_reviews(url, start_page=1, end_page=2, wait_time=10, max_tries=5):
  reviews = []
  for page in range(start_page, end_page+1):
    time.sleep(random.randint(1, wait_time))
    page_url = f"{url}&pageNumber={page}&sortBy=recent"
    response = send_request(page_url, max_tries=5, wait_time=wait_time)
    if response.status_code >= 400:
      print(f"Page {page}: Request Failed [{response.status_code}]")
      continue
    page_reviews = parse_reviews(response.text)    
    print(f"Page {page}: Reviews {len(page_reviews)}")
    reviews.extend(page_reviews) 
  return reviews

In [6]:
# go to product page click on all reviews
# and paste link here
# url = "https://www.amazon.in/SeCro-USB-Audio-Sound-Card/product-reviews/B07WSBKPXX/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"
url = "https://www.amazon.in/AmazonBasics-Internal-Hardback-Backpack-Raincover/product-reviews/B06Y5NCY5K/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"
url = "https://www.amazon.in/Indigenous-Unprocessed-Unfiltered-Unpasteurized-Disorders/product-reviews/B07H5PVCH7/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"
start_page = 1
end_page = 80
wait_time = 15
max_tries = 8

reviews = scrape_reviews(url, 
                         start_page=start_page, 
                         end_page=end_page, 
                         wait_time=wait_time,
                         max_tries=max_tries)

Page 1: Reviews 10
Page 2: Reviews 10
Page 3: Reviews 9
Page 4: Reviews 10
Page 5: Reviews 10
Page 6: Reviews 10
Page 7: Reviews 10
Page 8: Reviews 10
Page 9: Reviews 10
Page 10: Reviews 10
Page 11: Reviews 10
Page 12: Reviews 10
Page 13: Reviews 10
Page 14: Reviews 10
Page 15: Reviews 9
Page 16: Request Failed [503]
Page 17: Reviews 10
Page 18: Reviews 10
Page 19: Request Failed [503]
Page 20: Reviews 10
Page 21: Reviews 10
Page 22: Reviews 10
Page 23: Request Failed [503]
Page 24: Reviews 10
Page 25: Reviews 10
Page 26: Reviews 10
Page 27: Reviews 10
Page 28: Reviews 10
Page 29: Reviews 10
Page 30: Reviews 10
Page 31: Reviews 10
Page 32: Reviews 10
Page 33: Reviews 10
Page 34: Reviews 10
Page 35: Reviews 10
Page 36: Reviews 10
Page 37: Reviews 10
Page 38: Reviews 10
Page 39: Reviews 10
Page 40: Reviews 10
Page 41: Reviews 10
Page 42: Reviews 10
Page 43: Reviews 10
Page 44: Reviews 10
Page 45: Reviews 10
Page 46: Reviews 10
Page 47: Reviews 10
Page 48: Reviews 10
Page 49: Reviews 10
P

In [7]:
# save to disk
pd.DataFrame(reviews).to_csv("reviews.csv", index=False)

In [None]:
# download file
from google.colab import files
files.download('reviews.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>