In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import re
import browser_cookie3
from dateutil import parser
import time


In [2]:
cookie = browser_cookie3.chrome(domain_name='.amazon.in')
header = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36", 
    "Accept-Encoding":"gzip, deflate", 
    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 
    "DNT":"1",
    "Connection":"close", 
    "Upgrade-Insecure-Requests":"1"
}

In [3]:
def reviewUrl(asin,cookie,header):
    try:
        url = f'https://www.amazon.in/dp/{asin}'
        page = requests.get(url,cookies=cookie,headers=header)
        if page.status_code==200:
            soup=BeautifulSoup(page.content)
            review_url = 'https://www.amazon.in'+soup.find("a",{'data-hook':"see-all-reviews-link-foot"})['href']
            return review_url
        else:
            print(f"Unable to access the {url} with error code {page.status_code}")
            return ''
    except:
        raise Exception


def getlastPage(review_url):
    page = requests.get(review_url,cookies=cookie,headers=header)
    soup=BeautifulSoup(page.content)
    rating_count  = re.sub('\D', '', soup.find("span",{"class":"a-size-base a-color-secondary"}).text)
    review_rating = soup.find("div",{"class":"a-row a-spacing-base a-size-base"}).text.replace('\n','').replace(',','').strip()
    review_rating_lst = [i for i in review_rating.split() if i.isdigit()]
    if len(review_rating_lst) == 2:
        review_count = int(review_rating_lst[1])
        if review_count <=10:
            totalPages = 1
        else:
            totalPages = round(review_count/10)+1
    else:
        print(f'Please check if the product have any reviews {review_url}')
    return totalPages+1


In [4]:
def reviewData(rvw):
    try:
        profile_name = rvw.find("span",{"class":"a-profile-name"}).text
    except:
        profile_name = ''

    try:
        rd = rvw.find("span",{"data-hook":"review-date"}).text
        review_date = str(parser.parse(rd.split(' on ')[1]))
        review_country = rd.split(' on ')[0].split()[-1]
    except:
        review_date = ''
        review_country = ''
    try:
        review_title = rvw.find("a",{"data-hook":"review-title"}).text.strip()
    except:
        review_title = ''

    try:
        verified_purchase = rvw.find("span",{"data-hook":"avp-badge"}).text.strip()
    except:
        verified_purchase = ''

    try:
        review_body = rvw.find("span",{"data-hook":"review-body"}).text.strip()
    except:
        review_body = ''

    try:
        review_helpful = rvw.find("span",{"data-hook":"helpful-vote-statement"}).text.strip()
    except:
        review_helpful = ''

    try:
        user_rating = rvw.find("i",{"data-hook":"review-star-rating"}).text.split()[0]
    except:
        user_rating = ''
    return {'profile_name':profile_name,'review_date':review_date,'review_country':review_country,'review_title':review_title,'verified_purchase':verified_purchase,'review_body':review_body,'review_helpful':review_helpful,'user_rating':user_rating}

In [None]:
asin = 'B09G9D8KRQ'
review_url = reviewUrl(asin,cookie,header)

if len(review_url) > 0:
    totalPages = getlastPage(review_url)
    review_df = pd.DataFrame(columns=['profile_name','review_date','review_country','review_title','verified_purchase',
                                      'review_body','review_helpful','user_rating'])
    for i in range(1,totalPages):
        url = f'{review_url}&pageNumber={i}'
        page = requests.get(url,cookies=cookie,headers=header)
        if page.status_code==200:
            soup=BeautifulSoup(page.content)
            try:
                review_list =soup.findAll("div",{"data-hook":"review"})
                for rvw in review_list:
                    review_df = review_df.append(pd.DataFrame([reviewData(rvw)]),ignore_index=True)
                time.sleep(2)
            except:
                raise Exception
        else:
            print('f"Unable to access the page {i} with error code {page.status_code}"')
    
    print(f'{soup.find("a",{"data-hook":"product-link"}).text}\n')
    print(f'Rating is : {soup.find("span",{"data-hook":"rating-out-of-text"}).text}')
    if not os.path.exists('data'):
        os.makedirs('data')
    review_df.to_csv(f'data/{asin}.csv',index=False)