In [51]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from google.colab import files

In [52]:
HEADERS = {'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
        AppleWebKit/537.36 (KHTML, like Gecko) \
        Chrome/90.0.4430.212 Safari/537.36'}

COOKIES = {}

In [53]:
def get_data(url:str):
    page = requests.get(url, cookies=COOKIES, headers=HEADERS)
    return page

In [54]:
def asin_number(soup):
    data_asins = []
    for item in soup.find_all("div", {"data-component-type": "s-search-result"}):
        data_asins.append(item['data-asin'])
    return data_asins

In [55]:
def fetch_href(soup):
    links = []
    for item in soup.findAll("a",{'data-hook':"see-all-reviews-link-foot"}):
        links.append(item['href'])
    return links[0]

In [56]:
def customer_review(soup):
    data_str = ""
    td_tag_list = soup.findAll(lambda tag: "data-hook" in tag.attrs and tag["data-hook"] == "review")
    for item in td_tag_list:
        selected_item = item.findAll("span", attrs={'class': 'a-size-base review-text review-text-content'})
        if selected_item:
            temp_text = selected_item[0].get_text()
            if len(temp_text) > 0:
                data_str = data_str + temp_text
            else:
                data_str = data_str + " \n"
        else:
            data_str = data_str + " \n"
    result = data_str.split("\n")
    return (result)

In [57]:
def customer_rating(soup):
    data_out_list = []
    data_str = ""
    td_tag_list = soup.findAll(lambda tag: "data-hook" in tag.attrs and tag["data-hook"] == "review-star-rating")
    for item in td_tag_list:
        selected_item = item.findAll("span", attrs={'class': 'a-icon-alt'})
        if selected_item:
            data_str = selected_item[0].get_text().split("out")[0].strip(" ")
            data_int = int(float(data_str))
            data_out_list.append(data_int)
    return data_out_list

In [58]:
def extract_r_and_r(data_asin):
    all_reviews = []
    review_data = []
    rating_data = []
    
    url = f"https://www.amazon.in/dp/{data_asin}"
    
    response = get_data(url)
    soup = BeautifulSoup(response.content)
    link = fetch_href(soup)
    
    i = 0
    print(f"Fetching reviews from the product: {data_asin}")
    while 1:
        i += 1
        url = f"https://www.amazon.in{link}&pageNumber={i}"
        response = get_data(url)
        soup = BeautifulSoup(response.text)
        review_data = customer_review(soup)
        review_data = [review for review in review_data if len(review) > 0]
        temp_rating_data = customer_rating(soup)
        rating_data.extend(temp_rating_data)
        if len(review_data) == 0:
            break
        all_reviews += review_data
    reviews_df = pd.DataFrame({'reviews': all_reviews,
                               'ratings': rating_data})
    
    reviews_df["ASIN"] = data_asin
    print("{} has {} reviews".format(data_asin, reviews_df.shape[0]))
    return reviews_df

In [59]:
data_asins = ['B09QRTXCWD', 'B09QSBPKTF', 'B09QRL1M8Y', 'B09QRQQL51']
r_and_r_file_name = "Reviews_and_Ratings.xlsx"

In [60]:
li = []
for asins in data_asins:
    df = extract_r_and_r(asins)
    li.append(df)
        
reviews_df = pd.concat(li, axis=0, 
                       ignore_index=True)

Fetching reviews from the product: B09QRTXCWD
B09QRTXCWD has 54 reviews
Fetching reviews from the product: B09QSBPKTF
B09QSBPKTF has 43 reviews
Fetching reviews from the product: B09QRL1M8Y
B09QRL1M8Y has 5 reviews
Fetching reviews from the product: B09QRQQL51
B09QRQQL51 has 6 reviews


In [61]:
reviews_df.to_excel(r_and_r_file_name, index=False)

In [62]:
files.download(r_and_r_file_name) 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>