In [40]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

In [41]:
headers = {
    'authority': 'www.amazon.com',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'accept-language': 'en-US,en;q=0.9,bn;q=0.8',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}

In [42]:
def generate_amazon_review_urls(asin, num_pages=10):
    base_url = f'https://www.amazon.com/product-reviews/{asin}'
    urls = [base_url]
    
    for page in range(2, num_pages + 1):
        url = f'{base_url}/ref=cm_cr_arp_d_paging_btm_next_{page}?pageNumber={page}'
        urls.append(url)
    
    return urls

In [43]:
def reviewsHtml(links):
    
    soups = []
    
    for link in links:
        
        response = requests.get(link, headers=headers)
        
        
        soup = BeautifulSoup(response.text, 'lxml')
        
        soups.append(soup)
        
    return soups

In [44]:
def getReviews(html_data):

    data_dicts = []
    
    boxes = html_data.select('div[data-hook="review"]')
    
    for box in boxes:
        
        try:
            name = box.select_one('[class="a-profile-name"]').text.strip()
        except Exception as e:
            name = 'N/A'

        try:
            stars = box.select_one('[data-hook="review-star-rating"]').text.strip().split(' out')[0]
        except Exception as e:
            stars = 'N/A'   

        try:
            title = box.select_one('[data-hook="review-title"]').text.strip()
        except Exception as e:
            title = 'N/A'

        try:
            datetime_str = box.select_one('[data-hook="review-date"]').text.strip().split(' on ')[-1]
            date = datetime.strptime(datetime_str, '%B %d, %Y').strftime("%d/%m/%Y")
        except Exception as e:
            date = 'N/A'

        try:
            description = box.select_one('[data-hook="review-body"]').text.strip()
        except Exception as e:
            description = 'N/A'

        data_dict = {
            'Name' : name,
            'Stars' : stars,
            'Title' : title,
            'Date' : date,
            'Description' : description
        }

        data_dicts.append(data_dict)
    
    return data_dicts

In [45]:
asin = 'B0CX3NVXV9'
links = generate_amazon_review_urls(asin=asin,num_pages=10)

In [46]:
html_datas = reviewsHtml(links)

In [47]:
# Empty List to Hold all reviews data
reviews = []

In [48]:
# Iterate all Html page 
for html_data in html_datas:
    
    # Grab review data
    review = getReviews(html_data)
    
    # add review data in reviews empty list
    reviews += review

In [49]:
# Create a dataframe with reviews Data
df_reviews = pd.DataFrame(reviews)

In [50]:
df_reviews

Unnamed: 0,Name,Stars,Title,Date,Description
0,buyer,5.0,5.0 out of 5 stars\nTotally capable and worth ...,06/08/2024,I recently bought this machine for two purpose...
1,Michelle,5.0,"5.0 out of 5 stars\nGreat laptop, too small fo...",14/06/2024,"Objectively speaking, the M3 16 gb ram w/ 512 ..."
2,Joe,5.0,5.0 out of 5 stars\nPerfection in every way,28/07/2024,"First off, the size for the 13 inch air Mac is..."
3,Nicholas Baab,5.0,5.0 out of 5 stars\n2024 M3 Air,05/08/2024,As a long term apple user and this being my 3r...
4,Adam M.,5.0,5.0 out of 5 stars\nFirst ever MacBook for School,07/07/2024,My first ever MacBook. Have always been a wind...
...,...,...,...,...,...
95,karen short,5.0,5.0 out of 5 stars\nBuy this!,25/05/2024,Went to Best Buy to see what they had in stock...
96,sharon e.,5.0,5.0 out of 5 stars\nNice!,17/06/2024,Very pleased with this computer.
97,Marisa Gaete,5.0,5.0 out of 5 stars\nAmazing Laptop,11/06/2024,Exactly what I wanted. Great sale on the item ...
98,Shaneo,5.0,5.0 out of 5 stars\nIpad user back to a laptop,05/06/2024,Love the Apple ecosystem. going from device to...


In [51]:
df_reviews.nunique()

Name            98
Stars            3
Title           97
Date            53
Description    100
dtype: int64

In [53]:
df_reviews.to_csv('reviews.csv', index=False)