<a href="https://colab.research.google.com/github/caprolaliac/MLPrac/blob/main/data_scraping/datascraping_amazon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
# Import packages
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

# Header to set the requests as a browser requests
headers = {
    'authority': 'www.amazon.in',
    'method': 'GET',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'accept-language': 'en-US,en;q=0.9,bn;q=0.8',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}

url = input("Enter amazon url: ")
pages = int(input("Enter number of pages: "))
# URL of The amazon Review page
reviews_url = url

# Define Page No
len_page = pages

### <font color="red">Functions</font>

# Extra Data as Html object from amazon Review page
def reviewsHtml(url, len_page):

    # Empty List define to store all pages html data
    soups = []

    # Loop for gather all 3000 reviews from 300 pages via range
    for page_no in range(1, len_page + 1):

        # parameter set as page no to the requests body
        params = {
            'ie': 'UTF8',
            'reviewerType': 'all_reviews',
            'filterByStar': 'critical',
            'pageNumber': page_no,
        }

        # Request make for each page
        response = requests.get(url, headers=headers)

        # Save Html object by using BeautifulSoup4 and lxml parser
        soup = BeautifulSoup(response.text, 'lxml')

        # Add single Html page data in master soups list
        soups.append(soup)

    return soups

# Grab Reviews name, description, date, stars, title from HTML
def getReviews(html_data):

    # Create Empty list to Hold all data
    data_dicts = []

    # Select all Reviews BOX html using css selector
    boxes = html_data.select('div[data-hook="review"]')

    # Iterate all Reviews BOX
    for box in boxes:

        # Select Name using css selector and cleaning text using strip()
        # If Value is empty define value with 'N/A' for all.
        try:
            name = box.select_one('[class="a-profile-name"]').text.strip()
        except Exception as e:
            name = 'N/A'

        try:
            stars = box.select_one('[data-hook="review-star-rating"]').text.strip().split(' out')[0]
        except Exception as e:
            stars = 'N/A'

        try:
            title = box.select_one('[data-hook="review-title"]').text.strip()
        except Exception as e:
            title = 'N/A'

        try:
            # Convert date str to dd/mm/yyy format
            datetime_str = box.select_one('[data-hook="review-date"]').text.strip().split(' on ')[-1]
            date = datetime.strptime(datetime_str, '%d %B %Y').strftime("%d/%m/%Y")
        except Exception as e:
            date = 'N/A'

        try:
            description = box.select_one('[data-hook="review-body"]').text.strip()
        except Exception as e:
            description = 'N/A'

        # create Dictionary with al review data
        data_dict = {
            'Name' : name,
            'Stars' : stars,
            'Title' : title,
            'Date' : date,
            'Description' : description
        }

        # Add Dictionary in master empty List
        data_dicts.append(data_dict)

    return data_dicts

### <font color="red">Data Process</font>

# Grab all HTML
html_datas = reviewsHtml(reviews_url, len_page)

# Empty List to Hold all reviews data
reviews = []

# Iterate all Html page
for html_data in html_datas:

    # Grab review data
    review = getReviews(html_data)

    # add review data in reviews empty list
    reviews += review

# Create a dataframe with reviews Data
df_reviews = pd.DataFrame(reviews)
df_reviews = df_reviews.drop_duplicates(subset=['Title', 'Description'])
df_reviews['Date'] = pd.to_datetime(df_reviews['Date'], format='%d/%m/%Y', errors='coerce')
df_reviews = df_reviews.sort_values(by='Date', ascending=False)
df_reviews.reset_index(drop=True, inplace=True)
print(df_reviews)

Enter amazon url: https://www.amazon.in/Acer-Predator-Processor-Windows-PHN16-71/dp/B0C3HTXBWP/ref=sr_1_6?crid=332IOUA6ILDQ1&dib=eyJ2IjoiMSJ9.XefUpe4XtqunFlz3PSThWCP4_jwgUzH0-nIDTZoj6KNogQZOSgZ_sGq8a8IHzqg48wcfZxaP_pWLgdj5x5uwfs03J5iQW1SeAp3PpjkBtswFOdO9pF7I1j1eOBYZzuNTDZgwM-KI51JcZwUCv_GXvGIt7I1Kskp63_3ffX14gP3712BiHu-hxtNgkC0vhi0LtHLaY-zM1T3WM8n1631_Vq_9zzUDg5FL0b70-7BQdp4.IdOOnK0zCGF0RzT7cJN4MgibweEj8QeeiEBdMDLhuUg&dib_tag=se&keywords=acer+predator+helios+neo+16&qid=1720597228&sprefix=%2Caps%2C296&sr=8-6#customerReviews
Enter number of pages: 2
              Name Stars                                              Title  \
0            Khyat   1.0               1.0 out of 5 stars\nPower IC failure   
1    Nishant Saini   5.0                           5.0 out of 5 stars\nNice   
2  Sushanta Sarkar   5.0                    5.0 out of 5 stars\nGood laptop   
3         Chandril   5.0  5.0 out of 5 stars\nMassive steal of a powerho...   
4            Sunny   5.0  5.0 out of 5 stars\nVERY 

In [28]:
df_reviews.to_csv('reviews.csv', index=False)

In [30]:
df_reviews

Unnamed: 0,Name,Stars,Title,Date,Description
0,Khyat,1.0,1.0 out of 5 stars\nPower IC failure,2024-06-15,Just bought this laptop 6 months back and didn...
1,Nishant Saini,5.0,5.0 out of 5 stars\nNice,2024-05-25,The media could not be loaded.\n ...
2,Sushanta Sarkar,5.0,5.0 out of 5 stars\nGood laptop,2024-03-03,Product is good but wrong info about processor...
3,Chandril,5.0,5.0 out of 5 stars\nMassive steal of a powerho...,2023-10-10,The media could not be loaded.\n ...
4,Sunny,5.0,5.0 out of 5 stars\nVERY GOOD PERFORMANCE. BAT...,2023-07-26,The lighting in the keyboard is a very good go...
5,Amazon Customer,4.0,4.0 out of 5 stars\nPay for Upgrade,2023-07-09,"Pros:Fantastic processor, although graphic car..."
6,Alok Rawat,5.0,5.0 out of 5 stars\nBudget Beast,2023-05-15,I upgraded from a ryzen 3 2200u laptop from 20...
7,Placeholder,5.0,5.0 out of 5 stars\nHave budget ?? Go for it !!,2023-05-11,Review of Acer Predator Helios Neo 16 i7 13Gen...


In [33]:
import re

def extract_star_reviews(df):
  star_reviews = []
  for title in df['Title']:
    match = re.search(r'\d+\.\d+', title)
    if match:
      star_reviews.append(match.group(0))
  return star_reviews

In [34]:
reviewsss=extract_star_reviews(df_reviews)
print(reviewsss)

['1.0', '5.0', '5.0', '5.0', '5.0', '4.0', '5.0', '5.0']
