In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np

In [3]:
# Initial function to request the webpage and create a BeautifulSoup object
# This object is used to parse and navigate the HTML of the page
def make_soup(url):
    try:
        # Send a GET request to the URL
        webpage = requests.get(url, headers=HEADERS)

        # Parse the content of the request with BeautifulSoup and return the resulting object
        return BeautifulSoup(webpage.content, "html.parser")
    except requests.exceptions.RequestException as e:
        # In case of an error during the request, print the error and return None
        print(e)
        return None

# Function to extract the product title from the page
def get_title(soup):
    title = soup.find("span", attrs={"id":'productTitle'})
    # If the title was found, return it stripped of leading/trailing whitespace
    return title.text.strip() if title else ""

# Function to extract the detail description of the product
def get_detaildescription(soup):
    try:
        # Find the unordered list that contains the detail description
        description = soup.find("ul", attrs={'class':'a-unordered-list a-vertical a-spacing-mini'})

        # Find all list items in the description and extract their text
        description_items = description.find_all('span', class_='a-list-item')
        information = [item.get_text(strip=True) for item in description_items]

        # Return the details as a comma-separated string
        return ', '.join(information)
    except AttributeError:
        # In case of an error while parsing, return an empty string
        return ""

# Function to extract the product rating
def get_rating(soup):
    try:
        # Attempt to find the rating using the first expected class pattern
        rating = soup.find("i", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()
    except AttributeError:
        try:
            # If the first pattern is not found, try the second pattern
            rating = soup.find("span", attrs={'class':'a-icon-alt'}).string.strip()
        except AttributeError:
            # If neither pattern is found, return an empty string
            return ""
    # If a rating was found, use regex to extract the numeric part and return it
    return re.search(r'\d+\.\d+', rating).group(0) if rating else ""

# Function to extract the number of user reviews for the product
def get_review_count(soup):
    review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'})
    # If the review count was found, return it stripped of leading/trailing whitespace
    return review_count.string.strip() if review_count else ""

In [21]:
# User agent and Accept-Language headers for the GET request
HEADERS = ({'User-Agent':' ', 'Accept-Language': 'en-US, en;q=0.5'})
# The base URL of the website to be scraped
BASE_URL = "https://www.amazon.in"

# The main function of the script
def main():
    try:
        # The specific URL to be scraped
        URL = ""

        # Generate the soup object for the URL
        soup = make_soup(URL)

        # If the soup object is empty, exit the function
        if not soup: return

        # Find all anchor tags with the specified class attribute
        links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})

        # Create a list of the href attributes of the anchor tags
        links_list = [link.get('href') for link in links]

        # For each link, generate a dictionary of product data
        product_data = [
            {
                # The title of the product
                "title": get_title(make_soup(BASE_URL + link)),
                # The detailed description of the product
                "detail_description": get_detaildescription(make_soup(BASE_URL + link)),
                # The rating of the product
                "rating": get_rating(make_soup(BASE_URL + link)),
                # The number of reviews for the product
                "reviews": get_review_count(make_soup(BASE_URL + link)),
                # The URL of the product
                "URL": BASE_URL + link,
            }
            # This is done for each link in the links list
            for link in links_list
        ]
        
        # Convert the list of product data dictionaries into a pandas DataFrame
        df = pd.DataFrame(product_data)
        # Replace empty strings in the title column with NaN
        df['title'].replace('', np.nan, inplace=True)
        # Drop rows in the DataFrame where the title is NaN
        df.dropna(subset=['title'], inplace=True)
        # Write the DataFrame to a CSV file
        df.to_csv("web_data.csv", header=True, index=False)

    # If an exception is thrown
    except Exception as e:
        # Print the error message
        print(f"An error occurred: {e}")

# If the script is being run directly (not being imported)
if __name__ == '__main__':
    # Run the main function
    main()