In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import csv

In [2]:
def scraping_product_list(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    product_list = []
    
    products = soup.find_all('div', {'data-component-type': 's-search-result'})
    
    for product in products:
        product_data = {}
        product_url = 'https://www.amazon.in' + product.find('a', {'class': 'a-link-normal s-no-outline'}).get('href')
        product_name = product.find('span', {'class': 'a-size-medium a-color-base a-text-normal'}).text.strip()
        product_price = product.find('span', {'class': 'a-offscreen'}).text.strip()
        rating = product.find('span', {'class': 'a-icon-alt'}).text.strip().split()[0]
        num_reviews = product.find('span', {'class': 'a-size-base'}).text.strip()
        
        product_data['Product URL'] = product_url
        product_data['Product Name'] = product_name
        product_data['Product Price'] = product_price
        product_data['Rating'] = rating
        product_data['Number of Reviews'] = num_reviews
        
        product_list.append(product_data)
    
    return product_list

In [3]:
def scraping_product_details(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    product_details = {}
    
    try:
        description = soup.find('div', {'id': 'productDescription'}).text.strip()
    except AttributeError:
        description = ''
    
    try:
        asin = soup.find('th', text='ASIN').find_next('td').text.strip()
    except AttributeError:
        asin = ''
    
    try:
        product_description = soup.find('div', {'id': 'feature-bullets'}).text.strip()
    except AttributeError:
        product_description = ''
    
    try:
        manufacturer = soup.find('a', {'id': 'bylineInfo'}).text.strip()
    except AttributeError:
        manufacturer = ''
    
    product_details['Description'] = description
    product_details['ASIN'] = asin
    product_details['Product Description'] = product_description
    product_details['Manufacturer'] = manufacturer
    
    return product_details

In [4]:
base_url = 'https://www.amazon.in/s?k=bags&crid=2M096C61O4MLT&qid=1653308124&sprefix=ba%2Caps%2C283&ref=sr_pg_'
product_list = []

In [5]:
for page in range(1, 21):
    url = base_url + str(page)
    product_list.extend(scraping_product_list(url))

In [6]:
# Scraping product details
product_details_list = []
counter = 0

In [7]:
for product in product_list[:200]:  # Limiting to 200 products
    counter += 1
    print(f"Scraping ProductURL: {product['Product URL']} ({counter}/{len(product_list[:200])})")
    product_details = scraping_product_details(product['Product URL'])
    product_details_list.append(product_details)

Scraping ProductURL: https://www.amazon.in/Skybags-Brat-Black-Casual-Backpack/dp/B08Z1HHHTD/ref=sr_1_1?crid=2M096C61O4MLT&keywords=bags&qid=1688627200&sprefix=ba%2Caps%2C283&sr=8-1 (1/32)


  asin = soup.find('th', text='ASIN').find_next('td').text.strip()


Scraping ProductURL: https://www.amazon.in/American-Tourister-AMT-SCH-02/dp/B07CJCGM1M/ref=sr_1_2?crid=2M096C61O4MLT&keywords=bags&qid=1688627200&sprefix=ba%2Caps%2C283&sr=8-2 (2/32)
Scraping ProductURL: https://www.amazon.in/Wesley-Milestone-Waterproof-Backpack-Business/dp/B084JGJ8PF/ref=sr_1_3?crid=2M096C61O4MLT&keywords=bags&qid=1688627200&sprefix=ba%2Caps%2C283&sr=8-3 (3/32)
Scraping ProductURL: https://www.amazon.in/Half-Moon-Waterproof-Backpack-Students/dp/B085MHDJ93/ref=sr_1_4?crid=2M096C61O4MLT&keywords=bags&qid=1688627200&sprefix=ba%2Caps%2C283&sr=8-4 (4/32)
Scraping ProductURL: https://www.amazon.in/Gear-Black-Laptop-Backpack-LBPASPIRE0104/dp/B075MK4TXP/ref=sr_1_5?crid=2M096C61O4MLT&keywords=bags&qid=1688627200&sprefix=ba%2Caps%2C283&sr=8-5 (5/32)
Scraping ProductURL: https://www.amazon.in/Number-Backpack-Compartment-Charging-Organizer/dp/B09VTDMRY7/ref=sr_1_6?crid=2M096C61O4MLT&keywords=bags&qid=1688627200&sprefix=ba%2Caps%2C283&sr=8-6 (6/32)
Scraping ProductURL: https://www

In [8]:
# Exporting data to CSV
filename = 'amazon_products.csv'
fieldnames = ['Product URL', 'Product Name', 'Product Price', 'Rating', 'Number of Reviews',
              'Description', 'ASIN', 'Product Description', 'Manufacturer']

In [9]:
with open(filename, 'w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    
    for product, details in zip(product_list[:200], product_details_list):
        row = {**product, **details}
        writer.writerow(row)

In [10]:
print(filename)

amazon_products.csv
