In [1]:
!pip install requests beautifulsoup4



In [10]:
import csv
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time

def scrape_product_listing_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        products = []

        # Find all product listings on the page
        listings = soup.select('.s-result-item')
        if not listings:
            return []

        for listing in listings:
            # Get product URL
            product_link = listing.select_one('h2 a')
            if not product_link:
                continue  # Skip this product if URL is not found
            product_url = urljoin(url, product_link.get('href'))

            # Get product name
            product_name = listing.select_one('h2 a span').text.strip()

            # Get product price
            product_price = listing.select_one('.a-price .a-offscreen').text.strip()

            # Get product rating and number of reviews
            rating_element = listing.select_one('.a-icon-star span')
            rating = rating_element.text if rating_element else 'Not available'
            num_reviews_element = listing.select_one('.a-size-base')
            num_reviews = num_reviews_element.text.split()[0] if num_reviews_element else '0'

            products.append({
                'URL': product_url,
                'Name': product_name,
                'Price': product_price,
                'Rating': rating,
                'Number of Reviews': num_reviews
            })

        return products
    else:
      return []

def scrape_product_details(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Get ASIN from the product URL
        asin = url.split('/')[-1].split('?')[0]

        # Get product description
        product_description_element = soup.select_one('#productDescription')
        product_description = product_description_element.text.strip() if product_description_element else 'Not available'

        # Get manufacturer information
        manufacturer_element = soup.select_one('#bylineInfo')
        manufacturer = manufacturer_element.text.strip() if manufacturer_element else 'Not available'

        return {
            'URL': url,
            'ASIN': asin,
            'Product Description': product_description,
            'Manufacturer': manufacturer
        }
    else:
        return None

def main():
    base_url = "https://www.amazon.in/s?k=bags&crid=2M096C61O4MLT&qid=1653308124&sprefix=ba%2Caps%2C283&ref=sr_pg_1"
    num_pages_to_scrape = 50
    product_data = []

    # Scrape product listings from multiple pages
    for page in range(1, num_pages_to_scrape + 1):
        url = base_url + str(page)
        products = scrape_product_listing_page(url)
        product_data.extend(products)

        time.sleep(1)  # Add a small delay to avoid overloading the server

    # Scrape product details for each product URL
    total_products_to_scrape = 200
    product_details_data = []

    for idx, product in enumerate(product_data[:total_products_to_scrape]):
        url = product['URL']
        product_details = scrape_product_details(url)
        if product_details:
            product_details_data.append(product_details)
        print(f"Scraped {idx + 1} out of {total_products_to_scrape} products.")
        time.sleep(1)  # Add a small delay to avoid overloading the server

    # Export the data to a CSV file
    with open('amazon_products_data.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['URL', 'Name', 'Price', 'Rating', 'Number of Reviews', 'ASIN', 'Product Description', 'Manufacturer']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for product, details in zip(product_data, product_details_data):
            row = {**product, **details}
            writer.writerow(row)

if __name__ == '__main__':
    main()


Scraped 1 out of 200 products.
Scraped 2 out of 200 products.
Scraped 3 out of 200 products.
Scraped 4 out of 200 products.
Scraped 5 out of 200 products.
Scraped 6 out of 200 products.
Scraped 7 out of 200 products.
Scraped 8 out of 200 products.
Scraped 9 out of 200 products.
Scraped 10 out of 200 products.
Scraped 11 out of 200 products.
Scraped 12 out of 200 products.
Scraped 13 out of 200 products.
Scraped 14 out of 200 products.
Scraped 15 out of 200 products.
Scraped 16 out of 200 products.
Scraped 17 out of 200 products.
Scraped 18 out of 200 products.
Scraped 19 out of 200 products.
Scraped 20 out of 200 products.
Scraped 21 out of 200 products.
Scraped 22 out of 200 products.
Scraped 23 out of 200 products.
Scraped 24 out of 200 products.
Scraped 25 out of 200 products.
Scraped 26 out of 200 products.
Scraped 27 out of 200 products.
Scraped 28 out of 200 products.
Scraped 29 out of 200 products.
Scraped 30 out of 200 products.
Scraped 31 out of 200 products.
Scraped 32 out of

In [11]:
import pandas as pd
df=pd.read_csv('/content/amazon_products_data.csv')
df.shape

(68, 8)

In [12]:
df.head()

Unnamed: 0,URL,Name,Price,Rating,Number of Reviews,ASIN,Product Description,Manufacturer
0,https://www.amazon.in/Skybags-Brat-Black-Casua...,Wesley Milestone 2.0 Casual Waterproof Laptop ...,₹598,Not available,11801,ref=sr_1_3,Not available,Not available
1,https://www.amazon.in/MosQuick%C2%AE-Drawstrin...,American Tourister Fizz Large Size 32 Ltrs Cas...,"₹1,199",Not available,54957,ref=sxin_14_trfobq2av2_0_B09SB5DP5Y,Not available,Not available
2,https://www.amazon.in/Backpack-Toddler-Animal-...,Skybags Brat Black 46 Cms Casual Backpack,₹669,Not available,5158,ref=sxin_14_trfobq2av2_4_B08D5RCWV9,Not available,Not available
3,https://www.amazon.in/Blue-Tree-Velvet-Nursery...,MosQuick® Clear large Drawstring Bags Clear Pl...,₹310,Not available,Amazon’s,ref=sxin_14_trfobq2av2_7_B07QX58HGR,Not available,Not available
4,https://www.amazon.in/American-Tourister-BACKP...,MosQuick® Clear large Drawstring Bags Clear Pl...,₹310,Not available,69,ref=sr_1_5,Not available,Not available
