In [4]:
%pip install selenium pandas openpyxl


Note: you may need to restart the kernel to use updated packages.


In [2]:
import time
import random
import pandas as pd
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    NoSuchElementException,
    WebDriverException
)

# Setup for headless Chrome and webdriver
def setup_driver():
    options = Options()
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--window-size=1920,1080')
    options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
    service = Service('C:\\Users\\yujit\\OneDrive\\Desktop\\chromedriver-win64\\chromedriver.exe')
    driver = webdriver.Chrome(service=service, options=options)
    
    return driver

# Random waiting function to mimic human browsing behavior
def random_wait():
    wait_time = random.uniform(2, 5)
    print(f"Waiting for {wait_time:.2f} seconds...")
    time.sleep(wait_time)

def scrape_amazon_page(driver, page_number):
    url = f"https://www.amazon.in/s?k=mobiles&page={page_number}"
    print(f"Opening URL: {url}")
    driver.get(url)
    
    try:
        print(f"Waiting for page {page_number} to load...")
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 's-main-slot')]"))
        )

        print(f"Page {page_number} source loaded. Checking for products...")

        product_elements = driver.find_elements(By.XPATH, "//div[@data-component-type='s-search-result']")
        print(f"Found {len(product_elements)} products on page {page_number}.")

        if not product_elements:
            print(f"No products found on page {page_number}. Ending pagination.")
            return []

        data = []
        for index, product in enumerate(product_elements):
            print(f"Extracting data for product {index + 1} of {len(product_elements)}...")

            try:
                # Extract product name
                try:
                    name_element = product.find_element(By.XPATH, ".//h2[contains(@class, 'a-size-medium')]/span")
                    name = name_element.text.strip()
                    print(f"Product {index + 1} name: {name}")
                except NoSuchElementException:
                    name = "N/A"
                    print(f"Product {index + 1} name not found.")

                # Extract price
                try:
                    price_element = product.find_element(By.XPATH, ".//span[@class='a-price-whole']")
                    price = price_element.text.strip().replace(',', '')
                    print(f"Product {index + 1} price: {price}")
                except NoSuchElementException:
                    price = "N/A"
                    print(f"Product {index + 1} price not found.")

                # Extract delivery date
                try:
                    delivery_element = product.find_element(By.XPATH, ".//span[contains(@class, 'a-color-base a-text-bold')]")
                    delivery_date = delivery_element.text.strip()
                    print(f"Product {index + 1} delivery date: {delivery_date}")
                except NoSuchElementException:
                    delivery_date = "N/A"
                    print(f"Product {index + 1} delivery date not found.")

                data.append({
                    "Product Name": name,
                    "Price": price,
                    "Delivery Date": delivery_date
                })

            except Exception as e:
                print(f"Error extracting product details for product {index + 1}: {e}")
                continue  # Skip to the next product in case of error

        random_wait()  # Mimic human behavior
        return data

    except TimeoutException:
        print(f"Timeout error on page {page_number}. Skipping page.")
        return []

# Function to save results to separate Excel files for each page
def save_to_excel(data, page_number):
    if not data:
        print(f"No data found for page {page_number}, skipping file creation.")
        return
    
    df = pd.DataFrame(data)
    filename = f"amazon_page_{page_number}.xlsx"
    df.to_excel(filename, index=False)
    print(f"Data for page {page_number} saved to {filename}")

def scrape_multiple_pages():
    driver = setup_driver()
    page_number = 1
    
    while True:
        print(f"Processing Page {page_number}")

        page_data = scrape_amazon_page(driver, page_number)

        if not page_data:
            break  # Stop if no products are found

        save_to_excel(page_data, page_number)

        try:
            next_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, '//a[contains(@class, "s-pagination-next")]'))
            )
            next_button.click()
            page_number += 1
            random_wait()
        except Exception as e:
            print(f"Pagination ended: {e}")
            break  # Stop when there's no "Next" button

    driver.quit()
    print("Scraping completed.")

if __name__ == "__main__":
    scrape_multiple_pages()


Processing Page 1
Opening URL: https://www.amazon.in/s?k=mobiles&page=1
Waiting for page 1 to load...
Page 1 source loaded. Checking for products...
Found 22 products on page 1.
Extracting data for product 1 of 22...
Product 1 name: Samsung Galaxy S25 Ultra 5G AI Smartphone (Titanium Whitesilver, 12GB RAM, 512GB Storage), 200MP Camera, S Pen Included, Long Battery Life
Product 1 price: 141999
Product 1 delivery date: Fri, 21 Feb
Extracting data for product 2 of 22...
Product 2 name: Samsung Galaxy S25 5G AI Smartphone (Silver Shadow, 12GB RAM, 256GB Storage), 50MP Camera with Galaxy AI
Product 2 price: 80999
Product 2 delivery date: Fri, 21 Feb
Extracting data for product 3 of 22...
Product 3 name: Samsung Galaxy M05 (Mint Green, 4GB RAM, 64 GB Storage) | 50MP Dual Camera | Bigger 6.7" HD+ Display | 5000mAh Battery | 25W Fast Charging | 2 Gen OS Upgrade & 4 Year Security Update | Without Charger
Product 3 price: 6299
Product 3 delivery date: Fri, 21 Feb
Extracting data for product 4 of