In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from datetime import datetime
import os
import logging
import random

In [4]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Set up browser options
options = Options()
options.add_argument("--headless=new")  # Headless mode; remove for visible browser
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument("--enable-javascript")

# Initialize driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Define Amazon India fashion subcategories
category_urls = {
    "Men's Shirts": "https://www.amazon.in/s?k=men%27s+shirts",
    "Men's T-Shirts": "https://www.amazon.in/s?k=men%27s+t-shirts",
    "Men's Jeans": "https://www.amazon.in/s?k=men%27s+jeans",
    "Men's Shoes": "https://www.amazon.in/s?k=men%27s+shoes",
    "Men's Jackets": "https://www.amazon.in/s?k=men%27s+jackets",
    "Men's Watches": "https://www.amazon.in/s?k=men%27s+watches",
    "Men's Shorts": "https://www.amazon.in/s?k=men%27s+shorts",
    "Women's Kurtas": "https://www.amazon.in/s?k=women%27s+kurtas",
    "Women's Sarees": "https://www.amazon.in/s?k=women%27s+sarees",
    "Women's Dresses": "https://www.amazon.in/s?k=women%27s+dresses",
    "Women's Shoes": "https://www.amazon.in/s?k=women%27s+shoes",
    "Women's Tops": "https://www.amazon.in/s?k=women%27s+tops",
    "Women's Handbags": "https://www.amazon.in/s?k=women%27s+handbags",
    "Women's Jewellery": "https://www.amazon.in/s?k=women%27s+jewellery",
    "Women's Leggings": "https://www.amazon.in/s?k=women%27s+leggings",
    "Kids Shirts": "https://www.amazon.in/s?k=kids+shirts",
    "Kids Dresses": "https://www.amazon.in/s?k=kids+dresses",
    "Kids Shoes": "https://www.amazon.in/s?k=kids+shoes",
    "Kids Jackets": "https://www.amazon.in/s?k=kids+jackets",
    "Kids Pants": "https://www.amazon.in/s?k=kids+pants"
}

scrape_date = datetime.now().strftime("%Y-%m-%d")
all_products_data = []

for category_name, category_url in category_urls.items():
    logging.info(f"Scraping category: {category_name} from {category_url}")
    
    try:
        driver.get(category_url)
        time.sleep(random.uniform(5, 7))  # Wait for JS to render
        
        # Wait for product containers
        wait = WebDriverWait(driver, 30)
        wait.until(EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 's-result-item')]")))
        logging.info(f"Product grid loaded for {category_name}")

        # Scroll to load more products (target ~100-150 per category for 2000-2500 total)
        scroll_pause_time = random.uniform(2, 3)
        scroll_count = 0
        max_scrolls = 20
        last_height = driver.execute_script("return document.body.scrollHeight")
        while scroll_count < max_scrolls:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(scroll_pause_time)
            scroll_count += 1
            new_height = driver.execute_script("return document.body.scrollHeight")
            logging.info(f"Scrolled {scroll_count} times in {category_name}")
            if new_height == last_height and scroll_count > 5:
                logging.info(f"No new content after {scroll_count} scrolls in {category_name}")
                break
            last_height = new_height

        # Extract product cards
        product_cards = driver.find_elements(By.XPATH, "//div[contains(@class, 's-result-item') and .//h2]")
        logging.info(f"Found {len(product_cards)} products in {category_name}")

        for card in product_cards:
            try:
                # Product Name (full title)
                try:
                    name_element = card.find_element(By.XPATH, ".//h2//span")
                    product_name = name_element.text.strip()
                except NoSuchElementException:
                    product_name = "N/A"

                # Price
                try:
                    price_element = card.find_element(By.XPATH, ".//span[contains(@class, 'a-price-whole')]")
                    price = "₹" + price_element.text.strip().replace(",", "")
                    price = float(price[1:])
                except NoSuchElementException:
                    price = "N/A"

                # Original Price
                try:
                    orig_price_element = card.find_element(By.XPATH, ".//span[contains(@class, 'a-text-price')]//span[contains(@class, 'a-offscreen')]")
                    original_price = orig_price_element.get_attribute("innerHTML").strip().replace(",", "")
                    original_price = float(original_price[1:])
                except NoSuchElementException:
                    original_price = price if price != "N/A" else "N/A"

                # Discount % (Percentage)
                try:
                    if price != "N/A" and original_price != "N/A" and original_price > price:
                        discount_percent = round(((original_price - price) / original_price) * 100, 2)
                    else:
                        discount_percent = 0.0
                except:
                    discount_percent = "N/A"

                # Discount Amount (Absolute value)
                try:
                    if price != "N/A" and original_price != "N/A" and original_price > price:
                        discount_amount = round(original_price - price, 2)
                    else:
                        discount_amount = 0.0
                except:
                    discount_amount = "N/A"

                # Availability
                try:
                    avail_element = card.find_element(By.XPATH, ".//span[contains(text(), 'In stock') or contains(text(), 'Out of stock')]")
                    availability = avail_element.text.strip()
                except NoSuchElementException:
                    availability = "In Stock"  # Default assumption

                # Rating
                try:
                    rating_element = card.find_element(By.XPATH, ".//span[contains(@class, 'a-icon-alt')]")
                    rating = rating_element.get_attribute("innerHTML").split()[0]
                except NoSuchElementException:
                    rating = "N/A"

                # Product URL
                try:
                    url_element = card.find_element(By.XPATH, ".//a[contains(@class, 'a-link-normal')]")
                    product_url = url_element.get_attribute("href")
                    if not product_url.startswith("https"):
                        product_url = "https://www.amazon.in" + product_url
                except NoSuchElementException:
                    product_url = "N/A"

                # Image URL
                try:
                    image_element = card.find_element(By.XPATH, ".//img[contains(@class, 's-image')]")
                    image_url = image_element.get_attribute("src")
                except NoSuchElementException:
                    image_url = "N/A"

                # Is Prime (Prime eligibility)
                try:
                    card.find_element(By.XPATH, ".//i[contains(@class, 'a-icon-prime')]")
                    is_prime = "Yes"
                except NoSuchElementException:
                    is_prime = "No"

                # Append only if at least one key field is populated
                if any([product_name != "N/A", price != "N/A", product_url != "N/A"]):
                    all_products_data.append({
                        "Category": category_name,
                        "Product Name": product_name,
                        "Price": price,
                        "Original Price": original_price,
                        "Discount %": discount_percent,
                        "Discount Amount": discount_amount,
                        "Availability": availability,
                        "Product URL": product_url,
                        "Rating": rating,
                        "Image URL": image_url,
                        "Is Prime": is_prime,
                        "Date Scraped": scrape_date
                    })
                    logging.debug(f"Added product: {product_name} from {category_name}")

            except Exception as e:
                logging.warning(f"Error processing a product in {category_name}: {e}")

    except TimeoutException:
        logging.error(f"Timeout while loading {category_name}")
        with open(f"timeout_{category_name}_{scrape_date}.html", "w", encoding="utf-8") as f:
            f.write(driver.page_source)
        logging.info(f"Saved page source for {category_name} to debug")
    except Exception as e:
        logging.error(f"Unexpected error in {category_name}: {e}")

# Create DataFrame
df = pd.DataFrame(all_products_data)

# Save to CSV with error handling
downloads_path = os.path.join(os.path.expanduser("~"), "Downloads")
filename = os.path.join(downloads_path, f"amazon_fashion_{scrape_date}.csv")
try:
    df.to_csv(filename, index=False, encoding='utf-8-sig')
    logging.info(f"Scraping complete. Saved to: {filename}")
    logging.info(f"Total products scraped: {len(df)}")
except PermissionError:
    alt_filename = os.path.join(downloads_path, f"amazon_fashion_{scrape_date}_alt.csv")
    df.to_csv(alt_filename, index=False, encoding='utf-8-sig')
    logging.info(f"Permission denied for {filename}. Saved to: {alt_filename}")
    logging.info(f"Total products scraped: {len(df)}")
except Exception as e:
    logging.error(f"Error saving file: {e}")

# Cleanup
driver.quit()
logging.info("Browser closed.")

2025-04-07 22:56:15,937 - INFO - Get LATEST chromedriver version for google-chrome
2025-04-07 22:56:16,139 - INFO - Get LATEST chromedriver version for google-chrome
2025-04-07 22:56:16,250 - INFO - Driver [C:\Users\Yuvaraj J\.wdm\drivers\chromedriver\win64\134.0.6998.165\chromedriver-win32/chromedriver.exe] found in cache
2025-04-07 22:56:17,605 - INFO - Scraping category: Men's Shirts from https://www.amazon.in/s?k=men%27s+shirts
2025-04-07 22:56:30,515 - INFO - Product grid loaded for Men's Shirts
2025-04-07 22:56:33,400 - INFO - Scrolled 1 times in Men's Shirts
2025-04-07 22:56:36,214 - INFO - Scrolled 2 times in Men's Shirts
2025-04-07 22:56:39,031 - INFO - Scrolled 3 times in Men's Shirts
2025-04-07 22:56:42,796 - INFO - Scrolled 4 times in Men's Shirts
2025-04-07 22:56:45,615 - INFO - Scrolled 5 times in Men's Shirts
2025-04-07 22:56:48,434 - INFO - Scrolled 6 times in Men's Shirts
2025-04-07 22:56:48,436 - INFO - No new content after 6 scrolls in Men's Shirts
2025-04-07 22:56:4