In [1]:
%pip install selenium pandas openpyxl

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import random
from urllib.parse import urljoin, urlparse
import logging
from dataclasses import dataclass
from typing import List, Optional, Tuple
import csv

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

@dataclass
class Product:
    title: str
    price: str
    rating: Optional[str]
    reviews: Optional[str]
    delivery: Optional[str]
    url: str
    sponsored: bool = False
    deal: Optional[str] = None

class AmazonScraper:
    def __init__(self, visible_browser=True):
        self.base_url = "https://www.amazon.in"
        self.visible_browser = visible_browser
        self.driver = self._init_driver()
        self.delay_range = (1, 3)  # Random delay between actions in seconds

    def _init_driver(self):
        """Initialize Chrome driver with visible browser"""
        options = Options()
        if not self.visible_browser:
            options.add_argument("--headless")
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")
        options.add_argument("--window-size=1920,1080")
        options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
        driver = webdriver.Chrome(options=options)
        return driver

    def _random_delay(self):
        """Add random delay between actions"""
        delay = random.uniform(*self.delay_range)
        time.sleep(delay)

    def _validate_amazon_url(self, url: str) -> bool:
        """Check if URL is valid Amazon URL"""
        parsed = urlparse(url)
        return parsed.netloc.endswith('amazon.in') or parsed.netloc.endswith('amazon.com')

    def _get_page(self, url: str) -> Optional[str]:
        """Load page and return its HTML"""
        try:
            self._random_delay()
            self.driver.get(url)
            
            # Wait for page to load
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "div.s-result-item")))
            
            # Check for CAPTCHA
            if "api-services-support@amazon.com" in self.driver.page_source:
                raise Exception("CAPTCHA detected - Amazon is blocking requests")
                
            return self.driver.page_source
        except Exception as e:
            logger.error(f"Error loading page {url}: {e}")
            return None

    def _extract_product_data(self, item) -> Optional[Product]:
        """Extract product data from BeautifulSoup element"""
        try:
            # Extract product URL
            link = item.select_one('h2 a.a-link-normal, a.a-link-normal.s-line-clamp-2, a.a-link-normal.s-line-clamp-3')
            if not link:
                return None
                
            raw_url = link.get('href', '')
            product_url = urljoin(self.base_url, raw_url.split('?')[0])

            # Extract title
            title = link.get_text(strip=True)

            # Extract price
            price_whole = item.select_one('span.a-price-whole')
            price = price_whole.get_text(strip=True).replace(',', '') if price_whole else None

            # Extract rating
            rating = item.select_one('span.a-icon-alt')
            rating = rating.get_text(strip=True).split()[0] if rating else None

            # Extract reviews
            reviews = item.select_one('span.a-size-base[aria-label], span.a-size-base.s-underline-text')
            reviews = reviews.get_text(strip=True) if reviews else None

            # Extract delivery
            delivery = item.select_one('span.a-color-base.a-text-bold')
            delivery = delivery.get_text(strip=True) if delivery else None

            # Extract deal
            deal = item.select_one('span.a-badge-text')
            deal = deal.get_text(strip=True) if deal else None

            # Check if sponsored
            sponsored = bool(item.select_one('span.a-color-secondary:contains("Sponsored"), '
                           'span:contains("Sponsored Ad")'))

            return Product(
                title=title,
                price=price,
                rating=rating,
                reviews=reviews,
                delivery=delivery,
                url=product_url,
                sponsored=sponsored,
                deal=deal
            )
        except Exception as e:
            logger.error(f"Error extracting product data: {e}")
            return None

    def _get_pagination_links(self, soup) -> List[str]:
        """Extract pagination links from page"""
        pagination_div = soup.select_one('div.s-pagination-strip, div.a-text-center.s-pagination-container')
        if not pagination_div:
            return []

        page_links = []
        for a in pagination_div.select('a.s-pagination-item'):
            if a.text.strip().isdigit():
                page_url = urljoin(self.base_url, a['href'])
                if page_url not in page_links:
                    page_links.append(page_url)
        return page_links

    def scrape_search_page(self, url: str) -> Tuple[List[Product], List[str]]:
        """Scrape a single search page"""
        if not self._validate_amazon_url(url):
            logger.error("Invalid Amazon URL provided")
            return [], []

        logger.info(f"Scraping page: {url}")
        html = self._get_page(url)
        if not html:
            return [], []

        soup = BeautifulSoup(html, 'html.parser')
        products = []
        
        items = soup.select('div.s-result-item[data-component-type="s-search-result"], '
                           'div.a-section.a-spacing-small.puis-padding-left-small, '
                           'div.a-section.a-spacing-small.puis-padding-left-micro')
        logger.info(f"Found {len(items)} products on page")

        for item in items:
            product = self._extract_product_data(item)
            if product:
                products.append(product)

        next_pages = self._get_pagination_links(soup)
        return products, next_pages

    def scrape_search(self, search_url: str, max_pages: Optional[int] = None) -> List[Product]:
        """Scrape multiple pages of search results"""
        if not self._validate_amazon_url(search_url):
            logger.error("Invalid Amazon URL provided")
            return []

        all_products = []
        pages_to_scrape = [search_url]
        pages_scraped = 0
        scraped_urls = set()

        while pages_to_scrape and (max_pages is None or pages_scraped < max_pages):
            current_url = pages_to_scrape.pop(0)
            if current_url in scraped_urls:
                continue
                
            products, new_pages = self.scrape_search_page(current_url)
            
            all_products.extend(products)
            pages_scraped += 1
            scraped_urls.add(current_url)
            
            for page in new_pages:
                if page not in scraped_urls and page not in pages_to_scrape:
                    pages_to_scrape.append(page)

            logger.info(f"Scraped {pages_scraped} pages, {len(all_products)} products collected")
            
            if max_pages and pages_scraped >= max_pages:
                break

        return all_products

    def save_to_csv(self, products: List[Product], filename: str):
        """Save products to CSV"""
        with open(filename, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=['title', 'price', 'rating', 'reviews', 
                                                 'delivery', 'url', 'sponsored', 'deal'])
            writer.writeheader()
            for product in products:
                writer.writerow(product.__dict__)
        logger.info(f"Saved {len(products)} products to {filename}")

    def close(self):
        """Close the browser"""
        self.driver.quit()

def get_user_input():
    """Get user input for scraping parameters"""
    print("Amazon Product Scraper (Visible Browser Mode)")
    print("--------------------------------------------")
    
    while True:
        url = input("Enter Amazon search URL (e.g., https://www.amazon.in/s?k=mobiles): ").strip()
        if not url:
            print("URL cannot be empty. Please try again.")
            continue
            
        max_pages = input("How many pages to scrape? (Press Enter for all pages): ").strip()
        try:
            max_pages = int(max_pages) if max_pages else None
        except ValueError:
            print("Please enter a valid number or press Enter for all pages.")
            continue
            
        filename = input("Enter output CSV filename (default: amazon_products.csv): ").strip()
        filename = filename or "amazon_products.csv"
        
        return url, max_pages, filename

if __name__ == "__main__":
    # Get user input
    search_url, max_pages, filename = get_user_input()
    
    # Initialize scraper with visible browser
    scraper = AmazonScraper(visible_browser=True)
    
    try:
        # Run scraping
        products = scraper.scrape_search(search_url, max_pages)
        
        # Save results
        scraper.save_to_csv(products, filename)
        
        # Print summary
        print("\nScraping completed successfully!")
        print(f"Scraped {len(products)} products")
        print(f"Results saved to {filename}")
        
        # Print sample
        print("\nSample products:")
        for product in products[:3]:
            print(f"\n- {product.title}")
            print(f"  Price: ₹{product.price} | Rating: {product.rating or 'N/A'} stars")
            print(f"  Reviews: {product.reviews or 'N/A'} | Delivery: {product.delivery or 'N/A'}")
            print(f"  URL: {product.url[:80]}...")
    except Exception as e:
        print(f"\nError occurred during scraping: {e}")
    finally:
        # Close browser
        scraper.close()
        input("\nPress Enter to exit...")  # Keep window open to observe any errors

Amazon Product Scraper (Visible Browser Mode)
--------------------------------------------


2025-04-11 18:34:10,715 - INFO - Scraping page: https://www.amazon.in/s?k=clothes&crid=1VSYJVZO6HQIZ&sprefix=clothes%2Caps%2C266&ref=nb_sb_noss_2
2025-04-11 18:34:35,148 - INFO - Found 130 products on page
2025-04-11 18:34:35,719 - INFO - Scraped 1 pages, 130 products collected
2025-04-11 18:34:35,721 - INFO - Scraping page: https://www.amazon.in/s?k=clothes&page=2&xpid=pyDoFj3naw10q&crid=1VSYJVZO6HQIZ&qid=1744376652&sprefix=clothes%2Caps%2C266&ref=sr_pg_2
2025-04-11 18:35:09,931 - INFO - Found 130 products on page
2025-04-11 18:35:10,473 - INFO - Scraped 2 pages, 260 products collected
2025-04-11 18:35:10,475 - INFO - Scraping page: https://www.amazon.in/s?k=clothes&page=3&xpid=pyDoFj3naw10q&crid=1VSYJVZO6HQIZ&qid=1744376652&sprefix=clothes%2Caps%2C266&ref=sr_pg_3
2025-04-11 18:35:47,570 - INFO - Found 130 products on page
2025-04-11 18:35:48,058 - INFO - Scraped 3 pages, 390 products collected
2025-04-11 18:35:48,059 - INFO - Scraping page: https://www.amazon.in/s?k=clothes&crid=1VSY


Scraping completed successfully!
Scraped 910 products
Results saved to amo.csv

Sample products:

- Men's Wrinkle-Resistant Regular Fit Cotton Formal Shirt
  Price: ₹1899 | Rating: 4.0 stars
  Reviews: 549 | Delivery: Sun, 13 Apr
  URL: https://www.amazon.in/sspa/click...

- Men's Wrinkle-Resistant Regular Fit Cotton Formal Shirt
  Price: ₹1899 | Rating: 4.0 stars
  Reviews: 549 | Delivery: Sun, 13 Apr
  URL: https://www.amazon.in/sspa/click...

- Boldfit for Men Slim Fit Joggers for Men for Running, Gym Sports Lower for Men & Boys Summer Track Pants for Men Multipurpose Mens Lower Activewear Trousers, Night Pants for Men Black M
  Price: ₹699 | Rating: 4.2 stars
  Reviews: 1,118 | Delivery: Sun, 13 Apr
  URL: https://www.amazon.in/sspa/click...


KeyboardInterrupt: Interrupted by user