In [None]:
import requests
from bs4 import BeautifulSoup
import time
import random
import pandas as pd
import re
import os

In [None]:

# User agent to mimic a browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'Cache-Control': 'max-age=0'
}

def get_amazon_page(url):
    """Get Amazon page HTML with retry logic"""
    max_retries = 3
    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                return response.text
            else:
                print(f"Error {response.status_code}. Retrying ({attempt+1}/{max_retries})...")
                time.sleep(5 + random.random() * 5)  # Random delay between 5-10 seconds
        except Exception as e:
            print(f"Exception: {e}. Retrying ({attempt+1}/{max_retries})...")
            time.sleep(5 + random.random() * 5)
    
    print(f"Failed to retrieve page: {url}")
    return None

def extract_product_links(html_content):
    """Extract product links from Amazon search results page"""
    soup = BeautifulSoup(html_content, 'html.parser')
    product_links = []
    
    # Look for product links using the structure from your example
    product_elements = soup.select('a.a-link-normal.s-faceout-link.aok-block.a-text-normal')
    
    # If the above selector doesn't work, try alternative selectors
    if not product_elements:
        product_elements = soup.select('a.a-link-normal[href*="/dp/"]')
    
    for element in product_elements:
        href = element.get('href', '')
        if href and '/dp/' in href:
            # Extract just the product ID portion if it's a relative URL
            if href.startswith('/'):
                product_links.append(f"https://www.amazon.com{href}")
            else:
                product_links.append(href)
    
    return product_links

def get_next_page_url(html_content, base_url="https://www.amazon.com"):
    """Extract the next page URL from the current page"""
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Look for the "Next" button based on your example
    next_page_element = soup.select_one('a.s-pagination-item.s-pagination-next')
    
    if next_page_element and next_page_element.get('href'):
        next_url = next_page_element.get('href')
        if next_url.startswith('/'):
            return f"{base_url}{next_url}"
        return next_url
    
    return None

def save_links_to_csv(df, filename="amazon_laptop_links.csv"):
    """Save the collected links DataFrame to a CSV file"""
    df.to_csv(filename, index=False)
    print(f"Saved {len(df)} links to {filename}")

def scrape_amazon_laptops(start_url, max_pages=100):
    """Main function to scrape laptop links from Amazon"""
    current_url = start_url
    all_product_links = []
    page_count = 0
    
    while current_url and page_count < max_pages:
        page_count += 1
        print(f"Scraping page {page_count}: {current_url}")
        
        # Get the page HTML
        html_content = get_amazon_page(current_url)
        if not html_content:
            print(f"Failed to get page {page_count}. Stopping.")
            break
        
        # Extract product links
        product_links = extract_product_links(html_content)
        print(f"Found {len(product_links)} product links on page {page_count}")
        
        # Add page number information
        page_data = [(link, page_count) for link in product_links]
        all_product_links.extend(page_data)
        
        # Create DataFrame and save progress every 5 pages
        if page_count % 5 == 0:
            df = pd.DataFrame(all_product_links, columns=['product_url', 'page_number'])
            save_links_to_csv(df, f"amazon_laptop_links_progress_{page_count}.csv")
        
        # Get the next page URL
        current_url = get_next_page_url(html_content)
        
        # Add a random delay to avoid being blocked
        delay = 3 + random.random() * 7  # Random delay between 3-10 seconds
        print(f"Waiting {delay:.2f} seconds before next request...")
        time.sleep(delay)
    
    # Create final DataFrame and save all collected links
    df_final = pd.DataFrame(all_product_links, columns=['product_url', 'page_number'])
    
    # Extract product IDs from URLs
    df_final['product_id'] = df_final['product_url'].apply(
        lambda url: re.search(r'/dp/([A-Z0-9]{10})', url).group(1) if re.search(r'/dp/([A-Z0-9]{10})', url) else None
    )
    
    # Save the DataFrame
    save_links_to_csv(df_final)
    
    print(f"Finished scraping {page_count} pages. Total links collected: {len(df_final)}")
    return df_final

if __name__ == "__main__":
    # Use the base URL for Amazon laptop search
    start_url = "https://www.amazon.com/s?k=laptops"
    
    # Start scraping
    df_products = scrape_amazon_laptops(start_url, max_pages=100)
    
    # Display first few rows of the data
    print("\nFirst 5 rows of collected data:")
    print(df_products.head())
    
    # Quick stats
    print("\nData Statistics:")
    print(f"Total products: {len(df_products)}")
    print(f"Products per page (average): {len(df_products) / df_products['page_number'].max():.2f}")
    print(f"Number of pages scraped: {df_products['page_number'].max()}")

In [None]:
# User agent to mimic a browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'Cache-Control': 'max-age=0'
}

def get_amazon_page(url):
    """Get Amazon page HTML with retry logic"""
    max_retries = 3
    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                return response.text
            else:
                print(f"Error {response.status_code}. Retrying ({attempt+1}/{max_retries})...")
                time.sleep(5 + random.random() * 5)  # Random delay between 5-10 seconds
        except Exception as e:
            print(f"Exception: {e}. Retrying ({attempt+1}/{max_retries})...")
            time.sleep(5 + random.random() * 5)
    
    print(f"Failed to retrieve page: {url}")
    return None

def extract_product_links(html_content):
    """Extract product links from Amazon search results page"""
    soup = BeautifulSoup(html_content, 'html.parser')
    product_links = []
    
    # Look for product links using the structure from your example
    product_elements = soup.select('a.a-link-normal.s-faceout-link.aok-block.a-text-normal')
    
    # If the above selector doesn't work, try alternative selectors
    if not product_elements:
        product_elements = soup.select('a.a-link-normal[href*="/dp/"]')
    
    for element in product_elements:
        href = element.get('href', '')
        if href and '/dp/' in href:
            # Extract just the product ID portion if it's a relative URL
            if href.startswith('/'):
                product_links.append(f"https://www.amazon.com{href}")
            else:
                product_links.append(href)
    
    return product_links

def get_next_page_url(html_content, base_url="https://www.amazon.com"):
    """Extract the next page URL from the current page"""
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Look for the "Next" button based on your example
    next_page_element = soup.select_one('a.s-pagination-item.s-pagination-next')
    
    if next_page_element and next_page_element.get('href'):
        next_url = next_page_element.get('href')
        if next_url.startswith('/'):
            return f"{base_url}{next_url}"
        return next_url
    
    return None

def extract_product_details(html_content):
    """Extract detailed product information from a product page"""
    soup = BeautifulSoup(html_content, 'html.parser')
    details = {}
    
    # Extract title
    try:
        title_element = soup.select_one('#productTitle')
        if title_element:
            details['title'] = title_element.text.strip()
        else:
            details['title'] = None
    except Exception as e:
        print(f"Error extracting title: {e}")
        details['title'] = None
    
    # Extract price
    try:
        price_whole = soup.select_one('span.a-price-whole')
        price_fraction = soup.select_one('span.a-price-fraction')
        
        if price_whole and price_fraction:
            price = price_whole.text.strip() + price_fraction.text.strip()
            # Remove any non-numeric characters except decimal point
            price = re.sub(r'[^\d.]', '', price)
            details['price'] = float(price) if price else None
        else:
            details['price'] = None
    except Exception as e:
        print(f"Error extracting price: {e}")
        details['price'] = None
    
    # Extract tech specs table
    tech_specs = {}
    try:
        tech_table = soup.select_one('#productDetails_techSpec_section_1')
        if tech_table:
            rows = tech_table.select('tr')
            for row in rows:
                header = row.select_one('th')
                value = row.select_one('td')
                if header and value:
                    key = header.text.strip()
                    val = value.text.strip().replace('‎', '')
                    tech_specs[key] = val
    except Exception as e:
        print(f"Error extracting tech specs: {e}")
    
    # Extract additional details table
    additional_details = {}
    try:
        details_table = soup.select_one('#productDetails_techSpec_section_2')
        if details_table:
            rows = details_table.select('tr')
            for row in rows:
                header = row.select_one('th')
                value = row.select_one('td')
                if header and value:
                    key = header.text.strip()
                    val = value.text.strip().replace('‎', '')
                    additional_details[key] = val
    except Exception as e:
        print(f"Error extracting additional details: {e}")
    
    # Add all extracted info to the details dictionary
    details['tech_specs'] = tech_specs
    details['additional_details'] = additional_details
    
    return details

def save_to_csv(df, filename="amazon_laptops_data.csv"):
    """Save the DataFrame to CSV"""
    df.to_csv(filename, index=False)
    print(f"Saved {len(df)} products to {filename}")

def flatten_dict(d, prefix=''):
    """Flatten nested dictionaries for DataFrame creation"""
    items = []
    for k, v in d.items():
        new_key = f"{prefix}_{k}" if prefix else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key).items())
        else:
            items.append((new_key, v))
    return dict(items)

def scrape_laptop_links(start_url, max_pages=100):
    """Scrape product links from search results pages"""
    current_url = start_url
    all_product_links = []
    page_count = 0
    
    while current_url and page_count < max_pages:
        page_count += 1
        print(f"Scraping search page {page_count}: {current_url}")
        
        # Get the page HTML
        html_content = get_amazon_page(current_url)
        if not html_content:
            print(f"Failed to get page {page_count}. Stopping.")
            break
        
        # Extract product links
        product_links = extract_product_links(html_content)
        print(f"Found {len(product_links)} product links on page {page_count}")
        
        # Add page number information
        page_data = [(link, page_count) for link in product_links]
        all_product_links.extend(page_data)
        
        # Create DataFrame and save progress every 5 pages
        if page_count % 5 == 0:
            df = pd.DataFrame(all_product_links, columns=['product_url', 'page_number'])
            df.to_csv(f"amazon_laptop_links_progress_{page_count}.csv", index=False)
            print(f"Saved progress: {len(df)} links collected so far")
        
        # Get the next page URL
        current_url = get_next_page_url(html_content)
        
        # Add a random delay to avoid being blocked
        delay = 3 + random.random() * 7  # Random delay between 3-10 seconds
        print(f"Waiting {delay:.2f} seconds before next request...")
        time.sleep(delay)
    
    # Create final DataFrame with all collected links
    df_links = pd.DataFrame(all_product_links, columns=['product_url', 'page_number'])
    
    # Extract product IDs from URLs
    df_links['product_id'] = df_links['product_url'].apply(
        lambda url: re.search(r'/dp/([A-Z0-9]{10})', url).group(1) if re.search(r'/dp/([A-Z0-9]{10})', url) else None
    )
    
    print(f"Finished scraping {page_count} pages. Total links collected: {len(df_links)}")
    return df_links

def scrape_product_details(df_links, max_products=None):
    """Scrape detailed information from each product page"""
    all_product_data = []
    
    # Limit number of products if specified
    if max_products:
        products_to_scrape = df_links.head(max_products)
    else:
        products_to_scrape = df_links
    
    total = len(products_to_scrape)
    
    for idx, (_, row) in enumerate(products_to_scrape.iterrows()):
        url = row['product_url']
        product_id = row['product_id']
        page_number = row['page_number']
        
        print(f"Scraping product {idx+1}/{total}: {url}")
        
        # Get product page HTML
        html_content = get_amazon_page(url)
        if not html_content:
            print(f"Failed to get product page. Skipping.")
            continue
        
        # Extract product details
        details = extract_product_details(html_content)
        
        # Add product ID and URL to details
        details['product_id'] = product_id
        details['product_url'] = url
        details['search_page_number'] = page_number
        
        # Add to data list
        all_product_data.append(details)
        
        # Save progress every 10 products
        if (idx + 1) % 10 == 0:
            # Create temporary DataFrame with current data
            temp_df = pd.json_normalize(all_product_data)
            temp_df.to_csv(f"amazon_laptop_details_progress_{idx+1}.csv", index=False)
            print(f"Saved progress: {len(temp_df)} products scraped")
        
        # Add a random delay to avoid being blocked
        delay = 5 + random.random() * 10  # Random delay between 5-15 seconds
        print(f"Waiting {delay:.2f} seconds before next request...")
        time.sleep(delay)
    
    # Process all collected data into a DataFrame
    print("Processing collected data...")
    
    # Create DataFrame from collected data
    df_details = pd.DataFrame(all_product_data)
    
    # Extract key technical specifications into separate columns
    key_specs = [
        'Standing screen display size', 'Screen Resolution', 'Processor', 
        'RAM', 'Hard Drive', 'Graphics Coprocessor', 'Operating System',
        'Average Battery Life (in hours)'
    ]
    
    # Create normalized DataFrame
    flattened_data = []
    for product in all_product_data:
        # Basic product info
        flat_product = {
            'product_id': product['product_id'],
            'product_url': product['product_url'],
            'search_page_number': product['search_page_number'],
            'title': product['title'],
            'price': product['price']
        }
        
        # Extract key specs
        tech_specs = product.get('tech_specs', {})
        additional_details = product.get('additional_details', {})
        
        # Add key specs as direct columns
        for spec in key_specs:
            if spec in tech_specs:
                flat_product[spec.replace(' ', '_').lower()] = tech_specs[spec]
            elif spec in additional_details:
                flat_product[spec.replace(' ', '_').lower()] = additional_details[spec]
        
        # Add some additional important details
        if 'Brand' in additional_details:
            flat_product['brand'] = additional_details['Brand']
        if 'Series' in additional_details:
            flat_product['series'] = additional_details['Series']
        if 'Color' in additional_details:
            flat_product['color'] = additional_details['Color']
        if 'Item Weight' in additional_details:
            flat_product['weight'] = additional_details['Item Weight']
        
        flattened_data.append(flat_product)
    
    # Create final DataFrame
    df_final = pd.DataFrame(flattened_data)
    
    return df_final

def main():
    """Main function to run the scraper"""
    # Use the base URL for Amazon laptop search
    start_url = "https://www.amazon.com/s?k=laptops"
    
    # Step 1: Scrape product links
    print("Step 1: Scraping product links from search results...")
    df_links = scrape_laptop_links(start_url, max_pages=20)
    df_links.to_csv("amazon_laptop_links.csv", index=False)
    
    # Step 2: Scrape product details
    print("\nStep 2: Scraping detailed information for each product...")
    df_details = scrape_product_details(df_links)
    
    # Save the final results
    df_details.to_csv("amazon_laptops_complete_data.csv", index=False)
    
    print("\nScraping completed!")
    print(f"Total products with complete details: {len(df_details)}")
    print(f"Data saved to 'amazon_laptops_complete_data.csv'")

if __name__ == "__main__":
    main()