In [None]:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
import random


# Function to scrape HTML files from Amazon
def scrape_amazon_pages(query="laptops", pages=3):
    # Configure user-agent and proxies for stealth
    ua = UserAgent()
    user_agent = ua.random

    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument(f"user-agent={user_agent}")
    chrome_options.add_argument("--headless")  # Uncomment this for headless mode
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(options=chrome_options)

    # Create a directory to save HTML files
    if not os.path.exists("data"):
        os.makedirs("data")

    for page in range(1, pages + 1):
        try:
            print(f"Scraping page {page}...")
            url = f"https://www.amazon.com/s?k={query.replace(' ', '+')}&page={page}"
            driver.get(url)

            # Wait for product containers to load
            WebDriverWait(driver, 15).until(
                EC.presence_of_all_elements_located((By.XPATH, "//div[@data-component-type='s-search-result']"))
            )

            # Scroll to the bottom of the page to load all dynamic content
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(random.randint(5, 10))  # Random delay to mimic human interaction

            # Save the HTML content to a file
            with open(f"data/page_{page}.html", "w", encoding="utf-8") as f:
                f.write(driver.page_source)
            print(f"Saved page {page} to 'data/page_{page}.html'")

        except Exception as e:
            print(f"Error on page {page}: {e}")
            if "sorry" in driver.page_source.lower():  # Check for block
                print("Blocked by Amazon. Adding delay...")
                time.sleep(60)  # Wait 1 minute before retrying
        finally:
            time.sleep(random.randint(5, 15))  # Random delay between requests

    driver.quit()
    print("Scraping completed!")


# Function to parse HTML files and extract data
def parse_saved_pages():
    data = {
        "Title": [],
        "Price": [],
        "Rating": [],
        "Number_of_Reviews": [],
        "Availability": [],
        "Brand": [],
        "Discount_Price": [],
        "Link": [],
        "RAM_Storage": [],
        "Color": []
    }

    for file in os.listdir("data"):
        try:
            # Open and parse the HTML file
            with open(f"data/{file}", "r", encoding="utf-8") as f:
                html_doc = f.read()
            soup = BeautifulSoup(html_doc, "html.parser")

            # Extract product containers
            items = soup.find_all("div", {"data-component-type": "s-search-result"})

            for item in items:
                # Extract Title
                try:
                    title = item.find("span", class_="a-size-medium a-color-base a-text-normal").get_text(strip=True)
                except:
                    title = "N/A"

                # Extract Price
                try:
                    price_whole = item.find("span", class_="a-price-whole")
                    price_fraction = item.find("span", class_="a-price-fraction")
                    price = float(price_whole.get_text(strip=True) + "." + price_fraction.get_text(strip=True)) if price_whole and price_fraction else None
                except:
                    price = None

                # Extract Rating
                try:
                    rating = item.find("span", class_="a-icon-alt").get_text(strip=True).split()[0]
                except:
                    rating = None

                # Extract Number of Reviews
                try:
                    reviews = item.find("span", class_="a-size-base").get_text(strip=True).replace(",", "")
                    number_of_reviews = int(reviews) if reviews.isdigit() else None
                except:
                    number_of_reviews = None

                # Extract Availability
                try:
                    availability = "In Stock" if item.find("span", class_="a-color-success") else "Out of Stock"
                except:
                    availability = "Unknown"

                # Extract Brand
                try:
                    brand = title.split()[0] if title != "N/A" else "Unknown"
                except:
                    brand = "Unknown"

                # Calculate Discount Price (10% Discount)
                try:
                    discount_price = round(price * 0.9, 2) if price else None
                except:
                    discount_price = None

                # Extract Product Link
                try:
                    link_tag = item.find("a", class_="a-link-normal", href=True)
                    link = f"https://www.amazon.com{link_tag['href']}" if link_tag else "N/A"
                except:
                    link = "N/A"

                # Extract RAM and Storage (if available)
                try:
                    ram_storage = item.find("span", class_="selection").get_text(strip=True)
                except:
                    ram_storage = "N/A"

                # Extract Color (if available)
                try:
                    color = item.find("span", class_="selection").get_text(strip=True)
                except:
                    color = "N/A"

                # Append data
                data["Title"].append(title)
                data["Price"].append(price)
                data["Rating"].append(rating)
                data["Number_of_Reviews"].append(number_of_reviews)
                data["Availability"].append(availability)
                data["Brand"].append(brand)
                data["Discount_Price"].append(discount_price)
                data["Link"].append(link)
                data["RAM_Storage"].append(ram_storage)
                data["Color"].append(color)

        except Exception as e:
            print(f"Error processing file {file}: {e}")

    # Save the extracted data to a CSV file
    df = pd.DataFrame(data)
    df.to_csv("parsed_amazon_laptops.csv", index=False)
    print("Parsed data saved to 'parsed_amazon_laptops.csv'")


# Main Function
if __name__ == "__main__":
    # Step 1: Scrape Amazon pages
    scrape_amazon_pages(query="laptops", pages=3)

    # Step 2: Parse saved HTML files
    parse_saved_pages()


Scraping page 1...
Saved page 1 to 'data/page_1.html'
Scraping page 2...
Saved page 2 to 'data/page_2.html'
Scraping page 3...
Saved page 3 to 'data/page_3.html'


In [None]:
with open('page_1.html', 'r') as file:
    data = file.read()
    print(data)