In [12]:
import time
import re
import csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

# ---------------- CONFIG ----------------
URL = "https://in.puma.com/in/en/womens/womens-shoes"
CSV_FILE = "puma_womens_shoes.csv"
MAX_PRODUCTS = 100
SCROLL_PAUSE = 2

# ---------------- HELPERS ----------------
def clean_price(price):
    return int(re.sub(r"[^\d]", "", price)) if price else None

def parse_product(card):
    name = card.select_one("h3")
    sale = card.select_one("[data-test-id='sale-price']")
    original = card.select_one("[data-test-id='price']")
    discount = card.select_one("[data-test-id='product-badge-sale']")
    img = card.select_one("img")
    link = card.select_one("a[href]")
    colors = card.select_one("button")

    sale_price = clean_price(sale.text) if sale else None
    original_price = clean_price(original.text) if original else None

    # ---- DISCOUNT HANDLING ----
    if sale_price and original_price and original_price > sale_price:
        discount_value = original_price - sale_price
        discount_percentage = discount.text.strip() if discount else "0%"
    else:
        discount_value = 0
        discount_percentage = "0%"

    return {
        "product_name": name.text.strip() if name else None,
        "sale_price": sale_price,
        "original_price": original_price,
        "discount_percentage": discount_percentage,
        "discount_value": discount_value,
        "image_url": img["src"] if img else None,
        "total_colours": (
            re.search(r"\d+", colors.text).group()
            if colors and re.search(r"\d+", colors.text) else None
        ),
        "absolute_url": (
            "https://in.puma.com" + link["href"]
            if link else None
        )
    }

def write_csv(data):
    with open(CSV_FILE, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=data[0].keys())
        writer.writeheader()
        writer.writerows(data)

# ---------------- DRIVER SETUP ----------------
options = Options()
options.add_argument("--headless")
options.add_argument("--window-size=1920,1080")
options.add_argument("--disable-blink-features=AutomationControlled")

driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

driver.get(URL)
time.sleep(6)  # initial load

# ---------------- SCRAPE LOOP ----------------
products = {}
previous_count = 0
idle_scrolls = 0
MAX_IDLE_SCROLLS = 3

while len(products) < MAX_PRODUCTS:
    soup = BeautifulSoup(driver.page_source, "html.parser")
    cards = soup.select("li[data-test-id='product-list-item']")

    for card in cards:
        product = parse_product(card)
        if product["absolute_url"]:
            products[product["absolute_url"]] = product
        if len(products) >= MAX_PRODUCTS:
            break

    current_count = len(products)
    print(f"Scraped so far: {current_count}")

    # Save after each scrape
    write_csv(list(products.values()))

    # Stop if no new products
    if current_count == previous_count:
        idle_scrolls += 1
        if idle_scrolls >= MAX_IDLE_SCROLLS:
            print("No new products loading. Stopping.")
            break
    else:
        idle_scrolls = 0

    previous_count = current_count

    # Scroll
    driver.execute_script("window.scrollBy(0, 1200);")
    time.sleep(SCROLL_PAUSE)

driver.quit()

# ---------------- FINAL OUTPUT ----------------
print("\n========== SCRAPING COMPLETE ==========")
print(f"Total products scraped: {len(products)}")
print(f"CSV saved as: {CSV_FILE}")


Scraped so far: 24
Scraped so far: 24
Scraped so far: 48
Scraped so far: 48
Scraped so far: 72
Scraped so far: 72
Scraped so far: 72
Scraped so far: 96
Scraped so far: 96
Scraped so far: 96
Scraped so far: 100

Total products scraped: 100
CSV saved as: puma_womens_shoes.csv
