<a href="https://colab.research.google.com/github/adienlopez/EAN13-Barcode-Generator/blob/main/Webuy_Complete_DVD_Listing_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Final Webuy DVD scraper with direct image scraping (real folder names)

!pip install selenium gspread google-auth oauth2client --quiet

import time
import re
import random
import gspread
from google.colab import auth
from google.auth import default
from urllib.parse import urlparse, parse_qs, quote
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# === CONFIGURATION ===
SPREADSHEET_ID = "1rRa8QKtkE0NmN3DVF2SeUJfMlvkW51YribIsTtscb68"
SHEET_NAME = "Sheet1"
start_year = int(input("🔢 Enter the start year (e.g. 2005): "))
YEARS = list(range(start_year, 2026))
AGE_RATINGS = ["U", "PG", "12", "15", "18", "E", "tc"]
MAX_RETRIES = 2

# === GOOGLE SHEETS SETUP ===
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)
sheet = gc.open_by_key(SPREADSHEET_ID).worksheet(SHEET_NAME)
headers = ["Title", "Product ID", "Category", "Super Category", "Price", "Image URL"]
sheet.clear()
sheet.append_row(headers)

# === SELENIUM SETUP ===
def create_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--user-agent=Mozilla/5.0")
    driver = webdriver.Chrome(options=options)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    return driver

# === FALLBACK URL BUILDER ===
def construct_image_url(product_id, category_name):
    encoded_path = quote(category_name, safe='')
    return f"https://uk.static.webuy.com/product_images/DVD/{encoded_path}/{product_id}_l.jpg"

# === PRODUCT LINKS ===
def get_product_links(driver):
    try:
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "a[href*='product-detail']"))
        )
        elements = driver.find_elements(By.CSS_SELECTOR, "a[href*='product-detail']")
        return list({e.get_attribute("href") for e in elements if e.get_attribute("href")})
    except:
        return []

# === SCRAPER LOGIC ===
def extract_product_id(url):
    return parse_qs(urlparse(url).query).get("id", [None])[0]

def scrape_product_data(driver, url, retries=0):
    try:
        driver.get(url)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "main")))
        time.sleep(random.uniform(1.5, 3.0))

        product_id = extract_product_id(url)
        title = "N/A"
        price = "N/A"

        for sel in [".product-title", "h1", ".title"]:
            try:
                elem = driver.find_element(By.CSS_SELECTOR, sel)
                title = elem.text.strip()
                if title:
                    break
            except:
                continue

        for elem in driver.find_elements(By.XPATH, "//*[contains(text(), '£')]"):
            text = elem.text.strip()
            match = re.search(r'£\\d+(\\.\\d{2})?', text)
            if match:
                price = match.group()
                break

        query = parse_qs(urlparse(url).query)
        cat = query.get("categoryName", ["Unknown"])[0]
        supercat = query.get("superCatName", ["Unknown"])[0]

        # ✅ NEW: Extract image URL directly from product page
        try:
            image_element = driver.find_element(By.CSS_SELECTOR, "img[src*='product_images']")
            image_url = image_element.get_attribute("src")
        except:
            image_url = construct_image_url(product_id, cat)

        return [title, product_id, cat, supercat, price, image_url]
    except Exception as e:
        if retries < MAX_RETRIES:
            time.sleep(10)
            return scrape_product_data(driver, url, retries + 1)
        return None

# === MAIN LOOP ===
driver = create_driver()
row_buffer = []
scraped_count = 0

for year in YEARS:
    for rating in AGE_RATINGS:
        print(f"🟢 Scraping DVDs for Year: {year}, Rating: {rating}")
        base_url = f"https://uk.webuy.com/search?stext=dvd&Year={year}&Age+Rating+(BBFC)={rating}"
        for page in range(1, 61):
            page_url = base_url + f"&page={page}"
            try:
                driver.get(page_url)
                time.sleep(random.uniform(1.5, 3.5))
                product_links = get_product_links(driver)
                print(f"   🔍 Page {page} - Found {len(product_links)} product links")
                if not product_links:
                    break
                for link in product_links:
                    data = scrape_product_data(driver, link)
                    if data:
                        row_buffer.append(data)
                        scraped_count += 1
                if len(row_buffer) >= 50:
                    sheet.append_rows(row_buffer, value_input_option="USER_ENTERED")
                    print(f"  ✅ Wrote {len(row_buffer)} rows | Total scraped: {scraped_count}")
                    row_buffer.clear()
                if page % 10 == 0:
                    time.sleep(random.uniform(8, 15))
            except Exception as e:
                print(f"  ⚠️ Error on page {page}: {e}")
                time.sleep(30)

        if row_buffer:
            sheet.append_rows(row_buffer, value_input_option="USER_ENTERED")
            print(f"  ✅ Wrote remaining {len(row_buffer)} rows for Year {year}, Rating {rating}")
            row_buffer.clear()

driver.quit()
print(f"🎉 DONE: {scraped_count} DVD items scraped and saved to Google Sheets.")


🔢 Enter the start year (e.g. 2005): 2024
🟢 Scraping DVDs for Year: 2024, Rating: U
   🔍 Page 1 - Found 14 product links
