In [None]:
import time
import random
import logging
import csv  
import json
import re
from urllib.parse import urljoin, urlparse, parse_qs, urlencode
from typing import List, Dict, Optional, Set
import requests
from bs4 import BeautifulSoup

# vinted_scraper.py
# Requires: requests, bs4
# Optional: selenium (if dynamic rendering required)


# Reuse notebook variables if present
BASE_URL = 'https://www.vinted.fr'
START_URL = f'{BASE_URL}/vetements?search_text=jean'
USER_AGENT = 'Mozilla/5.0 (compatible; VintedScraper/1.0; +https://example.com/bot)'
session = requests.Session()
session.headers.update({'User-Agent': USER_AGENT})
rp = None
REQUEST_DELAY = (1.0, 3.0)
MAX_PAGES = 50
LOG_LEVEL = 20
OUTPUT_CSV = 'vinted_products.csv'
OUTPUT_JSONL = 'vinted_products.jsonl'

logging.basicConfig(level=LOG_LEVEL, format='%(asctime)s %(levelname)s %(message)s')
logger = logging.getLogger(__name__)
session.headers.update({'User-Agent': USER_AGENT})

In [4]:
def polite_get(url: str, timeout: int = 15) -> Optional[requests.Response]:
    """Fetch URL while respecting robots.txt (if rp provided) and random delay."""
    if rp is not None:
        try:
            if not rp.can_fetch(USER_AGENT, url):
                logger.warning("Blocked by robots.txt: %s", url)
                return None
        except Exception:
            # If robotparser misconfigured, continue but log
            logger.debug("robots parser error for %s", url)
    try:
        resp = session.get(url, timeout=timeout)
        if resp.status_code == 200:
            # polite delay
            time.sleep(random.uniform(*REQUEST_DELAY))
            return resp
        logger.warning("GET %s -> status %s", url, resp.status_code)
    except Exception as e:
        logger.exception("Request failed: %s", e)
    return None


def make_page_url(base: str, page: int) -> str:
    """Add or replace page param in a URL's query string."""
    parsed = urlparse(base)
    qs = parse_qs(parsed.query)
    qs['page'] = [str(page)]
    new_query = urlencode({k: v[0] for k, v in qs.items()})
    return parsed._replace(query=new_query).geturl()


PRODUCT_URL_PATTERNS = [r'/items/', r'/item/', r'/v/', r'/listing/']


def looks_like_product(href: str) -> bool:
    if not href:
        return False
    parsed = urlparse(href)
    path = parsed.path
    for p in PRODUCT_URL_PATTERNS:
        if re.search(p, path):
            return True
    return False


def extract_links_from_listing(html: str) -> Set[str]:
    soup = BeautifulSoup(html, 'html.parser')
    links = set()
    for a in soup.find_all('a', href=True):
        href = a['href']
        # normalize relative URLs
        full = urljoin(BASE_URL, href)
        if looks_like_product(full):
            # strip fragment and query params that aren't needed
            u = full.split('#')[0]
            links.add(u)
    return links


def parse_product_page(html: str, url: str) -> Dict:
    soup = BeautifulSoup(html, 'html.parser')
    data = {'url': url}

    # Try JSON-LD first
    try:
        for script in soup.find_all('script', type='application/ld+json'):
            try:
                j = json.loads(script.string or '')
            except Exception:
                continue
            # j might be a list
            items = j if isinstance(j, list) else [j]
            for item in items:
                if isinstance(item, dict) and item.get('@type', '').lower() == 'product':
                    data['title'] = item.get('name') or data.get('title')
                    data['description'] = item.get('description') or data.get('description')
                    images = item.get('image')
                    if isinstance(images, list):
                        data['images'] = images
                    elif isinstance(images, str):
                        data['images'] = [images]
                    offers = item.get('offers') or {}
                    if isinstance(offers, dict):
                        data['price'] = offers.get('price')
                        data['currency'] = offers.get('priceCurrency')
                    # found JSON-LD product; good enough
                    break
            if 'title' in data:
                break
    except Exception:
        logger.debug("JSON-LD parse error", exc_info=True)

    # Fallback to meta tags
    if 'title' not in data:
        og_title = soup.find('meta', property='og:title')
        if og_title and og_title.get('content'):
            data['title'] = og_title['content']
    if 'description' not in data:
        og_desc = soup.find('meta', property='og:description')
        if og_desc and og_desc.get('content'):
            data['description'] = og_desc['content']
    if 'images' not in data:
        og_image = soup.find('meta', property='og:image')
        if og_image and og_image.get('content'):
            data['images'] = [og_image['content']]

    # price meta
    if 'price' not in data:
        price_meta = soup.find('meta', attrs={'property': 'product:price:amount'})
        if price_meta and price_meta.get('content'):
            data['price'] = price_meta['content']
    if 'currency' not in data:
        cur_meta = soup.find('meta', attrs={'property': 'product:price:currency'})
        if cur_meta and cur_meta.get('content'):
            data['currency'] = cur_meta['content']

    # Extract textual details heuristically
    try:
        # item id from url digits
        m = re.search(r'(\d{4,})', url)
        if m:
            data['item_id'] = m.group(1)
    except Exception:
        pass

    # images from gallery
    imgs = []
    for img in soup.select('img'):
        src = img.get('src') or img.get('data-src') or img.get('data-lazy-src')
        if src and src.startswith('http'):
            imgs.append(src)
    if imgs:
        data.setdefault('images', imgs)

    # Seller info heuristics
    seller = {}
    seller_a = soup.find('a', href=re.compile(r'/membre/|/member/'))
    if seller_a:
        seller['name'] = seller_a.get_text(strip=True)
        seller['url'] = urljoin(BASE_URL, seller_a['href'])
    if seller:
        data['seller'] = seller

    # Additional fields by selectors (best-effort)
    # price text
    if 'price' not in data:
        price_el = soup.select_one('[data-testid="price"], .price, .ProductPrice, .item-price')
        if price_el:
            data['price'] = price_el.get_text(strip=True)
    # size, brand, condition - try to find label/value pairs
    attrs = {}
    for li in soup.select('.item-attributes li, .details-list li, .CharacteristicList li'):
        txt = li.get_text(separator=' ', strip=True)
        if ':' in txt:
            k, v = txt.split(':', 1)
            attrs[k.strip().lower()] = v.strip()
    if attrs:
        data.setdefault('attributes', attrs)
    # Extract brand and condition from attributes
    for key in ['marque', 'brand']:
        if key in attrs:
            data['brand'] = attrs[key]
            break
    for key in ['Ã©tat', 'etat', 'condition']:
        if key in attrs:
            data['condition'] = attrs[key]
            break

    data['scraped_at'] = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
    return data


def save_results(items: List[Dict], csv_path: str = OUTPUT_CSV, jsonl_path: str = OUTPUT_JSONL):
    # JSONL
    with open(jsonl_path, 'w', encoding='utf-8') as f:
        for it in items:
            f.write(json.dumps(it, ensure_ascii=False) + '\n')
    # CSV (flatten some fields)
    fieldnames = [
        'item_id', 'title', 'price', 'currency', 'url', 'seller', 'images', 'description', 'brand', 'condition', 'scraped_at'
    ]
    with open(csv_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for it in items:
            row = {
                'item_id': it.get('item_id'),
                'title': it.get('title'),
                'price': it.get('price'),
                'currency': it.get('currency'),
                'url': it.get('url'),
                'seller': it.get('seller', {}).get('name') if isinstance(it.get('seller'), dict) else '',
                'images': ';'.join(it.get('images', [])) if it.get('images') else '',
                'description': it.get('description', '')[:1000],
                'brand': it.get('brand'),
                'condition': it.get('condition'),
                'scraped_at': it.get('scraped_at'),
            }
            writer.writerow(row)


def scrape_vinted(start_url: str = START_URL, max_pages: int = MAX_PAGES) -> List[Dict]:
    all_products = []
    seen_urls = set()
    for page in range(1, max_pages + 1):
        page_url = make_page_url(start_url, page)
        logger.info("Fetching listing page %d: %s", page, page_url)
        resp = polite_get(page_url)
        if not resp:
            logger.info("Stopping: failed to fetch page %d", page)
            break
        links = extract_links_from_listing(resp.text)
        links = [l for l in links if l not in seen_urls]
        logger.info("Found %d new product links on page %d", len(links), page)
        if not links:
            # nothing new, stop
            break
        for prod_url in links:
            seen_urls.add(prod_url)
            logger.info("Fetching product: %s", prod_url)
            presp = polite_get(prod_url)
            if not presp:
                logger.warning("Failed product fetch: %s", prod_url)
                continue
            pdata = parse_product_page(presp.text, prod_url)
            all_products.append(pdata)
        # quick stop if reached many products (optional)
    logger.info("Scraped %d products", len(all_products))
    return all_products

In [5]:
items = scrape_vinted()
save_results(items)
logger.info("Saved %d items to %s and %s", len(items), OUTPUT_CSV, OUTPUT_JSONL)

2025-12-19 10:44:20,971 INFO Fetching listing page 1: https://www.vinted.fr/vetements?search_text=jean&page=1
2025-12-19 10:44:25,968 INFO Found 96 new product links on page 1
2025-12-19 10:44:25,969 INFO Fetching product: https://www.vinted.fr/items/7786583829-jean-ultra-baggy-noir-patchwork-motif-leopard-streetwear-2000-style-japan-m?referrer=catalog
2025-12-19 10:44:29,801 INFO Fetching product: https://www.vinted.fr/items/7788446935-jean-patte-deph-36?referrer=catalog
2025-12-19 10:44:32,657 INFO Fetching product: https://www.vinted.fr/items/7787065698-jean-femme-pimkie-taille-34?referrer=catalog
2025-12-19 10:44:34,718 INFO Fetching product: https://www.vinted.fr/items/7787033480-jeans?referrer=catalog
2025-12-19 10:44:36,725 INFO Fetching product: https://www.vinted.fr/items/7787776012-jean-flare-noir-stradivarius?referrer=catalog
2025-12-19 10:44:40,353 INFO Fetching product: https://www.vinted.fr/items/7787443203-jean-bleu-patte-delephant-zara-40?referrer=catalog
2025-12-19 10: