# Bulk re-parse URLs into To Be Communicated

Paste a list of Idealista URLs below. This notebook re-scrapes each listing concurrently and upserts it into the DB with stage `to_be_communicated`. Existing rows are updated with fresh data.

In [None]:
# Input: one URL per line
urls = [
    # "https://www.idealista.com/inmueble/12345678/",
]

# Optional: cap concurrency
max_workers = 5


In [None]:
import asyncio
from concurrent.futures import ThreadPoolExecutor, as_completed
from sqlalchemy import func
from webapp.database.database import SessionLocal
from webapp.database.models import Listing
from webapp.services.scraper_service import parse_idealista_url

STAGE_TO_BE_COMMUNICATED = "to_be_communicated"

unique_urls = list(dict.fromkeys([u.strip() for u in urls if u and u.strip()]))
if not unique_urls:
    raise ValueError("No URLs provided")

def _parse_sync(url: str) -> dict:
    # parse_idealista_url is async but internally uses a sync client;
    # running it in a thread keeps the main loop responsive.
    return asyncio.run(parse_idealista_url(url))

def scrape_all(url_list, workers=5):
    results = []
    with ThreadPoolExecutor(max_workers=workers) as executor:
        future_map = {executor.submit(_parse_sync, url): url for url in url_list}
        for future in as_completed(future_map):
            url = future_map[future]
            try:
                data = future.result()
                results.append((url, data, None))
            except Exception as exc:
                results.append((url, None, exc))
    return results

print(f"Scraping {len(unique_urls)} unique URLs with {max_workers} workers...")
scraped = scrape_all(unique_urls, workers=max_workers)

db = SessionLocal()
try:
    existing = db.query(Listing).filter(Listing.idealista_url.in_(unique_urls)).all()
    existing_map = {row.idealista_url: row for row in existing}

    max_pos = db.query(func.max(Listing.position)).filter(
        Listing.stage == STAGE_TO_BE_COMMUNICATED
    ).scalar() or 0

    created = 0
    updated = 0
    failed = 0

    for url, data, err in scraped:
        if err or not data:
            failed += 1
            print(f"FAIL {url}: {err}")
            continue

        row = existing_map.get(url)
        if row:
            for key, value in data.items():
                setattr(row, key, value)
            if row.stage != STAGE_TO_BE_COMMUNICATED:
                max_pos += 1
                row.stage = STAGE_TO_BE_COMMUNICATED
                row.position = max_pos
            row.source = "bulk_import"
            updated += 1
        else:
            max_pos += 1
            db.add(Listing(
                title=data.get("title"),
                price=data.get("price"),
                price_value=data.get("price_value"),
                rooms=data.get("rooms"),
                size=data.get("size"),
                floor=data.get("floor"),
                description=data.get("description"),
                thumbnail=data.get("thumbnail"),
                idealista_url=data.get("idealista_url") or url,
                stage=STAGE_TO_BE_COMMUNICATED,
                position=max_pos,
                source="bulk_import",
            ))
            created += 1

    db.commit()
    print(f"Done. Created: {created}, Updated: {updated}, Failed: {failed}")
finally:
    db.close()
