# Idealista Scraper Debug - Scrapfly

Using Scrapfly to bypass DataDome anti-bot protection.

In [None]:
# Install Scrapfly SDK
!pip install scrapfly-sdk beautifulsoup4 -q

In [1]:
from scrapfly import ScrapflyClient, ScrapeConfig, ScrapeApiResponse
from bs4 import BeautifulSoup
import json

## 1. Initialize Scrapfly Client

In [2]:
SCRAPFLY_API_KEY = "scp-live-1e9e5558c13049ccab83bc04ff5dab0f"

scrapfly = ScrapflyClient(key=SCRAPFLY_API_KEY)
print("Scrapfly client initialized!")

Scrapfly client initialized!


## 2. Build Search URL

In [3]:
# Idealista Spain - Barcelona rentals
SEARCH_URL = "https://www.idealista.com/alquiler-viviendas/barcelona-barcelona/con-precio-hasta_2400,precio-desde_1000/"

print(f"Target URL:\n{SEARCH_URL}")

Target URL:
https://www.idealista.com/alquiler-viviendas/barcelona-barcelona/con-precio-hasta_2400,precio-desde_1000/


## 3. Scrape with Anti-Bot Bypass

In [None]:
# Make request with ASP (Anti Scraping Protection) enabled
result = scrapfly.scrape(
    ScrapeConfig(
        url=SEARCH_URL,
        asp=True,  # Enable anti-bot bypass
        country="ES",
        render_js=True,  # Render JavaScript
    )
)

print(f"Status: {result.upstream_status_code}")
print(f"Success: {result.success}")
print(f"Content length: {len(result.content)} chars")

CRITICAL:root:<-- 200 | ERR::SCRAPE::BAD_UPSTREAM_RESPONSE - The website you target respond with an unexpected status code (>400) - The scrapped url: https://www.idealista.com/alquiler-viviendas/barcelona-barcelona/con-precio-hasta_2400,precio-desde_1000/ respond with 403 - Forbidden: . Checkout the related doc: https://scrapfly.io/docs/scrape-api/error/ERR::SCRAPE::BAD_UPSTREAM_RESPONSE


UpstreamHttpClientError: Target website responded with 403 - Forbidden

In [5]:
# Check the page title
soup = BeautifulSoup(result.content, "html.parser")
title = soup.find("title")
print(f"Page Title: {title.get_text() if title else 'N/A'}")

Page Title: Casas y pisos en alquiler en Barcelona — idealista


## 4. Find Listing Elements

In [6]:
# Find all listing articles
articles = soup.find_all("article", class_="item")
print(f"Found {len(articles)} listings with class 'item'")

if not articles:
    # Try alternative selectors
    articles = soup.find_all("article")
    print(f"Total articles found: {len(articles)}")
    
    for i, art in enumerate(articles[:5]):
        classes = art.get("class", [])
        print(f"  Article {i}: class={classes}")

Found 30 listings with class 'item'


In [7]:
# Show first article HTML structure
if articles:
    print("First listing HTML:")
    print("-" * 60)
    print(articles[0].prettify()[:2000])

First listing HTML:
------------------------------------------------------------
<article class="item extended-item item-multimedia-container" data-element-id="110429075" data-online-booking="false">
 <picture class="item-multimedia">
  <div class="item-multimedia-pictures">
   <div class="item-multimedia-pictures__container">
    <div class="item-multimedia-shortcuts --desktop">
     <button aria-label="Abrir mapa" class="multimedia-shortcut icon-location-outline" data-button-type="MAP">
     </button>
    </div>
    <span class="item-multimedia-pictures__ref-tag d-none">
    </span>
    <div class="item-multimedia-pictures__counter">
     <span>
      1/
     </span>
     <span>
      19
     </span>
    </div>
   </div>
  </div>
  <div class="item-ribbon-container">
  </div>
  <div class="item-gallery gallery-height-core-vitals neutral-orientation">
   <div class="mask-wrapper is-clickable">
    <div class="mask" style="touch-action: pan-y; user-select: none; -webkit-user-drag: none

## 5. Parse Listings

In [8]:
def parse_listing(article) -> dict:
    """Parse a single listing article."""
    data = {}
    
    # Link and title
    link_elem = article.find("a", class_="item-link")
    if link_elem:
        href = link_elem.get("href", "")
        if not href.startswith("http"):
            href = "https://www.idealista.com" + href  # Spain domain
        data["url"] = href
        data["title"] = link_elem.get_text(strip=True)
    
    # Price
    price_elem = article.find("span", class_="item-price")
    data["price"] = price_elem.get_text(strip=True) if price_elem else "N/A"
    
    # Details (rooms, size, floor)
    details = article.find_all("span", class_="item-detail")
    data["rooms"] = details[0].get_text(strip=True) if len(details) > 0 else "N/A"
    data["size"] = details[1].get_text(strip=True) if len(details) > 1 else "N/A"
    data["floor"] = details[2].get_text(strip=True) if len(details) > 2 else "N/A"
    
    # Description
    desc_elem = article.find("div", class_="item-description")
    data["description"] = desc_elem.get_text(strip=True) if desc_elem else ""
    
    # Thumbnail
    img_elem = article.find("img")
    if img_elem:
        data["thumbnail"] = img_elem.get("src") or img_elem.get("data-src") or ""
    
    return data

# Parse all listings
listings = []
for article in articles:
    try:
        listing = parse_listing(article)
        if listing.get("url"):
            listings.append(listing)
    except Exception as e:
        print(f"Error parsing: {e}")

print(f"Successfully parsed {len(listings)} listings")

Successfully parsed 30 listings


In [9]:
# Display parsed listings
for i, listing in enumerate(listings[:5]):
    print(f"\n{'='*60}")
    print(f"Listing {i+1}")
    print(f"{'='*60}")
    print(f"Title: {listing.get('title', 'N/A')}")
    print(f"Price: {listing.get('price', 'N/A')}")
    print(f"Rooms: {listing.get('rooms', 'N/A')}")
    print(f"Size: {listing.get('size', 'N/A')}")
    print(f"Floor: {listing.get('floor', 'N/A')}")
    print(f"URL: {listing.get('url', 'N/A')}")


Listing 1
Title: Piso en Calle de la Independència, El Camp de l'Arpa del Clot, Barcelona
Price: 2.000€/mes
Rooms: 3 hab.
Size: 93 m²
Floor: Planta 1ª exterior con ascensor
URL: https://www.idealista.com/inmueble/110429075/

Listing 2
Title: Piso en Calle de Neptú, Vila de Gràcia, Barcelona
Price: 1.350€/mes
Rooms: 2 hab.
Size: 70 m²
Floor: Planta 2ª interior sin ascensor
URL: https://www.idealista.com/inmueble/39923316/

Listing 3
Title: Piso en Calle del Peu de la Creu, El Raval, Barcelona
Price: 1.295€/mes
Rooms: 1 hab.
Size: 91 m²
Floor: Bajo exterior con ascensor
URL: https://www.idealista.com/inmueble/110428966/

Listing 4
Title: Piso en Calle de Pere IV, 440, Provençals del Poblenou, Barcelona
Price: 1.900€/mes
Rooms: 1 hab.
Size: 80 m²
Floor: Bajo exterior con ascensor
URL: https://www.idealista.com/inmueble/110378291/

Listing 5
Title: Ático en Calle de Berenguer Mallol, 99, La Barceloneta, Barcelona
Price: 1.300€/mes
Rooms: 3 hab.
Size: 60 m²
Floor: Planta 6ª exterior sin as

## 6. Load/Save Seen Links

In [None]:
# Seen links file path
SEEN_LINKS_FILE = "data/seen_links.txt"

def load_seen_links() -> set:
    """Load seen URLs from text file."""
    seen = set()
    try:
        with open(SEEN_LINKS_FILE, "r", encoding="utf-8") as f:
            for line in f:
                url = line.strip()
                if url:
                    seen.add(url)
        print(f"Loaded {len(seen)} seen URLs")
    except FileNotFoundError:
        print("No seen links file yet, starting fresh")
    return seen

def save_seen_links(seen: set) -> None:
    """Save seen URLs to text file."""
    import os
    os.makedirs("data", exist_ok=True)
    with open(SEEN_LINKS_FILE, "w", encoding="utf-8") as f:
        for url in seen:
            f.write(url + "\n")
    print(f"Saved {len(seen)} URLs to {SEEN_LINKS_FILE}")

# Load existing seen links
seen_links = load_seen_links()
print(f"Currently tracking {len(seen_links)} seen links")

In [None]:
# Filter out already seen listings and add new ones to seen set
new_listings = []
for listing in listings:
    url = listing.get("url", "")
    if url and url not in seen_links:
        new_listings.append(listing)
        seen_links.add(url)

print(f"New listings (not seen before): {len(new_listings)}")
print(f"Already seen: {len(listings) - len(new_listings)}")

# Save updated seen links
save_seen_links(seen_links)

# Save new listings as JSON for inspection
with open("debug_listings.json", "w", encoding="utf-8") as f:
    json.dump(new_listings, f, indent=2, ensure_ascii=False)
print(f"Saved {len(new_listings)} new listings to debug_listings.json")

## 7. Test Pagination

In [10]:
# Check if there are more pages
pagination = soup.find("div", class_="pagination")
if pagination:
    print("Pagination found:")
    print(pagination.prettify()[:500])
else:
    print("No pagination element found")

# Look for next page link
next_link = soup.find("a", class_="icon-arrow-right-after")
if next_link:
    next_url = next_link.get("href")
    print(f"\nNext page: {next_url}")

Pagination found:
<div class="pagination">
 <ul>
  <li class="moreresults">
   <span>
    Ver más resultados:
   </span>
  </li>
  <li class="selected">
   <span>
    1
   </span>
  </li>
  <li>
   <a class="" href="/alquiler-viviendas/barcelona-barcelona/con-precio-hasta_2400,precio-desde_1000/pagina-2.htm" rel="nofollow">
    2
   </a>
  </li>
  <li>
   <a class="" href="/alquiler-viviendas/barcelona-barcelona/con-precio-hasta_2400,precio-desde_1000/pagina-3.htm" rel="nofollow">
    3
   </a>
  </li>
  <li>
   

Next page: /alquiler-viviendas/barcelona-barcelona/con-precio-hasta_2400,precio-desde_1000/pagina-2.htm


## 8. Scrape Multiple Pages

In [11]:
def scrape_page(url: str) -> list:
    """Scrape a single page and return listings."""
    result = scrapfly.scrape(
        ScrapeConfig(
            url=url,
            asp=True,
            country="ES",  # Spain
            render_js=True,
        )
    )
    
    if not result.success:
        print(f"Failed: {result.upstream_status_code}")
        return []
    
    soup = BeautifulSoup(result.content, "html.parser")
    articles = soup.find_all("article", class_="item")
    
    listings = []
    for article in articles:
        try:
            listing = parse_listing(article)
            if listing.get("url"):
                listings.append(listing)
        except:
            pass
    
    return listings

# Test with page 2
# Format: https://www.idealista.com/.../filters/pagina-2.htm
page2_url = SEARCH_URL.rstrip('/') + "/pagina-2.htm"
print(f"Fetching: {page2_url}")

page2_listings = scrape_page(page2_url)
print(f"Found {len(page2_listings)} listings on page 2")

Fetching: https://www.idealista.com/alquiler-viviendas/barcelona-barcelona/con-precio-hasta_2400,precio-desde_1000/pagina-2.htm
Found 30 listings on page 2


## 9. API Usage Check

In [12]:
# Check your Scrapfly account usage
try:
    account = scrapfly.account()
    print("Scrapfly Account Info:")
    print(f"  Plan: {account.get('subscription', {}).get('plan', 'N/A')}")
    print(f"  Credits remaining: {account.get('subscription', {}).get('remaining_credits', 'N/A')}")
except Exception as e:
    print(f"Could not fetch account info: {e}")

Scrapfly Account Info:
  Plan: N/A
  Credits remaining: N/A


## 10. Cleanup

In [None]:
import os

for f in ["debug_response.html", "debug_listings.json"]:
    if os.path.exists(f):
        os.remove(f)
        print(f"Removed {f}")