This Python script is designed for web scraping real estate listings from a popular Polish website. It fetches multiple pages of real estate offers using requests and BeautifulSoup, extracts key listing details from embedded JSON data, and structures them into a DataFrame. The extracted data is then cleaned, anonymized, and saved as CSV files in Google Drive. Additionally, the script includes functionality to retrieve detailed information for each listing, ensuring comprehensive property data collection.

# Imports

In [None]:
import requests
from bs4 import BeautifulSoup
import json
from time import sleep
import pandas as pd

# Scrapping

In [None]:
def get_offers(page_number):
  """
  Fetches offers from a specific page on real estate website for apartments for sale.
  Args:
      page_number (int): The page number to fetch the offers from.
  Returns:
      list: A list of offer objects containing data about real estate listings.
  """
  website = ... # Url to one of the most popular websites in Poland
  # URL for the specific page number in search results
  url = f"{website}{page_number}"

  # HTTP headers to simulate a request from a standard web browser
  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

  # Send an HTTP GET request to the specified URL
  response = requests.get(url, headers=headers)

  # Introduce a 1-second delay to avoid overwhelming the server
  sleep(1)

  # Get the HTML content of the response
  content = response.text  # The response contains JSON embedded within the HTML

  # Parse the HTML content using BeautifulSoup
  soup = BeautifulSoup(content, 'html.parser')

  # Extract the script tag containing the JSON data (with the ID "__NEXT_DATA__")
  text = soup.find('script', {'id': "__NEXT_DATA__"}).get_text()

  # Parse the JSON content into a Python dictionary
  data_json = json.loads(text)

  # Navigate to the specific key where offer data is stored
  offers = data_json['props']['pageProps']['data']['searchAds']['items']

  # Return the list of offers
  return offers

In [None]:
def parse_offer(offer):
  """
  Parses a single offer from the real estate website data structure into a clean, structured dictionary.
  Args:
      offer (dict): The raw offer data from the Otodom API response.
  Returns:
      dict: A parsed and structured dictionary containing key offer details.
  """
  website_prefix = ...
  website_prefix_dev = ...

  # Construct the main offer URL using the "slug" field
  url = website_prefix + offer["slug"]

  # Try to construct the developer URL; fallback to "N/A" if not available
  try:
    developer_url = website_prefix_dev + offer["agency"]["slug"]
  except:
    developer_url = "N/A"

  # Extract and clean the creation dates
  dateCreated = offer["dateCreated"].split(" ")[0]
  dateCreatedFirst = offer["dateCreatedFirst"].split(" ")[0]

  # Define keys for direct extraction and nested key paths for deeper extraction
  keys = ["id", "title", "estate", "developmentId", "transaction", "areaInSquareMeters", "roomsNumber"]
  nested_keys = {
    "location": ["location", "reverseGeocoding", "locations", 3, "fullName"],
    "agencyId": ["agency", "id"],
    "agencyName": ["agency", "name"],
    "price": ["totalPrice", "value"]
  }

  # Extract simple keys from the offer data
  parsed_offer = {key: offer.get(key, "N/A") for key in keys}

  # Add basic information and URLs to the parsed offer
  parsed_offer["url"] = url
  parsed_offer["developerUrl"] = developer_url
  parsed_offer["dateModified"] = dateCreated
  parsed_offer["dateOfCreation"] = dateCreatedFirst

  # Extract nested keys from the offer data using safe acces function
  parsed_data_custom = {
    key: safe_acces(offer, path)
    for key, path in nested_keys.items()
  }
  parsed_offer.update(parsed_data_custom)

  return parsed_offer

In [None]:
def safe_acces(data, path, default="N/A"):
  """
  Safely traverses a nested dictionary or list to extract a value.
  Args:
      data (dict or list): The initial data structure to traverse.
      path (list): A list of keys (for dictionaries) or indices (for lists) representing the path to the desired value.
      default: The value to return if any part of the path is invalid or the data at the end of the path is missing.
  Returns:
      The value at the end of the path if it exists, or the default value if any part of the path is invalid.
  """
  for key in path:
    try:
      # If the current level is a dictionary, use .get() for safe access
      if isinstance(data, dict):
        data = data.get(key)
      # If the current level is a list, access by index if valid
      elif isinstance(data, list):
        data = data[key] if 0 <= key < len(data) else default
      else:
        # If the data is neither a dict nor a list, return default
        return default
    except (TypeError, IndexError):
      # Catch errors caused by invalid key/index access or wrong types
      return default

  # Return the value if found, otherwise fall back to the default
  return data or default

In [None]:
def getter():
  """
  Retrieves and parses all offers from the  page of real estate search results.
  Returns:
      list: A list of parsed offers, each represented as a structured dictionary.
  """
  # Initialize an empty list to store all parsed offers
  all_offers = []
  pages_number = ...

  # Loop through pages
  for page_number in range(0, pages_number):
    # Fetch offers from the current page
    offers = get_offers(page_number)
    print(f"Downloading {page_number} of {pages_number-1}")

    # Parse each offer and add it to the list of all offers
    for i, offer in enumerate(offers):
      all_offers.append(parse_offer(offer))

  # Return the complete list of parsed offers
  return all_offers

In [None]:
# Convert the list of parsed offers into a DataFrame and set "id" as the index
df = pd.DataFrame(getter()).set_index("id")

# Remove duplicate entries based on the "id" index, keeping the first occurrence
df = df[~df.index.duplicated(keep='first')]

# Define the path where the file should be saved
file_path = '../data/raw_before_anonymization/file_df.csv'

# Save the DataFrame as a CSV file 
df.to_csv(file_path, index=True)

print(f"File saved successfully to {file_path}")

In [None]:
def get_offer_details(url):
  """
  Retrieves detailed information about a specific offer from its URL.
  Args:
      url (str): The URL of the offer to fetch details for.
  Returns:
      dict: A dictionary containing detailed data about the offer.
  """
  # HTTP headers to simulate a request from a standard web browser
  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

  # Send an HTTP GET request to fetch the offer page
  response = requests.get(url, headers=headers)

  # Introduce a 1-second delay to avoid overwhelming the server
  sleep(1)

  # Extract the HTML content of the response
  content = response.text

  # Parse the HTML content using BeautifulSoup
  soup = BeautifulSoup(content, 'html.parser')

  # Extract the script tag containing the JSON data (with the ID "__NEXT_DATA__")
  text = soup.find('script', {'id': "__NEXT_DATA__"}).get_text()

  # Parse the JSON content into a Python dictionary
  data_json = json.loads(text)

  # Navigate to the key where the offer details are stored
  offers = data_json["props"]["pageProps"]["ad"]

  # Return the detailed offer data
  return offers

In [None]:
def parse_offer_details(offer):
  """
  Parses detailed information about a specific offer into a structured dictionary.
  Args:
      offer (dict): The raw detailed offer data.
  Returns:
      dict: A parsed dictionary containing key details about the offer.
  """
  # Direct keys to extract from the offer
  keys = ["id", "market", "advertiserType", "advertType"]

  # Nested keys to extract with their respective paths in the offer dictionary
  nested_keys = {
    "buildYear": ["target", "Build_year"],
    "buildingFloorsNum": ["target", "Building_floors_num"],
    "cityId": ["target", "City_id"],
    "buildingOwnership": ["target", "Building_ownership"],
    "buildingType": ["target", "Building_type"],
    "constructionStatus": ["target", "Construction_status"],
    "extrasTypes": ["target", "Extras_types"],
    "latitude": ["location", "coordinates", "latitude"],
    "longitude": ["location", "coordinates", "longitude"]
  }

  # Extract direct keys
  parsed_offer = {key: offer.get(key, "N/A") for key in keys}

  # Extract nested keys using the safe_acces function
  parsed_data_custom = {
    key: safe_acces(offer, path)
    for key, path in nested_keys.items()
  }
  parsed_offer.update(parsed_data_custom)

  # Additional fields requiring custom extraction and transformation
  extraction_config = {
    "outdoor": (["topInformation", 5, "values", 0], lambda x: x.split("::")[1]),
    "heating": (["topInformation", 9, "values", 0], lambda x: x.split("::")[1]),
    "freeFrom": (["additionalInformation", 2, "values", 0], lambda x: x),
  }

  # Extract and transform additional fields based on the extraction configuration
  for field, (path, transform) in extraction_config.items():
    try:
      # Extract the raw value using safe_acces
      raw_value = safe_acces(offer, path, default="N/A")
      # Apply the transformation if the value is valid
      parsed_offer[field] = transform(raw_value) if raw_value != "N/A" else "N/A"
    except (IndexError, KeyError, AttributeError):
      # Handle any errors during extraction or transformation
      parsed_offer[field] = "N/A"

  # Return the fully parsed offer details
  return parsed_offer

In [None]:
def getter_details(urlstr, qend, qstart):
  """
  Retrieves and parses detailed information for multiple offers from their URLs.
  Args:
      urlstr (list): A list of URLs for the offers to be processed.
  Returns:
      list: A list of parsed offer details, each represented as a structured dictionary.
  """
  offer_details = []

  for url in urlstr:
    print(f"Downloading {qstart+1} / {qend} of {total}") # total - total samples qty
    try:
      # Fetch offer details for the given URL
      offers = get_offer_details(url)
      # Ensure `offers` is a list; wrap in a list if it's a single dictionary
      if isinstance(offers, dict):
        offers = [offers]
      # Parse each offer and append the details to the result list
      for offer in offers:
        offer_details.append(parse_offer_details(offer))
    except Exception as e:
      # Print an error message if something goes wrong for a particular URL
      print(f"Error processing URL {url}: {e}")
    qstart += 1

  return offer_details

In [None]:
#counting quartiles to divide data and divide downloading 
q = round(len(df)/4)

q1end = q
q1start = 0
q1 = df["url"][q1start:q1end]

q2end = q*2
q2start = q
q2 = df["url"][q2start:q2end]

q3end = q*3
q3start = q*2
q3 = df["url"][q3start:q3end]

q4end = #total samples qty
q4start = q*3
q4 = df["url"][q4start:q4end]

In [None]:
# Convert the list of parsed details into a DataFrame and set "id" as the index
df_details1 = pd.DataFrame(getter_details(q1, q1end, q1start)).set_index("id")
# Define the path where the file should be saved
file_path = '../data/raw_before_anonymization/file_dfdetails1.csv'

# Save the DataFrame as a CSV file 
df_details1.to_csv(file_path, index=True)

print(f"File saved successfully to {file_path}")

In [None]:
# Convert the list of parsed details into a DataFrame and set "id" as the index
df_details2 = pd.DataFrame(getter_details(q2, q2end, q2start)).set_index("id")
# Define the path where the file should be saved
file_path = '../data/raw_before_anonymization/file_dfdetails2.csv'

# Save the DataFrame as a CSV file 
df_details2.to_csv(file_path, index=True)

print(f"File saved successfully to {file_path}")

In [None]:
# Convert the list of parsed details into a DataFrame and set "id" as the index
df_details3 = pd.DataFrame(getter_details(q3, q3end, q3start)).set_index("id")
# Define the path where the file should be saved
file_path = '../data/raw_before_anonymization/file_dfdetails3.csv'

# Save the DataFrame as a CSV file 
df_details3.to_csv(file_path, index=True)

print(f"File saved successfully to {file_path}")

In [None]:
# Convert the list of parsed details into a DataFrame and set "id" as the index
df_details4 = pd.DataFrame(getter_details(q4, q4end, q4start)).set_index("id")
# Define the path where the file should be saved
file_path = '../data/raw_before_anonymization/file_dfdetails4.csv'

# Save the DataFrame as a CSV file 
df_details4.to_csv(file_path, index=True)

print(f"File saved successfully to {file_path}")

In [None]:
"""
concat() makes a full copy of the data, and iteratively reusing concat() can create unnecessary copies. Collect all DataFrame or Series objects in a list before using concat().
"""
frames = [df_details1, df_details2, df_details3, df_details4]
# Combining all offers details
df_details_combined = pd.concat(frames)

In [None]:
# Merging all data in to one Data Frame
df_combined = pd.merge(df, df_details_combined, how='inner', left_index=True, right_index=True)

In [None]:
# Define the path where the file should be saved
file_path = '../data/raw_before_anonymization/file_dfcombined.csv'

# Save the DataFrame as a CSV file 
df_combined.to_csv(file_path, index=True)

print(f"File saved successfully to {file_path}")

# Anonymization

In [None]:
# Data frame anonymization
df_combined = df_combined.reset_index()
df_anonymized = df_combined.drop(columns=["id", "title", "developmentId", "url", "developerUrl", "agencyId", "agencyName"])

# Define the path where the file should be saved
file_path = '../data/interim/file_dfanonymized.csv'

# Save the DataFrame as a CSV file 
df_anonymized.to_csv(file_path, index=True)

print(f"File saved successfully to {file_path}")