In [1]:
# CURL query: http://web.archive.org/cdx/search/cdx?url=https://www.zillow.com/berkeley-ca/apartments/&output=json
# This script is used to scrape the archived webpages of Zillow's Berkeley apartment listings

In [2]:
import json
from datetime import datetime
import pandas as pd
import os
import time
import requests
from bs4 import BeautifulSoup

In [3]:
def convert_timestamp(timestamp):
    """Convert a timestamp in 'YYYYMMDDHHMMSS' format to a readable string."""
    try:
        readable_format = datetime.strptime(timestamp, "%Y%m%d%H%M%S").strftime("%Y-%m-%d")
        return readable_format
    except ValueError:
        return "Invalid timestamp format. Please use 'YYYYMMDDHHMMSS'."

# Example usage
timestamp = "20240811231741"
print(f"Readable Timestamp: {convert_timestamp(timestamp)}")

Readable Timestamp: 2024-08-11


In [4]:
with open("apartment_snapshots.json", "r") as f:
    data = json.load(f)

# Skip the header row and construct the archive URLs
timestamps = [
    convert_timestamp(entry[1]) for entry in data[1:] if convert_timestamp(entry[1]) > "2020"
]

urls = [
    f"http://web.archive.org/web/{entry[1]}/{entry[2]}"
    for entry in data[1:] if convert_timestamp(entry[1]) > "2020"
]

In [5]:
print(len(urls), len(timestamps))

100 100


In [6]:
# Create a directory to save tables
if not os.path.exists("scraped_tables/zillow_apartments"):
    os.makedirs("scraped_tables/zillow_apartments")
    
# Rate limit settings
RATE_LIMIT = 5  # Time in seconds between requests

In [38]:
def get_property_data_pre_2022(soup, date):
    """
    Extracts property data from Zillow using BeautifulSoup.

    Args:
        soup (BeautifulSoup): The BeautifulSoup object representing the webpage.
        date (str): The date of the data.

    Returns:
        list: A list of dictionaries containing the extracted property data.
    """
    
    # Find all the listings on the page
    listings = soup.find_all('div', class_='list-card-heading')
    addresses = soup.find_all(attrs={'class': 'list-card-addr'})
    if len(listings) != len(addresses):
        print("potential mismatch of listings and addresses:", date, len(listings), len(addresses))

    # Loop through each listing div and extract details
    property_data = []
    for n, listing in enumerate(listings):
        # Extract price
        price = listing.find('div', class_='list-card-price')
        price_text = price.get_text(strip=True) if price else None
        if not price_text:
            continue

        # Extract details
        details = listing.find('ul', class_='list-card-details')
        details_items = details.find_all('li') if details else []

        # Initialize variables for type, bathrooms, and size
        property_type = details_items[0].get_text(strip=True) if len(details_items) > 0 else None
        bathrooms = details_items[1].get_text(strip=True) if len(details_items) > 1 else None
        size = details_items[2].get_text(strip=True) if len(details_items) > 2 else None

        # Create a dictionary for the current listing
        property_info = {
            "price": price_text,
            "type": property_type,
            "bathrooms": bathrooms,
            "size": size,
            "address": addresses[n].get_text(strip=True),
            "date": date
        }
        property_data.append(property_info)
    return property_data

def get_property_data_post_2022(soup, date):
    """
    Extracts property data from Zillow using BeautifulSoup.

    Args:
        soup (BeautifulSoup): The BeautifulSoup object representing the webpage.
        date (str): The date of the data.

    Returns:
        list: A list of dictionaries containing the extracted property data.
    """
    
    property_data = []
    # Find all the addresses on the page
    # addresses = soup.find_all('address', attrs={'data-test': 'property-card-addr'})
    listing_groups = soup.find_all('div', class_='property-card-data')

    for listing_soup in listing_groups:
        address = listing_soup.find('address', attrs={'data-test': 'property-card-addr'}).get_text(strip=True)
        
        # Headline data
        header_price_data = listing_soup.find('span', attrs={'data-test': 'property-card-price'}).get_text(strip=True)
        property_data.append({
            "price": header_price_data.split(' ')[0],
            "type": header_price_data[header_price_data.index(' ')+1:] if ' ' in header_price_data else None,
            "bathrooms": None,
            "size": None,
            "address": address,
            "date": date
        })

        # Other room data
        other_rooms_section = listing_soup.find_all('span', class_='jlVIIO')
        for container in other_rooms_section:
            for room in container.find_all('span', recursive=False):
                bold_text = room.find('b')
                if bold_text:
                    # Extract the price from the <b> tag
                    price = bold_text.get_text(strip=True)
                    room_type = room.get_text(strip=True).replace(price, '').strip()
                    property_data.append({
                        "price": price,
                        "type": room_type,
                        "bathrooms": None,
                        "size": None,
                        "address": address,
                        "date": date
                    })
        
        other_rooms_section = listing_soup.find_all('ul', class_='dmDolk')
        for container in other_rooms_section:
            for room in container.find_all('li', recursive=False):
                bold_text = room.find('b')
                if bold_text:
                    # Extract the price from the <b> tag
                    price = bold_text.get_text(strip=True)
                    room_type = room.get_text(strip=True).replace(price, '').strip()
                    property_data.append({
                        "price": price,
                        "type": room_type,
                        "bathrooms": None,
                        "size": None,
                        "address": address,
                        "date": date
                    })
    
    return property_data
    
    

In [35]:
def scrape_info_from_site(url, date):
    """
    Scrapes information from a Zillow URL and saves it to a CSV file.
    Args:
        url (str): The URL of the website to scrape.
        date (str): The date in the format "YYYY-MM-DD".
    Raises:
        HTTPError: If the request to the website fails.
    Returns:
        None
    """

    url.strip()
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch URL: {url}")
        return

    # Parse HTML
    soup = BeautifulSoup(response.content, 'html')

    # Find detailed property information
    # Zillow changed frontend UI starting after url[89]: 2022-05-14
    if date < "2022-05-14":
        property_data = get_property_data_pre_2022(soup, date)
    elif date == "2022-05-14":
        return # skip url[89] since it's a transition date -- soup response is bugged
    else:
        property_data = get_property_data_post_2022(soup, date)
    
    # Save the data to a CSV file
    filename = f"scraped_tables/zillow_apartments/{date}.csv"
    df = pd.DataFrame(property_data)
    df.to_csv(filename, index=False)
    print(f"Saved table to {filename}")


In [39]:
for n, url in enumerate(urls):
    # In case of rate limiting interuptions:
    if timestamps[n] <= "2024-07-08": # CHANGE THIS DATE TO LAST SUCCESSFUL SCRAPE!
        continue
    if timestamps[n] in ["2023-03-14", "2024-07-08"]: # these specific dates were manually scraped
        continue
    scrape_info_from_site(url, timestamps[n])
    time.sleep(RATE_LIMIT)  # Rate limit to avoid overwhelming the server

Failed to fetch URL: http://web.archive.org/web/20240716230438/https://www.zillow.com/berkeley-ca/apartments/
