In [0]:
import os
import requests
import gzip
import shutil
from datetime import datetime, date, timedelta

# ---------------------------------------------
# CONFIG DEFAULTS
# ---------------------------------------------
# RANGE MODE DEFAULTS (if last_n_days == 0)
DEFAULT_START_DATE = "2025-03-01"  # yyyy-MM-dd
DEFAULT_END_DATE   = "2025-10-31"  # yyyy-MM-dd

# LAST-N-DAYS MODE
# If LAST_N_DAYS > 0, we ignore the explicit start/end and use:
#   [today - LAST_N_DAYS + 1  ...  today]
DEFAULT_LAST_N_DAYS = 0  # 0 = disabled

COUNTRY = "canada"
STATE = "on"
CITY = "toronto"

# Where to store snapshots (Unity Catalog Volume)
# Final layout:
#   /Volumes/workspace/default/course_data/airbnb_toronto/<snapshot_date>/*.csv
BASE_ROOT = "/Volumes/workspace/default/course_data/airbnb_toronto"

REQUIRED_FILES = ["listings.csv", "calendar.csv", "reviews.csv", "neighbourhoods.csv"]

# ---------------------------------------------
# Resolve parameters from widgets (if available)
# ---------------------------------------------
start_date_str = DEFAULT_START_DATE
end_date_str = DEFAULT_END_DATE
last_n_days = DEFAULT_LAST_N_DAYS

# In Databricks notebook you *can* add:
# dbutils.widgets.text("start_date", DEFAULT_START_DATE, "Start date (yyyy-MM-dd)")
# dbutils.widgets.text("end_date",   DEFAULT_END_DATE,   "End date (yyyy-MM-dd)")
# dbutils.widgets.text("last_n_days", str(DEFAULT_LAST_N_DAYS), "Last N days (0 = off)")
try:
    sd = dbutils.widgets.get("start_date")   # type: ignore[name-defined]
    ed = dbutils.widgets.get("end_date")     # type: ignore[name-defined]
    ln = dbutils.widgets.get("last_n_days")  # type: ignore[name-defined]

    if sd:
        start_date_str = sd.strip()
    if ed:
        end_date_str = ed.strip()
    if ln:
        try:
            last_n_days = int(ln)
        except ValueError:
            print(f"Invalid last_n_days '{ln}', falling back to {DEFAULT_LAST_N_DAYS}")
            last_n_days = DEFAULT_LAST_N_DAYS
except Exception:
    # Not in Databricks or widgets not defined → use defaults
    pass

# ---------------------------------------------
# Build list of snapshot_dates to try
# ---------------------------------------------
snapshot_dates = []
today = date.today()

if last_n_days and last_n_days > 0:
    # Use last N days up to today
    print(f"Using last_n_days mode: last {last_n_days} days (including today)")

    start = today - timedelta(days=last_n_days - 1)
    end = today
else:
    # Use explicit start/end range
    print(f"Using explicit range mode: {start_date_str} -> {end_date_str}")
    start = datetime.strptime(start_date_str, "%Y-%m-%d").date()
    end = datetime.strptime(end_date_str, "%Y-%m-%d").date()

    if end < start:
        raise ValueError("END_DATE must be >= START_DATE")

current = start
while current <= end:
    snapshot_dates.append(current.strftime("%Y-%m-%d"))
    current += timedelta(days=1)

if not snapshot_dates:
    raise ValueError("No snapshot dates resolved from inputs.")

print(f"Snapshot dates to attempt: {snapshot_dates}")

# ---------------------------------------------
# Helper functions
# ---------------------------------------------
def download(url: str, path: str):
    print(f"  Downloading {url} -> {path}")
    resp = requests.get(url)

    # Treat 403 and 404 as "no snapshot for this date"
    if resp.status_code in (403, 404):
        raise FileNotFoundError(f"URL not accessible ({resp.status_code}): {url}")

    resp.raise_for_status()

    with open(path, "wb") as f:
        f.write(resp.content)

def download_and_unzip_gz(url: str, out_csv_path: str):
    gz_path = out_csv_path + ".gz"
    download(url, gz_path)
    print(f"  Unzipping {gz_path} -> {out_csv_path}")
    with gzip.open(gz_path, "rb") as f_in, open(out_csv_path, "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)
    os.remove(gz_path)

def folder_has_all_files(folder: str) -> bool:
    """Return True if folder already has all expected CSVs."""
    for fname in REQUIRED_FILES:
        if not os.path.exists(os.path.join(folder, fname)):
            return False
    return True

# ---------------------------------------------
# Ensure base root exists
# ---------------------------------------------
os.makedirs(BASE_ROOT, exist_ok=True)

# ---------------------------------------------
# Main loop: download for each snapshot date
# ---------------------------------------------
for snapshot_date in snapshot_dates:
    print(f"\n=== Processing snapshot date: {snapshot_date} ===")

    target_dir = os.path.join(BASE_ROOT, snapshot_date)

    # If this snapshot already fully exists, skip downloading
    if os.path.isdir(target_dir) and folder_has_all_files(target_dir):
        print(f"  Snapshot {snapshot_date} already fully downloaded. Skipping.")
        continue

    base_url = f"https://data.insideairbnb.com/{COUNTRY}/{STATE}/{CITY}/{snapshot_date}"

    urls = {
        "listings":       f"{base_url}/data/listings.csv.gz",
        "calendar":       f"{base_url}/data/calendar.csv.gz",
        "reviews":        f"{base_url}/data/reviews.csv.gz",
        "neighbourhoods": f"{base_url}/visualisations/neighbourhoods.csv",  # not gzipped
    }

    # (Re)create target_dir if needed (we'll wipe it on failure)
    os.makedirs(target_dir, exist_ok=True)
    print(f"  Saving files under: {target_dir}")

    success = False  # track if this date actually completed downloads

    try:
        # Listings, calendar, reviews (gzipped)
        download_and_unzip_gz(urls["listings"],  os.path.join(target_dir, "listings.csv"))
        download_and_unzip_gz(urls["calendar"],  os.path.join(target_dir, "calendar.csv"))
        download_and_unzip_gz(urls["reviews"],   os.path.join(target_dir, "reviews.csv"))

        # Neighbourhoods (already CSV)
        download(urls["neighbourhoods"], os.path.join(target_dir, "neighbourhoods.csv"))

        # Final sanity check
        if not folder_has_all_files(target_dir):
            raise RuntimeError(f"Missing one or more files after download for {snapshot_date}")

        success = True
        print(f"✓ Finished snapshot {snapshot_date}")
    except FileNotFoundError as e:
        print(f"! Snapshot {snapshot_date} appears not to exist / be accessible on InsideAirbnb: {e}")
    except Exception as e:
        print(f"! Error while processing {snapshot_date}: {e}")
    finally:
        # If we didn't successfully download everything, delete the folder
        if not success:
            print(f"  Deleting folder with incomplete or no data: {target_dir}")
            try:
                shutil.rmtree(target_dir, ignore_errors=True)
            except Exception as cleanup_err:
                print(f"  Warning: failed to remove {target_dir}: {cleanup_err}")

print("\nAll requested snapshots processed.")
print("Base folder pattern for snapshots with data:")
print("  /Volumes/workspace/default/course_data/airbnb_toronto/<snapshot_date>/")
