In [6]:
from pathlib import Path
import requests
import pandas as pd

# ========== 1) Config ==========
DATA_DIR = Path("data/inside_airbnb_cdmx_2025-09-27")
DATA_DIR.mkdir(parents=True, exist_ok=True)

BASE = "https://data.insideairbnb.com/mexico/df/mexico-city/2025-09-27"

URLS = {
    "listings.csv.gz": f"{BASE}/data/listings.csv.gz",
    "calendar.csv.gz": f"{BASE}/data/calendar.csv.gz",
    "reviews.csv.gz":  f"{BASE}/data/reviews.csv.gz",
    # opcionales (barrios)
    "neighbourhoods.csv":     f"{BASE}/visualisations/neighbourhoods.csv",
    "neighbourhoods.geojson": f"{BASE}/visualisations/neighbourhoods.geojson",
}

# ========== 2) Descarga robusta ==========
def download(url: str, outpath: Path, timeout=60):
    if outpath.exists() and outpath.stat().st_size > 0:
        print(f"✔ Ya existe: {outpath.name} ({outpath.stat().st_size:,} bytes)")
        return

    print(f"⬇ Descargando {outpath.name} ...")
    with requests.get(url, stream=True, timeout=timeout) as r:
        r.raise_for_status()
        # Detectar si te devolvió HTML (a veces pasa con bloqueos/redirecciones)
        ctype = (r.headers.get("Content-Type") or "").lower()
        if "text/html" in ctype:
            raise RuntimeError(
                f"Descarga devolvió HTML en vez de archivo para {outpath.name}. "
                "Revisa red/proxy o intenta desde otra red."
            )

        with open(outpath, "wb") as f:
            for chunk in r.iter_content(chunk_size=1024 * 1024):
                if chunk:
                    f.write(chunk)

    print(f"✅ Listo: {outpath.name} ({outpath.stat().st_size:,} bytes)")

# Descarga los principales (y opcionales)
for fname, url in URLS.items():
    download(url, DATA_DIR / fname)

# ========== 3) Lectura en pandas ==========
listings = pd.read_csv(DATA_DIR / "listings.csv.gz", compression="gzip", low_memory=False)
calendar = pd.read_csv(DATA_DIR / "calendar.csv.gz", compression="gzip", low_memory=False)
reviews  = pd.read_csv(DATA_DIR / "reviews.csv.gz",  compression="gzip", low_memory=False)

print("listings:", listings.shape)
print("calendar:", calendar.shape)
print("reviews:", reviews.shape)

# Primer vistazo
print(listings.columns())
print(calendar.columns())

⬇ Descargando listings.csv.gz ...
✅ Listo: listings.csv.gz (14,513,553 bytes)
⬇ Descargando calendar.csv.gz ...
✅ Listo: calendar.csv.gz (23,037,908 bytes)
⬇ Descargando reviews.csv.gz ...
✅ Listo: reviews.csv.gz (141,542,027 bytes)
⬇ Descargando neighbourhoods.csv ...
✅ Listo: neighbourhoods.csv (275 bytes)
⬇ Descargando neighbourhoods.geojson ...
✅ Listo: neighbourhoods.geojson (342,744 bytes)
listings: (27051, 79)
calendar: (9873624, 7)
reviews: (1454740, 6)


TypeError: 'Index' object is not callable

In [None]:
import pandas as pd
import numpy as np

DATA_DIR = "data/inside_airbnb_cdmx"  # ajusta a tu ruta

listings_path = f"{DATA_DIR}/listings.csv.gz"
calendar_path = f"{DATA_DIR}/calendar.csv.gz"

listings = pd.read_csv(listings_path, compression="gzip", low_memory=False)
calendar = pd.read_csv(calendar_path, compression="gzip", low_memory=False)

print("listings:", listings.shape)
print("calendar:", calendar.shape)

# columnas típicas que nos interesan (si existen)
cols_interest = [
    "id","name","neighbourhood_cleansed","latitude","longitude","property_type",
    "room_type","accommodates","bathrooms_text","bedrooms","beds",
    "amenities","price","minimum_nights","maximum_nights",
    "number_of_reviews","review_scores_rating","reviews_per_month",
    "host_is_superhost","instant_bookable"
]
existing = [c for c in cols_interest if c in listings.columns]
print("Columnas presentes:", existing)

# Missingness rápido
miss = (listings.isna().mean().sort_values(ascending=False) * 100).round(2)
print(miss)

In [None]:
def clean_price(s: pd.Series) -> pd.Series:
    # convierte a string, quita símbolos y separadores, a float
    out = (s.astype(str)
             .str.replace(r"[\$,]", "", regex=True)
             .str.strip())
    # valores que eran NaN quedan como "nan"
    out = out.replace({"nan": np.nan, "": np.nan, "None": np.nan})
    return pd.to_numeric(out, errors="coerce")

if "price" in listings.columns:
    listings["price_num"] = clean_price(listings["price"])
    print("price_num missing %:", listings["price_num"].isna().mean().round(4))
    print(listings["price_num"].describe(percentiles=[.05,.25,.5,.75,.95]))
else:
    print("No existe columna price en listings.")


In [None]:
# Opción A: filtrar
df_price = listings.dropna(subset=["price_num"]).copy()

# Opción B: imputación por grupo (si quieres más dificultad)
group_cols = [c for c in ["neighbourhood_cleansed","room_type"] if c in listings.columns]
if group_cols:
    medians = listings.groupby(group_cols)["price_num"].transform("median")
    listings["price_imputed"] = listings["price_num"].fillna(medians)
    # fallback global
    listings["price_imputed"] = listings["price_imputed"].fillna(listings["price_num"].median())
else:
    listings["price_imputed"] = listings["price_num"].fillna(listings["price_num"].median())


In [None]:
# columnas típicas: listing_id, date, available, price, adjusted_price, minimum_nights, maximum_nights
print(calendar.columns)

# parse date
if "date" in calendar.columns:
    calendar["date"] = pd.to_datetime(calendar["date"], errors="coerce")

# available suele venir como "t"/"f"
if "available" in calendar.columns:
    calendar["available_bin"] = calendar["available"].astype(str).str.lower().map({"t":1, "f":0})


In [None]:
# nos quedamos con un horizonte (por ejemplo 90 días desde el mínimo de fecha)
start = calendar["date"].min()
end = start + pd.Timedelta(days=90)

cal90 = calendar[(calendar["date"] >= start) & (calendar["date"] < end)].copy()

agg = (cal90.groupby("listing_id", as_index=False)
            .agg(avail_rate_90d=("available_bin","mean"),
                 n_days=("available_bin","size")))

agg["occ_rate_90d"] = 1 - agg["avail_rate_90d"]

# target clasificación (ajusta el umbral si quieres)
THRESH = 0.60
agg["high_demand"] = (agg["occ_rate_90d"] >= THRESH).astype(int)

print(agg.describe())
print(agg["high_demand"].value_counts(dropna=False))


In [None]:
# listings id suele ser "id"
list_id_col = "id" if "id" in listings.columns else None
if list_id_col is None:
    raise ValueError("No encuentro columna id en listings")

model_df = listings.merge(agg, left_on=list_id_col, right_on="listing_id", how="inner")

print("model_df:", model_df.shape)
print(model_df[["price_num","occ_rate_90d","high_demand"]].isna().mean().round(3))
