In [14]:
from pathlib import Path
import requests
import pandas as pd

# ========== 1) Config ==========
DATA_DIR = Path("data")
DATA_DIR.mkdir(parents=True, exist_ok=True)

BASE = "https://data.insideairbnb.com/mexico/df/mexico-city/2025-09-27"

URLS = {
    "listings.csv.gz": f"{BASE}/data/listings.csv.gz",
    "calendar.csv.gz": f"{BASE}/data/calendar.csv.gz",
    "reviews.csv.gz":  f"{BASE}/data/reviews.csv.gz",
    # opcionales (barrios)
    "neighbourhoods.csv":     f"{BASE}/visualisations/neighbourhoods.csv",
    "neighbourhoods.geojson": f"{BASE}/visualisations/neighbourhoods.geojson",
}

# ========== 2) Descarga robusta ==========
def download(url: str, outpath: Path, timeout=60):
    if outpath.exists() and outpath.stat().st_size > 0:
        print(f"✔ Ya existe: {outpath.name} ({outpath.stat().st_size:,} bytes)")
        return

    print(f"⬇ Descargando {outpath.name} ...")
    with requests.get(url, stream=True, timeout=timeout) as r:
        r.raise_for_status()
        # Detectar si te devolvió HTML (a veces pasa con bloqueos/redirecciones)
        ctype = (r.headers.get("Content-Type") or "").lower()
        if "text/html" in ctype:
            raise RuntimeError(
                f"Descarga devolvió HTML en vez de archivo para {outpath.name}. "
                "Revisa red/proxy o intenta desde otra red."
            )

        with open(outpath, "wb") as f:
            for chunk in r.iter_content(chunk_size=1024 * 1024):
                if chunk:
                    f.write(chunk)

    print(f"✅ Listo: {outpath.name} ({outpath.stat().st_size:,} bytes)")

# Descarga los principales (y opcionales)
for fname, url in URLS.items():
    download(url, DATA_DIR / fname)

# ========== 3) Lectura en pandas ==========
listings = pd.read_csv(DATA_DIR / "listings.csv.gz", compression="gzip", low_memory=False)
calendar = pd.read_csv(DATA_DIR / "calendar.csv.gz", compression="gzip", low_memory=False)
reviews  = pd.read_csv(DATA_DIR / "reviews.csv.gz",  compression="gzip", low_memory=False)

print("listings:", listings.shape)
print("calendar:", calendar.shape)
print("reviews:", reviews.shape)

✔ Ya existe: listings.csv.gz (14,513,553 bytes)
✔ Ya existe: calendar.csv.gz (23,037,908 bytes)
✔ Ya existe: reviews.csv.gz (141,542,027 bytes)
✔ Ya existe: neighbourhoods.csv (275 bytes)
✔ Ya existe: neighbourhoods.geojson (342,744 bytes)
listings: (27051, 79)
calendar: (9873624, 7)
reviews: (1454740, 6)


In [15]:
import pandas as pd
import numpy as np

DATA_DIR = "data" 

listings_path = f"{DATA_DIR}/listings.csv.gz"
calendar_path = f"{DATA_DIR}/calendar.csv.gz"

listings = pd.read_csv(listings_path, compression="gzip", low_memory=False)
calendar = pd.read_csv(calendar_path, compression="gzip", low_memory=False)

print("listings:", listings.shape)
print("calendar:", calendar.shape)

# columnas típicas que nos interesan (si existen)
cols_interest = [
    "id","name","neighbourhood_cleansed","latitude","longitude","property_type",
    "room_type","accommodates","bathrooms_text","bedrooms","beds",
    "amenities","price","minimum_nights","maximum_nights",
    "number_of_reviews","review_scores_rating","reviews_per_month",
    "host_is_superhost","instant_bookable"
]
existing = [c for c in cols_interest if c in listings.columns]
print("Columnas presentes:", existing)

# Missingness rápido
miss = (listings.isna().mean().sort_values(ascending=False) * 100).round(2)

listings: (27051, 79)
calendar: (9873624, 7)
Columnas presentes: ['id', 'name', 'neighbourhood_cleansed', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price', 'minimum_nights', 'maximum_nights', 'number_of_reviews', 'review_scores_rating', 'reviews_per_month', 'host_is_superhost', 'instant_bookable']


In [16]:
listings.isna().mean().sort_values(ascending=False)

neighbourhood_group_cleansed    1.000000
calendar_updated                1.000000
license                         1.000000
neighborhood_overview           0.492218
neighbourhood                   0.492218
                                  ...   
minimum_nights_avg_ntm          0.000000
maximum_nights_avg_ntm          0.000000
availability_30                 0.000000
availability_60                 0.000000
amenities                       0.000000
Length: 79, dtype: float64

In [17]:
def clean_price(s: pd.Series) -> pd.Series:
    # convierte a string, quita símbolos y separadores, a float
    out = (s.astype(str)
             .str.replace(r"[\$,]", "", regex=True)
             .str.strip())
    # valores que eran NaN quedan como "nan"
    out = out.replace({"nan": np.nan, "": np.nan, "None": np.nan})
    return pd.to_numeric(out, errors="coerce")

if "price" in listings.columns:
    listings["price_num"] = clean_price(listings["price"])
    print("price_num missing %:", listings["price_num"].isna().mean().round(4))
    print(listings["price_num"].describe(percentiles=[.05,.25,.5,.75,.95]))
else:
    print("No existe columna price en listings.")


price_num missing %: 0.1288
count     23567.000000
mean       1792.540841
std       13230.940558
min          61.000000
5%          333.000000
25%         643.000000
50%        1039.000000
75%        1611.000000
95%        3831.000000
max      900000.000000
Name: price_num, dtype: float64


In [18]:
# Opción A: filtrar
df_price = listings.dropna(subset=["price_num"]).copy()

# Opción B: imputación por grupo (si quieres más dificultad)
group_cols = [c for c in ["neighbourhood_cleansed","room_type"] if c in listings.columns]
if group_cols:
    medians = listings.groupby(group_cols)["price_num"].transform("median")
    listings["price_imputed"] = listings["price_num"].fillna(medians)
    # fallback global
    listings["price_imputed"] = listings["price_imputed"].fillna(listings["price_num"].median())
else:
    listings["price_imputed"] = listings["price_num"].fillna(listings["price_num"].median())


In [23]:
df_price

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,price_num
0,35797,https://www.airbnb.com/rooms/35797,20250927041820,2025-09-27,city scrape,Villa Dante,"Dentro de Villa un estudio de arte con futon, ...","Santa Fe Shopping Mall, Interlomas Park and th...",https://a0.muscache.com/pictures/f395ab78-1185...,153786,...,,,,f,1,1,0,0,,3673.0
1,44616,https://www.airbnb.com/rooms/44616,20250927041820,2025-09-28,city scrape,Condesa Haus,A new concept of hosting in mexico through a b...,,https://a0.muscache.com/pictures/251410/ec75fe...,196253,...,4.98,4.47,,f,9,4,2,0,0.38,18000.0
2,56074,https://www.airbnb.com/rooms/56074,20250927041820,2025-09-28,city scrape,Great space in historical San Rafael,This great apartment is located in one of the ...,Very traditional neighborhood with all service...,https://a0.muscache.com/pictures/3005118/60dac...,265650,...,4.76,4.79,,f,1,1,0,0,0.48,591.0
6,165772,https://www.airbnb.com/rooms/165772,20250927041820,2025-09-28,city scrape,BEST 5 Bedroom HOUSE IN S. Miguel Chapultepec,Welcome to Your Home in Mexico City<br />We ha...,San Miguel Chapultepec is the best kept secret...,https://a0.muscache.com/pictures/miso/Hosting-...,790208,...,4.75,4.90,,f,5,5,0,0,2.24,3673.0
7,171109,https://www.airbnb.com/rooms/171109,20250927041820,2025-09-28,city scrape,Cool room near WTC and Metrobus,"Stay in a private room 2 blocks away from WTC,...","Great location, feel comfortable and secure at...",https://a0.muscache.com/pictures/16040866/8b0a...,816295,...,4.97,4.83,,f,2,0,2,0,0.88,321.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27046,1518356968266486778,https://www.airbnb.com/rooms/1518356968266486778,20250927041820,2025-09-27,city scrape,Suite Frida,Suite Frida is the most spacious room in Casa ...,,https://a0.muscache.com/pictures/hosting/Hosti...,674644167,...,,,,t,5,3,2,0,,1096.0
27047,1518388105556721163,https://www.airbnb.com/rooms/1518388105556721163,20250927041820,2025-09-27,city scrape,Habitación a un costado de Ciudad Universitaria,Feel at home in your private corner next to Ci...,,https://a0.muscache.com/pictures/hosting/Hosti...,222854336,...,,,,f,1,0,1,0,,260.0
27048,1518433354720281854,https://www.airbnb.com/rooms/1518433354720281854,20250927041820,2025-09-28,city scrape,Espacioso departamento en Anzures,"Beautiful apartment a few blocks from Polanco,...",,https://a0.muscache.com/pictures/hosting/Hosti...,470800231,...,,,,t,20,15,5,0,,996.0
27049,1518462568893975567,https://www.airbnb.com/rooms/1518462568893975567,20250927041820,2025-09-28,city scrape,Casa Roma Baja 05,It is a mini apartment with an independent ent...,,https://a0.muscache.com/pictures/hosting/Hosti...,710857718,...,,,,f,7,2,5,0,,458.0


In [19]:
# columnas típicas: listing_id, date, available, price, adjusted_price, minimum_nights, maximum_nights
print(calendar.columns)

# parse date
if "date" in calendar.columns:
    calendar["date"] = pd.to_datetime(calendar["date"], errors="coerce")

# available suele venir como "t"/"f"
if "available" in calendar.columns:
    calendar["available_bin"] = calendar["available"].astype(str).str.lower().map({"t":1, "f":0})


Index(['listing_id', 'date', 'available', 'price', 'adjusted_price',
       'minimum_nights', 'maximum_nights'],
      dtype='str')


In [33]:
ids = [35797]
filtro = calendar[calendar.listing_id.isin(ids)].sort_values(by='date',ascending=True)
filtro.to_csv(f'f{ids}.csv')


In [20]:
# nos quedamos con un horizonte (por ejemplo 90 días desde el mínimo de fecha)
start = calendar["date"].min()
end = start + pd.Timedelta(days=90)

cal90 = calendar[(calendar["date"] >= start) & (calendar["date"] < end)].copy()

agg = (cal90.groupby("listing_id", as_index=False)
            .agg(avail_rate_90d=("available_bin","mean"),
                 n_days=("available_bin","size")))

agg["occ_rate_90d"] = 1 - agg["avail_rate_90d"]

# target clasificación (ajusta el umbral si quieres)
THRESH = 0.60
agg["high_demand"] = (agg["occ_rate_90d"] >= THRESH).astype(int)

print(agg.describe())
print(agg["high_demand"].value_counts(dropna=False))


         listing_id  avail_rate_90d        n_days  occ_rate_90d   high_demand
count  2.705100e+04    27051.000000  27051.000000  27051.000000  27051.000000
mean   7.003556e+17        0.594052     89.268604      0.405948      0.289121
std    5.702714e+17        0.349195      0.443242      0.349195      0.453362
min    3.579700e+04        0.000000     89.000000      0.000000      0.000000
25%    4.410472e+07        0.314607     89.000000      0.089888      0.000000
50%    8.307368e+17        0.674157     89.000000      0.325843      0.000000
75%    1.217382e+18        0.910112     90.000000      0.685393      1.000000
max    1.518561e+18        1.000000     90.000000      1.000000      1.000000
high_demand
0    19230
1     7821
Name: count, dtype: int64


In [25]:
agg

Unnamed: 0,listing_id,avail_rate_90d,n_days,occ_rate_90d,high_demand
0,35797,0.977778,90,0.022222,0
1,44616,0.943820,89,0.056180,0
2,56074,0.640449,89,0.359551,0
3,67703,0.101124,89,0.898876,1
4,70644,0.539326,89,0.460674,0
...,...,...,...,...,...
27046,1518356968266486778,1.000000,90,0.000000,0
27047,1518388105556721163,1.000000,90,0.000000,0
27048,1518433354720281854,1.000000,89,0.000000,0
27049,1518462568893975567,0.977528,89,0.022472,0


In [21]:
# listings id suele ser "id"
list_id_col = "id" if "id" in listings.columns else None
if list_id_col is None:
    raise ValueError("No encuentro columna id en listings")

model_df = listings.merge(agg, left_on=list_id_col, right_on="listing_id", how="inner")

print("model_df:", model_df.shape)
print(model_df[["price_num","occ_rate_90d","high_demand"]].isna().mean().round(3))


model_df: (27051, 86)
price_num       0.129
occ_rate_90d    0.000
high_demand     0.000
dtype: float64
