In [None]:
import pandas as pd

# Merge the Airbnb dataset with the ID listing
df_original = pd.read_csv("data/raw_data.csv").drop_duplicates(subset="Nombre", keep="first")
df_ids = pd.read_csv("data/id_listing.csv", dtype=str).iloc[:, :-1].drop_duplicates(subset="Nombre", keep="first")
df_merged = pd.merge(df_original, df_ids, on="Nombre", how="left")

# Reorder columns: insert new columns after column index 16
for i, col in enumerate([col for col in df_ids.columns if col != "Nombre"]):
    cols = df_merged.columns.tolist()
    cols.insert(16 + 1 + i, cols.pop(cols.index(col)))
    df_merged = df_merged[cols]
    
#df_merged.to_csv("data/airbnb_dataset.csv", index=False)
df_merged

Unnamed: 0,Nombre,#,User,Puntaje,Profile,Link,Limpieza,Veracidad,Check-in,Comunicación,...,11/1/2026,12/1/2026,13/1/2026,14/1/2026,15/1/2026,16/1/2026,17/1/2026,18/1/2026,19/1/2026,20/1/2026
0,Luxury |Boho-Chic |Jacuzzi| Dominique,,Alejandra,5.00,,,5.0,5.0,5.0,5.0,...,,,,,,,,,,
1,14th Floor/Jacuzzi balcony/SeaViews/Parking/King,14XX,Alejandra,5.00,,,4.9,5.0,5.0,5.0,...,,,,,,,,,,
2,2 BR Modern APT jacuzzi / Rooftop near to beach,,Alberto,5.00,,,5.0,5.0,5.0,5.0,...,,,,,,,,,,
3,Piso 12/ Jacuzzi/ Vistas/ 1BR/Parqueadero/Pisc...,,Alejandra,5.00,,,5.0,5.0,5.0,5.0,...,,,,,,,,,,
4,Private jacuzzi |King bed |Parking| Wi-Fi,,Alejandra,5.00,,,5.0,5.0,5.0,5.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,Luxury Cartagena dominique 1209,1209,Zenya Host,4.94,,,4.9,4.9,4.9,5.0,...,,,,,,,,,,
91,Luxury cartagena dominique 705,705,Zenya Host,4.94,,,5.0,5.0,4.9,4.9,...,,,,,,,,,,
92,Luxury Cartagena Dominique 1004,,Zenya Host,4.93,,,4.8,5.0,5.0,4.9,...,,,,,,,,,,
93,luxury Cartagena Dominique 807,807,Zenya Host,4.93,,,4.9,4.9,4.9,5.0,...,,,,,,,,,,


In [61]:
len(df_original), len(df_ids), len(df_merged)

(95, 95, 95)

In [None]:
# Show nan entries in the ID column
nan_entries = df_merged[df_merged["ID"].isna()]
if not nan_entries.empty:
    print("Entries with NaN in ID column:")
    print(nan_entries[["Nombre", "ID"]])

Entries with NaN in ID column:
                                               Nombre   ID
0               Luxury |Boho-Chic |Jacuzzi| Dominique  NaN
4           Private jacuzzi |King bed |Parking| Wi-Fi  NaN
23  Private jacuzzi balcony With sea View | 14th f...  NaN
24                           Balcony Jacuzzi, Indulge  NaN
52   Brand New Luxury Apartment Private Jacuzzi DQ901  NaN
53             Apartamento lujoso con Jacuzzi privado  NaN
67    Luxury apartment with Private Jacuzzi Dominique  NaN
81  Luxury Family Retreat w Priv Jacuzzi Dominique...  NaN


In [67]:
import json
import os
from apify_client import ApifyClient
from dotenv import load_dotenv

# Initialize the Apify client with your API token
load_dotenv()
APIFY_TOKEN = os.getenv("APIFY_TOKEN")
client = ApifyClient(APIFY_TOKEN)

"""_summary_
Reduces each dataset item to only the relevant fields.
"""
def slim_down_listing(raw: dict) -> dict:
    return {
        "id":             raw.get("id"),
        "coordinates":    raw.get("coordinates"),
        "propertyType":   raw.get("propertyType"),
        "roomType":       raw.get("roomType"),
        "homeTier":       raw.get("homeTier"),
        "personCapacity": raw.get("personCapacity"),
        "rating":         raw.get("rating"),
        "title":          raw.get("title"),
        "subDescription": raw.get("subDescription"),
        "amenities":      raw.get("amenities"),
    }

"""_summary_
Reads Airbnb room IDs, builds URLs, runs the Apify Actor, and extracts selected fields.
"""
def main():
    # Read the Airbnb listing IDs from 'ids.txt' and convert to Airbnb URL
    #with open("ids.txt", "r", encoding="utf-8") as f:
        #ids = [line.strip() for line in f if line.strip()]
    ids = df_merged["ID"].tolist()
    room_urls = [f"https://www.airbnb.mx/rooms/{room_id}" for room_id in ids]

    # Prepare the input for the actor
    run_input = {"startUrls": [{"url": url} for url in room_urls]}
    run = client.actor("tri_angle~airbnb-rooms-urls-scraper").call(run_input=run_input)

    # Once completed, access the dataset to retrieve the scraped data
    dataset_id = run["defaultDatasetId"]
    all_listings = []
    returned_ids = []
    for item in client.dataset(dataset_id).iterate_items():
        slim = slim_down_listing(item)
        all_listings.append(slim)
        if slim["id"]:
            returned_ids.append(str(slim["id"]))

    # Detectar errores
    expected_ids = list(map(str, ids))
    missing_ids = [eid for eid in expected_ids if eid not in returned_ids]  # Find missing IDs

    if missing_ids:
        print(f"\n{len(missing_ids)} IDs were not successfully scraped:")
        for mid in missing_ids:
            print(f"- https://www.airbnb.mx/rooms/{mid}")
    else:
        print("\nAll IDs successfully scraped.")

    # Save all reduced listings into a JSON file
    with open("data/all_listings.json", "w", encoding="utf-8") as out_f:
        json.dump(all_listings, out_f, ensure_ascii=False, indent=2)
    print(f"\nProcess complete. Saved {len(all_listings)} records into all_listings.json.")

if __name__ == "__main__":
    main()


8 IDs were not successfully scraped:
- https://www.airbnb.mx/rooms/nan
- https://www.airbnb.mx/rooms/nan
- https://www.airbnb.mx/rooms/nan
- https://www.airbnb.mx/rooms/nan
- https://www.airbnb.mx/rooms/nan
- https://www.airbnb.mx/rooms/nan
- https://www.airbnb.mx/rooms/nan
- https://www.airbnb.mx/rooms/nan

Process complete. Saved 87 records into all_listings.json.


In [None]:
import re

def parse_listing(listing: dict) -> dict:
    # Safe getters
    def get_nested(d, *keys, default=None):
        for key in keys:
            d = d.get(key, {})
        return d if d else default

    # Extract counts from items like "2 bedrooms", "4 guests", etc.
    def extract_from_items(items, key):
        pattern = re.compile(rf"(\d+)\s+{key}", re.IGNORECASE)
        for item in items:
            match = pattern.search(item)
            if match:
                return int(match.group(1))
        return None
    
    # Normalize target amenities
    def matches_target(target_amenity: str) -> bool:
        norm_target = target_amenity.lower()
        return any(
            norm_target in a or a in norm_target
            for a in available_amenities
        )

    # Amenity list of interest
    target_amenities = [
        "City skyline view", "Lake view", "Beach view", "Sea view",
        "Private hot tub", "Shared pool", "Shared gym",
        "Outdoor furniture", "Sun loungers",
        "Free washer – In unit", "Free dryer – In unit", "Room-darkening shades",
        "AC - split type ductless system", "Heating",
        "Fast wifi", "Dedicated workspace",
        "Crib", "Window guards", "Babysitter recommendations", "Outdoor playground",
        "Kitchen", "Microwave", "Dishes and silverware", "Dining table",
        "Fire extinguisher", "First aid kit", "Security cameras",
        "Luggage dropoff allowed", "Self check-in", "Keypad"
    ]
    
    # Rename amenities for consistency
    amenity_rename = {
        "Free washer – In unit": "Washer",
        "Free dryer – In unit": "Dryer",
        "AC - split type ductless system": "AC",
        "Luggage dropoff allowed": "Luggage dropoff"
    }

    # Normalize available amenities from listing
    available_amenities = {
        value["title"].strip().lower()
        for group in listing.get("amenities", [])
        for value in group.get("values", [])
        if value.get("available") is True and isinstance(value.get("title"), str)
    }

    # Extract fields
    items = listing.get("subDescription", {}).get("items", [])
    result = {
        "ID": listing.get("id"),
        "latitude": get_nested(listing, "coordinates", "latitude"),
        "longitude": get_nested(listing, "coordinates", "longitude"),
        "propertyType": listing.get("propertyType"),
        "roomType": listing.get("roomType"),
        "homeTier": listing.get("homeTier"),
        "personCapacity": listing.get("personCapacity"),
        "accuracy": get_nested(listing, "rating", "accuracy"),
        "checking": get_nested(listing, "rating", "checking"),
        "cleanliness": get_nested(listing, "rating", "cleanliness"),
        "communication": get_nested(listing, "rating", "communication"),
        "location": get_nested(listing, "rating", "location"),
        "value": get_nested(listing, "rating", "value"),
        "guestSatisfaction": get_nested(listing, "rating", "guestSatisfaction"),
        "reviewsCount": get_nested(listing, "rating", "reviewsCount"),
        "guests": extract_from_items(items, "guests"),
        "bedrooms": extract_from_items(items, "bedroom"),
        "beds": extract_from_items(items, "bed"),
        "baths": extract_from_items(items, "bath"),
    }

    # Add amenities as boolean flags        
    for amenity in target_amenities:
        col_name = amenity_rename.get(amenity, amenity)
        result[col_name] = matches_target(amenity)

    return result

with open("data/all_listings.json", "r", encoding="utf-8") as f:
    listings = json.load(f)
    
parsed_listings = [parse_listing(lst) for lst in listings]
parsed_listings

[{'ID': '890771974735264612',
  'latitude': 10.45615,
  'longitude': -75.50486,
  'propertyType': 'Entire rental unit',
  'roomType': 'Entire home/apt',
  'homeTier': 1,
  'personCapacity': 4,
  'accuracy': 4.78,
  'checking': 4.86,
  'cleanliness': 4.88,
  'communication': 4.86,
  'location': 4.75,
  'value': 4.78,
  'guestSatisfaction': 4.75,
  'reviewsCount': 77,
  'guests': 4,
  'bedrooms': 1,
  'beds': 1,
  'baths': 2,
  'City skyline view': False,
  'Lake view': False,
  'Beach view': False,
  'Sea view': False,
  'Private hot tub': True,
  'Shared pool': True,
  'Shared gym': True,
  'Outdoor furniture': False,
  'Sun loungers': False,
  'Washer': True,
  'Dryer': True,
  'Room-darkening shades': True,
  'AC': False,
  'Heating': False,
  'Fast wifi': True,
  'Dedicated workspace': True,
  'Crib': False,
  'Window guards': False,
  'Babysitter recommendations': False,
  'Outdoor playground': False,
  'Kitchen': True,
  'Microwave': True,
  'Dishes and silverware': True,
  'Dinin

In [93]:
# Insert parsed listings into the merged DataFrame
#df_merged = pd.read_csv("data/airbnb_dataset.csv", dtype=str)
with open("data/all_listings.json", "r", encoding="utf-8") as f:
    listings = json.load(f)
parsed_listings = [parse_listing(listing) for listing in listings]
df = df_merged.merge(pd.DataFrame(parsed_listings),on="ID", how="left")

# Reorder columns: insert new columns after column index 16
cols = df.columns.tolist()
for i, col in enumerate([col for col in parsed_listings[0].keys() if col != "ID"]):
    cols.insert(18 + 1 + i, cols.pop(cols.index(col)))
df = df[cols]

#df.to_csv("data/airbnb_dataset_enriched.csv", index=False)
df.to_csv("data/airbnb_dataset.csv", index=False)
df

Unnamed: 0,Nombre,#,User,Puntaje,Profile,Link,Limpieza,Veracidad,Check-in,Comunicación,...,11/1/2026,12/1/2026,13/1/2026,14/1/2026,15/1/2026,16/1/2026,17/1/2026,18/1/2026,19/1/2026,20/1/2026
0,Luxury |Boho-Chic |Jacuzzi| Dominique,,Alejandra,5.0,,,5.0,5.0,5.0,5.0,...,,,,,,,,,,
1,14th Floor/Jacuzzi balcony/SeaViews/Parking/King,14XX,Alejandra,5.0,,,4.9,5.0,5.0,5.0,...,,,,,,,,,,
2,2 BR Modern APT jacuzzi / Rooftop near to beach,,Alberto,5.0,,,5.0,5.0,5.0,5.0,...,,,,,,,,,,
3,Piso 12/ Jacuzzi/ Vistas/ 1BR/Parqueadero/Pisc...,,Alejandra,5.0,,,5.0,5.0,5.0,5.0,...,,,,,,,,,,
4,Private jacuzzi |King bed |Parking| Wi-Fi,,Alejandra,5.0,,,5.0,5.0,5.0,5.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,Luxury Cartagena dominique 1209,1209,Zenya Host,4.94,,,4.9,4.9,4.9,5.0,...,,,,,,,,,,
91,Luxury cartagena dominique 705,705,Zenya Host,4.94,,,5.0,5.0,4.9,4.9,...,,,,,,,,,,
92,Luxury Cartagena Dominique 1004,,Zenya Host,4.93,,,4.8,5.0,5.0,4.9,...,,,,,,,,,,
93,luxury Cartagena Dominique 807,807,Zenya Host,4.93,,,4.9,4.9,4.9,5.0,...,,,,,,,,,,


In [94]:
df[['ID','Jacuzzi', 'Private hot tub']]

Unnamed: 0,ID,Jacuzzi,Private hot tub
0,,SI,
1,1282890977555839180,SI,True
2,1255908021842197930,SI,True
3,1364376551860961934,SI,True
4,,SI,
...,...,...,...
90,982475113924685879,SI,True
91,1061083288618779943,SI,True
92,1131261936147332933,SI,True
93,1061118933427624758,SI,True


In [95]:
df['Jacuzzi'].value_counts(), df['Private hot tub'].value_counts()

(Jacuzzi
 SI    82
 NO    13
 Name: count, dtype: int64,
 Private hot tub
 True     71
 False    16
 Name: count, dtype: int64)