# Web Scrapping on Airbnb API

In [134]:
import pandas as pd

# Merge the Airbnb dataset with the ID listing
df_original = pd.read_csv("data/raw_data.csv").drop_duplicates(subset="Nombre", keep="first")
df_ids = pd.read_csv("data/id_listing.csv", dtype=str).iloc[:, :-1].drop_duplicates(subset="Nombre", keep="first")
df_merged = pd.merge(df_original, df_ids, on="Nombre", how="left")

# Reorder columns: insert new columns after column index 16
for i, col in enumerate([col for col in df_ids.columns if col != "Nombre"]):
    cols = df_merged.columns.tolist()
    cols.insert(16 + 1 + i, cols.pop(cols.index(col)))
    df_merged = df_merged[cols]
    
#df_merged.to_csv("data/airbnb_dataset.csv", index=False)
df_merged

Unnamed: 0,Nombre,#,User,Puntaje,Profile,Link,Limpieza,Veracidad,Check-in,Comunicación,...,11/1/2026,12/1/2026,13/1/2026,14/1/2026,15/1/2026,16/1/2026,17/1/2026,18/1/2026,19/1/2026,20/1/2026
0,Luxury |Boho-Chic |Jacuzzi| Dominique,,Alejandra,5.00,,,5.0,5.0,5.0,5.0,...,,,,,,,,,,
1,14th Floor/Jacuzzi balcony/SeaViews/Parking/King,14XX,Alejandra,5.00,,,4.9,5.0,5.0,5.0,...,,,,,,,,,,
2,2 BR Modern APT jacuzzi / Rooftop near to beach,,Alberto,5.00,,,5.0,5.0,5.0,5.0,...,,,,,,,,,,
3,Piso 12/ Jacuzzi/ Vistas/ 1BR/Parqueadero/Pisc...,,Alejandra,5.00,,,5.0,5.0,5.0,5.0,...,,,,,,,,,,
4,Private jacuzzi |King bed |Parking| Wi-Fi,,Alejandra,5.00,,,5.0,5.0,5.0,5.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,Luxury Cartagena dominique 1209,1209,Zenya Host,4.94,,,4.9,4.9,4.9,5.0,...,,,,,,,,,,
91,Luxury cartagena dominique 705,705,Zenya Host,4.94,,,5.0,5.0,4.9,4.9,...,,,,,,,,,,
92,Luxury Cartagena Dominique 1004,,Zenya Host,4.93,,,4.8,5.0,5.0,4.9,...,,,,,,,,,,
93,luxury Cartagena Dominique 807,807,Zenya Host,4.93,,,4.9,4.9,4.9,5.0,...,,,,,,,,,,


In [129]:
len(df_original), len(df_ids), len(df_merged)

(95, 95, 95)

In [None]:
# Show nan entries in the ID column
nan_entries = df_merged[df_merged["ID"].isna()]
if not nan_entries.empty:
    print("Entries with NaN in ID column:")
    print(nan_entries[["Nombre", "ID"]])

Entries with NaN in ID column:
                                               Nombre   ID
0               Luxury |Boho-Chic |Jacuzzi| Dominique  NaN
4           Private jacuzzi |King bed |Parking| Wi-Fi  NaN
23  Private jacuzzi balcony With sea View | 14th f...  NaN
24                           Balcony Jacuzzi, Indulge  NaN
52   Brand New Luxury Apartment Private Jacuzzi DQ901  NaN
53             Apartamento lujoso con Jacuzzi privado  NaN
67    Luxury apartment with Private Jacuzzi Dominique  NaN
81  Luxury Family Retreat w Priv Jacuzzi Dominique...  NaN


In [67]:
import json
import os
from apify_client import ApifyClient
from dotenv import load_dotenv

# Initialize the Apify client with your API token
load_dotenv()
APIFY_TOKEN = os.getenv("APIFY_TOKEN")
client = ApifyClient(APIFY_TOKEN)

"""_summary_
Reduces each dataset item to only the relevant fields.
"""
def slim_down_listing(raw: dict) -> dict:
    return {
        "id":             raw.get("id"),
        "coordinates":    raw.get("coordinates"),
        "propertyType":   raw.get("propertyType"),
        "roomType":       raw.get("roomType"),
        "homeTier":       raw.get("homeTier"),
        "personCapacity": raw.get("personCapacity"),
        "rating":         raw.get("rating"),
        "title":          raw.get("title"),
        "subDescription": raw.get("subDescription"),
        "amenities":      raw.get("amenities"),
    }

"""_summary_
Reads Airbnb room IDs, builds URLs, runs the Apify Actor, and extracts selected fields.
"""
def main():
    # Read the Airbnb listing IDs from 'ids.txt' and convert to Airbnb URL
    #with open("ids.txt", "r", encoding="utf-8") as f:
        #ids = [line.strip() for line in f if line.strip()]
    ids = df_merged["ID"].tolist()
    room_urls = [f"https://www.airbnb.mx/rooms/{room_id}" for room_id in ids]

    # Prepare the input for the actor
    run_input = {"startUrls": [{"url": url} for url in room_urls]}
    run = client.actor("tri_angle~airbnb-rooms-urls-scraper").call(run_input=run_input)

    # Once completed, access the dataset to retrieve the scraped data
    dataset_id = run["defaultDatasetId"]
    all_listings = []
    returned_ids = []
    for item in client.dataset(dataset_id).iterate_items():
        slim = slim_down_listing(item)
        all_listings.append(slim)
        if slim["id"]:
            returned_ids.append(str(slim["id"]))

    # Detectar errores
    expected_ids = list(map(str, ids))
    missing_ids = [eid for eid in expected_ids if eid not in returned_ids]  # Find missing IDs

    if missing_ids:
        print(f"\n{len(missing_ids)} IDs were not successfully scraped:")
        for mid in missing_ids:
            print(f"- https://www.airbnb.mx/rooms/{mid}")
    else:
        print("\nAll IDs successfully scraped.")

    # Save all reduced listings into a JSON file
    with open("data/all_listings.json", "w", encoding="utf-8") as out_f:
        json.dump(all_listings, out_f, ensure_ascii=False, indent=2)
    print(f"\nProcess complete. Saved {len(all_listings)} records into all_listings.json.")

if __name__ == "__main__":
    main()


8 IDs were not successfully scraped:
- https://www.airbnb.mx/rooms/nan
- https://www.airbnb.mx/rooms/nan
- https://www.airbnb.mx/rooms/nan
- https://www.airbnb.mx/rooms/nan
- https://www.airbnb.mx/rooms/nan
- https://www.airbnb.mx/rooms/nan
- https://www.airbnb.mx/rooms/nan
- https://www.airbnb.mx/rooms/nan

Process complete. Saved 87 records into all_listings.json.


In [152]:
import re

def parse_listing(listing: dict) -> dict:
    # Get nested dictionary safely
    def get_nested(d, *keys, default=None):
        for key in keys:
            d = d.get(key, {})
        return d if d else default

    # Extract counts from items like "2 bedrooms", "4 guests", etc.
    def extract_from_items(items, key):
        pattern = re.compile(rf"(\d+)\s+{key}", re.IGNORECASE)
        for item in items:
            match = pattern.search(item)
            if match:
                return int(match.group(1))
        return None

    # Define icons of interest with their corresponding descriptions
    target_icons = [
        ("SYSTEM_VIEW_CITY", "City skyline view"),  
        ("SYSTEM_BEACH", "Beach view"),
        ("SYSTEM_VIEW_OCEAN", "Sea/Lake view"),  
        
        # Amenities related to pools, spaces and outdoor areas
        ("SYSTEM_HOT_WATER", "Hot water"),
        ("SYSTEM_JACUZZI", "Jacuzzi"),
        ("SYSTEM_POOL", "Shared pool"),
        ("SYSTEM_GYM", "Shared gym"),
        ("SYSTEM_PATIO_BALCONY", "Patio or balcony"),
        ("SYSTEM_ROOFTOP_DECK", "Outdoor furniture"),
        ("SYSTEM_PLAY_SLIDE", "Outdoor playground"),
        ("SYSTEM_ELEVATOR", "Elevator"),
        ("SYSTEM_MAPS_CAR_RENTAL", "Carport"),
        ("SYSTEM_WORKSPACE", "Dedicated workspace"),
        
        # Amenities related to comfort and convenience
        ("SYSTEM_SNOWFLAKE", "AC"),
        ("SYSTEM_THERMOMETER", "Heating"),
        ("SYSTEM_TV", "TV"),
        ("SYSTEM_CABLE", "Cable TV"),
        ("SYSTEM_WI_FI", "Wifi"),
        ("SYSTEM_LAUNDRY_SERVICE", "Laundry service"),
        ("SYSTEM_COOKING_BASICS", "Kitchen"),
        ("SYSTEM_DINING_TABLE", "Dining table"),
        ("SYSTEM_MICROWAVE", "Microwave"),
        ("SYSTEM_DISHES_AND_SILVERWARE", "Dishes and silverware"),
        ("SYSTEM_REFRIGERATOR", "Refrigerator"),
        ("SYSTEM_STOVE", "Stove"),
        ("SYSTEM_COFFEE_MAKER", "Coffee maker"),
        ("SYSTEM_WASHER", "Washer"),
        ("SYSTEM_DRYER", "Dryer"),
        ("SYSTEM_SUN_DECK", "Sun loungers"),
        ("SYSTEM_PETS", "Pets allowed"),
        ("SYSTEM_CHILD", "Babysitter recommendations"),
        ("SYSTEM_CRIB", "Crib"),
        
        # Amenities related to safety and security
        ("SYSTEM_BLACKOUT_SHADES", "Room-darkening shades"),
        ("SYSTEM_WINDOW_GUARDS", "Window guards"),
        ("SYSTEM_SURVEILLANCE", "Security cameras"),
        ("SYSTEM_LOCK_ON_DOOR", "Lock on door"),
        ("SYSTEM_KEYPAD", "Keypad"),
        ("SYSTEM_FIRE_EXTINGUISHER", "Fire extinguisher"),
        ("SYSTEM_FIRST_AID_KIT", "First aid kit"),
        ("SYSTEM_DETECTOR_SMOKE", "Smoke detector"),
        
        # Amenities related to check-in and luggage
        ("SYSTEM_LUGGAGE_DROP", "Luggage dropoff"),
        ("SYSTEM_KEY", "Self check-in")
    ]
    
    # Extract available amenity icons
    available_icons = {
        value["icon"]
        for group in listing.get("amenities", [])
        for value in group.get("values", [])
        if value.get("available") is True and isinstance(value.get("icon"), str)
    }

    # Extract other listing fields
    items = listing.get("subDescription", {}).get("items", [])
    result = {
        "ID": listing.get("id"),
        "latitude": get_nested(listing, "coordinates", "latitude"),
        "longitude": get_nested(listing, "coordinates", "longitude"),
        "Property type": listing.get("propertyType"),
        "Room type": listing.get("roomType"),
        "Home tier": listing.get("homeTier"),
        "Person capacity": listing.get("personCapacity"),
        "accuracy_rating": get_nested(listing, "rating", "accuracy"),
        "checking_rating": get_nested(listing, "rating", "checking"),
        "cleanliness_rating": get_nested(listing, "rating", "cleanliness"),
        "communication_rating": get_nested(listing, "rating", "communication"),
        "location_rating": get_nested(listing, "rating", "location"),
        "value_rating": get_nested(listing, "rating", "value"),
        "satisfaction_rating": get_nested(listing, "rating", "guestSatisfaction"),
        "Reviews": get_nested(listing, "rating", "reviewsCount"),
        "Guests": extract_from_items(items, "guests"),
        "Bedrooms": extract_from_items(items, "bedroom"),
        "Beds": extract_from_items(items, "bed"),
        "Baths": extract_from_items(items, "bath"),
    }

    # Add amenities as boolean flags        
    for icon_code, readable_name in target_icons:
            result[readable_name] = icon_code in available_icons

    return result

with open("data/all_listings.json", "r", encoding="utf-8") as f:
    listings = json.load(f)
parse_listing(listings[0]) 

{'ID': '890771974735264612',
 'latitude': 10.45615,
 'longitude': -75.50486,
 'Property type': 'Entire rental unit',
 'Room type': 'Entire home/apt',
 'Home tier': 1,
 'Person capacity': 4,
 'accuracy_rating': 4.78,
 'checking_rating': 4.86,
 'cleanliness_rating': 4.88,
 'communication_rating': 4.86,
 'location_rating': 4.75,
 'value_rating': 4.78,
 'satisfaction_rating': 4.75,
 'Reviews': 77,
 'Guests': 4,
 'Bedrooms': 1,
 'Beds': 1,
 'Baths': 2,
 'City skyline view': False,
 'Beach view': False,
 'Sea/Lake view': False,
 'Hot water': True,
 'Jacuzzi': True,
 'Shared pool': True,
 'Shared gym': True,
 'Patio or balcony': False,
 'Outdoor furniture': False,
 'Outdoor playground': False,
 'Elevator': True,
 'Carport': True,
 'Dedicated workspace': True,
 'AC': True,
 'Heating': False,
 'TV': True,
 'Cable TV': False,
 'Wifi': True,
 'Laundry service': False,
 'Kitchen': True,
 'Dining table': False,
 'Microwave': True,
 'Dishes and silverware': True,
 'Refrigerator': True,
 'Stove': Tru

In [172]:
# Insert parsed listings into the merged DataFrame
with open("data/all_listings.json", "r", encoding="utf-8") as f:
    listings = json.load(f)
parsed_listings = [parse_listing(listing) for listing in listings]
df_merged = df_merged.drop(columns=["Jacuzzi", 'Reviews'], errors="ignore")
df = df_merged.merge(pd.DataFrame(parsed_listings),on="ID", how="left").copy()

# Reorder columns: insert new columns after column index 16
cols = df.columns.tolist()
for i, col in enumerate([col for col in parsed_listings[0].keys() if col != "ID"]):
    cols.insert(16 + 1 + i, cols.pop(cols.index(col)))
df = df[cols]

df.to_csv("data/airbnb_dataset.csv", index=False)
df

Unnamed: 0,Nombre,#,User,Puntaje,Profile,Link,Limpieza,Veracidad,Check-in,Comunicación,...,11/1/2026,12/1/2026,13/1/2026,14/1/2026,15/1/2026,16/1/2026,17/1/2026,18/1/2026,19/1/2026,20/1/2026
0,Luxury |Boho-Chic |Jacuzzi| Dominique,,Alejandra,5.00,,,5.0,5.0,5.0,5.0,...,,,,,,,,,,
1,14th Floor/Jacuzzi balcony/SeaViews/Parking/King,14XX,Alejandra,5.00,,,4.9,5.0,5.0,5.0,...,,,,,,,,,,
2,2 BR Modern APT jacuzzi / Rooftop near to beach,,Alberto,5.00,,,5.0,5.0,5.0,5.0,...,,,,,,,,,,
3,Piso 12/ Jacuzzi/ Vistas/ 1BR/Parqueadero/Pisc...,,Alejandra,5.00,,,5.0,5.0,5.0,5.0,...,,,,,,,,,,
4,Private jacuzzi |King bed |Parking| Wi-Fi,,Alejandra,5.00,,,5.0,5.0,5.0,5.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,Luxury Cartagena dominique 1209,1209,Zenya Host,4.94,,,4.9,4.9,4.9,5.0,...,,,,,,,,,,
91,Luxury cartagena dominique 705,705,Zenya Host,4.94,,,5.0,5.0,4.9,4.9,...,,,,,,,,,,
92,Luxury Cartagena Dominique 1004,,Zenya Host,4.93,,,4.8,5.0,5.0,4.9,...,,,,,,,,,,
93,luxury Cartagena Dominique 807,807,Zenya Host,4.93,,,4.9,4.9,4.9,5.0,...,,,,,,,,,,


# Data Cleaning

In [348]:
# Remove unwanted columns
df = pd.read_csv("data/airbnb_dataset.csv", dtype={"ID": str})
df = df.drop(columns=['Profile', 'Link', 'Puntaje', 'Limpieza', 'Veracidad', 'Check-in', 'Comunicación', 'Ubicación', 'Calidad'], errors='ignore')

# Rename columns for consistency
df = df.rename(columns={
    "Nombre": "Name",
    'User': 'Host',
    'Tarifa': 'Base fee',
    'Cleaning': 'Cleaning fee',
    '#': 'Room number'
})
df

Unnamed: 0,Name,Room number,Host,Base fee,Cleaning fee,Habs,URL,ID,latitude,longitude,...,11/1/2026,12/1/2026,13/1/2026,14/1/2026,15/1/2026,16/1/2026,17/1/2026,18/1/2026,19/1/2026,20/1/2026
0,Luxury |Boho-Chic |Jacuzzi| Dominique,,Alejandra,$79.00,$36.00,2,,,,,...,,,,,,,,,,
1,14th Floor/Jacuzzi balcony/SeaViews/Parking/King,14XX,Alejandra,$95.00,$38.00,2,https://www.airbnb.ca/rooms/128289097755583918...,1282890977555839180,10.456400,-75.505410,...,,,,,,,,,,
2,2 BR Modern APT jacuzzi / Rooftop near to beach,,Alberto,$100.00,$35.00,2,https://www.airbnb.com/rooms/12559080218421979...,1255908021842197930,10.456549,-75.505265,...,,,,,,,,,,
3,Piso 12/ Jacuzzi/ Vistas/ 1BR/Parqueadero/Pisc...,,Alejandra,$67.00,$36.00,1,https://www.airbnb.mx/rooms/136437655186096193...,1364376551860961934,10.456523,-75.505361,...,,,,,,,,,,
4,Private jacuzzi |King bed |Parking| Wi-Fi,,Alejandra,$81.00,$36.00,2,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,Luxury Cartagena dominique 1209,1209,Zenya Host,$96.00,$36.41,1,https://www.airbnb.mx/rooms/982475113924685879...,982475113924685879,10.456388,-75.505411,...,,,,,,,,,,
91,Luxury cartagena dominique 705,705,Zenya Host,$89.32,$36.32,1,https://www.airbnb.mx/rooms/106108328861877994...,1061083288618779943,10.456420,-75.505390,...,,,,,,,,,,
92,Luxury Cartagena Dominique 1004,,Zenya Host,$116.42,$36.35,2,https://www.airbnb.mx/rooms/113126193614733293...,1131261936147332933,10.456388,-75.505411,...,,,,,,,,,,
93,luxury Cartagena Dominique 807,807,Zenya Host,$97.00,$36.00,1,https://www.airbnb.mx/rooms/106111893342762475...,1061118933427624758,10.456388,-75.505411,...,,,,,,,,,,


In [349]:
print(f"Number of 'Habs': {df['Habs'].sum()} \t\t\tNumber of 'Bedrooms': {df['Bedrooms'].sum()}")
print(f"Number of 'Person capacity': {df['Person capacity'].sum()} \tNumber of 'Guests': {df['Guests'].sum()}")
print("\n", df['Room type'].value_counts())
print("\n", df['Home tier'].value_counts())

Number of 'Habs': 145 			Number of 'Bedrooms': 137.0
Number of 'Person capacity': 420.0 	Number of 'Guests': 420.0

 Room type
Entire home/apt    87
Name: count, dtype: int64

 Home tier
1.0    87
Name: count, dtype: int64


In [350]:
(df['Habs'] - df['Bedrooms']).sort_values(ascending=True).head(5)

40   -2.0
16   -2.0
18   -1.0
1     0.0
69    0.0
dtype: float64

In [351]:
# Drop some columns that are not needed or does not add value
df = df.drop(columns=['Habs', 'Guests', 'Room type', 'Home tier', 'Room number'], errors='ignore')
df = df.dropna(subset=['ID']) # Drop nan IDs

# Extract time series columns
date_cols = [col for col in df.columns if re.fullmatch(r"\d{1,2}/\d{1,2}/\d{4}", col)]
base_ts_cols = ['Name', 'ID', 'Base fee', 'Cleaning fee', 'latitude', 'longitude', 'Property type']
time_series_cols = ['Noches 05', 'Noches 06', 'Noches 07'] + date_cols

# Create DataFrames 
df_timeseries = df[base_ts_cols + time_series_cols].copy()
df_attributes = df.drop(columns=time_series_cols)
df_attributes.to_csv("clean_data/airbnb_attributes.csv", index=False)
df_timeseries.shape, df_attributes.shape

((87, 250), (87, 63))

In [352]:
# Search for non-empty date columns
non_empty_dates = [
    col for col in date_cols 
    if not df_timeseries[col].replace("", pd.NA).isna().all()
]
print("First valid date:", non_empty_dates[0], "\nLast valid date:", non_empty_dates[-1])

# Trim the DataFrame
base_ts_cols = [col for col in df_timeseries.columns if col not in date_cols]
df_timeseries = df_timeseries[base_ts_cols + non_empty_dates]
df_timeseries.to_csv("clean_data/airbnb_time_series.csv", index=False)
df_timeseries.shape

First valid date: 26/5/2025 
Last valid date: 31/7/2025


(87, 77)

In [353]:
import numpy as np
from itertools import groupby
df_timeseries = pd.read_csv("clean_data/airbnb_time_series.csv", dtype={"ID": str})

# Replace -1 and -0.5 with NaN in the date columns
df_timeseries[non_empty_dates] = df_timeseries[non_empty_dates].replace([-1, -0.5], np.nan)

# Calculate the maximum gap of NaNs in each row for the non-empty date columns
def max_nan_gap(series):
    return max((len(list(g)) for k, g in groupby(series.isna()) if k), default=0)
df_timeseries["max_nan_gap"] = df_timeseries[non_empty_dates].apply(max_nan_gap, axis=1)
df_timeseries.sort_values("max_nan_gap", ascending=False)[["Name", "ID", "max_nan_gap"]].head(20)

Unnamed: 0,Name,ID,max_nan_gap
75,Luxury Condo & private jacuzzi Dominique DQ810,865497675793371668,28
43,New Luxury + Private Jacuzzi + Nesspresso + Views,1305119491130976920,24
77,DQ505 Nuevo hermoso jacuzzi priv,856430871775844191,22
76,Luxury apartment private jacuzzi Dominique DQ1305,852769563272239362,20
4,Luxury /Jacuzzi/ Decoration/ Caribbean/ Pools/,1259780256818112667,20
64,Luxury Condo Private Jacuzzi Dominique DQ805,852989152866665235,17
45,Tropical Luxury Condo Private Jacuzzi WIFI DQ1201,852725916509464739,16
17,Dominique 1008,1315898353576844455,16
3,Piso 14/ Jacuzzi/ Vistas/ 2BR/Parqueadero/Pisc...,1279218191310817247,14
11,Luxury Apartment with a Private Jacuzzi -DMQ507,994392265836559354,14


In [354]:
from sklearn.impute import KNNImputer
import warnings

# Define a mask for rows with small gaps (<= 14 consecutive NaNs) and prepare for KNN imputation
small_gap_mask = df_timeseries[non_empty_dates].apply(max_nan_gap, axis=1) <= 14
imputer = KNNImputer(n_neighbors=3)
to_impute = df_timeseries.loc[small_gap_mask, non_empty_dates]
to_impute = to_impute.replace([np.inf, -np.inf], np.nan)

# Suppress warnings just during these computations
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=RuntimeWarning)
    imputed = imputer.fit_transform(to_impute)

# Apply the imputed values back to the DataFrame
df_interpolated = df_timeseries.copy()
df_interpolated.loc[small_gap_mask, non_empty_dates] = np.round(imputed, 2)

# Extrapolate up to 7-day gaps at the beginning/end of rows
df_interpolated[non_empty_dates] = df_interpolated[non_empty_dates].ffill(axis=1, limit=7).bfill(axis=1, limit=7)

df_interpolated.to_csv("clean_data/airbnb_time_series_interpolated.csv", index=False)
df_interpolated

Unnamed: 0,Name,ID,Base fee,Cleaning fee,latitude,longitude,Property type,Noches 05,Noches 06,Noches 07,...,23/7/2025,24/7/2025,25/7/2025,26/7/2025,27/7/2025,28/7/2025,29/7/2025,30/7/2025,31/7/2025,max_nan_gap
0,14th Floor/Jacuzzi balcony/SeaViews/Parking/King,1282890977555839180,$95.00,$38.00,10.456400,-75.505410,Entire serviced apartment,3,19,18,...,146.5,146.50,142.0,146.33,150.67,150.67,155.5,155.5,152.50,11
1,2 BR Modern APT jacuzzi / Rooftop near to beach,1255908021842197930,$100.00,$35.00,10.456549,-75.505265,Entire rental unit,10,31,30,...,117.0,117.00,117.0,116.17,117.17,121.00,112.5,117.0,117.00,4
2,Piso 12/ Jacuzzi/ Vistas/ 1BR/Parqueadero/Pisc...,1364376551860961934,$67.00,$36.00,10.456523,-75.505361,Entire serviced apartment,20,24,31,...,108.0,108.00,110.5,111.00,111.00,112.00,111.5,107.0,105.00,5
3,Piso 14/ Jacuzzi/ Vistas/ 2BR/Parqueadero/Pisc...,1279218191310817247,$89.00,$36.00,10.456398,-75.505406,Entire serviced apartment,4,21,31,...,117.5,117.50,117.5,117.50,117.50,117.50,117.5,117.5,121.50,14
4,Luxury /Jacuzzi/ Decoration/ Caribbean/ Pools/,1259780256818112667,$81.00,$36.00,10.456398,-75.505406,Entire serviced apartment,16,14,20,...,,,,,,,,,,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,Luxury Cartagena dominique 1209,982475113924685879,$96.00,$36.41,10.456388,-75.505411,Entire rental unit,13,26,31,...,168.5,168.50,163.5,163.50,169.50,169.50,178.0,178.0,161.50,0
83,Luxury cartagena dominique 705,1061083288618779943,$89.32,$36.32,10.456420,-75.505390,Entire rental unit,14,30,31,...,167.0,168.67,167.0,168.00,170.00,197.17,200.0,200.0,159.17,10
84,Luxury Cartagena Dominique 1004,1131261936147332933,$116.42,$36.35,10.456388,-75.505411,Entire rental unit,10,30,31,...,232.0,219.50,219.5,208.50,208.50,227.00,227.0,219.5,219.50,6
85,luxury Cartagena Dominique 807,1061118933427624758,$97.00,$36.00,10.456388,-75.505411,Entire rental unit,14,30,31,...,178.0,175.00,175.0,172.00,172.00,186.00,186.0,178.5,178.50,5


# Export Databases to SQL server

In [4]:
import pandas as pd
import sqlite3

# Load your cleaned datasets
df_attributes = pd.read_csv("clean_data/airbnb_attributes.csv")
df_time_raw = pd.read_csv("clean_data/airbnb_time_series.csv")
df_time_interp = pd.read_csv("clean_data/airbnb_time_series_interpolated.csv")

# Connect to SQLite database (create if doesn't exist)
conn = sqlite3.connect("airbnb_cartagena.sqlite")

# Export to SQL tables
df_attributes.to_sql("Attributes", conn, if_exists="replace", index=False)
df_time_raw.to_sql("TimeSeriesRaw", conn, if_exists="replace", index=False)
df_time_interp.to_sql("TimeSeriesInterpolated", conn, if_exists="replace", index=False)

# Close connection
conn.close()

In [6]:
# Check the tables in the SQLite database
conn = sqlite3.connect("airbnb_cartagena.sqlite")
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")

for table in cursor.fetchall():
    print(table[0])

conn.close()

Attributes
TimeSeriesRaw
TimeSeriesInterpolated
