In [1]:
%pip install duckdb pandas numpy pyspainmobility requests

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


<h1 align="center"><b>Building a 3-Tier Data Lakehouse for Mobility Analysis in Spain</b></h1>
<h3 align="center"><b style="color:gray">Bronze Layer</b></h3>
<h4 align="right">Joan Fernández Navarro & Borja Albert Gramaje</h4>
<h3><b>Table of Contents</b></h3>
<ul style = "list-style-type: none; line-height: 0.5em;">
    <li><a href="#mitma"><h5>1. Spanish Ministry of Transport, Mobility and Urban Agenda
(MITMA) Open Data</h5></a></li>
    <ul style = "list-style-type: none; line-height: 1em;">
        <li><a href="#od"><h5>1.1. Origin-destination (OD) trip matrices</h5></a></li>
        <li><a href="#people"><h5>1.2. People by day</h5></a></li>
        <li><a href="#overnight"><h5>1.3. Overnight stays</h5></a></li>
        <li><a href="#zones"><h5>1.4. Zones</h5></a></li>
    </ul>
    <li><a href="#ine"><h5>2. Spanish National Statistics Institute (INE)</h5></a></li>
    <ul style = "list-style-type: none; line-height: 1em;">
        <li><a href="#population"><h5>2.1. Population by municipio (Padrón)</h5></a></li>
        <li><a href="#income"><h5>2.2. Income by distrito</h5></a></li>
        <li><a href="#business"><h5>2.3. Business by municipio</h5></a></li>
    </ul>
    </ul>
</ul>

In [2]:
import os
import duckdb
import requests
import pandas as pd
from pyspainmobility import Mobility, Zones

BASE_PATH = f"{os.getcwd()}/../../raw"
LAKE_LAYER = "bronze"

con = duckdb.connect("./../../mobility.db")

def SQL(q):
    """Run SQL (printed for clarity) and return a DataFrame."""
    return con.execute(q).fetchdf()

print("DuckDB version:", con.sql("SELECT version();").fetchone()[0])

DuckDB version: v1.4.2


<h2 id="mitma"><b>1. Spanish Ministry of Transport, Mobility and Urban Agenda (MITMA) Open Data</b></h2>

<h2 id="od"><b>1.1. Origin-destination (OD) trip matrices</b></h2>

```sql
-- Trip Matrices - distristos
CREATE TABLE bronze_mitma_od_districts (
    fecha TEXT,
    periodo TEXT,
    origen TEXT,
    destino TEXT,
    distancia TEXT,
    actividad_origen TEXT,
    actividad_destino TEXT,
    residencia TEXT,
    renta TEXT,
    edad TEXT,
    sexo TEXT,
    viajes TEXT,
    viajes_km TEXT,
    -- Columnas extras añadidas para auditoria. 
    loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    source_file TEXT
);

-- Trip Matrices - municipalities
CREATE TABLE bronze_mitma_od_municipalities (
    fecha TEXT,
    periodo TEXT,
    origen TEXT,
    destino TEXT,
    distancia TEXT,
    actividad_origen TEXT,
    actividad_destino TEXT,
    residencia TEXT,
    renta TEXT,
    edad TEXT,
    sexo TEXT,
    viajes TEXT,
    viajes_km TEXT,
    -- Columnas extras añadidas para auditoria. 
    loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    source_file TEXT
);

-- Trip Matrices - GAU
CREATE TABLE bronze_mitma_od_gau (
    fecha TEXT,
    periodo TEXT,
    origen TEXT,
    destino TEXT,
    distancia TEXT,
    actividad_origen TEXT,
    actividad_destino TEXT,
    residencia TEXT,
    renta TEXT,
    edad TEXT,
    sexo TEXT,
    viajes TEXT,
    viajes_km TEXT,
    -- Columnas extras añadidas para auditoria. 
    loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    source_file TEXT
);
```

![Descripción de la imagen](./schemas/bronze_od.png)

In [3]:
def load_od_matrices(zone_type="districts", start_date="2022-03-01", end_date="2022-03-03"):
    """
    Downloads MITMA OD matrices (only if not already present) and loads them into DuckDB.

    Parameters
    ----------
    zone_type : str
        Zone level ("districts", "municipalities", etc.).
    start_date : str
        Start date (YYYY-MM-DD).
    end_date : str
        End date (YYYY-MM-DD).
    """

    dataset = "od"
    dataset_path = f"{BASE_PATH}/MITMA/{dataset}_{zone_type}"
    table_name = f"{LAKE_LAYER}_mitma_{dataset}_{zone_type}"

    # -------------------------------------------------------
    # 1. Ensure directory exists
    # -------------------------------------------------------
    os.makedirs(dataset_path, exist_ok=True)

    # -------------------------------------------------------
    # 2. Download data only if the directory is empty
    # -------------------------------------------------------
    print(f"Downloading MITMA OD matrices for zone type: {zone_type}...")
    mobility = Mobility(
        version=2,
        zones=zone_type,
        start_date=start_date,
        end_date=end_date,
        output_directory=str(dataset_path),
    )
    mobility.get_od_data(keep_activity=True)

    # -------------------------------------------------------
    # 3. Create the target DuckDB table
    # -------------------------------------------------------
    SQL(f"""
        CREATE TABLE IF NOT EXISTS {table_name}(
            fecha TEXT,
            periodo TEXT,
            origen TEXT,
            destino TEXT,
            distancia TEXT,
            actividad_origen TEXT,
            actividad_destino TEXT,
            residencia TEXT,
            renta TEXT,
            edad TEXT,
            sexo TEXT,
            viajes TEXT,
            viajes_km TEXT,
            estudio_destino_posible TEXT,
            estudio_origen_posible TEXT,
            loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            source_file TEXT
        );
    """)

    # -------------------------------------------------------
    # 4. Load the CSV files into the table
    # -------------------------------------------------------
    SQL(f"""
        INSERT INTO {table_name}
        SELECT
            fecha,
            periodo,
            origen,
            destino,
            distancia,
            actividad_origen,
            actividad_destino,
            residencia,
            renta,
            edad,
            sexo,
            viajes,
            viajes_km,
            estudio_destino_posible,
            estudio_origen_posible,
            CURRENT_TIMESTAMP AS loaded_at,
            filename AS source_file
        FROM read_csv(
            '{dataset_path}/*.csv.gz',
            filename = true,
            all_varchar = true
        );
    """)

    print(f"Data successfully loaded into table: {table_name}")

load_od_matrices(zone_type="distritos", start_date="2023-03-01", end_date="2023-03-15")
load_od_matrices(zone_type="municipios", start_date="2023-03-01", end_date="2023-03-15")
load_od_matrices(zone_type="gau", start_date="2023-03-01", end_date="2023-03-15")

Downloading MITMA OD matrices for zone type: distritos...
Downloading file from https://movilidad-opendata.mitma.es/estudios_basicos/por-distritos/viajes/ficheros-diarios/2023-03/20230301_Viajes_distritos.csv.gz
Downloading: https://movilidad-opendata.mitma.es/estudios_basicos/por-distritos/viajes/ficheros-diarios/2023-03/20230301_Viajes_distritos.csv.gz
Saved 183411650 bytes to C:\Users\Joan\OneDrive\Documentos\Estudios\MUCEIM\Big Data Engineering and Technologies\Project\project\notebooks\layers/../../raw/MITMA/od_distritos\20230301_Viajes_distritos_v2.csv.gz
Downloading file from https://movilidad-opendata.mitma.es/estudios_basicos/por-distritos/viajes/ficheros-diarios/2023-03/20230302_Viajes_distritos.csv.gz
Downloading: https://movilidad-opendata.mitma.es/estudios_basicos/por-distritos/viajes/ficheros-diarios/2023-03/20230302_Viajes_distritos.csv.gz
Saved 185935541 bytes to C:\Users\Joan\OneDrive\Documentos\Estudios\MUCEIM\Big Data Engineering and Technologies\Project\project\note

  0%|                                                                                           | 0/15 [00:00<?, ?it/s]

Processing file: C:\Users\Joan\OneDrive\Documentos\Estudios\MUCEIM\Big Data Engineering and Technologies\Project\project\notebooks\layers/../../raw/MITMA/od_distritos\20230301_Viajes_distritos_v2.csv.gz
Reading gzipped file...


  7%|█████▌                                                                             | 1/15 [01:34<22:07, 94.85s/it]

Processing file: C:\Users\Joan\OneDrive\Documentos\Estudios\MUCEIM\Big Data Engineering and Technologies\Project\project\notebooks\layers/../../raw/MITMA/od_distritos\20230302_Viajes_distritos_v2.csv.gz
Reading gzipped file...


 13%|███████████                                                                        | 2/15 [03:08<20:24, 94.16s/it]

Processing file: C:\Users\Joan\OneDrive\Documentos\Estudios\MUCEIM\Big Data Engineering and Technologies\Project\project\notebooks\layers/../../raw/MITMA/od_distritos\20230303_Viajes_distritos_v2.csv.gz
Reading gzipped file...


 20%|████████████████▌                                                                  | 3/15 [04:43<18:54, 94.51s/it]

Processing file: C:\Users\Joan\OneDrive\Documentos\Estudios\MUCEIM\Big Data Engineering and Technologies\Project\project\notebooks\layers/../../raw/MITMA/od_distritos\20230304_Viajes_distritos_v2.csv.gz
Reading gzipped file...


 27%|██████████████████████▏                                                            | 4/15 [05:56<15:45, 85.94s/it]

Processing file: C:\Users\Joan\OneDrive\Documentos\Estudios\MUCEIM\Big Data Engineering and Technologies\Project\project\notebooks\layers/../../raw/MITMA/od_distritos\20230305_Viajes_distritos_v2.csv.gz
Reading gzipped file...


 33%|███████████████████████████▋                                                       | 5/15 [06:51<12:27, 74.78s/it]

Processing file: C:\Users\Joan\OneDrive\Documentos\Estudios\MUCEIM\Big Data Engineering and Technologies\Project\project\notebooks\layers/../../raw/MITMA/od_distritos\20230306_Viajes_distritos_v2.csv.gz
Reading gzipped file...


 40%|█████████████████████████████████▏                                                 | 6/15 [08:06<11:15, 75.06s/it]

Processing file: C:\Users\Joan\OneDrive\Documentos\Estudios\MUCEIM\Big Data Engineering and Technologies\Project\project\notebooks\layers/../../raw/MITMA/od_distritos\20230307_Viajes_distritos_v2.csv.gz
Reading gzipped file...


 47%|██████████████████████████████████████▋                                            | 7/15 [09:22<10:01, 75.23s/it]

Processing file: C:\Users\Joan\OneDrive\Documentos\Estudios\MUCEIM\Big Data Engineering and Technologies\Project\project\notebooks\layers/../../raw/MITMA/od_distritos\20230308_Viajes_distritos_v2.csv.gz
Reading gzipped file...


 53%|████████████████████████████████████████████▎                                      | 8/15 [10:39<08:51, 75.90s/it]

Processing file: C:\Users\Joan\OneDrive\Documentos\Estudios\MUCEIM\Big Data Engineering and Technologies\Project\project\notebooks\layers/../../raw/MITMA/od_distritos\20230309_Viajes_distritos_v2.csv.gz
Reading gzipped file...


 60%|█████████████████████████████████████████████████▊                                 | 9/15 [12:00<07:43, 77.33s/it]

Processing file: C:\Users\Joan\OneDrive\Documentos\Estudios\MUCEIM\Big Data Engineering and Technologies\Project\project\notebooks\layers/../../raw/MITMA/od_distritos\20230310_Viajes_distritos_v2.csv.gz
Reading gzipped file...


 67%|██████████████████████████████████████████████████████▋                           | 10/15 [13:20<06:31, 78.28s/it]

Processing file: C:\Users\Joan\OneDrive\Documentos\Estudios\MUCEIM\Big Data Engineering and Technologies\Project\project\notebooks\layers/../../raw/MITMA/od_distritos\20230311_Viajes_distritos_v2.csv.gz
Reading gzipped file...


 73%|████████████████████████████████████████████████████████████▏                     | 11/15 [14:27<04:58, 74.70s/it]

Processing file: C:\Users\Joan\OneDrive\Documentos\Estudios\MUCEIM\Big Data Engineering and Technologies\Project\project\notebooks\layers/../../raw/MITMA/od_distritos\20230312_Viajes_distritos_v2.csv.gz
Reading gzipped file...


 80%|█████████████████████████████████████████████████████████████████▌                | 12/15 [15:26<03:29, 69.92s/it]

Processing file: C:\Users\Joan\OneDrive\Documentos\Estudios\MUCEIM\Big Data Engineering and Technologies\Project\project\notebooks\layers/../../raw/MITMA/od_distritos\20230313_Viajes_distritos_v2.csv.gz
Reading gzipped file...


 87%|███████████████████████████████████████████████████████████████████████           | 13/15 [16:43<02:24, 72.13s/it]

Processing file: C:\Users\Joan\OneDrive\Documentos\Estudios\MUCEIM\Big Data Engineering and Technologies\Project\project\notebooks\layers/../../raw/MITMA/od_distritos\20230314_Viajes_distritos_v2.csv.gz
Reading gzipped file...


 93%|████████████████████████████████████████████████████████████████████████████▌     | 14/15 [17:57<01:12, 72.64s/it]

Processing file: C:\Users\Joan\OneDrive\Documentos\Estudios\MUCEIM\Big Data Engineering and Technologies\Project\project\notebooks\layers/../../raw/MITMA/od_distritos\20230315_Viajes_distritos_v2.csv.gz
Reading gzipped file...


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [19:14<00:00, 76.95s/it]


Concatenating all the dataframes....
Writing the parquet file....


ArrowMemoryError: realloc of size 1073741824 failed

In [None]:
SQL(f"""
    (SELECT '{LAKE_LAYER}_mitma_od_distritos' as name, count(*) 
    FROM {LAKE_LAYER}_mitma_od_distritos)
        UNION
    (SELECT '{LAKE_LAYER}_mitma_od_municipios' as name, count(*) 
    FROM {LAKE_LAYER}_mitma_od_municipios)
        UNION
    (SELECT '{LAKE_LAYER}_mitma_od_gau' as name, count(*) 
    FROM {LAKE_LAYER}_mitma_od_gau);
""")


In [None]:
SQL(f"""
    SELECT * 
    FROM {LAKE_LAYER}_mitma_od_distritos 
    LIMIT 5;
""")

<h2 id="people"><b>1.2. People by day</b></h2>

```sql
-- Distritos
CREATE TABLE IF NOT EXISTS bronze_mitma_people_day_districts (
  fecha TEXT,
  zona_pernoctacion TEXT,
  edad TEXT,
  sexo TEXT,
  numero_viajes TEXT,   -- 0,1,2,2+ (mantener TEXT)
  personas TEXT,
  -- Columnas extras añadidas para auditoria. 
  loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  source_file TEXT
);

-- Municipios
CREATE TABLE IF NOT EXISTS bronze_mitma_people_day_municipalities (
  fecha TEXT,
  zona_pernoctacion TEXT,
  edad TEXT,
  sexo TEXT,
  numero_viajes TEXT,
  personas TEXT,
  -- Columnas extras añadidas para auditoria. 
  loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  source_file TEXT
);

-- GAU
CREATE TABLE IF NOT EXISTS bronze_mitma_people_day_gau (
  fecha TEXT,
  zona_pernoctacion TEXT,
  edad TEXT,
  sexo TEXT,
  numero_viajes TEXT,
  personas TEXT,
  -- Columnas extras añadidas para auditoria. 
  loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  source_file TEXT
);
```

![Descripción de la imagen](./schemas/bronze_people_day.png)

In [None]:
def load_people_day(zone_type="districts", start_date="2022-03-01", end_date="2022-03-03"):
    """
    Downloads MITMA 'people_day' data (only if not already present)
    and loads it into DuckDB.

    Parameters
    ----------
    zone_type : str
        Zone level (“districts”, “municipalities”, etc.).
    start_date : str
        Start date (YYYY-MM-DD).
    end_date : str
        End date (YYYY-MM-DD).
    """

    dataset = "people_day"
    dataset_path = f"{BASE_PATH}/MITMA/{dataset}_{zone_type}"
    table_name = f"{LAKE_LAYER}_mitma_{dataset}_{zone_type}"

    # -------------------------------------------------------
    # 1. Ensure directory exists
    # -------------------------------------------------------
    os.makedirs(dataset_path, exist_ok=True)

    # -------------------------------------------------------
    # 2. Download data only if directory is empty
    # -------------------------------------------------------
    print(f"Downloading MITMA 'people_day' dataset for: {zone_type}...")
    mobility = Mobility(
        version=2,
        zones=zone_type,
        start_date=start_date,
        end_date=end_date,
        output_directory=str(dataset_path),
    )
    mobility.get_number_of_trips_data()

    # -------------------------------------------------------
    # 3. Create DuckDB table
    # -------------------------------------------------------
    SQL(f"""
        CREATE TABLE {table_name}(
            fecha TEXT,
            zona_pernoctacion TEXT,
            edad TEXT,
            sexo TEXT,
            numero_viajes TEXT,
            personas TEXT,
            loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            source_file TEXT
        );
    """)

    # -------------------------------------------------------
    # 4. Load data from CSV into DuckDB
    # -------------------------------------------------------
    SQL(f"""
        INSERT INTO {table_name}
        SELECT
            fecha,
            zona_pernoctacion,
            edad,
            sexo,
            numero_viajes,
            personas,
            CURRENT_TIMESTAMP AS loaded_at,
            filename AS source_file
        FROM read_csv(
            '{dataset_path}/*.csv.gz',
            filename = true,
            all_varchar = true
        );
    """)

    print(f"Data successfully loaded into table: {table_name}")


load_people_day(zone_type="distritos", start_date="2023-03-01", end_date="2023-03-15")
load_people_day(zone_type="municipios", start_date="2023-03-01", end_date="2023-03-15")
load_people_day(zone_type="gau", start_date="2023-03-01", end_date="2023-03-15")

In [None]:
SQL(f"""
    (SELECT '{LAKE_LAYER}_mitma_people_day_distritos' as name, count(*) 
    FROM {LAKE_LAYER}_mitma_people_day_distritos)
        UNION
    (SELECT '{LAKE_LAYER}_mitma_people_day_municipios' as name, count(*) 
    FROM {LAKE_LAYER}_mitma_people_day_municipios)
        UNION
    (SELECT '{LAKE_LAYER}_mitma_people_day_gau' as name, count(*) 
    FROM {LAKE_LAYER}_mitma_people_day_gau);
""")

In [None]:
SQL(f"""
    SELECT * 
    FROM {LAKE_LAYER}_mitma_people_day_distritos 
    LIMIT 5;
""")

<h2 id="overnight"><b>1.3. Overnight stays</b></h2>

```sql
-- Distritos
CREATE TABLE IF NOT EXISTS bronze_mitma_overnight_stay_districts (
  fecha TEXT,
  zona_residencia TEXT,
  zona_pernoctacion TEXT,
  personas TEXT,
  -- Columnas extras añadidas para auditoria. 
  loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  source_file TEXT
);

-- Municipios
CREATE TABLE IF NOT EXISTS bronze_mitma_overnight_stay_municipalities (
  fecha TEXT,
  zona_residencia TEXT,
  zona_pernoctacion TEXT,
  personas TEXT,
  -- Columnas extras añadidas para auditoria. 
  loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  source_file TEXT
);

-- GAU
CREATE TABLE IF NOT EXISTS bronze_mitma_overnight_stay_gau (
  fecha TEXT,
  zona_residencia TEXT,
  zona_pernoctacion TEXT,
  personas TEXT,
  -- Columnas extras añadidas para auditoria. 
  loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  source_file TEXT
);
```

![Descripción de la imagen](./schemas/bronze_overnight.png)

In [None]:
def load_overnight_stay(zone_type="districts", start_date="2022-03-01", end_date="2022-03-03"):
    """
    Downloads MITMA overnight-stay data (only if missing) and loads it into DuckDB.

    Parameters
    ----------
    zone_type : str
        Zone level (“districts”, “municipalities”, etc.).
    start_date : str
        Start date (YYYY-MM-DD).
    end_date : str
        End date (YYYY-MM-DD).
    """

    dataset = "overnight_stay"
    dataset_path = f"{BASE_PATH}/MITMA/{dataset}_{zone_type}"
    table_name = f"{LAKE_LAYER}_mitma_{dataset}_{zone_type}"

    # -------------------------------------------------------
    # 1. Ensure directory exists
    # -------------------------------------------------------
    os.makedirs(dataset_path, exist_ok=True)

    # -------------------------------------------------------
    # 2. Download data only if directory is empty
    # -------------------------------------------------------
    print(f"Downloading MITMA overnight-stay data for zone type: {zone_type}...")
    mobility = Mobility(
        version=2,
        zones=zone_type,
        start_date=start_date,
        end_date=end_date,
        output_directory=str(dataset_path),
    )
    mobility.get_overnight_stays_data()

    # -------------------------------------------------------
    # 3. Create table in DuckDB
    # -------------------------------------------------------
    SQL(f"""
        CREATE TABLE {table_name}(
            fecha TEXT,
            zona_residencia TEXT,
            zona_pernoctacion TEXT,
            personas TEXT,
            loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            source_file TEXT
        );
    """)

    # -------------------------------------------------------
    # 4. Insert CSV data
    # -------------------------------------------------------
    SQL(f"""
        INSERT INTO {table_name}
        SELECT
            fecha,
            zona_residencia,
            zona_pernoctacion,
            personas,
            CURRENT_TIMESTAMP AS loaded_at,
            filename AS source_file
        FROM read_csv(
            '{dataset_path}/*.csv.gz',
            filename = true,
            all_varchar = true
        );
    """)

    print(f"Overnight-stay data successfully loaded into table: {table_name}")


load_overnight_stay(zone_type="distritos", start_date="2023-03-01", end_date="2023-03-15")
load_overnight_stay(zone_type="municipios", start_date="2023-03-01", end_date="2023-03-15")
load_overnight_stay(zone_type="gau", start_date="2023-03-01", end_date="2023-03-15")

In [None]:
SQL(f"""
    (SELECT '{LAKE_LAYER}_mitma_overnight_stay_distritos' as name, count(*) 
    FROM {LAKE_LAYER}_mitma_overnight_stay_distritos)
        UNION
    (SELECT '{LAKE_LAYER}_mitma_overnight_stay_municipios' as name, count(*) 
    FROM {LAKE_LAYER}_mitma_overnight_stay_municipios)
        UNION
    (SELECT '{LAKE_LAYER}_mitma_overnight_stay_gau' as name, count(*) 
    FROM {LAKE_LAYER}_mitma_overnight_stay_gau);
""")

In [None]:
SQL(f"""
    SELECT * 
    FROM {LAKE_LAYER}_mitma_overnight_stay_distritos 
    LIMIT 5;
""")

<h2 id="zones"><b>1.4. Zones</b></h2>

```sql
-- Distritos
CREATE TABLE IF NOT EXISTS bronze_mitma_districts (
  id TEXT,
  name TEXT,
  population TEXT,
  geometry TEXT,
  -- Columnas extras añadidas para auditoria. 
  loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  source_file TEXT
);

-- Municipios
CREATE TABLE IF NOT EXISTS bronze_mitma_municipalities (
  id TEXT,
  name TEXT,
  population TEXT,
  geometry TEXT,
  -- Columnas extras añadidas para auditoria. 
  loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  source_file TEXT
);

-- GAU
CREATE TABLE IF NOT EXISTS bronze_mitma_gau (
  id TEXT,
  name TEXT,
  population TEXT,
  geometry TEXT,
  -- Columnas extras añadidas para auditoria. 
  loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  source_file TEXT
);

```

![Descripción de la imagen](./schemas/bronze_zones.png)

In [None]:
def load_zones(zone_type="districts"):
    """
    Downloads MITMA zone definitions (only if missing), stores them as compressed CSV,
    and loads them into DuckDB.

    Parameters
    ----------
    zone_type : str
        Zone level (“districts”, “municipalities”, “gau”, etc.).
    """

    dataset = "zones"
    dataset_path = f"{BASE_PATH}/MITMA/{zone_type}"
    table_name = f"{LAKE_LAYER}_mitma_{zone_type}"

    # -------------------------------------------------------
    # 1. Ensure directory exists
    # -------------------------------------------------------
    os.makedirs(dataset_path, exist_ok=True)

    csv_path = dataset_path + "/zones.csv.gz"

    # -------------------------------------------------------
    # 2. Download and save zones CSV if not present
    # -------------------------------------------------------
    if not os.path.isfile(dataset_path):
        print(f"Downloading MITMA zone definitions for zone type: {zone_type}...")

        zones = Zones(
            version=2,
            zones=zone_type,
            output_directory=str(dataset_path),
        )

        df = zones.get_zone_geodataframe()

        if df is None:
            raise ValueError("Zones.get_zone_geodataframe() returned None")

        # Save geodataframe as compressed CSV
        df.to_csv(csv_path, index=True, compression="gzip")
        print("Zones saved:", csv_path)
    
    else:
        print("Zone definition file already exists. Skipping download.")

    # -------------------------------------------------------
    # 3. Create DuckDB table
    # -------------------------------------------------------
    SQL(f"""
        CREATE TABLE {table_name}(
            id TEXT,
            name TEXT,
            population TEXT,
            geometry TEXT,   -- stored as plain text in BRONZE layer
            loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            source_file TEXT
        );
    """)

    # -------------------------------------------------------
    # 4. Load CSV into DuckDB
    # -------------------------------------------------------
    SQL(f"""
        INSERT INTO {table_name}
        SELECT
            id,
            name,
            population,
            geometry,
            CURRENT_TIMESTAMP AS loaded_at,
            filename AS source_file
        FROM read_csv(
            '{dataset_path}/*.csv.gz',
            filename = true,
            all_varchar = true
        );
    """)

    print(f"Zone data successfully loaded into table: {table_name}")



load_zones(zone_type="distritos")
load_zones(zone_type="municipios")
load_zones(zone_type="gau")

In [None]:
SQL(f"""
    (SELECT '{LAKE_LAYER}_mitma_distritos' as name, count(*) 
    FROM {LAKE_LAYER}_mitma_distritos)
        UNION
    (SELECT '{LAKE_LAYER}_mitma_municipios' as name, count(*) 
    FROM {LAKE_LAYER}_mitma_municipios)
        UNION
    (SELECT '{LAKE_LAYER}_mitma_gau' as name, count(*)
    FROM {LAKE_LAYER}_mitma_gau);
""")

In [None]:
SQL(f"""
    SELECT * 
    FROM {LAKE_LAYER}_mitma_od_distritos 
    LIMIT 5;
""")

<h2 id="ine"><b>2. Spanish National Statistics Institute (INE)</b></h2>

<h2 id="population"><b>2.1. Population by Municipio (Padrón)</b></h2>

```sql
-- Distritos
CREATE TABLE IF NOT EXISTS bronze_ine_padron_municipios (
  cod        VARCHAR,
  nombre     VARCHAR,
  fk_unidad  INTEGER,
  fk_escala  INTEGER,
  data_txt   TEXT,
  data       JSON,
  -- Columnas extras añadidas para auditoria. 
  loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  -- loaded_by TEXT DEFAULT CURRENT_USER,
  source_file TEXT
);
```

![Descripción de la imagen](./schemas/bronze_ine.png)

In [None]:
def get_padron_by_municipio(year: int):
    """
    Fetches Spanish municipal population (Padrón) data from INE's WS Tempus API.

    Parameters
    ----------
    year : int
        Year for which the Padrón data is requested.

    Returns
    -------
    pandas.DataFrame
        Normalized JSON response as DataFrame.
    """

    url = f"https://servicios.ine.es/wstempus/js/ES/DATOS_TABLA/29005?date={year}0101:{year}1231"

    response = requests.get(url, timeout=120)
    response.raise_for_status()

    data = response.json()
    df = pd.json_normalize(data)
    return df


def load_padron(year=2023):
    """
    Downloads INE municipal Padrón data (only if missing) and loads it into DuckDB.

    Parameters
    ----------
    year : int
        Year of the Padrón dataset to retrieve.
    """

    dataset = "padron_municipios"
    dataset_path = f"{BASE_PATH}/INE/{dataset}"
    table_name = f"{LAKE_LAYER}_ine_{dataset}"

    filename = f"padron_municipios_{year}.csv.gz"
    file_path = dataset_path + "/" + filename

    # -------------------------------------------------------
    # 1. Ensure directory exists
    # -------------------------------------------------------
    os.makedirs(dataset_path, exist_ok=True)

    # -------------------------------------------------------
    # 2. Download and save zones CSV if not present
    # -------------------------------------------------------
    if not os.path.isfile(dataset_path):
        print(f"Downloading INE Padrón municipal data for year {year}...")

        df = get_padron_by_municipio(year)

        if df is not None and not df.empty:
            df.to_csv(file_path, index=False, compression="gzip")
        else:
            raise ValueError("get_padron_by_municipio() returned an empty or null dataframe")
    else:
        print("File already exists. Skipping download.")

    # -------------------------------------------------------
    # 3. Create DuckDB table
    # -------------------------------------------------------
    SQL(f"""
        CREATE TABLE {table_name}(
            cod        VARCHAR,
            nombre     VARCHAR,
            fk_unidad  INTEGER,
            fk_escala  INTEGER,
            data_txt   TEXT,
            data       JSON,
            loaded_at  TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            source_file TEXT
        );
    """)

    # -------------------------------------------------------
    # 4. Insert CSV data into DuckDB
    # -------------------------------------------------------
    SQL(f"""
        INSERT INTO {table_name}
        SELECT
            COD        AS cod,
            Nombre     AS nombre,
            FK_Unidad::INTEGER AS fk_unidad,
            FK_Escala::INTEGER AS fk_escala,
            REGEXP_REPLACE(
                REGEXP_REPLACE(
                    REGEXP_REPLACE(
                        REPLACE(data, '''', '"'),
                        '\\bTrue\\b', 'true'
                    ),
                    '\\bFalse\\b', 'false'
                ),
                '\\bNone\\b', 'null'
            ) AS data_txt,
            CAST(data_txt AS JSON) AS data,
            CURRENT_TIMESTAMP AS loaded_at,
            filename AS source_file
        FROM read_csv(
            '{file_path}',
            filename = true,
            all_varchar = true
        );
    """)

    print(f"INE Padrón data for year {year} successfully loaded into table: {table_name}")


load_padron(2023)

In [None]:
SQL("""
    SELECT * 
    FROM bronze_ine_padron_municipios 
    LIMIT 5;
""")

<h2 id="income"><b>2.2. Income by District</b></h2>

```sql
-- Distritos
CREATE TABLE IF NOT EXISTS bronze_ine_renta_distritos (
  municipio      VARCHAR,
  distrito       VARCHAR,
  seccion        VARCHAR,
  indicador      VARCHAR,
  anyo           INTEGER,
  total          VARCHAR,
  -- Columnas extras añadidas para auditoria. 
  loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  source_file TEXT
);
```

![Descripción de la imagen](./schemas/bronze_income.png)

In [None]:
def load_income():
    """
    Loads the income data into DuckDB.
    """

    dataset = "renta_distritos"
    dataset_path = f"{BASE_PATH}/INE/{dataset}.csv"
    table_name = f"{LAKE_LAYER}_ine_{dataset}"

 # -------------------------------------------------------
    # 1. Ensure CSV is downloaded
    # -------------------------------------------------------
    if not os.path.isfile(dataset_path):
        print("'renta_distritos' data is not available.")
    else:
        print("Loading data...")

    # -------------------------------------------------------
    # 2. Create DuckDB table
    # -------------------------------------------------------
    SQL(f"""
        CREATE TABLE IF NOT EXISTS {table_name} (
            municipio      VARCHAR,
            distrito       VARCHAR,
            seccion        VARCHAR,
            indicador      VARCHAR,
            anyo           INTEGER,
            total          VARCHAR,
            loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            source_file TEXT
        );
    """)

    # -------------------------------------------------------
    # 3. Insert CSV data into DuckDB
    # -------------------------------------------------------
    SQL(f"""
        INSERT INTO {table_name}
        SELECT
            Municipios                      AS municipio,
            Distritos                       AS distrito,
            Secciones                       AS seccion,
            Indicador                       AS indicador,
            Periodo::INTEGER                AS anyo,
            Total                           AS total,
            CURRENT_TIMESTAMP AS loaded_at,
            filename AS source_file
        FROM read_csv(
            '{dataset_path}',
            filename = true,
            all_varchar = true
        );
    """)

    print(f"INE INCOME data successfully loaded into table: {table_name}")


load_income()

In [None]:
SQL("""
    SELECT 'bronze_ine_renta_distritos' as name, COUNT(*)
    FROM bronze_ine_renta_distritos;
""")

In [None]:
SQL("""
    SELECT * 
    FROM bronze_ine_renta_distritos 
    LIMIT 5;
""")

<h2 id="business"><b>2.3. Business by municipio</b></h2>

```sql
-- Municipios
CREATE TABLE IF NOT EXISTS bronze_ine_empresas_municipios (
  municipio      VARCHAR,
  anyo           INTEGER,
  total          VARCHAR,
  -- Columnas extras añadidas para auditoria. 
  loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  source_file TEXT
);
```

![Descripción de la imagen](./schemas/bronze_business.png)

In [None]:
def load_business():
    """
    Loads the number of business data into DuckDB.
    """

    dataset = "empresas_municipios"
    dataset_path = f"{BASE_PATH}/INE/{dataset}.csv"
    table_name = f"{LAKE_LAYER}_ine_{dataset}"

 # -------------------------------------------------------
    # 1. Ensure CSV is downloaded
    # -------------------------------------------------------
    if not os.path.isfile(dataset_path):
        print("'empresas_municipios' data is not available.")
    else:
        print("Loading data...")

    # -------------------------------------------------------
    # 2. Create DuckDB table
    # -------------------------------------------------------
    SQL(f"""
        CREATE TABLE IF NOT EXISTS {table_name} (
            municipio      VARCHAR,
            anyo           INTEGER,
            total          VARCHAR,
            loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            source_file TEXT
        );
    """)

    # -------------------------------------------------------
    # 3. Insert CSV data into DuckDB
    # -------------------------------------------------------
    SQL(f"""
        INSERT INTO {table_name}
        SELECT
            Municipios                      AS municipio,
            Periodo::INTEGER                AS anyo,
            Total                           AS total,
            CURRENT_TIMESTAMP AS loaded_at,
            filename AS source_file
        FROM read_csv(
            '{dataset_path}',
            delim=';',
            header=True,
            encoding='latin-1',
            filename=true,
            all_varchar=true
        );
    """)

    print(f"INE INCOME data successfully loaded into table: {table_name}")


load_business()

In [None]:
SQL("""
    SELECT 'bronze_ine_empresas_municipios' as name, COUNT(*)
    FROM bronze_ine_empresas_municipios;
""")

In [None]:
SQL("""
    SELECT * 
    FROM bronze_ine_empresas_municipios 
    LIMIT 5;
""")