In [None]:
%pip install duckdb pandas numpy pyspainmobility requests

<h1 align="center"><b>Building a 3-Tier Data Lakehouse for Mobility Analysis in Spain</b></h1>
<h3 align="center"><b style="color:gray">Silver Layer</b></h3>
<h4 align="right">Joan Fernández Navarro & Borja Albert Gramaje</h4>
<h3><b>Table of Contents</b></h3>
<ul style = "list-style-type: none; line-height: 0.5em;">
    <li><a href="#holidays"><h5>1. Spanish Holidays (Open Holidyas API)</h5></a></li>
    <li><a href="#mitma"><h5>2. Spanish Ministry of Transport, Mobility and Urban Agenda (MITMA) Open Data</h5></a></li>
    <ul style = "list-style-type: none; line-height: 1em;">
        <li><a href="#od"><h5>1.1. Origin-destination (OD) trip matrices</h5></a></li>
        <li><a href="#people"><h5>1.2. People by day</h5></a></li>
        <li><a href="#overnight"><h5>1.3. Overnight stays</h5></a></li>
        <li><a href="#zones"><h5>1.4. Zones</h5></a></li>
        <li><a href="#zones"><h5>1.5. Centroids</h5></a></li>
    </ul>
    <li><a href="#ine"><h5>3. Spanish National Statistics Institute (INE)</h5></a></li>
    <ul style = "list-style-type: none; line-height: 1em;">
        <li><a href="#population"><h5>3.1. Population by municipio (Padrón)</h5></a></li>
        <li><a href="#income"><h5>3.2. Income by distrito</h5></a></li>
        <li><a href="#business"><h5>3.3. Business by municipio</h5></a></li>
    </ul>
    <li><a href="#extra"><h5>4. Extra tables</h5></a></li>
    </ul>
</ul>

In [1]:
import os
import duckdb
import requests
import pandas as pd
from pyspainmobility import Mobility, Zones

LAKE_LAYER = "silver"

con = duckdb.connect("./../../mobility.db")

def SQL(q):
    """Run SQL (printed for clarity) and return a DataFrame."""
    return con.execute(q).fetchdf()

print("DuckDB version:", con.sql("SELECT version();").fetchone()[0])

DuckDB version: v1.4.2


<h2 id="holidays"><b>1. Spanish Holidays (Open Holidyas API)</b></h2>

In [2]:
SQL("""
    INSTALL httpfs;
    LOAD httpfs;
""")

SQL("""
    -- La tabla temporal es visible solo para la sesión que la creó.
    CREATE OR REPLACE TEMP TABLE spanish_holidays AS
    WITH parsed_holidays AS (
        SELECT 
            json_extract(holiday, '$.startDate') AS date_str,
            CAST(json_extract(holiday, '$.nationwide') AS BOOLEAN) AS nationwide
        FROM read_json(
            'https://openholidaysapi.org/PublicHolidays?countryIsoCode=ES&languageIsoCode=ES&validFrom=2023-01-01&validTo=2023-12-31',
            format='array'
        ) AS t(holiday)
    )
    SELECT DISTINCT
        CAST(date_str AS DATE) AS date
    FROM parsed_holidays
    WHERE nationwide = TRUE;
""")

Unnamed: 0,Count
0,0


<h2 id="mitma"><b>1. Spanish Ministry of Transport, Mobility and Urban Agenda (MITMA) Open Data</b></h2>

<h2 id="od"><b>1.1. Origin-destination (OD) trip matrices</b></h2>

```mermaid
flowchart TD

    %% --------------------------
    %% Bronze Sources
    %% --------------------------
    B1[bronze_mitma_od_distritos]:::bronze
    B2[bronze_mitma_od_municipios]:::bronze
    B3[bronze_mitma_od_gau]:::bronze
    H[spanish_holidays]:::bronze

    %% --------------------------
    %% Transform blocks
    %% --------------------------
    subgraph T1[Transformations]
        direction TB
        C1[Parse fecha]
        C2[Convert periodo → hora]
        C3[Numerical Casting]
        C4[Boolean normalization]
        C5[Holidays Flags]
        C6[Add zone_level]
    end

    %% --------------------------
    %% Individual transform outputs
    %% --------------------------
    S1[silver_od_distritos]:::silver
    S2[silver_od_municipios]:::silver
    S3[silver_od_gau]:::silver

    %% --------------------------
    %% Final unified table
    %% --------------------------
    ALL[silver_od_all]:::target

    %% --------------------------
    %% Flows bronze → transform
    %% --------------------------
    B1 --> T1 --> S1
    B2 --> T1 --> S2
    B3 --> T1 --> S3
    H --> C5

    %% --------------------------
    %% UNION ALL final
    %% --------------------------
    S1 --> ALL
    S2 --> ALL
    S3 --> ALL

    %% --------------------------
    %% Styles
    %% --------------------------
    classDef bronze fill:#f2d7d5,stroke:#a93226,color:#000;
    classDef silver fill:#d6eaf8,stroke:#2e86c1,color:#000;
    classDef target fill:#d5f5e3,stroke:#1d8348,color:#000,font-weight:bold;

```

![Descripción de la imagen](./schemas/silver_od.png)

In [21]:
"""
Generates a unified silver_od_all table directly from all Bronze MITMA OD tables.
Adds:
  - Type casting
  - Weekend / holiday flags
  - NULL filtering for required fields
  - zone_level field
"""

SQL("""
CREATE OR REPLACE TABLE silver_od_all AS
WITH base AS (

    -------------------------------------------------------------------
    -- DISTRITOS
    -------------------------------------------------------------------
    SELECT
        'distritos' AS zone_level,
        strptime(CAST(fecha AS VARCHAR), '%Y%m%d')::DATE AS fecha,
        CAST(periodo AS SMALLINT) AS hora,

        origen AS origen_zone_id,
        destino AS destino_zone_id,

        CAST(REPLACE(viajes, '.', '') AS DOUBLE)    AS viajes,
        CAST(REPLACE(viajes_km, '.', '') AS DOUBLE) AS viajes_km,

        distancia,
        actividad_origen,
        actividad_destino,
        residencia,
        renta,
        edad,
        sexo,

        CASE WHEN estudio_destino_posible ILIKE 'si' THEN TRUE
             WHEN estudio_destino_posible ILIKE 'no' THEN FALSE END
             AS estudio_destino_posible,

        CASE WHEN estudio_origen_posible ILIKE 'si' THEN TRUE
             WHEN estudio_origen_posible ILIKE 'no' THEN FALSE END
             AS estudio_origen_posible

    FROM bronze_mitma_od_distritos


    UNION ALL

    -------------------------------------------------------------------
    -- MUNICIPIOS
    -------------------------------------------------------------------
    SELECT
        'municipios' AS zone_level,
        strptime(CAST(fecha AS VARCHAR), '%Y%m%d')::DATE AS fecha,
        CAST(periodo AS SMALLINT) AS hora,

        origen AS origen_zone_id,
        destino AS destino_zone_id,

        CAST(REPLACE(viajes, '.', '') AS DOUBLE),
        CAST(REPLACE(viajes_km, '.', '') AS DOUBLE),

        distancia,
        actividad_origen,
        actividad_destino,
        residencia,
        renta,
        edad,
        sexo,

        CASE WHEN estudio_destino_posible ILIKE 'si' THEN TRUE
             WHEN estudio_destino_posible ILIKE 'no' THEN FALSE END,
        CASE WHEN estudio_origen_posible ILIKE 'si' THEN TRUE
             WHEN estudio_origen_posible ILIKE 'no' THEN FALSE END

    FROM bronze_mitma_od_municipios


    UNION ALL

    -------------------------------------------------------------------
    -- GAU
    -------------------------------------------------------------------
    SELECT
        'gau' AS zone_level,
        strptime(CAST(fecha AS VARCHAR), '%Y%m%d')::DATE AS fecha,
        CAST(periodo AS SMALLINT) AS hora,

        origen AS origen_zone_id,
        destino AS destino_zone_id,

        CAST(REPLACE(viajes, '.', '') AS DOUBLE),
        CAST(REPLACE(viajes_km, '.', '') AS DOUBLE),

        distancia,
        actividad_origen,
        actividad_destino,
        residencia,
        renta,
        edad,
        sexo,

        CASE WHEN estudio_destino_posible ILIKE 'si' THEN TRUE
             WHEN estudio_destino_posible ILIKE 'no' THEN FALSE END,
        CASE WHEN estudio_origen_posible ILIKE 'si' THEN TRUE
             WHEN estudio_origen_posible ILIKE 'no' THEN FALSE END

    FROM bronze_mitma_od_gau
),

enriched AS (
    SELECT
        *,
        CASE WHEN dayofweek(fecha) IN (6,7) THEN TRUE ELSE FALSE END AS is_weekend,
        CASE WHEN fecha IN (SELECT date FROM spanish_holidays) THEN TRUE ELSE FALSE END AS is_holiday
    FROM base
),

filtered AS (
    SELECT *
    FROM enriched
    WHERE 
        -- Campos críticos: descartar registros incompletos
        fecha IS NOT NULL
        AND hora IS NOT NULL
        AND origen_zone_id IS NOT NULL
        AND destino_zone_id IS NOT NULL
        AND viajes IS NOT NULL
        AND viajes_km IS NOT NULL
        AND distancia IS NOT NULL
)

SELECT * FROM filtered;
""")

print("Created unified Silver table: silver_od_all")

Created unified Silver table: silver_od_all


In [22]:
SQL(f"""
    SELECT zone_level, COUNT(*)
    FROM silver_od_all
    GROUP BY zone_level;
""")

Unnamed: 0,zone_level,count_star()
0,municipios,34683529
1,gau,20826976
2,distritos,55862966


In [23]:
SQL(f"""
    SELECT *
    FROM silver_od_all 
    LIMIT 10;
""")

Unnamed: 0,zone_level,fecha,hora,origen_zone_id,destino_zone_id,viajes,viajes_km,distancia,actividad_origen,actividad_destino,residencia,renta,edad,sexo,estudio_destino_posible,estudio_origen_posible,is_weekend,is_holiday
0,distritos,2022-03-01,13,303102,303103,3251.0,10441.0,2-10,frecuente,frecuente,3,10-15,45-65,mujer,False,False,False,False
1,distritos,2022-03-01,13,303102,303103,7426.0,18648.0,2-10,frecuente,frecuente,3,10-15,65-100,mujer,False,False,False,False
2,distritos,2022-03-01,13,303102,303103,277.0,9342.0,2-10,frecuente,frecuente,3,10-15,,,False,False,False,False
3,distritos,2022-03-01,13,303102,303103,2832.0,7524.0,2-10,frecuente,frecuente,3,<10,45-65,hombre,False,False,False,False
4,distritos,2022-03-01,13,303102,303103,6704.0,1469.0,2-10,frecuente,frecuente,3,<10,,,False,False,False,False
5,distritos,2022-03-01,13,303102,303103,587.0,18789.0,2-10,frecuente,trabajo_estudio,3,10-15,25-45,,False,False,False,False
6,distritos,2022-03-01,13,303102,303103,405.0,9324.0,2-10,frecuente,trabajo_estudio,3,10-15,25-45,hombre,False,False,False,False
7,distritos,2022-03-01,13,303102,303103,8335.0,18487.0,2-10,frecuente,trabajo_estudio,3,10-15,,,False,False,False,False
8,distritos,2022-03-01,13,303102,303103,3024.0,70.0,2-10,frecuente,trabajo_estudio,3,<10,65-100,hombre,False,False,False,False
9,distritos,2022-03-01,13,303102,303103,459.0,11361.0,2-10,frecuente,trabajo_estudio,3,<10,,,False,False,False,False


In [24]:
SQL(f"""
    SELECT is_weekend, count(*) as total
    FROM silver_od_all 
    GROUP BY is_weekend;
""")

Unnamed: 0,is_weekend,total
0,False,111373471


<h2 id="people"><b>1.2. People by day</b></h2>

```mermaid
flowchart TD

    %% --------------------------
    %% Bronze Sources
    %% --------------------------
    B1[bronze_mitma_peple_day_distritos]:::bronze
    B2[bronze_mitma_peple_day_municipios]:::bronze
    B3[bronze_mitma_peple_day_gau]:::bronze

    %% --------------------------
    %% Transform blocks
    %% --------------------------
    subgraph T1[Transformations]
        direction TB
        C1[Parse fecha]
        C3[Numerical Casting]
        C6[Add zone_level]
    end

    %% --------------------------
    %% Individual transform outputs
    %% --------------------------
    S1[silver_peple_day_distritos]:::silver
    S2[silver_peple_day_municipios]:::silver
    S3[silver_peple_day_gau]:::silver

    %% --------------------------
    %% Final unified table
    %% --------------------------
    ALL[silver_peple_day_all]:::target

    %% --------------------------
    %% Flows bronze → transform
    %% --------------------------
    B1 --> T1 --> S1
    B2 --> T1 --> S2
    B3 --> T1 --> S3

    %% --------------------------
    %% UNION ALL final
    %% --------------------------
    S1 --> ALL
    S2 --> ALL
    S3 --> ALL

    %% --------------------------
    %% Styles
    %% --------------------------
    classDef bronze fill:#f2d7d5,stroke:#a93226,color:#000;
    classDef silver fill:#d6eaf8,stroke:#2e86c1,color:#000;
    classDef target fill:#d5f5e3,stroke:#1d8348,color:#000,font-weight:bold;

```

![Descripción de la imagen](./schemas/silver_people_day.png)

In [25]:
"""
Generates a unified silver_people_day_all table directly from all Bronze MITMA People Day tables.
Adds:
  - Type casting
  - zone_level field
  - Filtering of incomplete/null rows
"""

SQL("""
CREATE OR REPLACE TABLE silver_people_day_all AS
WITH base AS (

    -------------------------------------------------------------------
    -- DISTRITOS
    -------------------------------------------------------------------
    SELECT
        'distritos' AS zone_level,
        strptime(CAST(fecha AS VARCHAR), '%Y%m%d')::DATE AS fecha,
        zona_pernoctacion,
        edad,
        sexo,
        numero_viajes,
        CAST(REPLACE(personas, '.', '') AS DOUBLE) AS personas
    FROM bronze_mitma_people_day_distritos

    UNION ALL

    -------------------------------------------------------------------
    -- MUNICIPIOS
    -------------------------------------------------------------------
    SELECT
        'municipios' AS zone_level,
        strptime(CAST(fecha AS VARCHAR), '%Y%m%d')::DATE AS fecha,
        zona_pernoctacion,
        edad,
        sexo,
        numero_viajes,
        CAST(REPLACE(personas, '.', '') AS DOUBLE) AS personas
    FROM bronze_mitma_people_day_municipios

    UNION ALL

    -------------------------------------------------------------------
    -- GAU
    -------------------------------------------------------------------
    SELECT
        'gau' AS zone_level,
        strptime(CAST(fecha AS VARCHAR), '%Y%m%d')::DATE AS fecha,
        zona_pernoctacion,
        edad,
        sexo,
        numero_viajes,
        CAST(REPLACE(personas, '.', '') AS DOUBLE) AS personas
    FROM bronze_mitma_people_day_gau
),

filtered AS (
    SELECT *
    FROM base
    WHERE
        -- Required fields: avoid null or garbage rows
        fecha IS NOT NULL
        AND zona_pernoctacion IS NOT NULL
        AND edad IS NOT NULL
        AND sexo IS NOT NULL
        AND numero_viajes IS NOT NULL
        AND personas IS NOT NULL
)

SELECT * FROM filtered;
""")

print("Created unified Silver table: silver_people_day_all")


Created unified Silver table: silver_people_day_all


In [26]:
SQL(f"""
    SELECT zone_level, COUNT(*)
    FROM silver_people_day_all
    GROUP BY zone_level;
""")

Unnamed: 0,zone_level,count_star()
0,distritos,340448
1,municipios,231688
2,gau,182214


In [27]:
SQL(f"""
    SELECT *
    FROM silver_people_day_all 
    LIMIT 10;
""")

Unnamed: 0,zone_level,fecha,zona_pernoctacion,edad,sexo,numero_viajes,personas
0,distritos,2022-03-01,1001,0-25,hombre,0,125296.0
1,distritos,2022-03-01,1001,0-25,hombre,2,115378.0
2,distritos,2022-03-01,1001,0-25,hombre,2+,176630.0
3,distritos,2022-03-01,1001,0-25,mujer,0,125069.0
4,distritos,2022-03-01,1001,0-25,mujer,2,117712.0
5,distritos,2022-03-01,1001,0-25,mujer,2+,117712.0
6,distritos,2022-03-01,1001,25-45,hombre,0,111041.0
7,distritos,2022-03-01,1001,25-45,hombre,1,13820.0
8,distritos,2022-03-01,1001,25-45,hombre,2,94913.0
9,distritos,2022-03-01,1001,25-45,hombre,2+,147164.0


<h2 id="overnight"><b>1.3. Overnight stays</b></h2>

```mermaid
flowchart TD

    %% --------------------------
    %% Bronze Sources
    %% --------------------------
    B1[bronze_mitma_overnight_stay_distritos]:::bronze
    B2[bronze_mitma_overnight_stay_municipios]:::bronze
    B3[bronze_mitma_overnight_stay_gau]:::bronze

    %% --------------------------
    %% Transform blocks
    %% --------------------------
    subgraph T1[Transformations]
        direction TB
        C1[Parse fecha]
        C3[Numerical Casting]
        C6[Add zone_level]
    end

    %% --------------------------
    %% Individual transform outputs
    %% --------------------------
    S1[silver_overnight_stay_distritos]:::silver
    S2[silver_overnight_stay_municipios]:::silver
    S3[silver_overnight_stay_gau]:::silver

    %% --------------------------
    %% Final unified table
    %% --------------------------
    ALL[silver_overnight_stay_all]:::target

    %% --------------------------
    %% Flows bronze → transform
    %% --------------------------
    B1 --> T1 --> S1
    B2 --> T1 --> S2
    B3 --> T1 --> S3

    %% --------------------------
    %% UNION ALL final
    %% --------------------------
    S1 --> ALL
    S2 --> ALL
    S3 --> ALL

    %% --------------------------
    %% Styles
    %% --------------------------
    classDef bronze fill:#f2d7d5,stroke:#a93226,color:#000;
    classDef silver fill:#d6eaf8,stroke:#2e86c1,color:#000;
    classDef target fill:#d5f5e3,stroke:#1d8348,color:#000,font-weight:bold;

```

![Descripción de la imagen](./schemas/silver_overnight.png)

In [28]:
"""
Generates a unified silver_people_day_all table directly from all Bronze MITMA People Day tables.
Adds:
  - Type casting
  - zone_level field
  - Filtering of incomplete/null rows
"""

SQL("""
CREATE OR REPLACE TABLE silver_overnight_stay_all AS
WITH base AS (

    -------------------------------------------------------------------
    -- DISTRITOS
    -------------------------------------------------------------------
    SELECT
        'distritos' AS zone_level,
        strptime(CAST(fecha AS VARCHAR), '%Y%m%d')::DATE AS fecha,
        zona_pernoctacion,
        zona_residencia,
        CAST(REPLACE(personas, '.', '') AS DOUBLE) AS personas
    FROM bronze_mitma_overnight_stay_distritos

    UNION ALL

    -------------------------------------------------------------------
    -- MUNICIPIOS
    -------------------------------------------------------------------
    SELECT
        'municipios' AS zone_level,
        strptime(CAST(fecha AS VARCHAR), '%Y%m%d')::DATE AS fecha,
        zona_pernoctacion,
        zona_residencia,
        CAST(REPLACE(personas, '.', '') AS DOUBLE) AS personas
    FROM bronze_mitma_overnight_stay_municipios

    UNION ALL

    -------------------------------------------------------------------
    -- GAU
    -------------------------------------------------------------------
    SELECT
        'gau' AS zone_level,
        strptime(CAST(fecha AS VARCHAR), '%Y%m%d')::DATE AS fecha,
        zona_pernoctacion,
        zona_residencia,
        CAST(REPLACE(personas, '.', '') AS DOUBLE) AS personas
    FROM bronze_mitma_overnight_stay_gau
),

filtered AS (
    SELECT *
    FROM base
    WHERE
        -- Required fields: avoid null or garbage rows
        fecha IS NOT NULL
        AND zona_pernoctacion IS NOT NULL
        AND zona_residencia IS NOT NULL
        AND personas IS NOT NULL
)

SELECT * FROM filtered;
""")

print("Created unified Silver table: silver_overnight_stay_all")


Created unified Silver table: silver_overnight_stay_all


In [29]:
SQL(f"""
    SELECT zone_level, COUNT(*)
    FROM silver_overnight_stay_all
    GROUP BY zone_level;
""")

Unnamed: 0,zone_level,count_star()
0,distritos,947839
1,municipios,741266
2,gau,574703


In [32]:
SQL(f"""
    SELECT *
    FROM silver_overnight_stay_all 
    LIMIT 10;
""")

Unnamed: 0,zone_level,fecha,zona_pernoctacion,zona_residencia,personas
0,distritos,2022-03-01,01001,1001,2733784.0
1,distritos,2022-03-01,01004_AM,1001,2514.0
2,distritos,2022-03-01,01009_AM,1001,18431.0
3,distritos,2022-03-01,01017_AM,1001,2922.0
4,distritos,2022-03-01,01051,1001,7831.0
5,distritos,2022-03-01,01058_AM,1001,10600.0
6,distritos,2022-03-01,0105901,1001,10301.0
7,distritos,2022-03-01,0105902,1001,3243.0
8,distritos,2022-03-01,0105903,1001,2514.0
9,distritos,2022-03-01,0105904,1001,13588.0


<h2 id="zones"><b>1.4. Zones</b></h2>

```mermaid
flowchart TD

    %% --------------------------
    %% Bronze Sources
    %% --------------------------
    B1[bronze_mitma_zones_distritos]:::bronze
    B2[bronze_mitma_zones_municipios]:::bronze
    B3[bronze_mitma_zones_gau]:::bronze

    %% --------------------------
    %% Transform blocks
    %% --------------------------
    subgraph T1[Transformations]
        direction TB
        C1[Parse Geometry]
        C3[Numerical Casting]
        C6[Add zone_level]
        C5[Centroid calculation]
        C4[Filtering by zone type]
        C2[Link distrito-municipio]
    end

    %% --------------------------
    %% Individual transform outputs
    %% --------------------------
    S1[silver_zones_distritos]:::silver
    S2[silver_zones_municipios]:::silver
    S3[silver_zones_gau]:::silver

    %% --------------------------
    %% Final unified table
    %% --------------------------
    ALL[silver_zones_all]:::target

    %% --------------------------
    %% Flows bronze → transform
    %% --------------------------
    B1 --> T1 --Length is 7--> S1
    B2 --> T1 --Length is 5--> S2
    B3 --> T1 --Length is 8--> S3

    %% --------------------------
    %% UNION ALL final
    %% --------------------------
    S1 --> ALL
    S2 --> ALL
    S3 --> ALL

    %% --------------------------
    %% Styles
    %% --------------------------
    classDef bronze fill:#f2d7d5,stroke:#a93226,color:#000;
    classDef silver fill:#d6eaf8,stroke:#2e86c1,color:#000;
    classDef target fill:#d5f5e3,stroke:#1d8348,color:#000,font-weight:bold;

```

![Descripción de la imagen](./schemas/silver_zones.png)

In [36]:
SQL("""
    INSTALL spatial;
    LOAD spatial;
""") 

Unnamed: 0,Success


In [47]:
SQL("""
CREATE OR REPLACE TABLE silver_zones_all AS
WITH base AS (

    -------------------------------------------------------------------
    -- DISTRITOS
    -------------------------------------------------------------------
    SELECT
        id,
        name,
        'distritos' AS zone_level,
        CAST(REPLACE(population, '.', '') AS DOUBLE) AS population,
        ST_Multi(ST_GeomFromText(geometry)) AS geometry_obj,
        ST_Centroid(geometry_obj) AS centroid,
        ST_X(ST_Centroid(geometry_obj)) AS centroid_longitude,
        ST_Y(ST_Centroid(geometry_obj)) AS centroid_latitude,
        substring(id, 1, 5) AS municipio_id
    FROM bronze_mitma_distritos
    WHERE length(id) = 7

    UNION ALL

    -------------------------------------------------------------------
    -- MUNICIPIOS
    -------------------------------------------------------------------
    SELECT
        id,
        name,
        'municipios' AS zone_level,
        CAST(REPLACE(population, '.', '') AS DOUBLE) AS population,
        ST_Multi(ST_GeomFromText(geometry)) AS geometry_obj,
        ST_Centroid(geometry_obj) AS centroid,
        ST_X(ST_Centroid(geometry_obj)) AS centroid_longitude,
        ST_Y(ST_Centroid(geometry_obj)) AS centroid_latitude,
        id AS municipio_id
    FROM bronze_mitma_municipios
    WHERE length(id) = 5

    UNION ALL

    -------------------------------------------------------------------
    -- GAU
    -------------------------------------------------------------------
    SELECT
        id,
        name,
        'gau' AS zone_level,
        CAST(REPLACE(population, '.', '') AS DOUBLE) AS population,
        ST_Multi(ST_GeomFromText(geometry)) AS geometry_obj,
        ST_Centroid(geometry_obj) AS centroid,
        ST_X(ST_Centroid(geometry_obj)) AS centroid_longitude,
        ST_Y(ST_Centroid(geometry_obj)) AS centroid_latitude,
        NULL AS municipio_id
    FROM bronze_mitma_gau
    WHERE length(id) = 8
),

filtered AS (
    SELECT *
    FROM base
    WHERE
        -- Required fields: avoid null or garbage rows
        id IS NOT NULL
        AND name IS NOT NULL
        AND population IS NOT NULL
        AND geometry_obj IS NOT NULL
)

SELECT * FROM filtered;
""")

print("Created unified Silver table: silver_zones_all")


Created unified Silver table: silver_zones_all


In [48]:
SQL(f"""
    SELECT zone_level, COUNT(*)
    FROM silver_zones_all
    GROUP BY zone_level;
""")

Unnamed: 0,zone_level,count_star()
0,distritos,1565
1,municipios,1645
2,gau,911


In [50]:
SQL(f"""
    SELECT 
        * EXCLUDE (geometry_obj, centroid), 
        ST_AsGeoJSON(centroid) AS centroid,
        ST_AsGeoJSON(geometry_obj) AS geojson 
    FROM silver_zones_all 
    LIMIT 10;
""")

Unnamed: 0,id,name,zone_level,population,centroid_longitude,centroid_latitude,municipio_id,centroid,geojson
0,105901,Vitoria-Gasteiz distrito 01,distritos,386000.0,-2.674455,42.850278,1059,"{""type"":""Point"",""coordinates"":[-2.674454571789...","{""type"":""MultiPolygon"",""coordinates"":[[[[-2.66..."
1,105902,Vitoria-Gasteiz distrito 02,distritos,698070.0,-2.693003,42.841624,1059,"{""type"":""Point"",""coordinates"":[-2.693003453533...","{""type"":""MultiPolygon"",""coordinates"":[[[[-2.67..."
2,105903,Vitoria-Gasteiz distrito 03,distritos,541140.0,-2.655504,42.850168,1059,"{""type"":""Point"",""coordinates"":[-2.655504253524...","{""type"":""MultiPolygon"",""coordinates"":[[[[-2.66..."
3,105904,Vitoria-Gasteiz distrito 04,distritos,330420.0,-2.663306,42.835145,1059,"{""type"":""Point"",""coordinates"":[-2.663306112096...","{""type"":""MultiPolygon"",""coordinates"":[[[[-2.66..."
4,105905,Vitoria-Gasteiz distrito 05,distritos,524320.0,-2.689069,42.864926,1059,"{""type"":""Point"",""coordinates"":[-2.689068958198...","{""type"":""MultiPolygon"",""coordinates"":[[[[-2.69..."
5,105906,Vitoria-Gasteiz distrito 06,distritos,50980.0,-2.688233,42.850417,1059,"{""type"":""Point"",""coordinates"":[-2.688232690725...","{""type"":""MultiPolygon"",""coordinates"":[[[[-2.64..."
6,200301,Albacete distrito 01,distritos,259280.0,-1.844706,39.004829,2003,"{""type"":""Point"",""coordinates"":[-1.844706041907...","{""type"":""MultiPolygon"",""coordinates"":[[[[-1.84..."
7,200302,Albacete distrito 02,distritos,394870.0,-1.877033,39.003382,2003,"{""type"":""Point"",""coordinates"":[-1.877032752111...","{""type"":""MultiPolygon"",""coordinates"":[[[[-1.85..."
8,200303,Albacete distrito 03,distritos,201790.0,-1.871509,38.99521,2003,"{""type"":""Point"",""coordinates"":[-1.871509074076...","{""type"":""MultiPolygon"",""coordinates"":[[[[-1.87..."
9,200304,Albacete distrito 04,distritos,197070.0,-1.882018,38.982831,2003,"{""type"":""Point"",""coordinates"":[-1.882017883777...","{""type"":""MultiPolygon"",""coordinates"":[[[[-1.86..."


<h2 id="ine"><b>2. Spanish National Statistics Institute (INE)</b></h2>

In [121]:
SQL("""
    
    WITH unnested_population_data AS (
        SELECT
            nombre,
            UNNEST(
                CAST(json_extract(data, '$') AS JSON[])
            ) AS data_element
        FROM bronze_ine_padron_municipios
    ),
    parsed_population AS (
        SELECT
            split_part(nombre, '. ', 1) AS municipio_nombre,
            split_part(nombre, '. ', 2) AS indicador,
            CAST(data_element.Anyo AS INTEGER) AS periodo,
            CAST(REPLACE(data_element.Valor, '.', '') AS DOUBLE)/10 AS valor,
            CAST(data_element.Secreto AS BOOLEAN) AS secreto
        FROM unnested_population_data
        WHERE CAST(data_element.Secreto AS BOOLEAN) = FALSE
    ),
    population_data AS (
        PIVOT parsed_population
        ON indicador
        USING MAX(valor)
        GROUP BY municipio_nombre, periodo
        ORDER BY municipio_nombre ASC
    ),
    avg_capita_income AS (
        SELECT
            split_part(distrito, ' ', 1) AS distrito_id,
            substring(distrito from position(' ' in distrito) + 1) AS distrito_nombre,
            substring(distrito_id, 1, 5) AS municipio_id,
            anyo,
            AVG(CAST(REPLACE(total, '.', '') AS DOUBLE)) AS renta_capita
        FROM bronze_ine_renta_distritos
        WHERE indicador = 'Renta neta media por persona'
            AND distrito IS NOT NULL AND total != '.'
        GROUP BY distrito, anyo
    ),
    avg_household_income AS (
        SELECT
            split_part(distrito, ' ', 1) AS distrito_id,
            substring(distrito from position(' ' in distrito) + 1) AS distrito_nombre,
             substring(distrito_id, 1, 5) AS municipio_id,
            anyo,
            AVG(CAST(REPLACE(total, '.', '') AS DOUBLE)) AS renta_hogar
        FROM bronze_ine_renta_distritos
        WHERE indicador = 'Renta neta media por hogar'
            AND distrito IS NOT NULL AND total != '.'
        GROUP BY distrito, anyo
    ),
    business_municipio AS (
        SELECT
            split_part(municipio, ' ', 1) AS municipio_id,
            substring(municipio from position(' ' in municipio) + 1) AS municipio_nombre,
            anyo,
            AVG(CAST(REPLACE(total, '.', '') AS DOUBLE)) AS empresas
        FROM bronze_ine_empresas_municipios
        WHERE municipio IS NOT NULL AND total != '.'
        GROUP BY municipio, anyo
    )
    
    SELECT * FROM business_municipio WHERE anyo = 2023;
""")

Unnamed: 0,municipio_id,municipio_nombre,anyo,empresas
0,01013,Barrundia,2023,66.0
1,01017,Campezo/Kanpezu,2023,56.0
2,01022,Elciego,2023,66.0
3,01041,Navaridas,2023,19.0
4,01063,Zuia,2023,145.0
...,...,...,...,...
6870,50225,Ricla,2023,122.0
6871,50242,Sediles,2023,5.0
6872,50244,Sierra de Luna,2023,13.0
6873,50261,Torrellas,2023,16.0


<h2 id="extra"><b>4. Extra tables</b></h2>

<h2 id="extra"><b>4.1. Distance Matrix between municipalities</b></h2>

In [111]:
SQL("""
    CREATE OR REPLACE TABLE silver_distances AS
    SELECT
        o.id AS origen,
        d.id AS destino,
        ST_Distance_Sphere(o.centroid, d.centroid) / 1000.0 AS distancia_km
    FROM silver_zones_all AS o
    CROSS JOIN silver_zones_all AS d
    WHERE o.id != d.id
        AND o.zone_level = 'distritos' AND d.zone_level = 'distritos';
""")

Unnamed: 0,Count
0,2704380


In [112]:
SQL(f"""
    SELECT * 
    FROM silver_distances
    LIMIT 10;
""")

Unnamed: 0,origen,destino,distancia_km
0,1002,1001,55.649314
1,1010,1001,68.72756
2,1036,1001,62.181485
3,1043,1001,32.785814
4,1051,1001,13.814324
5,1059,1001,19.741849
6,1901,1001,34.229879
7,2003,1001,436.047625
8,2009,1001,470.523952
9,2025,1001,487.462737
