In [None]:
%pip install duckdb pandas numpy pyspainmobility requests

In [1]:
import duckdb
import pandas

BASE_PATH = '../../raw'
LAKE_LAYER = 'silver'

con = duckdb.connect(
    database='../../mobility.db',
    config={'allow_unsigned_extensions': 'true'}
)


def SQL(q):
    """Run SQL (printed for clarity) and return a DataFrame."""
    return con.execute(q).fetchdf()


print("DuckDB version:", con.sql(
    "SELECT version();").fetchone()[0])  # type: ignore

DuckDB version: v1.4.2


```sql
-- viajes distristos
-- se castea en vez de VARCHAR de 'si/no' a un booleano de true o false
CREATE TABLE silver_mitma_od_districts (
    fecha DATE, -- de TEXT a DATE
    hora SMALLINT,
    origen TEXT,
    destino TEXT,
    distancia TEXT,
    actividad_origen TEXT,
    actividad_destino TEXT,
    residencia TEXT,
    renta TEXT,
    edad TEXT,
    sexo TEXT,
    viajes DOUBLE,
    viajes_km DOUBLE,
    estudio_destino_posible BOOLEAN,  -- de VARCHAR a BOOLEAN
    estudio_origen_posible BOOLEAN,   -- de VARCHAR a BOOLEAN
);

-- Viajes municipios
-- silver_mitma_od_municipalities / bronze_mitma_viajes_municipios
CREATE TABLE silver_mitma_od_municipalities (
    fecha DATE,  -- de TEXT a DATE
    hora SMALLINT,
    origen TEXT,
    destino TEXT,
    distancia TEXT,
    actividad_origen TEXT,
    actividad_destino TEXT,
    residencia TEXT,
    renta TEXT,
    edad TEXT,
    sexo TEXT,
    viajes DOUBLE,
    viajes_km DOUBLE,
    estudio_destino_posible BOOLEAN, -- de VARCHAR a BOOLEAN
    estudio_origen_posible BOOLEAN,  -- de VARCHAR a BOOLEAN
);

-- Viajes GAU
-- bronze_mitma_od_gau / bronze_mitma_viajes_gau
CREATE TABLE silver_mitma_od_gau (
    fecha DATE,  -- de TEXT a DATE
    hora SMALLINT,
    origen TEXT,
    destino TEXT,
    distancia TEXT,
    actividad_origen TEXT,
    actividad_destino TEXT,
    residencia TEXT,
    renta TEXT,
    edad TEXT,
    sexo TEXT,
    viajes DOUBLE,
    viajes_km DOUBLE,
    estudio_destino_posible BOOLEAN, -- de VARCHAR a BOOLEAN
    estudio_origen_posible BOOLEAN,  -- de VARCHAR a BOOLEAN
);
```

In [None]:
# Cargar holidays una sola vez al inicio
SQL("""
    INSTALL httpfs;
    LOAD httpfs;
""")

SQL("""
    -- La tabla temporal es visible solo para la sesión que la creó.
    CREATE OR REPLACE TEMP TABLE spanish_holidays AS
    WITH parsed_holidays AS (
        SELECT 
            json_extract(holiday, '$.startDate') AS date_str,
            CAST(json_extract(holiday, '$.nationwide') AS BOOLEAN) AS nationwide
        FROM read_json(
            'https://openholidaysapi.org/PublicHolidays?countryIsoCode=ES&languageIsoCode=ES&validFrom=2023-01-01&validTo=2023-12-31',
            format='array'
        ) AS t(holiday)
    )
    SELECT DISTINCT
        CAST(date_str AS DATE) AS date
    FROM parsed_holidays
    WHERE nationwide = TRUE;
""")


def load_od_matrices(type="districts"):
    """Load MITMA OD matrices from bronze to silver layer, transforming data types as needed."""

    dataset = 'od'
    table_name = f"{LAKE_LAYER}_mitma_{dataset}_{type}"

    SQL(f"""
        CREATE OR REPLACE TABLE {table_name} AS
        WITH parsed_data AS (
            SELECT
                strptime(CAST(fecha AS VARCHAR), '%Y%m%d')::DATE AS fecha,
                CAST(periodo AS SMALLINT) AS hora,
                origen,
                destino,
                distancia,
                actividad_origen,
                actividad_destino,
                residencia,
                renta,
                edad,
                sexo,
                CAST(viajes AS DOUBLE) AS viajes,
                CAST(viajes_km AS DOUBLE) AS viajes_km,
                CASE 
                    WHEN estudio_destino_posible IN ('si', 'SI', 'sí') THEN TRUE 
                    WHEN estudio_destino_posible IN ('no', 'NO') THEN FALSE 
                    ELSE NULL
                END AS estudio_destino_posible,
                CASE 
                    WHEN estudio_origen_posible IN ('si', 'SI', 'sí') THEN TRUE 
                    WHEN estudio_origen_posible IN ('no', 'NO') THEN FALSE 
                    ELSE NULL
                END AS estudio_origen_posible
            FROM bronze_mitma_{dataset}_{type}
        ),
        enriched_data AS (
            SELECT
                *,
                CASE 
                    WHEN dayofweek(fecha) IN (6, 7) THEN TRUE
                    ELSE FALSE
                END AS is_weekend,
                CASE
                    WHEN fecha IN (SELECT date FROM spanish_holidays) THEN TRUE
                    ELSE FALSE
                END AS is_holiday
            FROM parsed_data
        )
        SELECT * FROM enriched_data;
    """)

In [8]:
load_od_matrices(type="distritos")
load_od_matrices(type="municipios")
load_od_matrices(type="gau")

SQL(f""" DROP TABLE IF EXISTS spanish_holidays; """)

Unnamed: 0,Success


In [5]:
SQL(f"""
    SELECT *
    FROM {LAKE_LAYER}_mitma_od_distritos 
    LIMIT 10;
""")

Unnamed: 0,fecha,hora,origen,destino,distancia,actividad_origen,actividad_destino,residencia,renta,edad,sexo,viajes,viajes_km,estudio_destino_posible,estudio_origen_posible,is_weekend,is_holiday
0,2022-03-01,13,303102,303103,2-10,frecuente,frecuente,3,10-15,45-65,mujer,3.251,10.441,False,False,False,False
1,2022-03-01,13,303102,303103,2-10,frecuente,frecuente,3,10-15,65-100,mujer,7.426,18.648,False,False,False,False
2,2022-03-01,13,303102,303103,2-10,frecuente,frecuente,3,10-15,,,2.77,9.342,False,False,False,False
3,2022-03-01,13,303102,303103,2-10,frecuente,frecuente,3,<10,45-65,hombre,2.832,7.524,False,False,False,False
4,2022-03-01,13,303102,303103,2-10,frecuente,frecuente,3,<10,,,6.704,14.69,False,False,False,False
5,2022-03-01,13,303102,303103,2-10,frecuente,trabajo_estudio,3,10-15,25-45,,5.87,18.789,False,False,False,False
6,2022-03-01,13,303102,303103,2-10,frecuente,trabajo_estudio,3,10-15,25-45,hombre,4.05,9.324,False,False,False,False
7,2022-03-01,13,303102,303103,2-10,frecuente,trabajo_estudio,3,10-15,,,8.335,18.487,False,False,False,False
8,2022-03-01,13,303102,303103,2-10,frecuente,trabajo_estudio,3,<10,65-100,hombre,3.024,7.0,False,False,False,False
9,2022-03-01,13,303102,303103,2-10,frecuente,trabajo_estudio,3,<10,,,4.59,11.361,False,False,False,False


In [6]:
SQL(f"""
    SELECT is_weekend, count(*) as total
    FROM {LAKE_LAYER}_mitma_od_distritos 
    GROUP BY is_weekend;
    """)

Unnamed: 0,is_weekend,total
0,False,55862966


In [12]:
SQL(f"""
    SELECT '{LAKE_LAYER}_mitma_od_distritos' as name, count(*) FROM {LAKE_LAYER}_mitma_od_distritos
    UNION
    SELECT '{LAKE_LAYER}_mitma_od_municipios' as name, count(*) FROM {LAKE_LAYER}_mitma_od_municipios
    UNION
    SELECT '{LAKE_LAYER}_mitma_od_gau' as name, count(*) FROM {LAKE_LAYER}_mitma_od_gau
    ORDER BY count(*) DESC;
""")

Unnamed: 0,name,count_star()
0,silver_mitma_od_distritos,55862966
1,silver_mitma_od_municipios,34684127
2,silver_mitma_od_gau,20827473


```sql
-- Distritos
CREATE TABLE IF NOT EXISTS bronze_mitma_people_day_districts (
  fecha DATE,
  zona_pernoctacion TEXT,
  edad TEXT,
  sexo TEXT,
  numero_viajes TEXT,   -- 0,1,2,2+ (mantener TEXT)
  personas DOUBLE,
  loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  source_file TEXT
);
```

```sql
-- Municipios
CREATE TABLE IF NOT EXISTS bronze_mitma_people_day_municipalities (
  fecha DATE,
  zona_pernoctacion TEXT,
  edad TEXT,
  sexo TEXT,
  numero_viajes TEXT,
  personas DOUBLE,
  loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  source_file TEXT
);
```

```sql
-- GAU
CREATE TABLE IF NOT EXISTS bronze_mitma_people_day_gau (
  fecha DATE,
  zona_pernoctacion TEXT,
  edad TEXT,
  sexo TEXT,
  numero_viajes TEXT,
  personas DOUBLE,
  loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  source_file TEXT
);
```

In [6]:
def load_people_day(type="districts", start_date='2022-03-01', end_date='2022-03-03'):
    """ Load MITMA People Day data from bronze to silver layer, transforming data types as needed. """
    dataset = 'people_day'
    table_name = f'{LAKE_LAYER}_mitma_{dataset}_{type}'

    SQL(f"DROP TABLE IF EXISTS {table_name};")
    SQL(f"""
        CREATE TABLE IF NOT EXISTS {table_name}(
            fecha DATE,
            zona_pernoctacion TEXT,
            edad TEXT,
            sexo TEXT,
            numero_viajes TEXT,
            personas DOUBLE,
        );
    """)

    SQL(f"""
        INSERT INTO {table_name}
        SELECT
            strptime(CAST(fecha AS VARCHAR), '%Y%m%d')::DATE AS fecha,
            zona_pernoctacion,
            edad,
            sexo,
            numero_viajes,
            CAST(personas AS DOUBLE) AS personas,
        FROM bronze_mitma_{dataset}_{type};
    """)

In [7]:
load_people_day(type="distritos")
load_people_day(type="municipios")
load_people_day(type="gau")

In [8]:
SQL(f"SELECT * FROM {LAKE_LAYER}_mitma_people_day_distritos LIMIT 5;")

Unnamed: 0,fecha,zona_pernoctacion,edad,sexo,numero_viajes,personas
0,2022-03-01,1001,0-25,hombre,0,125.296
1,2022-03-01,1001,0-25,hombre,2,115.378
2,2022-03-01,1001,0-25,hombre,2+,176.63
3,2022-03-01,1001,0-25,mujer,0,125.069
4,2022-03-01,1001,0-25,mujer,2,117.712


In [9]:
SQL(f"""
    SELECT '{LAKE_LAYER}_mitma_people_day_distritos' as name, count(*) FROM {LAKE_LAYER}_mitma_people_day_distritos
    UNION
    SELECT '{LAKE_LAYER}_mitma_people_day_municipios' as name, count(*) FROM {LAKE_LAYER}_mitma_people_day_municipios
    UNION
    SELECT '{LAKE_LAYER}_mitma_people_day_gau' as name, count(*) FROM {LAKE_LAYER}_mitma_people_day_gau;
""")

Unnamed: 0,name,count_star()
0,silver_mitma_people_day_gau,182214
1,silver_mitma_people_day_distritos,340448
2,silver_mitma_people_day_municipios,231688


```sql
-- Distritos
CREATE TABLE IF NOT EXISTS bronze_mitma_overnight_stay_districts (
  fecha DATE,
  zona_residencia TEXT,
  zona_pernoctacion TEXT,
  personas DOUBLE,
  loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  source_file TEXT
);
```

```sql
-- Municipios
CREATE TABLE IF NOT EXISTS bronze_mitma_overnight_stay_municipalities (
  fecha DATE,
  zona_residencia TEXT,
  zona_pernoctacion TEXT,
  personas DOUBLE,
  loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  source_file TEXT
);
```

```sql
-- GAU
CREATE TABLE IF NOT EXISTS bronze_mitma_overnight_stay_gau (
  fecha DATE,
  zona_residencia TEXT,
  zona_pernoctacion TEXT,
  personas DOUBLE,
  loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  source_file TEXT
);
```

In [10]:

def load_overnight_stay(type="districts"):
    dataset = 'overnight_stay'
    table_name = f'{LAKE_LAYER}_mitma_{dataset}_{type}'

    SQL(f"DROP TABLE IF EXISTS {table_name};")
    SQL(f"""
        CREATE TABLE IF NOT EXISTS {table_name}(
            fecha DATE,
            zona_residencia TEXT,
            zona_pernoctacion TEXT,
            personas DOUBLE,
        );
    """)

    SQL(f"""
        INSERT INTO {table_name}
        SELECT
            strptime(CAST(fecha AS VARCHAR), '%Y%m%d')::DATE AS fecha,
            zona_residencia,
            zona_pernoctacion,
            CAST(personas AS DOUBLE) AS personas,
        FROM bronze_mitma_{dataset}_{type};
""")

In [11]:
load_overnight_stay(type="distritos")
load_overnight_stay(type="municipios")
load_overnight_stay(type="gau")

In [12]:
SQL(f"SELECT * FROM {LAKE_LAYER}_mitma_overnight_stay_distritos LIMIT 5;")

Unnamed: 0,fecha,zona_residencia,zona_pernoctacion,personas
0,2022-03-01,1001,01001,2733.784
1,2022-03-01,1001,01004_AM,2.514
2,2022-03-01,1001,01009_AM,18.431
3,2022-03-01,1001,01017_AM,2.922
4,2022-03-01,1001,01051,7.831


In [14]:
SQL(f"""
    SELECT '{LAKE_LAYER}_mitma_overnight_stay_distritos' as name, count(*) FROM {LAKE_LAYER}_mitma_overnight_stay_distritos
    UNION
    SELECT '{LAKE_LAYER}_mitma_overnight_stay_municipios' as name, count(*) FROM {LAKE_LAYER}_mitma_overnight_stay_municipios
    UNION
    SELECT '{LAKE_LAYER}_mitma_overnight_stay_gau' as name, count(*) FROM {LAKE_LAYER}_mitma_overnight_stay_gau
    ORDER BY count(*) DESC;
""")

Unnamed: 0,name,count_star()
0,silver_mitma_overnight_stay_distritos,947839
1,silver_mitma_overnight_stay_municipios,741266
2,silver_mitma_overnight_stay_gau,574703


```sql
-- Distritos
CREATE TABLE IF NOT EXISTS bronze_mitma_districts (
  id TEXT,
  name TEXT,
  population DOUBLE,
  geometry GEOMETRY,
  municipio_id TEXT,
);
```

```sql
-- Municipios
CREATE TABLE IF NOT EXISTS bronze_mitma_municipalities (
  id TEXT,
  name TEXT,
  population DOUBLE,
  geometry GEOMETRY,
);
```

```sql
-- GAU
CREATE TABLE IF NOT EXISTS bronze_mitma_gau (
  id TEXT,
  name TEXT,
  population DOUBLE,
  geometry GEOMETRY,
);
```

In [None]:

def load_zones(type="districts", id_length=None):
    table_name = f'{LAKE_LAYER}_mitma_{type}'

    SQL("""
        INSTALL spatial;
        LOAD spatial;
    """)

    SQL(f"DROP TABLE IF EXISTS {table_name};")
    SQL(f"""
        CREATE TABLE IF NOT EXISTS {table_name}(
            id TEXT,
            name TEXT,
            population DOUBLE,
            geometry GEOMETRY,
        );
    """)

    where_clause = f"WHERE length(id) = {id_length}" if id_length is not None else ""

    SQL(f"""
        INSERT INTO {table_name}
        SELECT
            id,
            name,
            CAST(population AS DOUBLE) AS population,
            ST_Multi(ST_GeomFromText(geometry)) AS geometry
        FROM bronze_mitma_{type}
        {where_clause}
        ORDER BY id ASC;
    """)

In [16]:
load_zones(type="distritos", id_length=7)
load_zones(type="municipios", id_length=5)
load_zones(type="gau", id_length=8)

In [None]:
# metemos la columna municipio_id en la tabla de distritos.
# pues el id de un distrito es el de su municipio más un sufijo de dos dígitos.
# 28079 (Municipio de Madrid) -> 2807901 (Distrito de Centro)
type = "distritos"

SQL(f"""
    ALTER TABLE {LAKE_LAYER}_mitma_{type} ADD COLUMN municipio_id TEXT;
""")

SQL(f"""
    UPDATE {LAKE_LAYER}_mitma_{type}
    SET municipio_id = CASE
        WHEN length(id) = 7 THEN substring(id, 1, 5)
        WHEN length(id) = 5 THEN id
        ELSE NULL
    END;
""")

In [62]:
SQL(f"""
    SELECT 
        * EXCLUDE (geometry), 
        ST_AsGeoJSON(geometry) AS geojson 
    FROM {LAKE_LAYER}_mitma_distritos
    WHERE municipio_id = '01059'
    LIMIT 10;
""")

Unnamed: 0,id,name,population,municipio_id,geojson
0,105901,Vitoria-Gasteiz distrito 01,38600.0,1059,"{""type"":""MultiPolygon"",""coordinates"":[[[[-2.66..."
1,105902,Vitoria-Gasteiz distrito 02,69807.0,1059,"{""type"":""MultiPolygon"",""coordinates"":[[[[-2.67..."
2,105903,Vitoria-Gasteiz distrito 03,54114.0,1059,"{""type"":""MultiPolygon"",""coordinates"":[[[[-2.66..."
3,105904,Vitoria-Gasteiz distrito 04,33042.0,1059,"{""type"":""MultiPolygon"",""coordinates"":[[[[-2.66..."
4,105905,Vitoria-Gasteiz distrito 05,52432.0,1059,"{""type"":""MultiPolygon"",""coordinates"":[[[[-2.69..."
5,105906,Vitoria-Gasteiz distrito 06,5098.0,1059,"{""type"":""MultiPolygon"",""coordinates"":[[[[-2.64..."


In [60]:
SQL(f"""
    SELECT 
        * EXCLUDE (geometry), 
        ST_AsGeoJSON(geometry) AS geojson 
    FROM {LAKE_LAYER}_mitma_municipios
    WHERE id = '01059'
    LIMIT 5;
""")

Unnamed: 0,id,name,population,geojson
0,1059,Vitoria-Gasteiz,253093.0,"{""type"":""MultiPolygon"",""coordinates"":[[[[-2.64..."


In [57]:
SQL(f"""
    SELECT '{LAKE_LAYER}_mitma_distritos' as name, count(*) FROM {LAKE_LAYER}_mitma_distritos
    UNION
    SELECT '{LAKE_LAYER}_mitma_municipios' as name, count(*) FROM {LAKE_LAYER}_mitma_municipios
    UNION
    SELECT '{LAKE_LAYER}_mitma_gau' as name, count(*) FROM {LAKE_LAYER}_mitma_gau
    ORDER BY count(*) DESC;
""")

Unnamed: 0,name,count_star()
0,silver_mitma_municipios,1645
1,silver_mitma_distritos,1565
2,silver_mitma_gau,911


```sql
-- Distritos
CREATE TABLE IF NOT EXISTS bronze_ine_padron_municipios (
  cod        VARCHAR,
  nombre     VARCHAR,
  fk_unidad  INTEGER,
  fk_escala  INTEGER,
  data_txt   TEXT,
  data       JSON,
  -- Columnas extras añadidas para auditoria. 
  loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  -- loaded_by TEXT DEFAULT CURRENT_USER,
  source_file TEXT
);

In [63]:
con.close()