In [4]:
%pip install duckdb pandas numpy pyspainmobility

Collecting pyspainmobility
  Downloading pyspainmobility-1.0.4-py3-none-any.whl.metadata (4.9 kB)
Collecting geopandas~=1.0.1 (from pyspainmobility)
  Downloading geopandas-1.0.1-py3-none-any.whl.metadata (2.2 kB)
Collecting tqdm>=4.0.0 (from pyspainmobility)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting matplotlib>=3.0.0 (from pyspainmobility)
  Downloading matplotlib-3.10.7-cp313-cp313-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting pyarrow>=8.0.0 (from pyspainmobility)
  Downloading pyarrow-22.0.0-cp313-cp313-macosx_12_0_arm64.whl.metadata (3.2 kB)
Collecting dask>=2024.0 (from dask[dataframe]>=2024.0->pyspainmobility)
  Downloading dask-2025.11.0-py3-none-any.whl.metadata (3.8 kB)
Collecting pyogrio>=0.7.2 (from geopandas~=1.0.1->pyspainmobility)
  Downloading pyogrio-0.11.1-cp313-cp313-macosx_12_0_arm64.whl.metadata (5.3 kB)
Collecting pyproj>=3.3.0 (from geopandas~=1.0.1->pyspainmobility)
  Downloading pyproj-3.7.2-cp313-cp313-macosx_14_0_arm64.whl.meta

In [4]:
import duckdb
import pandas

con = duckdb.connect('../bronze.db')

def SQL(q):
    """Run SQL (printed for clarity) and return a DataFrame."""
    return con.execute(q).fetchdf()

print("DuckDB version:", con.sql("SELECT version();").fetchone()[0]) # type: ignore

DuckDB version: v1.4.2


```sql
-- viajes distristos
-- se castea en vez de VARCHAR de 'si/no' a un booleano de true o false
-- bronze_mitma_od_districts / bronze_mitma_viajes_distritos
CREATE TABLE bronze_mitma_od_districts (
    fecha DATE, -- de TEXT a DATE
    hora SMALLINT,
    origen TEXT,
    destino TEXT,
    distancia TEXT,
    actividad_origen TEXT,
    actividad_destino TEXT,
    residencia TEXT,
    renta TEXT,
    edad TEXT,
    sexo TEXT,
    viajes DOUBLE,
    viajes_km DOUBLE,
    estudio_destino_posible BOOLEAN,  -- de VARCHAR a BOOLEAN
    estudio_origen_posible BOOLEAN,   -- de VARCHAR a BOOLEAN
    -- Columnas extras añadidas para auditoria. 
    loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    -- loaded_by TEXT DEFAULT CURRENT_USER,
    source_file TEXT
);

-- Viajes municipios
-- bronze_mitma_od_municipalities / bronze_mitma_viajes_municipios
CREATE TABLE bronze_mitma_od_municipalities (
    fecha DATE,  -- de TEXT a DATE
    hora SMALLINT,
    origen TEXT,
    destino TEXT,
    distancia TEXT,
    actividad_origen TEXT,
    actividad_destino TEXT,
    residencia TEXT,
    renta TEXT,
    edad TEXT,
    sexo TEXT,
    viajes DOUBLE,
    viajes_km DOUBLE,
    estudio_destino_posible BOOLEAN, -- de VARCHAR a BOOLEAN
    estudio_origen_posible BOOLEAN,  -- de VARCHAR a BOOLEAN
    -- auditoria
    loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    -- loaded_by TEXT DEFAULT CURRENT_USER,
    source_file TEXT
);

-- Viajes GAU
-- bronze_mitma_od_gau / bronze_mitma_viajes_gau
CREATE TABLE bronze_mitma_od_gau (
    fecha DATE,  -- de TEXT a DATE
    hora SMALLINT,
    origen TEXT,
    destino TEXT,
    distancia TEXT,
    actividad_origen TEXT,
    actividad_destino TEXT,
    residencia TEXT,
    renta TEXT,
    edad TEXT,
    sexo TEXT,
    viajes DOUBLE,
    viajes_km DOUBLE,
    estudio_destino_posible BOOLEAN, -- de VARCHAR a BOOLEAN
    estudio_origen_posible BOOLEAN,  -- de VARCHAR a BOOLEAN
    -- auditoria
    loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    -- loaded_by TEXT DEFAULT CURRENT_USER,
    source_file TEXT
);
```

In [None]:
from pyspainmobility import Mobility
import os

def load_od_matrices(type = "districts"):
    dataset = 'od'
    os.makedirs(f'../raw/viajes_{type}', exist_ok=True)
    
    if not os.listdir(f'../raw/viajes_{type}'): # only download if the directory is empty
        data = Mobility(
            version=2,
            zones=type,
            start_date='2022-03-01',
            end_date='2022-03-03',
            output_directory=f'/Users/bgramaje/workspace/MUCEIM/bigdata/muceim-bigdata_project/raw/viajes_{type}',
        )
    
        data.get_od_data(keep_activity=True)

    SQL(f"DROP TABLE IF EXISTS bronze_mitma_{dataset}_{type};")
    SQL(f"""
        CREATE TABLE IF NOT EXISTS bronze_mitma_{dataset}_{type}(
            fecha DATE, -- de TEXT a DATE
            hora SMALLINT, -- de TEXT a SMALLINT, de periodo a hora
            origen TEXT,
            destino TEXT,
            distancia TEXT,
            actividad_origen TEXT,
            actividad_destino TEXT,
            residencia TEXT,
            renta TEXT,
            edad TEXT,
            sexo TEXT,
            viajes DOUBLE,
            viajes_km DOUBLE,
            estudio_destino_posible BOOLEAN,  -- de VARCHAR a BOOLEAN
            estudio_origen_posible BOOLEAN,   -- de VARCHAR a BOOLEAN
            -- Columnas extras añadidas para auditoria. 
            loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            -- loaded_by TEXT DEFAULT CURRENT_USER,
            source_file TEXT
        );
    """)

    SQL(f"""
        INSERT INTO bronze_mitma_{dataset}_{type}
        SELECT
            strptime(CAST(fecha AS VARCHAR), '%Y%m%d')::DATE AS fecha,
            CAST(periodo AS SMALLINT) AS hora,
            origen,
            destino,
            distancia,
            actividad_origen,
            actividad_destino,
            residencia,
            renta,
            edad,
            sexo,
            CAST(viajes AS DOUBLE) AS viajes,
            CAST(viajes_km AS DOUBLE) AS viajes_km,
            CASE 
                WHEN estudio_destino_posible IN ('si', 'SI', 'sí') THEN TRUE 
                WHEN estudio_destino_posible IN ('no', 'NO') THEN FALSE 
                ELSE NULL
            END AS estudio_destino_posible,
            CASE 
                WHEN estudio_origen_posible IN ('si', 'SI', 'sí') THEN TRUE 
                WHEN estudio_origen_posible IN ('no', 'NO') THEN FALSE 
                ELSE NULL
            END AS estudio_origen_posible,
            CURRENT_TIMESTAMP AS loaded_at,
            -- CURRENT_USER AS loaded_by,
            filename AS source_file
        FROM read_csv(
            '../raw/viajes_{type}/*.csv.gz',
            filename = true,
            all_varchar = true
        );
 
""")

In [31]:
load_od_matrices(type="distritos")
load_od_matrices(type="municipios")
load_od_matrices(type="gau")

In [33]:
SQL("SELECT * FROM bronze_mitma_od_municipios LIMIT 5;")

Unnamed: 0,fecha,hora,origen,destino,distancia,actividad_origen,actividad_destino,residencia,renta,edad,sexo,viajes,viajes_km,estudio_destino_posible,estudio_origen_posible,loaded_at,source_file
0,2022-03-01,0,1001,01009_AM,2-10,casa,frecuente,1,10-15,,,10.279,67.832,False,False,2025-11-15 22:51:17.350963,../raw/viajes_municipios/20220301_Viajes_munic...
1,2022-03-01,0,1001,01009_AM,2-10,frecuente,casa,1,>15,,,4.591,42.419,False,False,2025-11-15 22:51:17.350963,../raw/viajes_municipios/20220301_Viajes_munic...
2,2022-03-01,0,1001,01058_AM,10-50,frecuente,no_frecuente,1,>15,,,4.773,72.081,False,False,2025-11-15 22:51:17.350963,../raw/viajes_municipios/20220301_Viajes_munic...
3,2022-03-01,0,1001,01058_AM,2-10,frecuente,casa,1,>15,,,1.739,15.513,False,False,2025-11-15 22:51:17.350963,../raw/viajes_municipios/20220301_Viajes_munic...
4,2022-03-01,0,1001,01059,10-50,frecuente,casa,1,>15,0-25,mujer,12.318,155.587,False,False,2025-11-15 22:51:17.350963,../raw/viajes_municipios/20220301_Viajes_munic...


In [35]:
SQL("SELECT * FROM bronze_mitma_od_gau LIMIT 5;")

Unnamed: 0,fecha,hora,origen,destino,distancia,actividad_origen,actividad_destino,residencia,renta,edad,sexo,viajes,viajes_km,estudio_destino_posible,estudio_origen_posible,loaded_at,source_file
0,2022-03-01,0,1001,01009_AM,2-10,casa,frecuente,1,10-15,,,10.279,67.832,False,False,2025-11-15 22:51:57.938569,../raw/viajes_gau/20220301_Viajes_GAU_v2.csv.gz
1,2022-03-01,0,1001,01009_AM,2-10,frecuente,casa,1,>15,,,4.591,42.419,False,False,2025-11-15 22:51:57.938569,../raw/viajes_gau/20220301_Viajes_GAU_v2.csv.gz
2,2022-03-01,0,1001,01058_AM,10-50,frecuente,no_frecuente,1,>15,,,4.773,72.081,False,False,2025-11-15 22:51:57.938569,../raw/viajes_gau/20220301_Viajes_GAU_v2.csv.gz
3,2022-03-01,0,1001,01058_AM,2-10,frecuente,casa,1,>15,,,1.739,15.513,False,False,2025-11-15 22:51:57.938569,../raw/viajes_gau/20220301_Viajes_GAU_v2.csv.gz
4,2022-03-01,0,1001,GAU Vitoria/Gasteiz,10-50,frecuente,casa,1,>15,0-25,mujer,12.318,155.587,False,False,2025-11-15 22:51:57.938569,../raw/viajes_gau/20220301_Viajes_GAU_v2.csv.gz


In [40]:
# todo, altera table para llamar periodo a hora, esta la columna creada con periodo en vez de hora
SQL("SELECT * FROM bronze_mitma_od_distritos LIMIT 5;")

Unnamed: 0,fecha,hora,origen,destino,distancia,actividad_origen,actividad_destino,residencia,renta,edad,sexo,viajes,viajes_km,estudio_destino_posible,estudio_origen_posible,loaded_at,source_file
0,2022-03-01,8,1001,01002,10-50,casa,frecuente,1,10-15,,,2.764,125.486,False,False,2025-11-15 22:50:14.404164,../raw/viajes_distritos/20220301_Viajes_distri...
1,2022-03-01,17,1001,01002,10-50,no_frecuente,casa,1,10-15,,,6.526,303.751,False,False,2025-11-15 22:50:14.404164,../raw/viajes_distritos/20220301_Viajes_distri...
2,2022-03-01,0,1001,01009_AM,2-10,casa,frecuente,1,10-15,,,10.279,67.832,False,False,2025-11-15 22:50:14.404164,../raw/viajes_distritos/20220301_Viajes_distri...
3,2022-03-01,0,1001,01009_AM,2-10,frecuente,casa,1,>15,,,4.591,42.419,False,False,2025-11-15 22:50:14.404164,../raw/viajes_distritos/20220301_Viajes_distri...
4,2022-03-01,2,1001,01009_AM,2-10,casa,frecuente,1,10-15,,,2.539,13.819,False,False,2025-11-15 22:50:14.404164,../raw/viajes_distritos/20220301_Viajes_distri...


```sql
-- Distritos
CREATE TABLE IF NOT EXISTS bronze_mitma_people_day_districts (
  fecha DATE,
  zona_pernoctacion TEXT,
  edad TEXT,
  sexo TEXT,
  numero_viajes TEXT,   -- 0,1,2,2+ (mantener TEXT)
  personas DOUBLE,
  loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  source_file TEXT
);

-- Municipios
CREATE TABLE IF NOT EXISTS bronze_mitma_people_day_municipalities (
  fecha DATE,
  zona_pernoctacion TEXT,
  edad TEXT,
  sexo TEXT,
  numero_viajes TEXT,
  personas DOUBLE,
  loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  source_file TEXT
);

-- GAU
CREATE TABLE IF NOT EXISTS bronze_mitma_people_day_gau (
  fecha DATE,
  zona_pernoctacion TEXT,
  edad TEXT,
  sexo TEXT,
  numero_viajes TEXT,
  personas DOUBLE,
  loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  source_file TEXT
);

In [42]:
from pyspainmobility import Mobility
import os

def load_people_day(type = "districts", start_date='2022-03-01', end_date='2022-03-03'):
    dataset = 'people_day'
    dataset_path = f'../raw/{dataset}_{type}'
    table_name = f'bronze_mitma_{dataset}_{type}'

    os.makedirs(dataset_path, exist_ok=True)
    
    if not os.listdir(dataset_path): # only download if the directory is empty
        data = Mobility(
            version=2,
            zones=type,
            start_date=start_date,
            end_date=end_date,
            output_directory=f'/Users/bgramaje/workspace/MUCEIM/bigdata/muceim-bigdata_project/raw/{dataset}_{type}',
        )
    
        data.get_number_of_trips_data()

    SQL(f"DROP TABLE IF EXISTS {table_name};")
    SQL(f"""
        CREATE TABLE IF NOT EXISTS {table_name}(
            fecha DATE,
            zona_pernoctacion TEXT,
            edad TEXT,
            sexo TEXT,
            numero_viajes TEXT,
            personas DOUBLE,
            -- Columnas extras añadidas para auditoria. 
            loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            -- loaded_by TEXT DEFAULT CURRENT_USER,
            source_file TEXT
        );
    """)

    SQL(f"""
        INSERT INTO {table_name}
        SELECT
            strptime(CAST(fecha AS VARCHAR), '%Y%m%d')::DATE AS fecha,
            zona_pernoctacion,
            edad,
            sexo,
            numero_viajes,
            CAST(personas AS DOUBLE) AS personas,
            CURRENT_TIMESTAMP AS loaded_at,
            -- CURRENT_USER AS loaded_by,
            filename AS source_file
        FROM read_csv(
            '{dataset_path}/*.csv.gz',
            filename = true,
            all_varchar = true
        );
 
""")

In [None]:
load_people_day(type="distritos")
load_people_day(type="municipios")
load_people_day(type="gau")

In [None]:
SQL("SELECT * FROM bronze_mitma_people_day_distritos LIMIT 5;")

Unnamed: 0,fecha,zona_pernoctacion,edad,sexo,numero_viajes,personas,loaded_at,source_file
0,2022-03-01,1001,0-25,hombre,0,125.296,2025-11-16 09:18:07.482225,../raw/people_day_distritos/20220301_Personas_...
1,2022-03-01,1001,0-25,hombre,2,115.378,2025-11-16 09:18:07.482225,../raw/people_day_distritos/20220301_Personas_...
2,2022-03-01,1001,0-25,hombre,2+,176.63,2025-11-16 09:18:07.482225,../raw/people_day_distritos/20220301_Personas_...
3,2022-03-01,1001,0-25,mujer,0,125.069,2025-11-16 09:18:07.482225,../raw/people_day_distritos/20220301_Personas_...
4,2022-03-01,1001,0-25,mujer,2,117.712,2025-11-16 09:18:07.482225,../raw/people_day_distritos/20220301_Personas_...


In [44]:
SQL("SELECT * FROM bronze_mitma_people_day_municipios LIMIT 5;")

Unnamed: 0,fecha,zona_pernoctacion,edad,sexo,numero_viajes,personas,loaded_at,source_file
0,2022-03-01,1001,0-25,hombre,0,125.296,2025-11-16 09:18:11.377297,../raw/people_day_municipios/20220301_Personas...
1,2022-03-01,1001,0-25,hombre,2,115.378,2025-11-16 09:18:11.377297,../raw/people_day_municipios/20220301_Personas...
2,2022-03-01,1001,0-25,hombre,2+,176.63,2025-11-16 09:18:11.377297,../raw/people_day_municipios/20220301_Personas...
3,2022-03-01,1001,0-25,mujer,0,125.069,2025-11-16 09:18:11.377297,../raw/people_day_municipios/20220301_Personas...
4,2022-03-01,1001,0-25,mujer,2,117.712,2025-11-16 09:18:11.377297,../raw/people_day_municipios/20220301_Personas...


In [2]:
SQL("SELECT * FROM bronze_mitma_people_day_gau LIMIT 5;")

Unnamed: 0,fecha,zona_pernoctacion,edad,sexo,numero_viajes,personas,loaded_at,source_file
0,2022-03-01,1001,0-25,hombre,0,125.296,2025-11-16 09:18:14.943879,../raw/people_day_gau/20220301_Personas_GAU_v2...
1,2022-03-01,1001,0-25,hombre,2,115.378,2025-11-16 09:18:14.943879,../raw/people_day_gau/20220301_Personas_GAU_v2...
2,2022-03-01,1001,0-25,hombre,2+,176.63,2025-11-16 09:18:14.943879,../raw/people_day_gau/20220301_Personas_GAU_v2...
3,2022-03-01,1001,0-25,mujer,0,125.069,2025-11-16 09:18:14.943879,../raw/people_day_gau/20220301_Personas_GAU_v2...
4,2022-03-01,1001,0-25,mujer,2,117.712,2025-11-16 09:18:14.943879,../raw/people_day_gau/20220301_Personas_GAU_v2...


```sql
-- Distritos
CREATE TABLE IF NOT EXISTS bronze_mitma_overnight_stay_districts (
  fecha DATE,
  zona_residencia TEXT,
  zona_pernoctacion TEXT,
  personas DOUBLE,
  loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  source_file TEXT
);

-- Municipios
CREATE TABLE IF NOT EXISTS bronze_mitma_overnight_stay_municipalities (
  fecha DATE,
  zona_residencia TEXT,
  zona_pernoctacion TEXT,
  personas DOUBLE,
  loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  source_file TEXT
);

-- GAU
CREATE TABLE IF NOT EXISTS bronze_mitma_overnight_stay_gau (
  fecha DATE,
  zona_residencia TEXT,
  zona_pernoctacion TEXT,
  personas DOUBLE,
  loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  source_file TEXT
);

In [47]:
from pyspainmobility import Mobility
import os

def load_overnight_stay(type = "districts", start_date='2022-03-01', end_date='2022-03-03'):
    dataset = 'overnight_stay'
    dataset_path = f'../raw/{dataset}_{type}'
    table_name = f'bronze_mitma_{dataset}_{type}'

    os.makedirs(dataset_path, exist_ok=True)
    
    if not os.listdir(dataset_path): # only download if the directory is empty
        data = Mobility(
            version=2,
            zones=type,
            start_date=start_date,
            end_date=end_date,
            output_directory=f'/Users/bgramaje/workspace/MUCEIM/bigdata/muceim-bigdata_project/raw/{dataset}_{type}',
        )
    
        data.get_overnight_stays_data()

    SQL(f"DROP TABLE IF EXISTS {table_name};")
    SQL(f"""
        CREATE TABLE IF NOT EXISTS {table_name}(
            fecha DATE,
            zona_residencia TEXT,
            zona_pernoctacion TEXT,
            personas DOUBLE,
            -- Columnas extras añadidas para auditoria. 
            loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            -- loaded_by TEXT DEFAULT CURRENT_USER,
            source_file TEXT
        );
    """)

    SQL(f"""
        INSERT INTO {table_name}
        SELECT
            strptime(CAST(fecha AS VARCHAR), '%Y%m%d')::DATE AS fecha,
            zona_residencia,
            zona_pernoctacion,
            CAST(personas AS DOUBLE) AS personas,
            CURRENT_TIMESTAMP AS loaded_at,
            -- CURRENT_USER AS loaded_by,
            filename AS source_file
        FROM read_csv(
            '{dataset_path}/*.csv.gz',
            filename = true,
            all_varchar = true
        );
 
""")

In [None]:
load_overnight_stay(type="distritos")
load_overnight_stay(type="municipios")
load_overnight_stay(type="gau")

In [49]:
SQL("SELECT * FROM bronze_mitma_overnight_stay_distritos LIMIT 5;")

Unnamed: 0,fecha,zona_residencia,zona_pernoctacion,personas,loaded_at,source_file
0,2022-03-01,1001,01001,2733.784,2025-11-16 09:24:52.111616,../raw/overnight_stay_distritos/20220301_Perno...
1,2022-03-01,1001,01004_AM,2.514,2025-11-16 09:24:52.111616,../raw/overnight_stay_distritos/20220301_Perno...
2,2022-03-01,1001,01009_AM,18.431,2025-11-16 09:24:52.111616,../raw/overnight_stay_distritos/20220301_Perno...
3,2022-03-01,1001,01017_AM,2.922,2025-11-16 09:24:52.111616,../raw/overnight_stay_distritos/20220301_Perno...
4,2022-03-01,1001,01051,7.831,2025-11-16 09:24:52.111616,../raw/overnight_stay_distritos/20220301_Perno...


In [50]:
SQL("SELECT * FROM bronze_mitma_overnight_stay_municipios LIMIT 5;")

Unnamed: 0,fecha,zona_residencia,zona_pernoctacion,personas,loaded_at,source_file
0,2022-03-01,1001,01001,2733.784,2025-11-16 09:24:56.575760,../raw/overnight_stay_municipios/20220301_Pern...
1,2022-03-01,1001,01004_AM,2.514,2025-11-16 09:24:56.575760,../raw/overnight_stay_municipios/20220301_Pern...
2,2022-03-01,1001,01009_AM,18.431,2025-11-16 09:24:56.575760,../raw/overnight_stay_municipios/20220301_Pern...
3,2022-03-01,1001,01017_AM,2.922,2025-11-16 09:24:56.575760,../raw/overnight_stay_municipios/20220301_Pern...
4,2022-03-01,1001,01051,7.831,2025-11-16 09:24:56.575760,../raw/overnight_stay_municipios/20220301_Pern...


In [51]:
SQL("SELECT * FROM bronze_mitma_overnight_stay_gau LIMIT 5;")

Unnamed: 0,fecha,zona_residencia,zona_pernoctacion,personas,loaded_at,source_file
0,2022-03-01,1001,01001,2733.784,2025-11-16 09:25:00.656147,../raw/overnight_stay_gau/20220301_Pernoctacio...
1,2022-03-01,1001,01004_AM,2.514,2025-11-16 09:25:00.656147,../raw/overnight_stay_gau/20220301_Pernoctacio...
2,2022-03-01,1001,01009_AM,18.431,2025-11-16 09:25:00.656147,../raw/overnight_stay_gau/20220301_Pernoctacio...
3,2022-03-01,1001,01017_AM,2.922,2025-11-16 09:25:00.656147,../raw/overnight_stay_gau/20220301_Pernoctacio...
4,2022-03-01,1001,01051,7.831,2025-11-16 09:25:00.656147,../raw/overnight_stay_gau/20220301_Pernoctacio...


```sql
-- Distritos
CREATE TABLE IF NOT EXISTS bronze_mitma_districts (
  id TEXT,
  name TEXT,
  population DOUBLE,
  loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  source_file TEXT
);

-- Municipios
CREATE TABLE IF NOT EXISTS bronze_mitma_municipalities (
  id TEXT,
  name TEXT,
  population DOUBLE,
  loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  source_file TEXT
);

-- GAU
CREATE TABLE IF NOT EXISTS bronze_mitma_gau (
  id TEXT,
  name TEXT,
  population DOUBLE,
  loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  source_file TEXT
);

In [2]:
%load_ext autoreload

In [13]:
%autoreload 2

from pyspainmobility import Zones
import os
import pandas as pd

def load_zones(type = "districts"):
    dataset_path = f'../raw/{type}'
    table_name = f'bronze_mitma_{type}'

    os.makedirs(dataset_path, exist_ok=True)
    
    if(not os.path.isfile(f'{dataset_path}/zones.csv.gz')):
        data = Zones(
                version=2,
                zones=type,
                output_directory=f'/Users/bgramaje/workspace/MUCEIM/bigdata/muceim-bigdata_project/raw/{type}',
            )
        
        dfdata = data.get_zone_geodataframe()
        if dfdata is not None:
            dfdata.to_csv(f'{dataset_path}/zones.csv.gz', index=False, compression='gzip')
        else:
            raise ValueError("Zones.get_zone_geodataframe() returned None")

    SQL(f"DROP TABLE IF EXISTS {table_name};")
    SQL(f"""
        CREATE TABLE IF NOT EXISTS {table_name}(
            id TEXT,
            name TEXT,
            population DOUBLE,
            geometry TEXT, -- no geometry porque estamos en BRONZE LAYER
            -- Columnas extras añadidas para auditoria. 
            loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            -- loaded_by TEXT DEFAULT CURRENT_USER,
            source_file TEXT
        );
    """)

    SQL(f"""
        INSERT INTO {table_name}
        SELECT
            id,
            name,
            CAST(population AS DOUBLE) AS population,
            geometry,
            CURRENT_TIMESTAMP AS loaded_at,
            -- CURRENT_USER AS loaded_by,
            filename AS source_file
        FROM read_csv(
            '{dataset_path}/*.csv.gz',
            filename = true,
            all_varchar = true
        );
    """)

In [15]:
load_zones(type="distritos")
load_zones(type="municipios")
load_zones(type="gau")

Zones already downloaded. Reading the files....
File /Users/bgramaje/workspace/MUCEIM/bigdata/muceim-bigdata_project/raw/municipios/municipios_2.geojson already exists. Loading it...
Zones already downloaded. Reading the files....
File /Users/bgramaje/workspace/MUCEIM/bigdata/muceim-bigdata_project/raw/gau/gaus_2.geojson already exists. Loading it...


In [16]:
SQL("SELECT * FROM bronze_mitma_distritos LIMIT 5;")

Unnamed: 0,id,name,population,geometry,loaded_at,source_file
0,01001,Alegría-Dulantzi,2925.0,MULTIPOLYGON (((-2.534282365031893 42.78795118...,2025-11-16 10:22:38.514626,../raw/distritos/zones.csv.gz
1,01002,Amurrio,10307.0,MULTIPOLYGON (((-2.975672535119074 42.93785360...,2025-11-16 10:22:38.514626,../raw/distritos/zones.csv.gz
2,01004_AM,Artziniega agregacion de municipios,3005.0,MULTIPOLYGON (((-3.140059301490131 43.16165242...,2025-11-16 10:22:38.514626,../raw/distritos/zones.csv.gz
3,01009_AM,Asparrena agregacion de municipios,4599.0,POLYGON ((-2.331385516435793 42.81775192629513...,2025-11-16 10:22:38.514626,../raw/distritos/zones.csv.gz
4,01010,Ayala/Aiara,2951.0,POLYGON ((-3.000756145899012 43.06799670583464...,2025-11-16 10:22:38.514626,../raw/distritos/zones.csv.gz


In [17]:
SQL("SELECT * FROM bronze_mitma_municipios LIMIT 5;")

Unnamed: 0,id,name,population,geometry,loaded_at,source_file
0,01001,Alegría-Dulantzi,2925.0,MULTIPOLYGON (((-2.537148430799974 42.78635536...,2025-11-16 10:22:48.482946,../raw/municipios/zones.csv.gz
1,01002,Amurrio,10307.0,MULTIPOLYGON (((-2.976160717902642 42.93734039...,2025-11-16 10:22:48.482946,../raw/municipios/zones.csv.gz
2,01004_AM,Artziniega agregacion de municipios,3005.0,MULTIPOLYGON (((-3.137768037343801 43.16251541...,2025-11-16 10:22:48.482946,../raw/municipios/zones.csv.gz
3,01009_AM,Asparrena agregacion de municipios,4599.0,POLYGON ((-2.331385516435793 42.81775192629513...,2025-11-16 10:22:48.482946,../raw/municipios/zones.csv.gz
4,01010,Ayala/Aiara,2951.0,POLYGON ((-3.000756145899012 43.06799670583464...,2025-11-16 10:22:48.482946,../raw/municipios/zones.csv.gz


In [18]:
SQL("SELECT * FROM bronze_mitma_gau LIMIT 5;")

Unnamed: 0,id,name,population,geometry,loaded_at,source_file
0,01001,Alegría-Dulantzi,2925.0,MULTIPOLYGON (((-2.541390555095102 42.78538643...,2025-11-16 10:22:57.754422,../raw/gau/zones.csv.gz
1,01002,Amurrio,10307.0,MULTIPOLYGON (((-2.976906572483943 42.93701187...,2025-11-16 10:22:57.754422,../raw/gau/zones.csv.gz
2,01004_AM,Artziniega agregacion de municipios,3005.0,MULTIPOLYGON (((-3.137673560842534 43.16203825...,2025-11-16 10:22:57.754422,../raw/gau/zones.csv.gz
3,01009_AM,Asparrena agregacion de municipios,4599.0,POLYGON ((-2.331385516435793 42.81775192629513...,2025-11-16 10:22:57.754422,../raw/gau/zones.csv.gz
4,01010,Ayala/Aiara,2951.0,POLYGON ((-3.000756145899012 43.06799670583464...,2025-11-16 10:22:57.754422,../raw/gau/zones.csv.gz


In [38]:
con.close()