In [1]:
import pandas as pd
import os
import xarray as xr

## Data Reading

In [2]:
df_mapping = pd.read_csv("../../data/super_processed/4_no2_to_traffic_sensor_mapping.csv")
df_air = pd.read_csv("../../data/super_processed/6_df_air_data_and_locations_reduced.csv")

Read traffic data

In [3]:
# Define the root directory for processed traffic data
root_dir = '../../data/super_processed/5_traffic'

# List to hold DataFrames
dataframes = []

# Iterate through all folders in the root directory
for folder_name in os.listdir(root_dir):
    folder_path = os.path.join(root_dir, folder_name)
    
    # Check if the folder exists and is a directory
    if os.path.isdir(folder_path):
        print(f"Processing folder: {folder_path}")
        
        # Iterate through all files in the folder
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.parquet'):  # Ensure it's a Parquet file
                file_path = os.path.join(folder_path, file_name)
                
                # Read the Parquet file
                try:
                    df = pd.read_parquet(file_path)
                    print(f"Successfully read: {file_path} with {len(df)} rows.")
                    
                    # Append the DataFrame to the list
                    dataframes.append(df)
                    
                except Exception as e:
                    print(f"Error reading file {file_name}: {e}")

# Concatenate all DataFrames into one
if dataframes:  # Check if the list is not empty
    df_traffic = pd.concat(dataframes, ignore_index=True)
    print(f"Combined DataFrame created with {len(df_traffic)} rows.")

Processing folder: ../../data/super_processed/5_traffic/2022
Successfully read: ../../data/super_processed/5_traffic/2022/12-2022_processed.parquet with 48259 rows.
Successfully read: ../../data/super_processed/5_traffic/2022/11-2022_processed.parquet with 46690 rows.
Successfully read: ../../data/super_processed/5_traffic/2022/10-2022_processed.parquet with 47556 rows.
Successfully read: ../../data/super_processed/5_traffic/2022/01-2022_processed.parquet with 44145 rows.
Successfully read: ../../data/super_processed/5_traffic/2022/06-2022_processed.parquet with 42196 rows.
Successfully read: ../../data/super_processed/5_traffic/2022/05-2022_processed.parquet with 42735 rows.
Successfully read: ../../data/super_processed/5_traffic/2022/08-2022_processed.parquet with 45087 rows.
Successfully read: ../../data/super_processed/5_traffic/2022/02-2022_processed.parquet with 40278 rows.
Successfully read: ../../data/super_processed/5_traffic/2022/03-2022_processed.parquet with 44372 rows.
Suc

In [4]:
# Define the root directory for processed traffic data
root_dir = '../../data/raw/meteo'

# List to hold DataFrames
dataframes = []

# Iterate through all folders in the root directory
for folder_name in os.listdir(root_dir):
    folder_path = os.path.join(root_dir, folder_name)

    if folder_name == '2013' or folder_name == '2014' or folder_name == '2015' or folder_name == '2016' or folder_name == '2017':
        continue
    
    # Check if the folder exists and is a directory
    if os.path.isdir(folder_path):
        print(f"Processing folder: {folder_path}")
        
        # Iterate through all files in the folder
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.grib'):  # Ensure it's a GRIB file
                file_path = os.path.join(folder_path, file_name)
                
                # Read the file
                try:                    
                    df = xr.open_dataset(file_path, engine='cfgrib',  backend_kwargs={'indexpath': None})
                    print(f"Successfully read: {file_path} with {len(df)} rows.")
                    
                    df = df.to_dataframe().reset_index()  # Reset index if needed
                    df = df[df['d2m'].notna()]
                    
                    # Append the DataFrame to the list
                    dataframes.append(df)
                    
                except Exception as e:
                    print(f"Error reading file {file_name}: {e}")

# Concatenate all DataFrames into one
if dataframes:  # Check if the list is not empty
    df_meteo = pd.concat(dataframes, ignore_index=True)
    print(f"Combined DataFrame created with {len(df_meteo)} rows.")

Processing folder: ../../data/raw/meteo/2022
Successfully read: ../../data/raw/meteo/2022/4e143763b6ddd90830b1b2e53ae3d7a5.grib with 8 rows.
Successfully read: ../../data/raw/meteo/2022/84bbb1de389f549dc6756798501cdca5.grib with 8 rows.
Successfully read: ../../data/raw/meteo/2022/93003d3f27f9fd461aad3ee6e8f7bce0.grib with 8 rows.
Successfully read: ../../data/raw/meteo/2022/e3aa8ebebab5e5eca9010c191fca712.grib with 8 rows.
Successfully read: ../../data/raw/meteo/2022/41df651812e856531f6592e080a66fce.grib with 8 rows.
Successfully read: ../../data/raw/meteo/2022/d9331ddced57882f0d634e4db7abc227.grib with 8 rows.
Successfully read: ../../data/raw/meteo/2022/b3fcc9fe6bc5bd11659b6ba730d19df4.grib with 8 rows.
Successfully read: ../../data/raw/meteo/2022/773f78fa31108dc1bc9242648ce689bd.grib with 8 rows.
Successfully read: ../../data/raw/meteo/2022/b81359a8caf89dcca96e37ff9c93a920.grib with 8 rows.
Successfully read: ../../data/raw/meteo/2022/17d2a65169a31294a6845abb2e3382f6.grib with 8 ro

## Data Cleaning

In [5]:
df_air['id_no2'] = df_air['id_no2'].astype(str)
df_mapping['id_trafico'] = df_mapping['id_trafico'].astype(str)
df_mapping['id_no2'] = df_mapping['id_no2'].astype(str)

In [6]:
df_traffic = df_traffic.rename(columns = {'hora': 'fecha'})

save traffic_data

In [7]:
df_traffic.to_parquet("7_0_all_traffic.parquet", index = False)

In [8]:
df_meteo = df_meteo.rename(columns = {'valid_time':'fecha'})
df_meteo = df_meteo.drop(columns = ['time','step','surface','number'])

# Unit conversion

In [10]:
def convert_units(df: pd.DataFrame) -> pd.DataFrame:
    """Convierte las variables ERA-5 a unidades legibles y *sin* duplicar factores."""
    df = df.copy()

    # ── Temperaturas (K → °C) ──────────────────────────────
    for col in ('t2m', 'd2m'):
        if col in df:
            df[col] -= 273.15

    # ── Radiación (J m⁻² → kWh m⁻²) ───────────────────────
    # (1 kWh = 3 600 000 J)  ➜ rango diario ≈ 0-24
    for col in ('ssr', 'ssrd'):
        if col in df:
            df[col] /= 3_600_000

    # ── Viento (m s⁻¹ → km h⁻¹) ───────────────────────────
    if {'u10', 'v10'}.issubset(df.columns):
        df['u10'] *= 3.6
        df['v10'] *= 3.6

    # ── Presión (Pa → hPa) ─────────────────────────────────
    if 'sp' in df:
        df['sp'] /= 100

    # ── Precipitación (m → mm) ─────────────────────────────
    if 'tp' in df:
        df['tp'] *= 1_000

    return df

In [11]:
df_meteo = convert_units(df_meteo)

## Feature engineering for metheo

In [12]:
import numpy as np

df_meteo['u10'] = pd.to_numeric(df_meteo['u10'], errors='coerce')
df_meteo['v10'] = pd.to_numeric(df_meteo['v10'], errors='coerce')

# 1. Velocidad del viento
df_meteo['wind_speed'] = np.sqrt(df_meteo['u10']**2 + df_meteo['v10']**2)

# 2. Dirección del viento en grados meteorológicos (0° = norte, 90° = este, etc.)
df_meteo['wind_direction_deg'] = (270 - np.degrees(np.arctan2(df_meteo['v10'], df_meteo['u10']))) % 360

# 3. Representación cíclica de la dirección (para modelos como GAM, XGBoost o redes neuronales)
df_meteo['wind_dir_sin'] = np.sin(2 * np.pi * df_meteo['wind_direction_deg'] / 360)
df_meteo['wind_dir_cos'] = np.cos(2 * np.pi * df_meteo['wind_direction_deg'] / 360)

save meteo data

In [13]:
df_meteo.to_parquet("7_1_all_meteo.parquet", index = False)

vamos a quedarnos solo con las estaciones que nos interesan (de momento):

estacion 1

- latitud: 40.5
- longitud -3.7

TODO: posteriormente podemos sacar datos mas precisos.

In [14]:
df_meteo = df_meteo[(df_meteo['latitude'] == 40.5) & (df_meteo['longitude'] == -3.7)]

get the unique values of the longitude and latitude of the air quality data


In [15]:
df_air_locations = df_air[['longitud','latitud']].drop_duplicates().reset_index(drop = True)
df_meteo_locations = df_meteo[['longitude','latitude']].drop_duplicates().reset_index(drop = True)

In [16]:
# import folium

# # Crear un mapa centrado en la ubicación promedio
# center_lat = (df_air_locations['latitud'].mean() + df_meteo_locations['latitude'].mean()) / 2
# center_lon = (df_air_locations['longitud'].mean() + df_meteo_locations['longitude'].mean()) / 2

# # Crear el mapa base
# m = folium.Map(location=[center_lat, center_lon], zoom_start=11, 
#                tiles='CartoDB positron')

# # Añadir marcadores para todas las estaciones de calidad del aire (azul)
# for idx, row in df_air_locations.iterrows():
#     folium.Marker(
#         location=[row['latitud'], row['longitud']],
#         popup=f"""
#         <b>Estación de aire #{idx}</b><br>
#         Latitud: {row['latitud']:.6f}<br>
#         Longitud: {row['longitud']:.6f}
#         """,
#         icon=folium.Icon(color='blue', icon='info-sign'),
#         tooltip="Estación de calidad del aire"
#     ).add_to(m)

# # Añadir marcadores para todas las estaciones meteorológicas (rojo)
# for idx, row in df_meteo_locations.iterrows():
#     folium.Marker(
#         location=[row['latitude'], row['longitude']],
#         popup=f"""
#         <b>Estación meteorológica #{idx}</b><br>
#         Latitud: {row['latitude']:.6f}<br>
#         Longitud: {row['longitude']:.6f}
#         """,
#         icon=folium.Icon(color='red', icon='cloud'),
#         tooltip="Estación meteorológica"
#     ).add_to(m)

# # Añadir leyenda al mapa
# legend_html = '''
# <div style="position: fixed; 
#      bottom: 50px; right: 50px; width: 220px; height: 90px; 
#      border:2px solid grey; z-index:9999; font-size:14px;
#      background-color:white; padding: 10px;
#      border-radius: 5px;">
#      &nbsp; <b>Leyenda</b> <br>
#      &nbsp; <i class="fa fa-info-sign fa-2x" style="color:blue"></i>&nbsp; Estaciones de aire<br>
#      &nbsp; <i class="fa fa-cloud fa-2x" style="color:red"></i>&nbsp; Estaciones meteorológicas
# </div>
# '''
# m.get_root().html.add_child(folium.Element(legend_html))

# # Mostrar el mapa
# #m

In [17]:
df_meteo = df_meteo.rename(columns = {'longitude':'longitud_meteo','latitude':'latitud_meteo'})

In [18]:
df_meteo

Unnamed: 0,latitud_meteo,longitud_meteo,fecha,d2m,t2m,ssr,ssrd,u10,v10,sp,tp,wind_speed,wind_direction_deg,wind_dir_sin,wind_dir_cos
4,40.5,-3.7,2022-09-01 00:00:00,7.392242,19.716827,5.676253,6.935088,4.514721,0.124805,933.912476,0.000441,4.516446,268.416534,-0.999618,-0.027633
14,40.5,-3.7,2022-09-01 01:00:00,7.449982,19.385712,0.000000,0.000000,3.251459,-1.922498,934.611877,0.000000,3.777298,300.594635,-0.860789,0.508961
24,40.5,-3.7,2022-09-01 02:00:00,7.561432,18.419281,0.000000,0.000000,1.821753,-2.906378,934.409973,0.000000,3.430134,327.920013,-0.531102,0.847308
34,40.5,-3.7,2022-09-01 03:00:00,7.584076,17.629364,0.000000,0.000000,1.583459,-2.935657,934.240601,0.000000,3.335479,331.658081,-0.474732,0.880130
44,40.5,-3.7,2022-09-01 04:00:00,7.665924,16.715912,0.000000,0.000000,1.658386,-3.484424,934.223755,0.000000,3.858944,334.548218,-0.429751,0.902947
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
601874,40.5,-3.7,2018-03-30 19:00:00,-1.196320,5.444977,4.370864,5.249484,12.035083,13.605469,916.927490,2.062368,18.164581,221.495239,-0.662558,-0.749011
601884,40.5,-3.7,2018-03-30 20:00:00,0.749298,3.738922,4.370864,5.249484,13.084826,13.618103,917.066895,3.013182,18.885588,223.855927,-0.692847,-0.721084
601894,40.5,-3.7,2018-03-30 21:00:00,1.829132,2.836578,4.370864,5.249484,12.721289,12.346436,917.453125,3.689831,17.727539,225.856720,-0.717600,-0.696455
601904,40.5,-3.7,2018-03-30 22:00:00,2.279999,3.150299,4.370864,5.249484,12.715027,9.665387,917.693115,3.814471,15.971588,232.759552,-0.796103,-0.605161


In [19]:
df_meteo.to_parquet("7_2_meteo_data_one_station.parquet", index = False)

Joining the data `air_quality`, `traffic` and `meteo`

In [20]:
df_air.rename(columns = {'longitud':'longitud_no2', 'latitud':'latitud_no2'}, inplace = True)
df_air = df_air.drop(columns = ['latitud_no2','longitud_no2'])

In [21]:
df = pd.merge(df_air, df_mapping, how = 'left', left_on='id_no2', right_on='id_no2')

In [22]:
df['fecha'] = pd.to_datetime(df['fecha'])
df_traffic['fecha'] = pd.to_datetime(df_traffic['fecha'])
df_meteo['fecha'] = pd.to_datetime(df_meteo['fecha'])

In [23]:
df = pd.merge(df, df_traffic, how = 'inner', left_on=['id_trafico','fecha'], right_on=['id_trafico','fecha'])

In [24]:
df = pd.merge(df, df_meteo, how = 'inner', left_on=['fecha'], right_on=['fecha'])

In [25]:
df

Unnamed: 0,id_no2,fecha,no2_value,latitud_no2,longitud_no2,id_trafico,distance_m,latitud_trafico,longitud_trafico,intensidad,...,ssr,ssrd,u10,v10,sp,tp,wind_speed,wind_direction_deg,wind_dir_sin,wind_dir_cos
0,28079004,2018-01-01 01:00:00,15.0,40.423882,-3.712257,4284,56.2,40.424183,-3.711724,480.75,...,0.000000,0.000000,7.573919,1.335168,941.770020,0.0,7.690704,260.002319,-0.984815,-0.173608
1,28079004,2018-01-01 01:00:00,15.0,40.423882,-3.712257,3732,68.5,40.423533,-3.712923,65.75,...,0.000000,0.000000,7.573919,1.335168,941.770020,0.0,7.690704,260.002319,-0.984815,-0.173608
2,28079004,2018-01-01 01:00:00,15.0,40.423882,-3.712257,4286,73.1,40.424044,-3.711419,846.25,...,0.000000,0.000000,7.573919,1.335168,941.770020,0.0,7.690704,260.002319,-0.984815,-0.173608
3,28079004,2018-01-01 01:00:00,15.0,40.423882,-3.712257,4285,104.9,40.423156,-3.711466,858.25,...,0.000000,0.000000,7.573919,1.335168,941.770020,0.0,7.690704,260.002319,-0.984815,-0.173608
4,28079004,2018-01-01 01:00:00,15.0,40.423882,-3.712257,4353,155.4,40.425278,-3.712352,862.25,...,0.000000,0.000000,7.573919,1.335168,941.770020,0.0,7.690704,260.002319,-0.984815,-0.173608
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3228477,28079056,2024-12-01 00:00:00,48.0,40.385034,-3.718768,10982,118.3,40.384263,-3.719731,167.50,...,2.180835,2.560146,-1.041010,-5.588470,944.451233,0.0,5.684602,10.552002,0.183128,0.983089
3228478,28079056,2024-12-01 00:00:00,48.0,40.385034,-3.718768,10615,119.2,40.385814,-3.719732,36.50,...,2.180835,2.560146,-1.041010,-5.588470,944.451233,0.0,5.684602,10.552002,0.183128,0.983089
3228479,28079056,2024-12-01 00:00:00,48.0,40.385034,-3.718768,11006,163.0,40.385931,-3.717247,186.00,...,2.180835,2.560146,-1.041010,-5.588470,944.451233,0.0,5.684602,10.552002,0.183128,0.983089
3228480,28079056,2024-12-01 00:00:00,48.0,40.385034,-3.718768,10613,164.0,40.386412,-3.719458,188.00,...,2.180835,2.560146,-1.041010,-5.588470,944.451233,0.0,5.684602,10.552002,0.183128,0.983089


Feature engineering for date columns

In [26]:
# add the feature engineering for the date columns
df['day_of_week'] = df['fecha'].dt.dayofweek
df['day_of_year'] = df['fecha'].dt.dayofyear
df['month'] = df['fecha'].dt.month
df['year'] = df['fecha'].dt.year
df['weekend'] = df['day_of_week'].isin([5,6]).astype(int)
df['season'] = df['month'].apply(lambda x: 0 if x in [12,1,2] else 1 if x in [3,4,5] else 2 if x in [6,7,8] else 3)
df['hour'] = df['fecha'].dt.hour
df['day'] = df['fecha'].dt.day

## Temporal FE (Lags / MA / SUM / ECW)

In [27]:

from __future__ import annotations
import argparse
import pandas as pd
import numpy as np
from pathlib import Path


# ------------------------------------------------------------------ #
# Configuración global
# ------------------------------------------------------------------ #
ID_COL   = "id_no2"    # estación de referencia
TIME_COL = "fecha"     # marca temporal

# Ventanas y variables a procesar
SMA_SPEC = {
    "wind_speed": [3, 6, 24],
    "t2m":        [6, 24],
    "d2m":        [6, 24],
    "sp":         [6, 24, 72],
    "u10":        [6, 24],
    "v10":        [6, 24],
}

EWM_SPEC = {
    "wind_speed": 3,
    "t2m":        6,
    "d2m":        6,
    "sp":         12,
    "u10":        6,
    "v10":        6,
}



SUM_SPEC = {
    "ssr":  [24],       # radiación neta
    "ssrd": [24],       # radiación directa
    "tp":   [6, 24],    # precipitación
}

DIR_SPEC = {
    "wind_dir_sin": [6, 24],
    "wind_dir_cos": [6, 24],
}

# ------------------------------------------------------------------ #
# Funciones de generación
# ------------------------------------------------------------------ #
def add_sma(df: pd.DataFrame, col: str, window: int) -> None:
    """Media móvil simple que incluye el instante t."""
    new_name = f"{col}_ma{window}"
    df[new_name] = (
        df.groupby(ID_COL, sort=False)[col]
          .rolling(window=window, min_periods=1)         # incluye t
          .mean()
          .reset_index(level=0, drop=True)
    )

def add_ewm(df: pd.DataFrame, col: str, halflife: int) -> None:
    """Media móvil exponencial (EWM) con half-life en horas, incluye t."""
    new_name = f"{col}_ewm{halflife}"
    df[new_name] = (
        df.groupby(ID_COL, sort=False)[col]
          .apply(lambda s: s.ewm(halflife=halflife, adjust=False).mean())
          .reset_index(level=0, drop=True)
    )

def add_sum(df: pd.DataFrame, col: str, window: int) -> None:
    """Acumulado (suma) de las últimas `window` horas incluyendo t."""
    new_name = f"{col}_sum{window}"
    df[new_name] = (
        df.groupby(ID_COL, sort=False)[col]
          .rolling(window=window, min_periods=1)
          .sum()
          .reset_index(level=0, drop=True)
    )

# ------------------------------------------------------------------ #
# Pipeline completo
# ------------------------------------------------------------------ #
def build_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.sort_values([ID_COL, TIME_COL]).copy()

    # SMA
    for col, windows in SMA_SPEC.items():
        for w in windows:
            add_sma(df, col, w)

    # EWM
    for col, hl in EWM_SPEC.items():
        add_ewm(df, col, hl)

    # SUM
    for col, windows in SUM_SPEC.items():
        for w in windows:
            add_sum(df, col, w)

    # Dirección de viento suavizada
    for col, windows in DIR_SPEC.items():
        for w in windows:
            add_sma(df, col, w)    # genera *_ma{w}

    # Reconstrucción del ángulo de viento para cada ventana
    for w in DIR_SPEC["wind_dir_sin"]:
        sin_col = f"wind_dir_sin_ma{w}"
        cos_col = f"wind_dir_cos_ma{w}"
        deg_col = f"wind_dir_deg_ma{w}"
        df[deg_col] = np.degrees(np.arctan2(df[sin_col], df[cos_col])) % 360

    return df


df = build_features(df)

In [28]:
# Ordenar por sensor y por fecha
df = df.sort_values(['id_no2', 'fecha'])

# Generar lags por sensor
for var in ['intensidad', 'ocupacion', 'carga']:
    for lag in [1, 2, 3, 4, 6, 8]:
        df[f'{var}_lag{lag}'] = df.groupby('id_no2')[var].shift(lag)

In [29]:
df = df.dropna()

In [30]:
df.to_parquet("../../data/super_processed/7_3_no2_with_traffic_and_meteo.parquet", index=False)

## Gestionar outliers y valores faltantes

In [31]:
# Ver cuántos id_trafico están asignados a cada id_no2
resumen = df.groupby('id_no2')['id_trafico'].nunique().reset_index()
resumen.columns = ['id_no2', 'num_sensores_trafico']
print("Número de sensores de tráfico asignados a cada sensor NO2:")
print(resumen.sort_values(by='num_sensores_trafico', ascending=False))

# Identificar sensores NO2 sin sensores de tráfico asignados
sin_trafico = resumen[resumen['num_sensores_trafico'] == 0]['id_no2'].tolist()
if sin_trafico:
    print(f"\nSensores NO2 sin sensores de tráfico asignados: {sin_trafico}")

Número de sensores de tráfico asignados a cada sensor NO2:
      id_no2  num_sensores_trafico
12  28079056                    14
0   28079004                    10
2   28079011                     8
7   28079039                     7
5   28079036                     6
1   28079008                     5
4   28079035                     4
10  28079048                     4
6   28079038                     3
3   28079016                     2
8   28079040                     2
9   28079047                     2
11  28079050                     2


In [32]:
# Para cada id_no2, quedarse solo con el id_trafico que tenga mas datos y añadir una nueva columna con el numero de datos, el inicio y el fin de los dato

# Paso 1: Calcular métricas para cada combinación id_no2 y id_trafico
stats_trafico = df.groupby(['id_no2', 'id_trafico']).agg(
    num_registros=('fecha', 'count'),
    fecha_inicio=('fecha', 'min'),
    fecha_fin=('fecha', 'max')
).reset_index()

# Paso 2: Para cada id_no2, encontrar el id_trafico con más datos
mejor_trafico = stats_trafico.sort_values(['id_no2', 'num_registros'], ascending=[True, False])
mejor_trafico = mejor_trafico.groupby('id_no2').first().reset_index()

# Paso 3: Añadir columnas informativas
mejor_trafico['periodo_dias'] = (mejor_trafico['fecha_fin'] - mejor_trafico['fecha_inicio']).dt.days
mejor_trafico['densidad_datos'] = mejor_trafico['num_registros'] / mejor_trafico['periodo_dias'].clip(lower=1)

# Paso 4: Unir con el DataFrame original para mantener solo las filas con mejores sensores
df_filtrado = pd.merge(
    df,
    mejor_trafico[['id_no2', 'id_trafico']],
    on=['id_no2', 'id_trafico'],
    how='inner'
)

# Paso 5: Añadir las métricas calculadas al DataFrame filtrado
df_final = pd.merge(
    df_filtrado,
    mejor_trafico[['id_no2', 'id_trafico', 'num_registros', 'fecha_inicio', 'fecha_fin', 'periodo_dias', 'densidad_datos']],
    on=['id_no2', 'id_trafico'],
    how='left'
)

# Mostrar resumen del resultado
print(f"DataFrame original: {len(df)} filas, {df['id_no2'].nunique()} sensores NO2, {df['id_trafico'].nunique()} sensores tráfico")
print(f"DataFrame filtrado: {len(df_final)} filas, {df_final['id_no2'].nunique()} sensores NO2, {df_final['id_trafico'].nunique()} sensores tráfico")

# Mostrar información sobre los mejores sensores de tráfico seleccionados
print("\nResumen de mejores sensores de tráfico seleccionados:")
resumen = mejor_trafico[['id_no2', 'id_trafico', 'num_registros', 'periodo_dias']].sort_values('num_registros', ascending=False)
print(resumen.head(20))  # Muestra los 10 primeros

# df_final.to_csv('datos_filtrados_mejor_trafico.csv', index=False)

DataFrame original: 3226553 filas, 13 sensores NO2, 69 sensores tráfico
DataFrame filtrado: 736070 filas, 13 sensores NO2, 13 sensores tráfico

Resumen de mejores sensores de tráfico seleccionados:
      id_no2 id_trafico  num_registros  periodo_dias
10  28079048       4461          59054          2525
11  28079050       5465          58948          2525
8   28079040       5783          58918          2525
9   28079047       4129          58478          2525
6   28079038       4472          58165          2525
7   28079039       5422          58162          2525
12  28079056       5084          58107          2525
1   28079008       4022          57941          2525
2   28079011       3911          57577          2525
5   28079036       6116          56791          2525
3   28079016       3791          55980          2525
4   28079035       3731          50516          2525
0   28079004       4284          47433          2525


In [33]:
# filtrar el df original para quedarnos con estos id_trafico y los id_no2 que tengan asignados
df_filtered = df[df['id_trafico'].isin(mejor_trafico['id_trafico'])]
df_filtered = df_filtered[df_filtered['id_no2'].isin(mejor_trafico['id_no2'])]

In [34]:
#df_filtered.to_parquet("no2_with_traffic_and_meteo_one_station_filtered_with_best_trafic_id.parquet", index=False)

In [35]:
df_filtered.to_parquet("7_4_no2_with_traffic_and_1meteo_and_1trafic_id.parquet", index=False)

In [36]:
# Drop all columns from df_meteo except 'fecha'
cols_to_drop = [col for col in df_meteo.columns if col != 'fecha']

In [37]:
df_filtered = df_filtered.drop(columns=cols_to_drop)
df_filtered.to_parquet("7_5_no2_with_1traffic_id.parquet", index=False)

In [38]:
#test = pd.read_parquet("../../data/super_processed/7_5_no2_with_1traffic_id.parquet")

In [None]:
# import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
# import numpy as np
# from matplotlib.colors import LinearSegmentedColormap

# # Filtrar para el ID específico
# df_sensor = df[df['id_trafico'] == '5465'].copy()

# # Asegurarse que las fechas estén en formato datetime
# fecha_col = 'fecha'  # Ajusta al nombre real de tu columna de fecha
# if not pd.api.types.is_datetime64_any_dtype(df_sensor[fecha_col]):
#     df_sensor[fecha_col] = pd.to_datetime(df_sensor[fecha_col])

# # Crear un índice de todas las horas que deberían existir
# fecha_min = df_sensor[fecha_col].min()
# fecha_max = df_sensor[fecha_col].max()
# todas_horas = pd.date_range(start=fecha_min, end=fecha_max, freq='H')

# # Crear DataFrame con todas las horas
# df_completo = pd.DataFrame(index=todas_horas)
# df_completo.index.name = 'hora'

# # Marcar las horas que existen en los datos originales
# df_sensor_hora = df_sensor.set_index(fecha_col)
# df_completo['tiene_datos'] = df_completo.index.isin(df_sensor_hora.index).astype(int)

# # Calcular estadísticas de completitud
# total_horas = len(todas_horas)
# horas_con_datos = df_completo['tiene_datos'].sum()
# porcentaje_completitud = (horas_con_datos / total_horas) * 100

# print(f"Periodo: {fecha_min} a {fecha_max}")
# print(f"Total de horas en el periodo: {total_horas}")
# print(f"Horas con datos: {horas_con_datos} ({porcentaje_completitud:.2f}%)")
# print(f"Horas sin datos: {total_horas - horas_con_datos} ({100-porcentaje_completitud:.2f}%)")

# # Visualizar huecos temporales por día y hora
# df_completo['fecha'] = df_completo.index.date
# df_completo['hora_dia'] = df_completo.index.hour


# import calplot
# import matplotlib.pyplot as plt

# # Preparar datos para el calendario
# df_completo['fecha'] = pd.to_datetime(df_completo['fecha'])
# datos_diarios = df_completo.groupby('fecha')['tiene_datos'].sum()
# datos_diarios = datos_diarios / 24 * 100  # Convertir a porcentaje de completitud

# # Crear visualización de calendario
# plt.figure(figsize=(16, 10))
# calplot.calplot(datos_diarios, cmap='YlGn', 
#                fillcolor='whitesmoke',
#                vmin=0, vmax=100, 
#                suptitle=f'Disponibilidad diaria de datos (%) - Sensor {10885}')
# plt.tight_layout()
# plt.show()
