In [37]:
import pandas as pd
from tqdm import tqdm
import os

In [38]:
df_mapping_no2_to_traffico = pd.read_csv("../../data/processed/mapping/no2_to_traffic_sensor_mapping.csv")

In [39]:
import pandas as pd
from tqdm import tqdm

def process_traffic_data(df_traffic):
    """
    Función para procesar datos de tráfico a nivel horario a partir de datos cada 15 minutos.
    
    Se agrupan los datos por sensor y hora, calculando:
    - **Intensidad**: Promedio de la intensidad (vehículos/hora).
    - **Carga**: Promedio ponderado de la carga, usando la intensidad como peso.
    - **Ocupación**: Promedio ponderado de la ocupación, usando la intensidad como peso.
    - **Velocidad media (vmed)**: Promedio ponderado de la velocidad media, usando la intensidad como peso.

    Parámetros:
        df_traffic (pd.DataFrame): DataFrame con datos de tráfico a nivel de 15 minutos.

    Retorna:
        pd.DataFrame: DataFrame con los datos agregados a nivel horario.
    """

    # Convertir la columna 'fecha' a tipo datetime para manipulación de fechas
    df_traffic['fecha'] = pd.to_datetime(df_traffic['fecha'])

    # Redondear la fecha a la hora más cercana (para agrupar los datos por hora)
    df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')

    # Rellenar valores NaN con -1 para evitar problemas en cálculos posteriores
    df_traffic = df_traffic.fillna(-1)

    # Función para calcular un promedio ponderado
    def weighted_average(group, value_column, weight_column):
        """
        Calcula la media ponderada de una columna usando otra como peso.

        Parámetros:
            group (pd.DataFrame): Grupo de datos a procesar.
            value_column (str): Nombre de la columna cuyo promedio ponderado se calcula.
            weight_column (str): Nombre de la columna que se usa como peso.

        Retorna:
            float: Media ponderada.
        """
        values = group[value_column]
        weights = group[weight_column]
        
        # Si la suma de los pesos es mayor a 0, se calcula la media ponderada, si no, se devuelve 0
        weighted_avg = (values * weights).sum() / weights.sum() if weights.sum() > 0 else 0
        return weighted_avg

    # Agrupar los datos por sensor y hora
    grouped = df_traffic.groupby(['id_trafico', 'hora'])

    # Lista para almacenar los resultados procesados
    result = []

    # Procesar cada grupo de sensor y hora
    for name, group in tqdm(grouped, desc="Processing groups"):
        """
        Para cada grupo (es decir, cada sensor en cada hora):
        - Se calcula la **intensidad** como promedio de la intensidad en los 4 intervalos de 15 minutos.
        - Se calcula la **carga** como un promedio ponderado de los valores de carga usando intensidad como peso.
        - Se calcula la **ocupación** como un promedio ponderado usando la intensidad como peso.
        - Se calcula la **velocidad media (vmed)** como un promedio ponderado usando la intensidad como peso.
        """

        # Intensidad: Como ya está en vehículos/hora, tomamos el promedio, no la suma
        intensidad_mean = group['intensidad'].mean()

        # Carga: Media ponderada con intensidad como peso
        carga_mean = weighted_average(group, 'carga', 'intensidad')

        # Ocupación: Media ponderada con intensidad como peso (no se puede sumar porque es un porcentaje)
        ocupacion_mean = weighted_average(group, 'ocupacion', 'intensidad')

        # Velocidad media: Media ponderada con intensidad como peso
        vmed_weighted = weighted_average(group, 'vmed', 'intensidad')

        # Agregar los resultados procesados a la lista
        result.append({
            'id_trafico': name[0],   # ID del sensor de tráfico
            'hora': name[1],         # Hora agregada
            'intensidad': intensidad_mean,  # Promedio de intensidad (veh/h)
            'carga': carga_mean,           # Promedio ponderado de carga
            'ocupacion': ocupacion_mean,   # Promedio ponderado de ocupación
            'vmed': vmed_weighted          # Promedio ponderado de velocidad media
        })

    # Convertir la lista de resultados en un DataFrame
    df_traffic = pd.DataFrame(result)

    return df_traffic  # Retornar el DataFrame procesado


In [40]:
def corregir_errores(df):
    # Asegurarse de que 'fecha' sea de tipo datetime
    
    # Iterar por las filas donde 'error' es 'S'
    for index, row in df[df['error'] == 'S'].iterrows():
        # Inicializar el contador de minutos
        minutos_atras = 15
        
        try:
            while True:
                # Buscar la fila anterior con el mismo 'id_trafico' y 'error' diferente de 'S'
                fila_anterior = df[
                    (df['id_trafico'] == row['id_trafico']) & 
                    (df['fecha'] == row['fecha'] - pd.Timedelta(minutes=minutos_atras))
                ]
                
                if not fila_anterior.empty and fila_anterior.iloc[0]['error'] != 'S':
                    # Copiar los valores de la fila anterior en la fila con error
                    for col in ['intensidad', 'ocupacion', 'carga', 'vmed', 'periodo_integracion']:
                        df.at[index, col] = fila_anterior.iloc[0][col]
                    
                    # Ajustar el error a 'N' después de la corrección
                    df.at[index, 'error'] = 'N'
                    break  # Salir del bucle si se ha encontrado una fila válida
                
                # Incrementar el contador de minutos para buscar más atrás
                minutos_atras += 15
                
                # Si se han buscado más de un número razonable de filas, salir para evitar bucles infinitos
                if minutos_atras > 180:  # Por ejemplo, buscar hasta 3 horas atrás
                    break
        except:
            df.at[index, col] = -1
        
    return df

In [41]:
df_mapping_no2_to_traffico['id_trafico'] = df_mapping_no2_to_traffico['id_trafico'].astype(int).astype(str)
filtered_traffic_sensors = list(df_mapping_no2_to_traffico.id_trafico.unique())

In [45]:
len(filtered_traffic_sensors)

69

In [46]:
# Define the root directory
root_dir = '../../data/raw/traffic'

# Iterate through all year folders
for year in range(2019, 2025):
    year_folder = os.path.join(root_dir, str(year))
    
    # Check if the year folder exists
    if os.path.isdir(year_folder):
        print(f"Processing folder: {year_folder}")
        
        # Iterate through files in the year folder
        for file_name in os.listdir(year_folder):
            if file_name.endswith('.csv'):  # Ensure it's a CSV file
                
                
                file_path = os.path.join(year_folder, file_name)
                
                try:
                    df = pd.read_csv(file_path, delimiter= ";")
                    
                    df = df.rename(columns = {'hora': 'fecha'})
                    df['fecha'] = pd.to_datetime(df['fecha'])
                        
                    # algunos se llaman id y otros identif
                    # Rename columns if they exist
                    if 'identif' in df.columns:
                        df = df.rename(columns={'identif': 'id_trafico'})

                    if 'id' in df.columns:
                        df = df.rename(columns={'id': 'id_trafico'})
                        
                    df['id_trafico'] = df['id_trafico'].apply(lambda x: str(int(x)) if str(x).isdigit() else str(x))
                    df['id_trafico'] = df['id_trafico'].astype(str)

                    print("len df before filtering ", len(df))
                    print("Unique traffic sensors before filtering:" , df.id_trafico.nunique())

                    df = df[df['id_trafico'].isin(filtered_traffic_sensors)]
                    
                    print("len df after filtering ", len(df))
                    print("Unique traffic sensors after filtering:" , df.id_trafico.nunique())
                    
                    if 'S' in df.error.unique():
                        print(f"There are errors in file: {file_name}")
                        print("There are a total of ", len(df[df['error'] == 'S']) , " errors")
                        
                        df = corregir_errores(df)
                        
                    df = process_traffic_data(df)
                        
                    # Create the new file name
                    new_file_name = f"{file_name[:-4]}_processed.parquet"
                    new_file_path = os.path.join(year_folder, new_file_name)
                    new_file_path = new_file_path.replace("raw", "processed")
                    
                    # Create the processed directory if it does not exist
                    processed_dir = os.path.dirname(new_file_path)
                    os.makedirs(processed_dir, exist_ok=True)

                    # Save the processed DataFrame
                    df.to_parquet(new_file_path, index=False)
                    
                    print(f"Processed and saved: {new_file_path}")
                except Exception as e:
                    print(f"Error processing file {file_name}: {e}")


Processing folder: ../../data/raw/traffic/2019
len df before filtering  11203627
Unique traffic sensors before filtering: 3937
len df after filtering  154550
Unique traffic sensors after filtering: 53


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 38852/38852 [00:05<00:00, 7385.96it/s]


Processed and saved: ../../data/processed/traffic/2019/01-2019_processed.parquet
len df before filtering  11252731
Unique traffic sensors before filtering: 3998
len df after filtering  149099
Unique traffic sensors after filtering: 53


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 37438/37438 [00:05<00:00, 7185.90it/s]


Processed and saved: ../../data/processed/traffic/2019/07-2019_processed.parquet
len df before filtering  10794005
Unique traffic sensors before filtering: 3982
len df after filtering  143596
Unique traffic sensors after filtering: 53


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 36074/36074 [00:05<00:00, 7158.63it/s]


Processed and saved: ../../data/processed/traffic/2019/06-2019_processed.parquet
len df before filtering  11478841
Unique traffic sensors before filtering: 4033
len df after filtering  153008
Unique traffic sensors after filtering: 53


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 38431/38431 [00:05<00:00, 7155.35it/s]


Processed and saved: ../../data/processed/traffic/2019/10-2019_processed.parquet
len df before filtering  11186837
Unique traffic sensors before filtering: 4039
len df after filtering  148975
Unique traffic sensors after filtering: 53


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 37417/37417 [00:05<00:00, 7175.37it/s]


Processed and saved: ../../data/processed/traffic/2019/11-2019_processed.parquet
len df before filtering  11096650
Unique traffic sensors before filtering: 3963
len df after filtering  143357
Unique traffic sensors after filtering: 53


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 36189/36189 [00:04<00:00, 7253.51it/s]


Processed and saved: ../../data/processed/traffic/2019/03-2019_processed.parquet
len df before filtering  10060856
Unique traffic sensors before filtering: 3941
len df after filtering  135482
Unique traffic sensors after filtering: 53


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 34131/34131 [00:04<00:00, 7061.39it/s]


Processed and saved: ../../data/processed/traffic/2019/02-2019_processed.parquet
len df before filtering  11023586
Unique traffic sensors before filtering: 4019
len df after filtering  143316
Unique traffic sensors after filtering: 53


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 35981/35981 [00:05<00:00, 7143.86it/s]


Processed and saved: ../../data/processed/traffic/2019/09-2019_processed.parquet
len df before filtering  11283290
Unique traffic sensors before filtering: 3998
len df after filtering  147430
Unique traffic sensors after filtering: 53


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 37173/37173 [00:05<00:00, 7225.96it/s]


Processed and saved: ../../data/processed/traffic/2019/08-2019_processed.parquet
len df before filtering  11518048
Unique traffic sensors before filtering: 4056
len df after filtering  150672
Unique traffic sensors after filtering: 53


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 37895/37895 [00:05<00:00, 7141.77it/s]


Processed and saved: ../../data/processed/traffic/2019/12-2019_processed.parquet
len df before filtering  10542041
Unique traffic sensors before filtering: 3950
len df after filtering  133826
Unique traffic sensors after filtering: 53


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 33716/33716 [00:04<00:00, 7152.88it/s]


Processed and saved: ../../data/processed/traffic/2019/04-2019_processed.parquet
len df before filtering  11060255
Unique traffic sensors before filtering: 3968


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')


len df after filtering  141308
Unique traffic sensors after filtering: 53


Processing groups: 100%|██████████| 35541/35541 [00:04<00:00, 7128.19it/s]


Processed and saved: ../../data/processed/traffic/2019/05-2019_processed.parquet
Processing folder: ../../data/raw/traffic/2020
len df before filtering  11577408
Unique traffic sensors before filtering: 4058
len df after filtering  146849
Unique traffic sensors after filtering: 53


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 37193/37193 [00:05<00:00, 7130.94it/s]


Processed and saved: ../../data/processed/traffic/2020/01-2020_processed.parquet
len df before filtering  11401995
Unique traffic sensors before filtering: 4168
len df after filtering  141972
Unique traffic sensors after filtering: 54


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 36420/36420 [00:04<00:00, 7309.86it/s]


Processed and saved: ../../data/processed/traffic/2020/10-2020_processed.parquet
len df before filtering  11304137
Unique traffic sensors before filtering: 4160
len df after filtering  141456
Unique traffic sensors after filtering: 52


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 36123/36123 [00:05<00:00, 7204.74it/s]


Processed and saved: ../../data/processed/traffic/2020/11-2020_processed.parquet
len df before filtering  10915562
Unique traffic sensors before filtering: 3966
len df after filtering  144106
Unique traffic sensors after filtering: 53


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 36581/36581 [00:04<00:00, 7437.70it/s]


Processed and saved: ../../data/processed/traffic/2020/07-2020_processed.parquet
len df before filtering  10552512
Unique traffic sensors before filtering: 3970
len df after filtering  137696
Unique traffic sensors after filtering: 53


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 34902/34902 [00:04<00:00, 7505.64it/s]


Processed and saved: ../../data/processed/traffic/2020/06-2020_processed.parquet
len df before filtering  11050575
Unique traffic sensors before filtering: 4150


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')


len df after filtering  136014
Unique traffic sensors after filtering: 53


Processing groups: 100%|██████████| 34660/34660 [00:04<00:00, 7457.91it/s]


Processed and saved: ../../data/processed/traffic/2020/09-2020_processed.parquet
len df before filtering  10776844
Unique traffic sensors before filtering: 3965
len df after filtering  140806
Unique traffic sensors after filtering: 53


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 36045/36045 [00:04<00:00, 7667.71it/s]


Processed and saved: ../../data/processed/traffic/2020/08-2020_processed.parquet
len df before filtering  11240571
Unique traffic sensors before filtering: 4068
len df after filtering  143232
Unique traffic sensors after filtering: 51


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 36539/36539 [00:05<00:00, 7194.72it/s]


Processed and saved: ../../data/processed/traffic/2020/03-2020_processed.parquet
len df before filtering  10926382
Unique traffic sensors before filtering: 4071
len df after filtering  138173
Unique traffic sensors after filtering: 53


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 35010/35010 [00:04<00:00, 7246.07it/s]


Processed and saved: ../../data/processed/traffic/2020/02-2020_processed.parquet
len df before filtering  10603087
Unique traffic sensors before filtering: 4064
len df after filtering  133937
Unique traffic sensors after filtering: 52


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 34627/34627 [00:04<00:00, 7127.44it/s]


Processed and saved: ../../data/processed/traffic/2020/04-2020_processed.parquet
len df before filtering  10905113
Unique traffic sensors before filtering: 4065


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')


len df after filtering  136748
Unique traffic sensors after filtering: 52


Processing groups: 100%|██████████| 34990/34990 [00:04<00:00, 7209.77it/s]


Processed and saved: ../../data/processed/traffic/2020/05-2020_processed.parquet
len df before filtering  11101234
Unique traffic sensors before filtering: 4173
len df after filtering  138899
Unique traffic sensors after filtering: 53


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 36058/36058 [00:05<00:00, 7170.78it/s]


Processed and saved: ../../data/processed/traffic/2020/12-2020_processed.parquet
Processing folder: ../../data/raw/traffic/2021
len df before filtering  10422264
Unique traffic sensors before filtering: 4173
len df after filtering  127515
Unique traffic sensors after filtering: 52


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 33720/33720 [00:04<00:00, 7233.82it/s]


Processed and saved: ../../data/processed/traffic/2021/01-2021_processed.parquet
len df before filtering  11753247
Unique traffic sensors before filtering: 4351
len df after filtering  157909
Unique traffic sensors after filtering: 60


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 40029/40029 [00:05<00:00, 7217.07it/s]


Processed and saved: ../../data/processed/traffic/2021/11-2021_processed.parquet
len df before filtering  12036913
Unique traffic sensors before filtering: 4314
len df after filtering  160691
Unique traffic sensors after filtering: 56


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 40681/40681 [00:05<00:00, 7167.76it/s]


Processed and saved: ../../data/processed/traffic/2021/10-2021_processed.parquet
len df before filtering  11577338
Unique traffic sensors before filtering: 4273


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')


len df after filtering  136804
Unique traffic sensors after filtering: 55


Processing groups: 100%|██████████| 34725/34725 [00:04<00:00, 7180.46it/s]


Processed and saved: ../../data/processed/traffic/2021/06-2021_processed.parquet
len df before filtering  12132019
Unique traffic sensors before filtering: 4304
len df after filtering  150906
Unique traffic sensors after filtering: 53


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 38288/38288 [00:05<00:00, 7103.27it/s]


Processed and saved: ../../data/processed/traffic/2021/07-2021_processed.parquet
len df before filtering  11992900
Unique traffic sensors before filtering: 4301
len df after filtering  151452
Unique traffic sensors after filtering: 53


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 38612/38612 [00:05<00:00, 7226.18it/s]


Processed and saved: ../../data/processed/traffic/2021/08-2021_processed.parquet
len df before filtering  11697249
Unique traffic sensors before filtering: 4318
len df after filtering  148204
Unique traffic sensors after filtering: 57


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 37577/37577 [00:05<00:00, 7136.49it/s]


Processed and saved: ../../data/processed/traffic/2021/09-2021_processed.parquet
len df before filtering  10477478
Unique traffic sensors before filtering: 4181


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')


len df after filtering  128966
Unique traffic sensors after filtering: 53


Processing groups: 100%|██████████| 32817/32817 [00:04<00:00, 7075.52it/s]


Processed and saved: ../../data/processed/traffic/2021/02-2021_processed.parquet
len df before filtering  11891658
Unique traffic sensors before filtering: 4255


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')


len df after filtering  141084
Unique traffic sensors after filtering: 51


Processing groups: 100%|██████████| 35932/35932 [00:05<00:00, 7167.87it/s]


Processed and saved: ../../data/processed/traffic/2021/03-2021_processed.parquet
len df before filtering  11937669
Unique traffic sensors before filtering: 4259
len df after filtering  146058
Unique traffic sensors after filtering: 54


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 37130/37130 [00:05<00:00, 7139.60it/s]


Processed and saved: ../../data/processed/traffic/2021/05-2021_processed.parquet
len df before filtering  11495366
Unique traffic sensors before filtering: 4238
len df after filtering  135922
Unique traffic sensors after filtering: 50


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 34608/34608 [00:04<00:00, 7161.19it/s]


Processed and saved: ../../data/processed/traffic/2021/04-2021_processed.parquet
len df before filtering  12309478
Unique traffic sensors before filtering: 4372
len df after filtering  175074
Unique traffic sensors after filtering: 60


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 44260/44260 [00:06<00:00, 7291.54it/s]


Processed and saved: ../../data/processed/traffic/2021/12-2021_processed.parquet
Processing folder: ../../data/raw/traffic/2022
len df before filtering  12429813
Unique traffic sensors before filtering: 4374
len df after filtering  174980
Unique traffic sensors after filtering: 61


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 44303/44303 [00:06<00:00, 7180.82it/s]


Processed and saved: ../../data/processed/traffic/2022/01-2022_processed.parquet
len df before filtering  12511377
Unique traffic sensors before filtering: 4471
len df after filtering  183165
Unique traffic sensors after filtering: 63


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 46111/46111 [00:06<00:00, 7181.19it/s]


Processed and saved: ../../data/processed/traffic/2022/10-2022_processed.parquet
len df before filtering  12096282
Unique traffic sensors before filtering: 4488
len df after filtering  180201
Unique traffic sensors after filtering: 63


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 45285/45285 [00:06<00:00, 7097.17it/s]


Processed and saved: ../../data/processed/traffic/2022/11-2022_processed.parquet
len df before filtering  11796277
Unique traffic sensors before filtering: 4132
len df after filtering  167934
Unique traffic sensors after filtering: 59


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 42536/42536 [00:05<00:00, 7217.63it/s]


Processed and saved: ../../data/processed/traffic/2022/07-2022_processed.parquet
len df before filtering  11399709
Unique traffic sensors before filtering: 4129
len df after filtering  163110
Unique traffic sensors after filtering: 58


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 41106/41106 [00:05<00:00, 7016.13it/s]


Processed and saved: ../../data/processed/traffic/2022/06-2022_processed.parquet
len df before filtering  11796900
Unique traffic sensors before filtering: 4114
len df after filtering  172400
Unique traffic sensors after filtering: 59


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 43630/43630 [00:06<00:00, 7176.09it/s]


Processed and saved: ../../data/processed/traffic/2022/03-2022_processed.parquet
len df before filtering  10794781
Unique traffic sensors before filtering: 4394
len df after filtering  157322
Unique traffic sensors after filtering: 61


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 39788/39788 [00:05<00:00, 7203.48it/s]


Processed and saved: ../../data/processed/traffic/2022/02-2022_processed.parquet
len df before filtering  12168291
Unique traffic sensors before filtering: 4480
len df after filtering  177779
Unique traffic sensors after filtering: 63


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 44771/44771 [00:06<00:00, 7175.96it/s]


Processed and saved: ../../data/processed/traffic/2022/09-2022_processed.parquet
len df before filtering  11693047
Unique traffic sensors before filtering: 4148
len df after filtering  172267
Unique traffic sensors after filtering: 61


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 43604/43604 [00:06<00:00, 7124.94it/s]


Processed and saved: ../../data/processed/traffic/2022/08-2022_processed.parquet
len df before filtering  11323906
Unique traffic sensors before filtering: 4119
len df after filtering  165587
Unique traffic sensors after filtering: 59


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 41898/41898 [00:05<00:00, 7200.01it/s]


Processed and saved: ../../data/processed/traffic/2022/04-2022_processed.parquet
len df before filtering  11732565
Unique traffic sensors before filtering: 4127
len df after filtering  166427
Unique traffic sensors after filtering: 59


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 41991/41991 [00:05<00:00, 7126.26it/s]


Processed and saved: ../../data/processed/traffic/2022/05-2022_processed.parquet
len df before filtering  12512133
Unique traffic sensors before filtering: 4487
len df after filtering  186549
Unique traffic sensors after filtering: 63


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 46843/46843 [00:06<00:00, 7142.97it/s]


Processed and saved: ../../data/processed/traffic/2022/12-2022_processed.parquet
Processing folder: ../../data/raw/traffic/2023
len df before filtering  12644350
Unique traffic sensors before filtering: 4506
len df after filtering  186179
Unique traffic sensors after filtering: 63


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 46786/46786 [00:06<00:00, 6840.61it/s]


Processed and saved: ../../data/processed/traffic/2023/01-2023_processed.parquet
len df before filtering  12774666
Unique traffic sensors before filtering: 4663
len df after filtering  174835
Unique traffic sensors after filtering: 63


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 44085/44085 [00:06<00:00, 6966.50it/s]


Processed and saved: ../../data/processed/traffic/2023/11-2023_processed.parquet
len df before filtering  12946685
Unique traffic sensors before filtering: 4603
len df after filtering  173620
Unique traffic sensors after filtering: 63


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 43861/43861 [00:06<00:00, 7138.13it/s]


Processed and saved: ../../data/processed/traffic/2023/10-2023_processed.parquet
len df before filtering  12485567
Unique traffic sensors before filtering: 4551
len df after filtering  172789
Unique traffic sensors after filtering: 63


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 43649/43649 [00:06<00:00, 7215.18it/s]


Processed and saved: ../../data/processed/traffic/2023/06-2023_processed.parquet
len df before filtering  12945236
Unique traffic sensors before filtering: 4582
len df after filtering  174097
Unique traffic sensors after filtering: 63


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 44116/44116 [00:06<00:00, 7167.32it/s]


Processed and saved: ../../data/processed/traffic/2023/07-2023_processed.parquet
len df before filtering  11388974
Unique traffic sensors before filtering: 4503
len df after filtering  166254
Unique traffic sensors after filtering: 63


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 42079/42079 [00:06<00:00, 6959.73it/s]


Processed and saved: ../../data/processed/traffic/2023/02-2023_processed.parquet
len df before filtering  12732652
Unique traffic sensors before filtering: 4512
len df after filtering  184992
Unique traffic sensors after filtering: 63


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 46628/46628 [00:06<00:00, 7161.01it/s]


Processed and saved: ../../data/processed/traffic/2023/03-2023_processed.parquet
len df before filtering  12744711
Unique traffic sensors before filtering: 4588
len df after filtering  172628
Unique traffic sensors after filtering: 63


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 43956/43956 [00:06<00:00, 7114.46it/s]


Processed and saved: ../../data/processed/traffic/2023/08-2023_processed.parquet
len df before filtering  12518185
Unique traffic sensors before filtering: 4563
len df after filtering  171061
Unique traffic sensors after filtering: 63


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 43022/43022 [00:05<00:00, 7181.49it/s]


Processed and saved: ../../data/processed/traffic/2023/09-2023_processed.parquet
len df before filtering  12926315
Unique traffic sensors before filtering: 4547
len df after filtering  184044
Unique traffic sensors after filtering: 63


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 46245/46245 [00:06<00:00, 7106.95it/s]


Processed and saved: ../../data/processed/traffic/2023/05-2023_processed.parquet
len df before filtering  12438772
Unique traffic sensors before filtering: 4541
len df after filtering  177990
Unique traffic sensors after filtering: 63


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 44762/44762 [00:06<00:00, 7136.93it/s]


Processed and saved: ../../data/processed/traffic/2023/04-2023_processed.parquet
len df before filtering  13224732
Unique traffic sensors before filtering: 4659
len df after filtering  184660
Unique traffic sensors after filtering: 63


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 46389/46389 [00:06<00:00, 7173.39it/s]


Processed and saved: ../../data/processed/traffic/2023/12-2023_processed.parquet
Processing folder: ../../data/raw/traffic/2024
len df before filtering  12779019
Unique traffic sensors before filtering: 4696
len df after filtering  169677
Unique traffic sensors after filtering: 62


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 42694/42694 [00:05<00:00, 7165.20it/s]


Processed and saved: ../../data/processed/traffic/2024/09_2024_processed.parquet
len df before filtering  13343452
Unique traffic sensors before filtering: 4699
len df after filtering  185251
Unique traffic sensors after filtering: 63


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 46552/46552 [00:06<00:00, 7173.18it/s]


Processed and saved: ../../data/processed/traffic/2024/01-2024_processed.parquet
len df before filtering  12918338
Unique traffic sensors before filtering: 4703
len df after filtering  172121
Unique traffic sensors after filtering: 62


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 43654/43654 [00:06<00:00, 7160.31it/s]


Processed and saved: ../../data/processed/traffic/2024/06-2024_processed.parquet
len df before filtering  13351339
Unique traffic sensors before filtering: 4682
len df after filtering  175967
Unique traffic sensors after filtering: 62


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 44657/44657 [00:06<00:00, 7120.54it/s]


Processed and saved: ../../data/processed/traffic/2024/07-2024_processed.parquet
len df before filtering  12513169
Unique traffic sensors before filtering: 4677
len df after filtering  167586
Unique traffic sensors after filtering: 62


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 42114/42114 [00:05<00:00, 7263.65it/s]


Processed and saved: ../../data/processed/traffic/2024/11-2024_processed.parquet
len df before filtering  12918900
Unique traffic sensors before filtering: 4690
len df after filtering  174960
Unique traffic sensors after filtering: 62


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 43989/43989 [00:06<00:00, 7146.06it/s]


Processed and saved: ../../data/processed/traffic/2024/10-2024_processed.parquet
len df before filtering  12532935
Unique traffic sensors before filtering: 4689
len df after filtering  171777
Unique traffic sensors after filtering: 63


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 43287/43287 [00:06<00:00, 7152.12it/s]


Processed and saved: ../../data/processed/traffic/2024/02-2024_processed.parquet
len df before filtering  13451928
Unique traffic sensors before filtering: 4695
len df after filtering  183999
Unique traffic sensors after filtering: 63


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 46300/46300 [00:06<00:00, 7194.27it/s]


Processed and saved: ../../data/processed/traffic/2024/03-2024_processed.parquet
len df before filtering  13214710
Unique traffic sensors before filtering: 4678
len df after filtering  179025
Unique traffic sensors after filtering: 62


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 45486/45486 [00:06<00:00, 7191.88it/s]


Processed and saved: ../../data/processed/traffic/2024/08-2024_processed.parquet
len df before filtering  13296461
Unique traffic sensors before filtering: 4679
len df after filtering  172847
Unique traffic sensors after filtering: 60


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 43381/43381 [00:06<00:00, 7213.89it/s]


Processed and saved: ../../data/processed/traffic/2024/12-2024_processed.parquet
len df before filtering  13357035
Unique traffic sensors before filtering: 4706
len df after filtering  178987
Unique traffic sensors after filtering: 62


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 45406/45406 [00:06<00:00, 7210.62it/s]


Processed and saved: ../../data/processed/traffic/2024/05-2024_processed.parquet
len df before filtering  13010780
Unique traffic sensors before filtering: 4695
len df after filtering  177288
Unique traffic sensors after filtering: 63


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 44749/44749 [00:06<00:00, 7143.69it/s]


Processed and saved: ../../data/processed/traffic/2024/04-2024_processed.parquet


In [18]:
# df = pd.read_csv('../../data/raw/traffic/2020/01-2020.csv', delimiter= ";")
                    
# df = df.rename(columns = {'hora': 'fecha'})
# df['fecha'] = pd.to_datetime(df['fecha'])

#     # algunos se llaman id y otros identif
# # Rename columns if they exist
# if 'identif' in df.columns:
#     df = df.rename(columns={'identif': 'id_trafico'})

# if 'id' in df.columns:
#     df = df.rename(columns={'id': 'id_trafico'})
    
# df['id_trafico'] = df['id_trafico'].apply(lambda x: str(int(x)) if str(x).isdigit() else str(x))
# df['id_trafico'] = df['id_trafico'].astype(str)

# print(len(df))
# df = df[df['id_trafico'].isin(filtered_traffic_sensors)]
# print(len(df))


# if 'S' in df.error.unique():
#     print(f"There are errors in file: {file_name}")
#     print("There are a total of ", len(df[df['error'] == 'S']) , " errors")
    
#     df = corregir_errores(df)
    
# df = process_traffic_data(df)
    
# # Create the new file name
# new_file_name = f"{file_name[:-4]}_processed.parquet"
# new_file_path = os.path.join(year_folder, new_file_name)
# new_file_path = new_file_path.replace("raw", "processed")

# # Create the processed directory if it does not exist
# processed_dir = os.path.dirname(new_file_path)
# os.makedirs(processed_dir, exist_ok=True)

# # Save the processed DataFrame
# df.to_parquet(new_file_path, index=False)

# print(f"Processed and saved: {new_file_path}")

11577408
147118


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 37202/37202 [00:04<00:00, 9020.19it/s]


NameError: name 'file_name' is not defined

In [36]:
from pyproj import Proj, Transformer

def latlon_to_utm(lat, lon):
    """
    Convierte coordenadas de latitud y longitud a coordenadas UTM (X, Y).
    
    Parámetros:
    - lat: Latitud en grados decimales.
    - lon: Longitud en grados decimales.
    
    Retorna:
    - utm_x: Coordenada X en UTM.
    - utm_y: Coordenada Y en UTM.
    - zona_utm: Número de la zona UTM.
    """
    # Determinar la zona UTM según la longitud
    zona_utm = int((lon + 180) / 6) + 1
    
    # Definir el sistema de referencia UTM correspondiente
    utm_proj = Proj(proj="utm", zone=zona_utm, ellps="WGS84", datum="WGS84", south=False)
    
    # Transformar coordenadas geográficas (lat, lon) a UTM (X, Y)
    utm_x, utm_y = utm_proj(lon, lat)
    
    return round(utm_x,3), round(utm_y,3), zona_utm

# Ejemplo de uso
	

lat, lon = -40.4305187, 3.687254201	
utm_x, utm_y, zona = latlon_to_utm(lat, lon)

print(f"UTM X: {round(utm_x,4)}, UTM Y: {round(utm_y,4)}, Zona: {zona}")


UTM X: 558293.961, UTM Y: -4475769.104, Zona: 31


In [34]:
round(-40.4305187, 4), round(3.68725420,4)

(-40.4305, 3.6873)

In [33]:
round(lat,4), round(lon,4)

(40.4305, -3.6873)

In [31]:
from pyproj import Proj, Transformer

def utm_to_latlon(utm_x, utm_y, zona_utm):
    """
    Convierte coordenadas UTM (X, Y) a latitud y longitud en España (Hemisferio Norte).
    
    Parámetros:
    - utm_x: Coordenada X en UTM.
    - utm_y: Coordenada Y en UTM.
    - zona_utm: Número de la zona UTM (España está en 29, 30 o 31).
    
    Retorna:
    - lat: Latitud en grados decimales.
    - lon: Longitud en grados decimales.
    """
    # Definir el sistema de referencia UTM correspondiente (siempre en el hemisferio norte)
    utm_proj = Proj(proj="utm", zone=zona_utm, ellps="WGS84", datum="WGS84", south=False)
    
    # Invertir la transformación de UTM a lat/lon
    lon, lat = utm_proj(utm_x, utm_y, inverse=True)
    
    return lat, lon

# Ejemplo de uso

utm_x, utm_y, zona = 441705.882339595, 4475769.68733175, 30
lat, lon = utm_to_latlon(utm_x, utm_y, zona)

print(f"Latitud: {lat}, Longitud: {lon}")


Latitud: 40.43052393961246, Longitud: -3.687256103050413


(40.4305, -3.6873)