In [1]:
import pandas as pd
from tqdm import tqdm
import polars as pl

In [2]:
#df_traffic = pd.read_csv("../../data/traffic/2024/01-2024.csv", delimiter= ";")

In [3]:
#df_traffic = pd.read_csv("../../data/traffic/2014/01-2014.csv", delimiter= ";")

In [4]:
def process_traffic_data(df_traffic):
    # Convert 'fecha' to datetime and floor to the nearest hour
    df_traffic['fecha'] = pd.to_datetime(df_traffic['fecha'])
    df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')

    # Filter out rows with negative values in specified columns
    columnas_filtrar = ['intensidad', 'ocupacion', 'carga', 'vmed']
    df_traffic_processed = df_traffic[df_traffic[columnas_filtrar].ge(0).all(axis=1)]

    # Define a custom weighted average function
    def weighted_average(group, value_column, weight_column):
        values = group[value_column]
        weights = group[weight_column]
        weighted_avg = (values * weights).sum() / weights.sum() if weights.sum() > 0 else 0
        return weighted_avg

    # Initialize tqdm progress bar
    grouped = df_traffic_processed.groupby(['id', 'hora'])
    result = []

    for name, group in tqdm(grouped, desc="Processing groups"):
        intensidad_sum = group['intensidad'].sum()
        carga_mean = group['carga'].mean()
        ocupacion_weighted = weighted_average(group, 'ocupacion', 'intensidad')
        vmed_weighted = weighted_average(group, 'vmed', 'intensidad')
        
        result.append({
            'id': name[0],
            'hora': name[1],
            'intensidad': intensidad_sum,
            'carga': carga_mean,
            'ocupacion': ocupacion_weighted,
            'vmed': vmed_weighted
        })

    # Convert results back to DataFrame
    df_traffic_processed = pd.DataFrame(result)
    return df_traffic_processed

In [5]:
# # Read the CSV file

# file_path = '../../data/traffic/2014/11-2014.csv'
# file_name = '01-2015.csv'
# try:
#     df = pd.read_csv(file_path, delimiter= ";")
    
#     if 'S' in df.error.unique():
#         print(f"There are errors in file: {file_name}")
#     else:
        
#         df = df.rename(columns = {'identif':'id'})
#         df = process_traffic_data(df)
            
# except Exception as e:
#     print(f"Error processing file {file_name}: {e}")

In [6]:
def corregir_errores(df):
    # Asegurarse de que 'fecha' sea de tipo datetime
    df['fecha'] = pd.to_datetime(df['fecha'])
    
    # Iterar por las filas donde 'error' es 'S'
    for index, row in df[df['error'] == 'S'].iterrows():
        # Buscar la fila 15 minutos antes con el mismo 'idelem'
        fila_anterior = df[
            (df['idelem'] == row['idelem']) & 
            (df['fecha'] == row['fecha'] - pd.Timedelta(minutes=15))
        ]
        
        if not fila_anterior.empty:
            # Copiar los valores de la fila anterior en la fila con error
            for col in ['intensidad', 'ocupacion', 'carga', 'vmed','periodo_integracion']:
                df.at[index, col] = fila_anterior.iloc[0][col]
            
            # Ajustar el error a 'N' después de la corrección
            df.at[index, 'error'] = 'N'
    
    return df

In [None]:
import os
import pandas as pd

# Define the root directory
root_dir = '../../data/raw/traffic'

# Iterate through all year folders
for year in range(2020, 2024):
    year_folder = os.path.join(root_dir, str(year))
    
    # Check if the year folder exists
    if os.path.isdir(year_folder):
        print(f"Processing folder: {year_folder}")
        
        # Iterate through files in the year folder
        for file_name in os.listdir(year_folder):
            if file_name.endswith('.csv'):  # Ensure it's a CSV file
                
                
                file_path = os.path.join(year_folder, file_name)
                
                # Read the CSV file
                try:
                    df = pd.read_csv(file_path, delimiter= ";")
                    
                    if 'S' in df.error.unique():
                        print(f"There are errors in file: {file_name}")
                        print("There are a total of ", len(df[df['error'] == 'S']) , " errors")
                        
                        df = corregir_errores(df)
                        
                    df = df.rename(columns = {'identif':'id'})
                    df = process_traffic_data(df)
                        
                    # Create the new file name
                    new_file_name = f"{file_name[:-4]}_processed.csv"
                    new_file_path = os.path.join(year_folder, new_file_name)
                    
                    new_file_path = new_file_path.replace("raw", "processed")

                    # Save the processed DataFrame
                    df.to_csv(new_file_path, index=False)
                    
                    print(f"Processed and saved: {new_file_path}")
                except Exception as e:
                    print(f"Error processing file {file_name}: {e}")


Processing folder: ../../data/raw/traffic/2020


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 2934619/2934619 [05:17<00:00, 9252.35it/s] 


Processed and saved: ../../data/processed/traffic/2020/01-2020_processed.csv


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 2915148/2915148 [05:15<00:00, 9225.44it/s] 


Processed and saved: ../../data/processed/traffic/2020/10-2020_processed.csv


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 2889274/2889274 [05:03<00:00, 9520.55it/s] 


Processed and saved: ../../data/processed/traffic/2020/11-2020_processed.csv


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 2771414/2771414 [04:47<00:00, 9647.52it/s] 


Processed and saved: ../../data/processed/traffic/2020/07-2020_processed.csv


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 2688488/2688488 [04:42<00:00, 9524.32it/s] 


Processed and saved: ../../data/processed/traffic/2020/06-2020_processed.csv


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 2812572/2812572 [05:09<00:00, 9084.72it/s] 


Processed and saved: ../../data/processed/traffic/2020/09-2020_processed.csv


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 2758068/2758068 [05:02<00:00, 9124.68it/s] 


Processed and saved: ../../data/processed/traffic/2020/08-2020_processed.csv


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 2905393/2905393 [05:24<00:00, 8965.70it/s] 


Processed and saved: ../../data/processed/traffic/2020/03-2020_processed.csv


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 2771056/2771056 [05:09<00:00, 8962.28it/s] 


Processed and saved: ../../data/processed/traffic/2020/02-2020_processed.csv


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 2781249/2781249 [05:08<00:00, 9010.96it/s] 


Processed and saved: ../../data/processed/traffic/2020/04-2020_processed.csv


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 2820425/2820425 [05:14<00:00, 8966.44it/s] 


Processed and saved: ../../data/processed/traffic/2020/05-2020_processed.csv


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 2901171/2901171 [05:26<00:00, 8882.81it/s] 


Processed and saved: ../../data/processed/traffic/2020/12-2020_processed.csv
Processing folder: ../../data/raw/traffic/2021


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 2739298/2739298 [05:06<00:00, 8938.74it/s] 


Processed and saved: ../../data/processed/traffic/2021/01-2021_processed.csv


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 2982970/2982970 [05:34<00:00, 8922.16it/s] 


Processed and saved: ../../data/processed/traffic/2021/11-2021_processed.csv


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 3053260/3053260 [05:45<00:00, 8825.41it/s] 


Processed and saved: ../../data/processed/traffic/2021/10-2021_processed.csv


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 2940629/2940629 [05:29<00:00, 8913.40it/s] 


Processed and saved: ../../data/processed/traffic/2021/06-2021_processed.csv


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 3076848/3076848 [05:47<00:00, 8857.33it/s] 


Processed and saved: ../../data/processed/traffic/2021/07-2021_processed.csv


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 3057160/3057160 [05:44<00:00, 8873.83it/s] 


Processed and saved: ../../data/processed/traffic/2021/08-2021_processed.csv


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 2969261/2969261 [05:36<00:00, 8820.15it/s] 


Processed and saved: ../../data/processed/traffic/2021/09-2021_processed.csv


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 2688636/2688636 [05:03<00:00, 8863.86it/s] 


Processed and saved: ../../data/processed/traffic/2021/02-2021_processed.csv


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 3048199/3048199 [05:48<00:00, 8756.28it/s] 


Processed and saved: ../../data/processed/traffic/2021/03-2021_processed.csv


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 3044741/3044741 [05:45<00:00, 8813.13it/s] 


Processed and saved: ../../data/processed/traffic/2021/05-2021_processed.csv


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups: 100%|██████████| 2944647/2944647 [05:35<00:00, 8779.81it/s] 


Processed and saved: ../../data/processed/traffic/2021/04-2021_processed.csv


  df_traffic['hora'] = df_traffic['fecha'].dt.floor('H')
Processing groups:   0%|          | 0/3125231 [00:00<?, ?it/s]