In [1]:
import pandas as pd
import os
import xarray as xr

## Data Reading

In [2]:
df_mapping = pd.read_csv("../../data/processed/mapping/no2_to_traffic_sensor_mapping.csv")

df_air = pd.read_parquet("../../data/processed/air/df_air_quality_and_locations_from_2013.parquet")

Read traffic data

In [3]:
# Define the root directory for processed traffic data
root_dir = '../../data/processed/traffic'

# List to hold DataFrames
dataframes = []

# Iterate through all folders in the root directory
for folder_name in os.listdir(root_dir):
    folder_path = os.path.join(root_dir, folder_name)
    
    # Check if the folder exists and is a directory
    if os.path.isdir(folder_path):
        print(f"Processing folder: {folder_path}")
        
        # Iterate through all files in the folder
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.parquet'):  # Ensure it's a Parquet file
                file_path = os.path.join(folder_path, file_name)
                
                # Read the Parquet file
                try:
                    df = pd.read_parquet(file_path)
                    print(f"Successfully read: {file_path} with {len(df)} rows.")
                    
                    # Append the DataFrame to the list
                    dataframes.append(df)
                    
                except Exception as e:
                    print(f"Error reading file {file_name}: {e}")

# Concatenate all DataFrames into one
if dataframes:  # Check if the list is not empty
    df_traffic = pd.concat(dataframes, ignore_index=True)
    print(f"Combined DataFrame created with {len(df_traffic)} rows.")

Processing folder: ../../data/processed/traffic/2022
Successfully read: ../../data/processed/traffic/2022/12-2022_processed.parquet with 48331 rows.
Successfully read: ../../data/processed/traffic/2022/11-2022_processed.parquet with 46722 rows.
Successfully read: ../../data/processed/traffic/2022/10-2022_processed.parquet with 47576 rows.
Successfully read: ../../data/processed/traffic/2022/01-2022_processed.parquet with 45047 rows.
Successfully read: ../../data/processed/traffic/2022/06-2022_processed.parquet with 42196 rows.
Successfully read: ../../data/processed/traffic/2022/05-2022_processed.parquet with 42735 rows.
Successfully read: ../../data/processed/traffic/2022/08-2022_processed.parquet with 45087 rows.
Successfully read: ../../data/processed/traffic/2022/02-2022_processed.parquet with 40460 rows.
Successfully read: ../../data/processed/traffic/2022/03-2022_processed.parquet with 44372 rows.
Successfully read: ../../data/processed/traffic/2022/09-2022_processed.parquet with

In [4]:
# Define the root directory for processed traffic data
root_dir = '../../data/raw/meteo'

# List to hold DataFrames
dataframes = []

# Iterate through all folders in the root directory
for folder_name in ['2018']:
    folder_path = os.path.join(root_dir, folder_name)
    
    # Check if the folder exists and is a directory
    if os.path.isdir(folder_path):
        print(f"Processing folder: {folder_path}")
        
        # Iterate through all files in the folder
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.grib'):  # Ensure it's a Parquet file
                file_path = os.path.join(folder_path, file_name)
                
                # Read the Parquet file
                try:                    
                    df = xr.open_dataset(file_path, engine='cfgrib',  backend_kwargs={'indexpath': None})
                    print(f"Successfully read: {file_path} with {len(df)} rows.")
                    
                    df = df.to_dataframe().reset_index()  # Reset index if needed
                    df = df[df['d2m'].notna()]
                    
                    # Append the DataFrame to the list
                    dataframes.append(df)
                    
                except Exception as e:
                    print(f"Error reading file {file_name}: {e}")

# Concatenate all DataFrames into one
if dataframes:  # Check if the list is not empty
    df_meteo = pd.concat(dataframes, ignore_index=True)
    print(f"Combined DataFrame created with {len(df_meteo)} rows.")

Processing folder: ../../data/raw/meteo/2018
Successfully read: ../../data/raw/meteo/2018/e99341ff3b7b94cae8e3ecd140fdd177.grib with 8 rows.
Successfully read: ../../data/raw/meteo/2018/5e57992aac2ac56b7162ae01d706101c.grib with 8 rows.
Successfully read: ../../data/raw/meteo/2018/979398cb4826898b688f868f8f6df5b.grib with 8 rows.
Successfully read: ../../data/raw/meteo/2018/6d41aa349ea12967aa10013f34ffc380.grib with 8 rows.
Successfully read: ../../data/raw/meteo/2018/3626a5f630ed0e9abd392f0052295ec8.grib with 8 rows.
Successfully read: ../../data/raw/meteo/2018/53007aea49134b417d72c70d96447ae5.grib with 8 rows.
Successfully read: ../../data/raw/meteo/2018/495908a0213127247b17bf04990180f4.grib with 8 rows.
Successfully read: ../../data/raw/meteo/2018/884cc40e96d15c88c1e35ae19ab85f77.grib with 8 rows.
Successfully read: ../../data/raw/meteo/2018/64d6bf7ab4452798270cf18dc39bc54b.grib with 8 rows.
Successfully read: ../../data/raw/meteo/2018/646c1c43004891a3c55ed0ed2661f750.grib with 8 ro

## Data Cleaning

In [5]:
unnecesary_cols = ['PROVINCIA','MUNICIPIO','ESTACION','MAGNITUD','PUNTO_MUESTREO','CODIGO_CORTO','LOCALIZACION','DIRECCION','COD_VIA','COD_TIPO']

df_air = df_air.drop(columns = unnecesary_cols)
df_air.columns = df_air.columns.str.lower()
df_air = df_air.rename(columns = {'codigo':'id_no2'})

In [6]:
df_air['id_no2'] = df_air['id_no2'].astype(str)
df_mapping['id_trafico'] = df_mapping['id_trafico'].astype(str)
df_mapping['id_no2'] = df_mapping['id_no2'].astype(str)

In [7]:
df_traffic = df_traffic.rename(columns = {'hora': 'fecha'})

In [8]:
df_meteo = df_meteo.rename(columns = {'valid_time':'fecha'})
df_meteo = df_meteo.drop(columns = ['time','step','surface'])

GEt the unique coordinates for meteo

In [12]:
unique_coordinates = df_meteo[['latitude', 'longitude']].drop_duplicates()

In [13]:
unique_coordinates

Unnamed: 0,latitude,longitude
0,40.7,-3.7
1,40.7,-3.6
2,40.6,-3.7
3,40.6,-3.6
4,40.5,-3.7
5,40.5,-3.6
6,40.4,-3.7
7,40.4,-3.6
8,40.3,-3.7
9,40.3,-3.6


In [10]:
df_air

Unnamed: 0,id_no2,year,month,day,hour,no2_value,validacion,fecha,altitud,nom_tipo,longitud,latitud
0,28079004,2013,1,1,1,35.0,V,2013-01-01 01:00:00,637,Urbana tráfico,-3.712257,40.423882
1,28079004,2013,1,1,2,52.0,V,2013-01-01 02:00:00,637,Urbana tráfico,-3.712257,40.423882
2,28079004,2013,1,1,3,35.0,V,2013-01-01 03:00:00,637,Urbana tráfico,-3.712257,40.423882
3,28079004,2013,1,1,4,19.0,V,2013-01-01 04:00:00,637,Urbana tráfico,-3.712257,40.423882
4,28079004,2013,1,1,5,14.0,V,2013-01-01 05:00:00,637,Urbana tráfico,-3.712257,40.423882
...,...,...,...,...,...,...,...,...,...,...,...,...
2490931,28079060,2024,11,30,20,58.0,V,2024-11-30 20:00:00,709,Urbana fondo,-3.689731,40.500548
2490932,28079060,2024,11,30,21,61.0,V,2024-11-30 21:00:00,709,Urbana fondo,-3.689731,40.500548
2490933,28079060,2024,11,30,22,66.0,V,2024-11-30 22:00:00,709,Urbana fondo,-3.689731,40.500548
2490934,28079060,2024,11,30,23,77.0,V,2024-11-30 23:00:00,709,Urbana fondo,-3.689731,40.500548


keep only X coordinate (improve this in the future) Creating ethe mapping... TODO

In [None]:
df_meteo = df_meteo[(df_meteo['latitude'] == 40.7) & (df_meteo['longitude'] == -3.7)]

Joining the data

In [None]:
df = pd.merge(df_air, df_mapping, how = 'left', left_on='id_no2', right_on='id_no2')
df = pd.merge(df, df_traffic, how = 'inner', left_on=['id_trafico','fecha'], right_on=['id_trafico','fecha'])

In [None]:
df = pd.merge(df, df_meteo, how = 'inner', left_on=['fecha'], right_on=['fecha'])