In [10]:
import pandas as pd
import numpy as np
import os
import shapely
from shapely.geometry import Point, Polygon
import geopandas as gpd


### Merge traffic data with sensors locations
Sensors locations change by month. Thus, we need to first merge data with their corresponding locations. 

In [11]:
#upload traffic locations
traffic_locations_april = pd.read_excel("D:\\Data projects\\Air pollution Madrid\\data\\locations\\traffic_locations_0424.xlsx")
traffic_locations_may = pd.read_excel("D:\\Data projects\\Air pollution Madrid\\data\\locations\\traffic_locations_0524.xlsx")

In [12]:
#upload traffic data
traffic_may = pd.read_csv( 'D:\\Data projects\\Air pollution Madrid\\data\\traffic data\\05-2024.csv', sep =";")
traffic_april = pd.read_csv('D:\\Data projects\\Air pollution Madrid\\data\\traffic data\\04-2024.csv', sep =";")



In [17]:
#Proceed to merge 

# Convert the ID_EST column to int to match the ID column in locations_df
traffic_may['id'] = traffic_may['id'].astype(int)
traffic_april['id'] = traffic_april['id'].astype(int)

# Ensure the ID column in locations_df is also of type int (it usually should be)
traffic_locations_may['id'] = traffic_locations_may['id'].astype(int)
traffic_locations_april['id'] = traffic_locations_april['id'].astype(int)

# Merge the DataFrames on the corrected ID columns
merged_df_april = pd.merge(traffic_april, traffic_locations_april.iloc[:, [2,7,8]], left_on='id', right_on='id', how='left')
merged_df_may = pd.merge(traffic_may, traffic_locations_may.iloc[:, [2,7,8]], left_on='id', right_on='id', how='left')

### Proceed to manipulation and aggregations
All our data are in hourly frequency thus we need to aggregate 15 minutes frequencies of traffic and take their averages.
NaNs are not an issue in this case.

In [14]:
merged_df_april

Unnamed: 0,id,fecha,tipo_elem,intensidad,ocupacion,carga,vmed,error,periodo_integracion,longitud,latitud
0,1001,2024-04-01 00:00:00,C30,660,1.0,0,58.0,N,5,-3.740786,40.409729
1,1001,2024-04-01 00:15:00,C30,480,1.0,0,61.0,N,5,-3.740786,40.409729
2,1001,2024-04-01 00:30:00,C30,480,1.0,0,61.0,N,5,-3.740786,40.409729
3,1001,2024-04-01 00:45:00,C30,480,1.0,0,61.0,N,5,-3.740786,40.409729
4,1001,2024-04-01 01:00:00,C30,348,1.0,0,53.0,N,5,-3.740786,40.409729
...,...,...,...,...,...,...,...,...,...,...,...
13010775,11312,2024-04-30 22:45:00,URB,132,0.0,4,0.0,N,15,-3.611591,40.425411
13010776,11312,2024-04-30 23:00:00,URB,161,0.0,2,0.0,N,15,-3.611591,40.425411
13010777,11312,2024-04-30 23:15:00,URB,214,1.0,4,0.0,N,15,-3.611591,40.425411
13010778,11312,2024-04-30 23:30:00,URB,228,1.0,6,0.0,N,15,-3.611591,40.425411


In [18]:

# List of DataFrames to process
files = [merged_df_april, merged_df_may]

def process_file(df):
    df['fecha'] = pd.to_datetime(df['fecha'])

    # Set 'fecha' as index
    df.set_index('fecha', inplace=True)
    # Resample to hourly frequency and aggregate
    df_resampled = df.groupby('id').resample('H').agg({
        'intensidad': 'mean',
        'ocupacion': 'mean',
        'carga': 'mean',
        'vmed': 'mean',
        'longitud': 'first',
        'latitud': 'first'
    }).reset_index()

# Add month, day, and hour columns
    df_resampled['MES'] = df_resampled['fecha'].dt.month
    df_resampled['DIA'] = df_resampled['fecha'].dt.day
    df_resampled['HORA'] = df_resampled['fecha'].dt.hour + 1  # Adjust hour to be 1-indexed

# Reorder columns as specified
    df_resampled = df_resampled[['id', 'fecha', 'MES', 'DIA', 'HORA', 'intensidad', 'ocupacion', 'carga', 'vmed', 'longitud', 'latitud']]
    
    return df_resampled

# Process each file and concatenate the results
processed_dfs = [process_file(file) for file in files]
combined_df = pd.concat(processed_dfs, ignore_index=True)

print("The resulting dataset:")
print(combined_df)

The resulting dataset:
            id               fecha  MES  DIA  HORA  intensidad  ocupacion  \
0         1001 2024-04-01 00:00:00    4    1     1      525.00   1.000000   
1         1001 2024-04-01 01:00:00    4    1     2      267.00   0.750000   
2         1001 2024-04-01 02:00:00    4    1     3      120.00   0.000000   
3         1001 2024-04-01 03:00:00    4    1     4      114.00   0.333333   
4         1001 2024-04-01 04:00:00    4    1     5      198.00   0.250000   
...        ...                 ...  ...  ...   ...         ...        ...   
6815220  11319 2024-05-31 19:00:00    5   31    20      383.00   1.500000   
6815221  11319 2024-05-31 20:00:00    5   31    21      370.25   1.250000   
6815222  11319 2024-05-31 21:00:00    5   31    22      309.25   0.750000   
6815223  11319 2024-05-31 22:00:00    5   31    23      227.00   0.750000   
6815224  11319 2024-05-31 23:00:00    5   31    24      159.00   0.000000   

         carga   vmed  longitud    latitud  
0      

In [16]:
combined_df

Unnamed: 0,id,fecha,MES,DIA,HORA,intensidad,ocupacion,carga,vmed,longitud,latitud
0,1001,2024-04-01 00:00:00,4,1,1,163.763564,1.853580,7.616172,5.942081,-3.740786,40.409729
1,1001,2024-04-01 01:00:00,4,1,2,84.953758,1.186527,4.355507,5.356144,-3.740786,40.409729
2,1001,2024-04-01 02:00:00,4,1,3,57.523454,0.949455,3.119977,4.803194,-3.740786,40.409729
3,1001,2024-04-01 03:00:00,4,1,4,46.752446,0.805262,2.533251,4.530636,-3.740786,40.409729
4,1001,2024-04-01 04:00:00,4,1,5,45.382768,0.815506,2.421062,4.557522,-3.740786,40.409729
...,...,...,...,...,...,...,...,...,...,...,...
1459,1001,2024-05-31 19:00:00,5,31,20,580.977401,8.959684,27.796007,5.560323,-3.740786,40.409729
1460,1001,2024-05-31 20:00:00,5,31,21,562.596748,8.528901,26.941325,5.597263,-3.740786,40.409729
1461,1001,2024-05-31 21:00:00,5,31,22,513.203120,7.277930,24.272822,5.754752,-3.740786,40.409729
1462,1001,2024-05-31 22:00:00,5,31,23,396.551872,5.327529,18.839993,5.888932,-3.740786,40.409729


### Merge with sensors' locations coordinates

In [None]:
# Now we merge the air_data with the coordinates of each station
# Load the CSV file
file_path = "D:\\Data projects\\Air pollution Madrid\\data\\locations\\traffic_locations_modified.csv"
locations_df = pd.read_csv(file_path, sep=",")


# Convert the ID_EST column to int to match the ID column in locations_df
combined_df['id'] = combined_df['id'].astype(int)

# Ensure the ID column in locations_df is also of type int (it usually should be)
locations_df['ID'] = locations_df['ID'].astype(int)

# Merge the DataFrames on the corrected ID columns
merged_df = pd.merge(combined_df, locations_df, left_on='id', right_on='ID', how='left')


This is the resulting dataset


Unnamed: 0,id,fecha,intensidad,ocupacion,carga,vmed,MES,DIA,HORA,ID,LATITUD,LONGITUD
0,1001,2024-05-01 00:00:00,627.0,1.75,0.00,60.5,5,1,1,1001.0,404.097.291.910.074,-374.078.577.959.832
1,1001,2024-05-01 01:00:00,378.0,0.75,0.00,61.0,5,1,2,1001.0,404.097.291.910.074,-374.078.577.959.832
2,1001,2024-05-01 02:00:00,231.0,0.75,0.00,53.5,5,1,3,1001.0,404.097.291.910.074,-374.078.577.959.832
3,1001,2024-05-01 03:00:00,207.0,0.25,0.00,55.5,5,1,4,1001.0,404.097.291.910.074,-374.078.577.959.832
4,1001,2024-05-01 04:00:00,204.0,0.00,0.00,53.5,5,1,5,1001.0,404.097.291.910.074,-374.078.577.959.832
...,...,...,...,...,...,...,...,...,...,...,...,...
6697404,11312,2024-04-30 19:00:00,592.5,3.25,14.75,0.0,4,30,20,11312.0,404.254.109.622.568,-361.159.116.685.915
6697405,11312,2024-04-30 20:00:00,557.0,3.75,14.75,0.0,4,30,21,11312.0,404.254.109.622.568,-361.159.116.685.915
6697406,11312,2024-04-30 21:00:00,405.0,2.25,10.75,0.0,4,30,22,11312.0,404.254.109.622.568,-361.159.116.685.915
6697407,11312,2024-04-30 22:00:00,239.5,1.25,6.00,0.0,4,30,23,11312.0,404.254.109.622.568,-361.159.116.685.915


In [None]:
merged_df = merged_df.drop("ID", axis = 1) #drops a dupllicated id col

# Specify the directory and filename to save the combined DataFrame
output_directory = r"D:\\Data projects\\Air pollution Madrid\\data\\traffic data"
output_filename = 'traffic_geolocated.csv'
output_file = os.path.join(output_directory, output_filename)

# Save the combined DataFrame to a new CSV file
merged_df.to_csv(output_file, index=False)

print("This is the resulting dataset")
merged_df

This is the resulting dataset


Unnamed: 0,id,fecha,intensidad,ocupacion,carga,vmed,MES,DIA,HORA,LATITUD,LONGITUD
0,1001,2024-05-01 00:00:00,627.0,1.75,0.00,60.5,5,1,1,404.097.291.910.074,-374.078.577.959.832
1,1001,2024-05-01 01:00:00,378.0,0.75,0.00,61.0,5,1,2,404.097.291.910.074,-374.078.577.959.832
2,1001,2024-05-01 02:00:00,231.0,0.75,0.00,53.5,5,1,3,404.097.291.910.074,-374.078.577.959.832
3,1001,2024-05-01 03:00:00,207.0,0.25,0.00,55.5,5,1,4,404.097.291.910.074,-374.078.577.959.832
4,1001,2024-05-01 04:00:00,204.0,0.00,0.00,53.5,5,1,5,404.097.291.910.074,-374.078.577.959.832
...,...,...,...,...,...,...,...,...,...,...,...
6697404,11312,2024-04-30 19:00:00,592.5,3.25,14.75,0.0,4,30,20,404.254.109.622.568,-361.159.116.685.915
6697405,11312,2024-04-30 20:00:00,557.0,3.75,14.75,0.0,4,30,21,404.254.109.622.568,-361.159.116.685.915
6697406,11312,2024-04-30 21:00:00,405.0,2.25,10.75,0.0,4,30,22,404.254.109.622.568,-361.159.116.685.915
6697407,11312,2024-04-30 22:00:00,239.5,1.25,6.00,0.0,4,30,23,404.254.109.622.568,-361.159.116.685.915
