In [1]:
import pandas as pd
import numpy as np
import os
import shapely
from shapely.geometry import Point, Polygon
import geopandas as gpd


In [2]:
df = pd.read_csv( 'D:\\Data projects\\Air pollution Madrid\\data\\traffic data\\04-2024.csv', sep=";")


### Check for anomalies within the dataset
As we need to perform aggregations, let's check whether the dataset contains NaNs, anomalies or other problems. 

In [22]:
def inspect_missing_values(df):
    # Check for missing values in the DataFrame
    missing_values = df.isnull().sum()
    missing_percentage = (missing_values / len(df)) * 100
    missing_summary = pd.DataFrame({
        'Missing Values': missing_values,
        'Percentage': missing_percentage
    })
    print("Missing Values Summary:")
    print(missing_summary)
    return missing_summary

def inspect_anomalies(df):
    # Basic statistics
    print("\nBasic Statistics:")
    print(df.describe(include='all'))
    
    # Check for duplicate rows
    duplicate_rows = df.duplicated().sum()
    print(f"\nNumber of duplicate rows: {duplicate_rows}")
    
    # Check for anomalies in numerical columns (e.g., negative values where they shouldn't be)
    print("\nAnomalies in numerical columns:")
    numerical_columns = df.select_dtypes(include=['number']).columns
    for col in numerical_columns:
        # Check for negative values
        if (df[col] < 0).any():
            num_negatives = (df[col] < 0).sum()
            print(f"Column '{col}' has {num_negatives} negative values")
            
inspect_missing_values(df)
inspect_anomalies(df)

Missing Values Summary:
                     Missing Values  Percentage
id                                0    0.000000
fecha                             0    0.000000
tipo_elem                         0    0.000000
intensidad                        0    0.000000
ocupacion                     16108    0.123805
carga                             0    0.000000
vmed                           5648    0.043410
error                         25632    0.197006
periodo_integracion               0    0.000000

Basic Statistics:
                  id                fecha tipo_elem    intensidad  \
count   1.301078e+07             13010780  13010780  1.301078e+07   
unique           NaN                 2880         3           NaN   
top              NaN  2024-04-03 12:45:00       URB           NaN   
freq             NaN                 4614  11877375           NaN   
mean    6.645174e+03                  NaN       NaN  3.837453e+02   
std     2.599109e+03                  NaN       NaN  6.108228e+

### Proceed to manipulation and aggregations
All our data are in hourly frequency thus we need to aggregate 15 minutes frequencies of traffic and take their averages.
NaNs are not an issue in this case.

In [23]:
# List of files to process with full paths
files = [
    r'D:\\Data projects\\Air pollution Madrid\\data\\traffic data\\05-2024.csv',
    r'D:\\Data projects\\Air pollution Madrid\\data\\traffic data\\04-2024.csv'
]

def process_file(file):
    # Read the CSV file
    df = pd.read_csv(file, sep=";")
    
    # Drop columns 2, 7, 8 (0-indexed)
    df.drop(df.columns[[2, 7, 8]], axis=1, inplace=True)
    
    # Convert 'fecha' column to datetime
    df['fecha'] = pd.to_datetime(df['fecha'])
    
    # Set 'fecha' as index
    df.set_index('fecha', inplace=True)
    
    # Resample data to hourly frequency and calculate the mean for specified columns
    df_resampled = df.resample('H').mean()
    
    # Reset index to access 'fecha'
    df_resampled.reset_index(inplace=True)
    
    # Group by 'ID' and 'fecha', and calculate the mean of specified columns
    df_grouped = df.groupby(['id', pd.Grouper(freq='H')]).agg({
        'intensidad': 'mean',
        'ocupacion': 'mean',
        'carga': 'mean',
        'vmed': 'mean'
    }).reset_index()
    
    # Create new columns for month, day, and hour
    df_grouped['MES'] = df_grouped['fecha'].dt.month
    df_grouped['DIA'] = df_grouped['fecha'].dt.day
    df_grouped['HORA'] = df_grouped['fecha'].dt.hour + 1  # Adjust hour to be 1-indexed
    
    return df_grouped

# Process each file and concatenate the results
processed_dfs = [process_file(file) for file in files]
combined_df = pd.concat(processed_dfs, ignore_index=True)

print("The resulting dataset:")
combined_df

The resulting dataset:


Unnamed: 0,id,fecha,intensidad,ocupacion,carga,vmed,MES,DIA,HORA
0,1001,2024-05-01 00:00:00,627.0,1.75,0.00,60.5,5,1,1
1,1001,2024-05-01 01:00:00,378.0,0.75,0.00,61.0,5,1,2
2,1001,2024-05-01 02:00:00,231.0,0.75,0.00,53.5,5,1,3
3,1001,2024-05-01 03:00:00,207.0,0.25,0.00,55.5,5,1,4
4,1001,2024-05-01 04:00:00,204.0,0.00,0.00,53.5,5,1,5
...,...,...,...,...,...,...,...,...,...
6697404,11312,2024-04-30 19:00:00,592.5,3.25,14.75,0.0,4,30,20
6697405,11312,2024-04-30 20:00:00,557.0,3.75,14.75,0.0,4,30,21
6697406,11312,2024-04-30 21:00:00,405.0,2.25,10.75,0.0,4,30,22
6697407,11312,2024-04-30 22:00:00,239.5,1.25,6.00,0.0,4,30,23


In [24]:
inspect_missing_values(combined_df)
inspect_anomalies(combined_df)

Missing Values Summary:
            Missing Values  Percentage
id                       0    0.000000
fecha                    0    0.000000
intensidad               0    0.000000
ocupacion             1836    0.027414
carga                    0    0.000000
vmed                  2094    0.031266
MES                      0    0.000000
DIA                      0    0.000000
HORA                     0    0.000000

Basic Statistics:
                 id                          fecha    intensidad  \
count  6.697409e+06                        6697409  6.697409e+06   
mean   6.657395e+03  2024-05-01 10:50:31.347078144  3.734534e+02   
min    1.001000e+03            2024-04-01 00:00:00  0.000000e+00   
25%    4.656000e+03            2024-04-16 04:00:00  5.725000e+01   
50%    5.919000e+03            2024-05-01 10:00:00  1.717500e+02   
75%    9.974000e+03            2024-05-16 17:00:00  4.315000e+02   
max    1.131900e+04            2024-05-31 23:00:00  1.650275e+04   
std    2.605936e+03    

### Merge with sensors' locations coordinates

In [25]:
# Now we merge the air_data with the coordinates of each station
# Load the CSV file
file_path = "D:\\Data projects\\Air pollution Madrid\\data\\locations\\traffic_locations_modified.csv"
locations_df = pd.read_csv(file_path, sep=",")


# Convert the ID_EST column to int to match the ID column in locations_df
combined_df['id'] = combined_df['id'].astype(int)

# Ensure the ID column in locations_df is also of type int (it usually should be)
locations_df['ID'] = locations_df['ID'].astype(int)

# Merge the DataFrames on the corrected ID columns
merged_df = pd.merge(combined_df, locations_df, left_on='id', right_on='ID', how='left')


This is the resulting dataset


Unnamed: 0,id,fecha,intensidad,ocupacion,carga,vmed,MES,DIA,HORA,ID,LATITUD,LONGITUD
0,1001,2024-05-01 00:00:00,627.0,1.75,0.00,60.5,5,1,1,1001.0,404.097.291.910.074,-374.078.577.959.832
1,1001,2024-05-01 01:00:00,378.0,0.75,0.00,61.0,5,1,2,1001.0,404.097.291.910.074,-374.078.577.959.832
2,1001,2024-05-01 02:00:00,231.0,0.75,0.00,53.5,5,1,3,1001.0,404.097.291.910.074,-374.078.577.959.832
3,1001,2024-05-01 03:00:00,207.0,0.25,0.00,55.5,5,1,4,1001.0,404.097.291.910.074,-374.078.577.959.832
4,1001,2024-05-01 04:00:00,204.0,0.00,0.00,53.5,5,1,5,1001.0,404.097.291.910.074,-374.078.577.959.832
...,...,...,...,...,...,...,...,...,...,...,...,...
6697404,11312,2024-04-30 19:00:00,592.5,3.25,14.75,0.0,4,30,20,11312.0,404.254.109.622.568,-361.159.116.685.915
6697405,11312,2024-04-30 20:00:00,557.0,3.75,14.75,0.0,4,30,21,11312.0,404.254.109.622.568,-361.159.116.685.915
6697406,11312,2024-04-30 21:00:00,405.0,2.25,10.75,0.0,4,30,22,11312.0,404.254.109.622.568,-361.159.116.685.915
6697407,11312,2024-04-30 22:00:00,239.5,1.25,6.00,0.0,4,30,23,11312.0,404.254.109.622.568,-361.159.116.685.915


In [26]:
merged_df = merged_df.drop("ID", axis = 1) #drops a dupllicated id col

# Specify the directory and filename to save the combined DataFrame
output_directory = r"D:\\Data projects\\Air pollution Madrid\\data\\traffic data"
output_filename = 'traffic_geolocated.csv'
output_file = os.path.join(output_directory, output_filename)

# Save the combined DataFrame to a new CSV file
merged_df.to_csv(output_file, index=False)

print("This is the resulting dataset")
merged_df

This is the resulting dataset


Unnamed: 0,id,fecha,intensidad,ocupacion,carga,vmed,MES,DIA,HORA,LATITUD,LONGITUD
0,1001,2024-05-01 00:00:00,627.0,1.75,0.00,60.5,5,1,1,404.097.291.910.074,-374.078.577.959.832
1,1001,2024-05-01 01:00:00,378.0,0.75,0.00,61.0,5,1,2,404.097.291.910.074,-374.078.577.959.832
2,1001,2024-05-01 02:00:00,231.0,0.75,0.00,53.5,5,1,3,404.097.291.910.074,-374.078.577.959.832
3,1001,2024-05-01 03:00:00,207.0,0.25,0.00,55.5,5,1,4,404.097.291.910.074,-374.078.577.959.832
4,1001,2024-05-01 04:00:00,204.0,0.00,0.00,53.5,5,1,5,404.097.291.910.074,-374.078.577.959.832
...,...,...,...,...,...,...,...,...,...,...,...
6697404,11312,2024-04-30 19:00:00,592.5,3.25,14.75,0.0,4,30,20,404.254.109.622.568,-361.159.116.685.915
6697405,11312,2024-04-30 20:00:00,557.0,3.75,14.75,0.0,4,30,21,404.254.109.622.568,-361.159.116.685.915
6697406,11312,2024-04-30 21:00:00,405.0,2.25,10.75,0.0,4,30,22,404.254.109.622.568,-361.159.116.685.915
6697407,11312,2024-04-30 22:00:00,239.5,1.25,6.00,0.0,4,30,23,404.254.109.622.568,-361.159.116.685.915
