In [1]:
import sys
sys.path.append("../../..")

import math
from datetime import date, timedelta
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from copy import deepcopy

# import custom modules
from modules import processing_module as processing
from modules import interpolation_module as interp

2025-03-02 15:28:41.726598: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-02 15:28:41.727798: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-02 15:28:41.749746: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-02 15:28:41.750209: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# map land cover values to reduced land cover classes
def map_land_cat(value):
    value_str = str(int(value))
    value_cat = value_str[:2]
    if value_cat == '13':
        value_cat = '12'
    return int(value_cat)

In [4]:
start_date = date(2010, 1, 1)
end_date = date(2023, 12, 31)
options = {
    "pm10": {
        "columns": ['lat', 'lng', 'date', 'value'],
        "type": "timeseries",
        "date_format": "%Y-%m-%d",
        "value_columns": ["value"],
        "start_date": start_date,
        "end_date": end_date,
        "frequency": "day",
        "aggregation_function": "mean",
        #"remove_outliers": True,
        #"remove_outliers_window": 30,
        "interpolation": "NN"
    },
    
    "pm25": {
        "columns": ['lat', 'lng', 'date', 'value'],
        "type": "timeseries",
        "date_format": "%Y-%m-%d",
        "value_columns": ["value"],
        "start_date": start_date,
        "end_date": end_date,
        "frequency": "day",
        "aggregation_function": "mean",
        #"remove_outliers": True,
        #"remove_outliers_window": 30,
        "interpolation": "NN"
    },
    
    "no2": {
        "columns": ['lat', 'lng', 'date', 'value'],
        "type": "timeseries",
        "date_format": "%Y-%m-%d %H:%M:%S",
        "value_columns": ["value"],
        "start_date": start_date,
        "end_date": end_date,
        "frequency": "day",
        "aggregation_function": "mean",
        #"remove_outliers": True,
        #"remove_outliers_window": 30,
        "interpolation": "NN"
    },
    
    "so2": {
        "columns": ['lat', 'lng', 'date', 'value'],
        "type": "timeseries",
        "date_format": "%Y-%m-%d %H:%M:%S",
        "value_columns": ["value"],
        "start_date": start_date,
        "end_date": end_date,
        "frequency": "day",
        "aggregation_function": "mean",
        #"remove_outliers": True,
        #"remove_outliers_window": 30,
        "interpolation": "NN"
    },
    
    "o3": {
        "columns": ['lat', 'lng', 'date', 'value'],
        "type": "timeseries",
        "date_format": "%Y-%m-%d %H:%M:%S",
        "value_columns": ["value"],
        "start_date": start_date,
        "end_date": end_date,
        "frequency": "day",
        "aggregation_function": "mean",
        #"remove_outliers": True,
        #"remove_outliers_window": 30,
        "interpolation": "NN"
    },
    
    "humidity": {
        "columns": ['lat', 'lng', 'date', 'value'],
        "type": "timeseries",
        "date_format": "%Y-%m-%d",
        "value_columns": ["value"],
        "start_date": start_date,
        "end_date": end_date,
        "frequency": "day",
        "aggregation_function": "mean",
        "remove_outliers": True,
        #"remove_outliers_window": 30,
        "interpolation": "NN"
    },
    
    "global_radiation": {
        "columns": ['lat', 'lng', 'date', 'value'],
        "type": "timeseries",
        "date_format": "%Y-%m-%d",
        "value_columns": ["value"],
        "start_date": start_date,
        "end_date": end_date,
        "frequency": "day",
        "aggregation_function": "mean",
        "remove_outliers": True,
        #"remove_outliers_window": 30,
        "interpolation": "NN"
    },
    
    "winds": {
        "columns": ['lat', 'lng', 'date'],
        "type": "timeseries",
        "date_format": "%Y-%m-%d",
        "value_columns": [],
        "start_date": start_date,
        "end_date": end_date,
        "frequency": "day",
        "aggregation_function": "mean",
        #"remove_outliers": True,
        #"remove_outliers_window": 30,
        "interpolation": "NN"
    },
    
    "temperature": {
        "columns": ['lat', 'lng', 'date', 'value'],
        "type": "timeseries",
        "date_format": "%Y-%m-%d %H:%M:%S",
        "value_columns": ["value"],
        "start_date": start_date,
        "end_date": end_date,
        "frequency": "day",
        "aggregation_function": "mean",
        "remove_outliers": True,
        #"remove_outliers_window": 30,
        "interpolation": "NN"
    },
    
    "precipitation": {
        "columns": ['lat', 'lng', 'date', 'value'],
        "type": "timeseries",
        "date_format": "%Y-%m-%d %H:%M:%S",
        "value_columns": ["value"],
        "start_date": start_date,
        "end_date": end_date,
        "frequency": "day",
        "aggregation_function": "sum",
        "remove_outliers": True,
        #"remove_outliers_window": 30,
        "interpolation": "IDW"
    },
    
    "geologia": {
        "type": "odc",
        "dataset_epsg": "epsg:32632",
        "encode": True,
        "encoding_mapping": {
            101 : 0,
            201 : 0,
            301 : 0,
            205 : 1,
            206 : 2,
            76  : 3,
            207 : 4,
            8   : 5,
            81  : 6
        },
        "interpolation": "NN"
    },
    
    'dusaf15': {
        "type": "odc",
        "dataset_epsg": "epsg:32632",
        "encode": True,
        "encoding_mapping": map_land_cat,
        "interpolation": "NN"
    }
}

odc_datasets = [
    'dtm_milan','aspect', 'hillshade','ndvi_2019',
    'plan_curvature','profile_curvature', 'water_distance','slope',
    'spi','tri','twi','densita_popolazione','main_road_distance','building_height'
]
for d_name in odc_datasets:
    options[d_name] = {
        "type": "odc",
        "dataset_epsg": "epsg:32632",
        "encode": False,
        "interpolation": "NN"
    }

In [6]:
processor = processing.HarmoniaProcessor()

In [7]:
#import meteo data
temperature = processor.import_df('../data/milano_meteo_data/temperature.csv')
precipitation = processor.import_df('../data/milano_meteo_data/precipitation.csv')
humidity = processor.import_df('../data/milano_meteo_data/humidity.csv')
wind_velocity = processor.import_df('../data/milano_meteo_data/wind_velocity.csv')
wind_direction = processor.import_df('../data/milano_meteo_data/wind_direction.csv')
global_radiation = processor.import_df('../data/milano_meteo_data/radiation.csv')
hydrometric_level = processor.import_df('../data/milano_meteo_data/hydrometric_level.csv')

#pollutants
pm10_timeseries = processor.import_df("../data/mcm_pollutant_data/pm10_daily.csv", date_format="%Y-%m-%d") #daily
no2_timeseries = processor.import_df("../data/mcm_pollutant_data/no2_hourly.csv", date_format="%Y-%m-%dT%H:%M:%S")
so2_timeseries = processor.import_df("../data/mcm_pollutant_data/so2_hourly.csv", date_format="%Y-%m-%dT%H:%M:%S")
o3_timeseries = processor.import_df("../data/mcm_pollutant_data/o3_hourly.csv", date_format="%Y-%m-%dT%H:%M:%S")
pm25_timeseries = processor.import_df("../data/mcm_pollutant_data/pm25_daily.csv", date_format="%Y-%m-%d") #daily


In [8]:
#pollutants
processor.add_dataset("pm10", pm10_timeseries, options['pm10'])
processor.add_dataset("pm25", pm25_timeseries, options['pm25'])
processor.add_dataset("o3", o3_timeseries, options['o3'])
processor.add_dataset("no2", no2_timeseries, options['no2'])
processor.add_dataset("so2", so2_timeseries, options['so2'])

#odc datasets
#Example of datacube config file:
#datacube_config_path = "/home/user/datacube.conf"
datacube_config_path = "/home/user/ODC_harmonia/datacube.conf"
processor.add_odc_dataset("geologia", options['geologia'], config=datacube_config_path)
processor.add_odc_dataset("dusaf15", options['dusaf15'], config=datacube_config_path)
for d_name in odc_datasets:
    processor.add_odc_dataset(d_name, options[d_name], config=datacube_config_path)

#meteo
processor.add_dataset("humidity", humidity, options['humidity'])
processor.add_dataset("global_radiation", global_radiation, options['global_radiation'])
processor.add_winds_dataset("winds", wind_velocity, wind_direction, "value", "value", options['winds'])
processor.add_dataset("temperature", temperature, options['temperature'])
processor.add_dataset("precipitation", precipitation, options['precipitation'])


pm10
Aggregated daily.
Dataset pm10 added to processor
pm25
Aggregated daily.
Dataset pm25 added to processor
o3
Aggregated daily.
Dataset o3 added to processor
no2
Aggregated daily.
Dataset no2 added to processor
so2
Aggregated daily.
Dataset so2 added to processor
adding geologia
Added geologia
adding dusaf15


  from pkg_resources import iter_entry_points


Added dusaf15
adding dtm_milan
Added dtm_milan
adding aspect
Added aspect
adding hillshade
Added hillshade
adding ndvi_2019
Added ndvi_2019
adding plan_curvature
Added plan_curvature
adding profile_curvature
Added profile_curvature
adding water_distance
Added water_distance
adding slope
Added slope
adding spi
Added spi
adding tri
Added tri
adding twi
Added twi
adding densita_popolazione
Added densita_popolazione
adding main_road_distance
Added main_road_distance
adding building_height
Added building_height
humidity
Aggregated daily.
Dataset humidity added to processor
global_radiation
Aggregated daily.
Dataset global_radiation added to processor
Building wind sectors
Adding winds dataset
winds
Aggregated daily.
Dataset winds added to processor
temperature
Aggregated daily.
Dataset temperature added to processor
precipitation
Aggregated daily.
Dataset precipitation added to processor


In [9]:
milano_shapefile = '../data/milano_final_shapefile/milano_metro.shp'
grid = interp.create_grid_from_shapefile(milano_shapefile, xdelta=1000, ydelta=1000, shapefile_epsg=32632)

meteo_subset = [
    "humidity", "global_radiation", "temperature", "precipitation", "winds"
]

odc = [
    'dtm_milan','aspect', 'hillshade','ndvi_2019', 'dusaf15',
    'plan_curvature','profile_curvature', 'water_distance','slope',
    'spi','tri','twi','densita_popolazione','main_road_distance','building_height',
    'geologia'
]

pollutant_subset = [
    'pm10', 'pm25', 'no2', 'so2', 'o3'
]


The grid of the shapefile


In [10]:
for pollutant in pollutant_subset:
    merge_subset = [pollutant] + meteo_subset + odc
    
    print(f"--------------- {pollutant} --------------------")
    print("gridded dataset")
    processor.merge_datasets(subset=merge_subset, interpolate=False, locations=grid.copy())
    processor.save_merged_dataset(f'../harmonia_processor/{pollutant}/grid_dataset.csv')
    #prediction datasets
    
    
    #print("stations dataset")
    processor.merge_datasets(subset=merge_subset, interpolate=True)
    processor.save_merged_dataset(f'../harmonia_processor/{pollutant}/stations_dataset.csv')
    #training/validation datasets
    #prediction datasets


--------------- pm10 --------------------
gridded dataset
merging humidity
merging global_radiation
merging temperature
merging precipitation
merging winds
processing for specified locations
Interpolating for 5113 dates
zero-size array to reduction operation maximum which has no identity
Error for date 2010-01-08T00:00:00.000000000 for precipitation
float division by zero
Error for date 2010-04-04T00:00:00.000000000 for precipitation
Date #100
Date #200
index 0 is out of bounds for axis 0 with size 0
Error for date 2010-07-24T00:00:00.000000000 for N
index 0 is out of bounds for axis 0 with size 0
Error for date 2010-07-24T00:00:00.000000000 for NE
index 0 is out of bounds for axis 0 with size 0
Error for date 2010-07-24T00:00:00.000000000 for E
index 0 is out of bounds for axis 0 with size 0
Error for date 2010-07-24T00:00:00.000000000 for SE
index 0 is out of bounds for axis 0 with size 0
Error for date 2010-07-24T00:00:00.000000000 for S
index 0 is out of bounds for axis 0 with size

  in_crs_string = _prepare_from_proj_string(in_crs_string)


Sampling dtm_milan
Sampling aspect
Sampling hillshade
Sampling ndvi_2019
Sampling dusaf15
Sampling plan_curvature
Sampling profile_curvature
Sampling water_distance
Sampling slope
Sampling spi
Sampling tri
Sampling twi
Sampling densita_popolazione
Sampling main_road_distance
Sampling building_height
Sampling geologia
merging ODC datasets
datasets merged!
merged dataset saved to ../harmonia_processor/pm10/grid_dataset.csv as CSV
merging humidity
merging global_radiation
merging temperature
merging precipitation
merging winds
Sampling dtm_milan


  in_crs_string = _prepare_from_proj_string(in_crs_string)


Sampling aspect
Sampling hillshade
Sampling ndvi_2019
Sampling dusaf15
Sampling plan_curvature
Sampling profile_curvature
Sampling water_distance
Sampling slope
Sampling spi
Sampling tri
Sampling twi
Sampling densita_popolazione
Sampling main_road_distance
Sampling building_height
Sampling geologia
merging ODC datasets
datasets merged!
Starting interpolation of missing values
------------------------- Interpolating pm10 -------------------------
2010-01-01 00:00:00 2023-12-31 00:00:00
------------------------- Interpolating humidity -------------------------
2010-01-01 00:00:00 2023-12-31 00:00:00
------------------------- Interpolating global_radiation -------------------------
2010-01-01 00:00:00 2023-12-31 00:00:00
------------------------- Interpolating temperature -------------------------
2010-01-01 00:00:00 2023-12-31 00:00:00
------------------------- Interpolating precipitation -------------------------
2010-01-01 00:00:00 2023-12-31 00:00:00
error in 2010-01-08 00:00:00
error

  in_crs_string = _prepare_from_proj_string(in_crs_string)


Sampling dtm_milan
Sampling aspect
Sampling hillshade
Sampling ndvi_2019
Sampling dusaf15
Sampling plan_curvature
Sampling profile_curvature
Sampling water_distance
Sampling slope
Sampling spi
Sampling tri
Sampling twi
Sampling densita_popolazione
Sampling main_road_distance
Sampling building_height
Sampling geologia
merging ODC datasets
datasets merged!
merged dataset saved to ../harmonia_processor/pm25/grid_dataset.csv as CSV
merging humidity
merging global_radiation
merging temperature
merging precipitation
merging winds
Sampling dtm_milan


  in_crs_string = _prepare_from_proj_string(in_crs_string)


Sampling aspect
Sampling hillshade
Sampling ndvi_2019
Sampling dusaf15
Sampling plan_curvature
Sampling profile_curvature
Sampling water_distance
Sampling slope
Sampling spi
Sampling tri
Sampling twi
Sampling densita_popolazione
Sampling main_road_distance
Sampling building_height
Sampling geologia
merging ODC datasets
datasets merged!
Starting interpolation of missing values
------------------------- Interpolating pm25 -------------------------
2010-01-01 00:00:00 2023-12-31 00:00:00
error in 2010-01-22 00:00:00
error in 2010-02-19 00:00:00
error in 2010-02-20 00:00:00
error in 2010-02-21 00:00:00
error in 2010-02-22 00:00:00
error in 2010-02-23 00:00:00
error in 2010-07-01 00:00:00
error in 2010-07-02 00:00:00
error in 2010-07-18 00:00:00
error in 2010-07-19 00:00:00
error in 2010-07-20 00:00:00
error in 2010-11-11 00:00:00
error in 2010-11-12 00:00:00
error in 2010-12-16 00:00:00
error in 2010-12-17 00:00:00
error in 2010-12-18 00:00:00
error in 2010-12-19 00:00:00
error in 2010-12-

  in_crs_string = _prepare_from_proj_string(in_crs_string)


Sampling dtm_milan
Sampling aspect
Sampling hillshade
Sampling ndvi_2019
Sampling dusaf15
Sampling plan_curvature
Sampling profile_curvature
Sampling water_distance
Sampling slope
Sampling spi
Sampling tri
Sampling twi
Sampling densita_popolazione
Sampling main_road_distance
Sampling building_height
Sampling geologia
merging ODC datasets
datasets merged!
merged dataset saved to ../harmonia_processor/no2/grid_dataset.csv as CSV
merging humidity
merging global_radiation
merging temperature
merging precipitation
merging winds
Sampling dtm_milan


  in_crs_string = _prepare_from_proj_string(in_crs_string)


Sampling aspect
Sampling hillshade
Sampling ndvi_2019
Sampling dusaf15
Sampling plan_curvature
Sampling profile_curvature
Sampling water_distance
Sampling slope
Sampling spi
Sampling tri
Sampling twi
Sampling densita_popolazione
Sampling main_road_distance
Sampling building_height
Sampling geologia
merging ODC datasets
datasets merged!
Starting interpolation of missing values
------------------------- Interpolating no2 -------------------------
2010-01-01 00:00:00 2023-12-31 00:00:00
------------------------- Interpolating humidity -------------------------
2010-01-01 00:00:00 2023-12-31 00:00:00
------------------------- Interpolating global_radiation -------------------------
2010-01-01 00:00:00 2023-12-31 00:00:00
------------------------- Interpolating temperature -------------------------
2010-01-01 00:00:00 2023-12-31 00:00:00
------------------------- Interpolating precipitation -------------------------
2010-01-01 00:00:00 2023-12-31 00:00:00
error in 2010-01-08 00:00:00
error 

  in_crs_string = _prepare_from_proj_string(in_crs_string)


Sampling dtm_milan
Sampling aspect
Sampling hillshade
Sampling ndvi_2019
Sampling dusaf15
Sampling plan_curvature
Sampling profile_curvature
Sampling water_distance
Sampling slope
Sampling spi
Sampling tri
Sampling twi
Sampling densita_popolazione
Sampling main_road_distance
Sampling building_height
Sampling geologia
merging ODC datasets
datasets merged!
merged dataset saved to ../harmonia_processor/so2/grid_dataset.csv as CSV
merging humidity
merging global_radiation
merging temperature
merging precipitation
merging winds
Sampling dtm_milan


  in_crs_string = _prepare_from_proj_string(in_crs_string)


Sampling aspect
Sampling hillshade
Sampling ndvi_2019
Sampling dusaf15
Sampling plan_curvature
Sampling profile_curvature
Sampling water_distance
Sampling slope
Sampling spi
Sampling tri
Sampling twi
Sampling densita_popolazione
Sampling main_road_distance
Sampling building_height
Sampling geologia
merging ODC datasets
datasets merged!
Starting interpolation of missing values
------------------------- Interpolating so2 -------------------------
2010-01-01 00:00:00 2023-12-31 00:00:00
------------------------- Interpolating humidity -------------------------
2010-01-01 00:00:00 2023-12-31 00:00:00
------------------------- Interpolating global_radiation -------------------------
2010-01-01 00:00:00 2023-12-31 00:00:00
------------------------- Interpolating temperature -------------------------
2010-01-01 00:00:00 2023-12-31 00:00:00
------------------------- Interpolating precipitation -------------------------
2010-01-01 00:00:00 2023-12-31 00:00:00
error in 2010-01-08 00:00:00
error 

  in_crs_string = _prepare_from_proj_string(in_crs_string)


Sampling dtm_milan
Sampling aspect
Sampling hillshade
Sampling ndvi_2019
Sampling dusaf15
Sampling plan_curvature
Sampling profile_curvature
Sampling water_distance
Sampling slope
Sampling spi
Sampling tri
Sampling twi
Sampling densita_popolazione
Sampling main_road_distance
Sampling building_height
Sampling geologia
merging ODC datasets
datasets merged!
merged dataset saved to ../harmonia_processor/o3/grid_dataset.csv as CSV
merging humidity
merging global_radiation
merging temperature
merging precipitation
merging winds
Sampling dtm_milan


  in_crs_string = _prepare_from_proj_string(in_crs_string)


Sampling aspect
Sampling hillshade
Sampling ndvi_2019
Sampling dusaf15
Sampling plan_curvature
Sampling profile_curvature
Sampling water_distance
Sampling slope
Sampling spi
Sampling tri
Sampling twi
Sampling densita_popolazione
Sampling main_road_distance
Sampling building_height
Sampling geologia
merging ODC datasets
datasets merged!
Starting interpolation of missing values
------------------------- Interpolating o3 -------------------------
2010-01-01 00:00:00 2023-12-31 00:00:00
------------------------- Interpolating humidity -------------------------
2010-01-01 00:00:00 2023-12-31 00:00:00
------------------------- Interpolating global_radiation -------------------------
2010-01-01 00:00:00 2023-12-31 00:00:00
------------------------- Interpolating temperature -------------------------
2010-01-01 00:00:00 2023-12-31 00:00:00
------------------------- Interpolating precipitation -------------------------
2010-01-01 00:00:00 2023-12-31 00:00:00
error in 2010-01-08 00:00:00
error i

In [11]:
#fix the precipitation data due to IDW interpolation error
for pollutant in pollutant_subset:
    grid_path = f'../harmonia_processor/{pollutant}/grid_dataset.csv'
    df_to_fix = processor.import_df(grid_path, date_format="%Y-%m-%d")
    df_to_fix.loc[df_to_fix['precipitation'] < 0, 'precipitation'] = 0
    df_to_fix.to_csv(grid_path)
    
    stations_path = f'../harmonia_processor/{pollutant}/stations_dataset.csv'
    df_to_fix = processor.import_df(stations_path, date_format="%Y-%m-%d")
    df_to_fix.loc[df_to_fix['precipitation'] < 0, 'precipitation'] = 0
    df_to_fix.to_csv(stations_path)
    

In [12]:
pollutant_data = {
    'pm10': {
        "threshold": 50 #daily in regulation: CORRECT
    }, 
    'pm25': {
        "threshold": 25 #daily not in regulation: Estimated using the EU yearly + AQI Poor limit 
    }, 
    'no2': {
        "threshold": 120 #daily not in regulation: Estimated using the EU yearly and hourly limits + AQI Poor limit 
    }, 
    'so2': {
        "threshold": 125 #daily in regulation: CORRECT
    }, 
    'o3': {
        "threshold": 130 #8-hour mean in regulation: NO CORRECT
    }, 
}

general_value_columns = [
    'humidity', 'global_radiation',
    'temperature', 'precipitation', 'N', 'NE', 'E', 'SE', 'S', 'SW', 'W',
    'NW', 'dtm_milan', 'aspect', 'hillshade', 'ndvi_2019', 'dusaf15_11',
    'dusaf15_12', 'dusaf15_14', 'dusaf15_21', 'dusaf15_22', 'dusaf15_51',
    'plan_curvature', 'profile_curvature', 'water_distance', 'slope', 'spi',
    'tri', 'twi', 'densita_popolazione', 'main_road_distance',
    'building_height', 'geologia_0', 'geologia_1', 'geologia_4'
]

for pollutant in pollutant_subset:
    processor = processing.HarmoniaProcessor()
    stations_path = f'../harmonia_processor/{pollutant}/stations_dataset.csv'
    df_training = processor.import_df(stations_path, date_format="%Y-%m-%d")
    value_columns = [pollutant] + general_value_columns
    processor.add_merged_dataset(df_training, value_columns)
    
    for i in range(4):
        if i == 0: random_partition = False; balanced = False; csv_name = "NOrand_NObalance";
        if i == 1: random_partition = False; balanced = True; csv_name = "NOrand_balance";
        if i == 2: random_partition = True; balanced = False; csv_name = "rand_NObalance";
        if i == 3: random_partition = True; balanced = True; csv_name = "rand_balance";
            
        training, testing, validation = processor.generate_training_data(
            pollutant,
            pollutant_data[pollutant]['threshold'],
            predictor_classes=[1,0],
            date_range=[date(2010,1,1), date(2022,12,31)],
            train_percentage=0.8,
            test_percentage=0,
            validation_percentage=0.20,
            random_partition=random_partition,
            balanced=balanced
        )


        training.to_csv(f'../harmonia_processor/{pollutant}/train/training_{csv_name}.csv')
        validation.to_csv(f'../harmonia_processor/{pollutant}/train/validation_{csv_name}.csv')

In [13]:
gen_variables = ['humidity', 'global_radiation',
       'temperature', 'precipitation', 'N', 'NE', 'E', 'SE', 'S', 'SW', 'W',
       'NW', 'dtm_milan', 'aspect', 'hillshade', 'ndvi_2019', 'dusaf15_11',
       'dusaf15_12', 'dusaf15_14', 'dusaf15_21', 'dusaf15_22', 'dusaf15_31', 'dusaf15_51',
       'plan_curvature', 'profile_curvature', 'water_distance', 'slope', 'spi',
       'tri', 'twi', 'densita_popolazione', 'main_road_distance',
       'building_height', 'geologia_0', 'geologia_1', 'geologia_2', 'geologia_4']

gen_aggregation = {
    'humidity': "mean", 
    'global_radiation': "mean",
    'temperature': "mean", 
    'precipitation': "mean", 
    'N': "mean", 
    'NE': "mean", 
    'E': "mean", 
    'SE': "mean", 
    'S': "mean", 
    'SW': "mean", 
    'W': "mean",
    'NW': "mean", 
    'dtm_milan': "mean", 
    'aspect': "mean", 
    'hillshade': "mean", 
    'ndvi_2019': "mean", 
    'dusaf15_11': "mean",
    'dusaf15_12': "mean", 
    'dusaf15_14': "mean", 
    'dusaf15_21': "mean", 
    'dusaf15_22': "mean", 
    'dusaf15_31': "mean", 
    'dusaf15_51': "mean",
    'plan_curvature': "mean", 
    'profile_curvature': "mean", 
    'water_distance': "mean", 
    'slope': "mean", 
    'spi': "mean",
    'tri': "mean", 
    'twi': "mean", 
    'densita_popolazione': "mean", 
    'main_road_distance': "mean",
    'building_height': "mean", 
    'geologia_0': "mean", 
    'geologia_1': "mean", 
    'geologia_2': "mean", 
    'geologia_4': "mean"
}
gen_date_range = [date(2023,1,1), date(2023,12,31)]
gen_year_range = [2023, 2023]
months = [1,2,3,4,5,6,7,8,9,10,11,12]

pollutant_subset = ['pm10','pm25','no2','so2','o3']
for pollutant in pollutant_subset:
    value_columns = [pollutant] + gen_variables
    
    processor_stations = processing.HarmoniaProcessor()
    stations_path = f'../harmonia_processor/{pollutant}/stations_dataset.csv'
    df_gen_stations = processor_stations.import_df(stations_path, date_format="%Y-%m-%d")
    processor_stations.add_merged_dataset(df_gen_stations, value_columns)
    
    processor_grid = processing.HarmoniaProcessor()
    grid_path = f'../harmonia_processor/{pollutant}/grid_dataset.csv'
    df_gen_grid = processor_grid.import_df(grid_path, date_format="%Y-%m-%d")
    processor_grid.add_merged_dataset(df_gen_grid, value_columns)
    
    for month in months:
        generate_options = {
            "variables": gen_variables,
            "sampling": "location",
            "frequency": "month",
            "aggregation": gen_aggregation, 
            "date_range": gen_date_range,
            #"day": date, #conditional only when frequency is day
            "month": month, #conditional only when frequency is month
            "year_range": gen_year_range, #conditional only when frequency is year or month
        }
        predictor_grid = processor_grid.generate_prediction_data(generate_options.copy())
        predictor_grid.to_csv(f'../harmonia_processor/{pollutant}/predict/grid_month_{month}.csv')
        print(f"saved {pollutant} grid month {month}")
        
        predictor_stations = processor_stations.generate_prediction_data(generate_options.copy())
        predictor_stations.to_csv(f'../harmonia_processor/{pollutant}/predict/stations_month_{month}.csv')
        print(f"saved {pollutant} stations month {month}")
        

saved pm10 grid month 1
saved pm10 stations month 1
saved pm10 grid month 2
saved pm10 stations month 2
saved pm10 grid month 3
saved pm10 stations month 3
saved pm10 grid month 4
saved pm10 stations month 4
saved pm10 grid month 5
saved pm10 stations month 5
saved pm10 grid month 6
saved pm10 stations month 6
saved pm10 grid month 7
saved pm10 stations month 7
saved pm10 grid month 8
saved pm10 stations month 8
saved pm10 grid month 9
saved pm10 stations month 9


saved pm10 grid month 10
saved pm10 stations month 10
saved pm10 grid month 11
saved pm10 stations month 11
saved pm10 grid month 12
saved pm10 stations month 12
saved pm25 grid month 1
saved pm25 stations month 1
saved pm25 grid month 2
saved pm25 stations month 2
saved pm25 grid month 3
saved pm25 stations month 3
saved pm25 grid month 4
saved pm25 stations month 4
saved pm25 grid month 5
saved pm25 stations month 5
saved pm25 grid month 6
saved pm25 stations month 6
saved pm25 grid month 7
saved pm25 stations month 7
saved pm25 grid month 8
saved pm25 stations month 8
saved pm25 grid month 9
saved pm25 stations month 9
saved pm25 grid month 10
saved pm25 stations month 10
saved pm25 grid month 11
saved pm25 stations month 11
saved pm25 grid month 12
saved pm25 stations month 12
saved no2 grid month 1
saved no2 stations month 1
saved no2 grid month 2
saved no2 stations month 2
saved no2 grid month 3
saved no2 stations month 3
saved no2 grid month 4
saved no2 stations month 4
saved no