In [None]:
import sys
sys.path.append("../../..")

from datetime import date
import pandas as pd
import plotly.graph_objects as go
import numpy as np

from modules import processing_module as processing
from modules import interpolation_module as interp
from modules import ai_module as ai

In [None]:
milano_shapefile = '../data/milano_final_shapefile/milano_metro.shp'
milano_epsg = 32632
grid = interp.create_grid_from_shapefile(milano_shapefile, xdelta=1000, ydelta=1000, shapefile_epsg=milano_epsg)
pollutants = ['pm10', 'pm25', 'so2', 'o3']
models = ['rf', 'lstm']
train_modes = [
    'rand_balance',
    'NOrand_balance',
    'rand_NObalance',
    'NOrand_NObalance'
]

## Checking results for January with pollutant PM10 using gridded samples

The following sections corresponds to the predictions of the pollutant "PM10" in the month of January for the models that were trained. All training and testing datasets are partitioned with 80% training and 20% testing. 

The models are:
- Random Forest (rf)
- Long-short term memory (lstm)
- Support Vector Machine (svm)

For each of the models, it shows 4 grids, one for each of the training and testing samples:
- Random sampling with balanced classes.
- Sequential sampling with balanced classes.
- Random sampling with no balanced classes.
- Sequential sampling with no balanced classes.


Random sampling corresponds to a random sort of the entire dataset before partitioning.
Balanced classes corresponds to the partition being made on the filter of the classes or on the entire data. The training/testing is balanced when the total amount of each class corresponds to the partition made. This means that, for a balanced dataset, 80% of the total amount of records with class 1 will be present in the training data while 20% will be present in the testing. For a non-balanced dataset, this is not enforced.


The first set of grids correspond to the probability of belonging to class 1 (pollutant exceeded) in the meteo/pollutant stations of the pollutant. 
The second set of grids correspond also to the prediction on the stations, but classified into a specific class if the threshold exceeds 50%. Both first and second set are interpolated using NN to fill the grid of the entire area of interest.
The third and fourth set of grids correspond to the same, but predicted on each grid element separately.


Below are reported the accuracies of each of the models, sorted by pollutant and accuracy.

In [None]:
importer = processing.HarmoniaProcessor()
scores_path = f'../harmonia_processor/model_scores.csv'
model_scores =  importer.import_df(scores_path, date_format=None)
model_scores = model_scores.reset_index(drop=True).sort_values(by=["pollutant", "score"], ascending=False)
model_scores

In [None]:
#plot for model accuracies
for model in models:
    plot_dfs = []
    plot_names = []
    sort_by = 'train_mode'
    print(f"--------------------------------------------------")
    print(f"scores for model {model}")
    for pollutant in pollutants:
        temp_plot_df = model_scores.sort_values(by='score', ascending=False).loc[
            (model_scores['pollutant'] == pollutant) & (model_scores['type'] == model)
        ].reset_index(drop=True).sort_values(by=sort_by)
        plot_dfs.append(temp_plot_df.copy())
        plot_names.append(pollutant)

    importer.show_plot(
        plot_dfs,
        ['train_mode', 'train_mode', 'train_mode', 'train_mode'],
        ['score', 'score', 'score', 'score'],
        plot_names
    )

It is important to point out that for the pollutant Ozone (o3) and SO2 the accuracies are particularly high due to the low variability of the dataset. This is due to the low daily exceedance of the pollutants across the entire time frame. 

In [None]:
importer = processing.HarmoniaProcessor()
m = 1
pollutant = 'pm10'

#check for the same model with different training samples
prediction_base_path = f'../harmonia_processor/{pollutant}/predictions'
for model in models:
    print(f'------------------------------------------------------------------------')
    print(f'Showing for model {model}')
    for train_mode in train_modes:
        print(f'Showing for training and testing samples {train_mode}')
        prediction_path = f'{prediction_base_path}/{model}_prediction_month_{m}_stations_{train_mode}.csv'
        prediction_m =  importer.import_df(prediction_path, date_format=None)
        #prediction_m['exc'] = np.where(prediction_m['exc']>=50, 1, 0)

        interpolated_to_grid,b = interp.interpolate(
            'exc', 
            'NN', 
            milano_shapefile,
            prediction_m.copy(), 
            visual_output=True,
            epsg_utm=milano_epsg,
            #plot_min=0,
            #plot_max=1
        )
    

In [None]:
importer = processing.HarmoniaProcessor()
m = 1
pollutant = 'pm10'

#check for the same model with different training samples
prediction_base_path = f'../harmonia_processor/{pollutant}/predictions'
for model in models:
    print(f'Showing for {model}')
    for train_mode in train_modes:
        print(f'Showing for training and testing samples {train_mode}')
        prediction_path = f'{prediction_base_path}/{model}_prediction_month_{m}_stations_{train_mode}.csv'
        prediction_m =  importer.import_df(prediction_path, date_format=None)
        prediction_m['exc'] = np.where(prediction_m['exc']>=50, 1, 0)

        interpolated_to_grid,b = interp.interpolate(
            'exc', 
            'NN', 
            milano_shapefile,
            prediction_m.copy(), 
            visual_output=True,
            epsg_utm=milano_epsg,
            plot_min=0,
            plot_max=1
        )
    

In [None]:
importer = processing.HarmoniaProcessor()
m = 1
pollutant = 'pm10'

#check for the same model with different training samples
prediction_base_path = f'../harmonia_processor/{pollutant}/predictions'
for model in models:
    print(f'Showing for {model}')
    for train_mode in train_modes:
        print(f'Showing for training and testing samples {train_mode}')
        prediction_path = f'{prediction_base_path}/{model}_prediction_month_{m}_grid_{train_mode}.csv'
        prediction_m =  importer.import_df(prediction_path, date_format=None)
        #prediction_m['exc'] = np.where(prediction_m['exc']>=50, 1, 0)

        interpolated_to_grid,b = interp.interpolate(
            'exc', 
            'NN', 
            milano_shapefile,
            prediction_m.copy(), 
            visual_output=True,
            epsg_utm=milano_epsg,
            #plot_min=0,
            #plot_max=1
        )
    

In [None]:
importer = processing.HarmoniaProcessor()
m = 1
pollutant = 'pm10'

#check for the same model with different training samples
prediction_base_path = f'../harmonia_processor/{pollutant}/predictions'
for model in models:
    print(f'Showing for {model}')
    for train_mode in train_modes:
        print(f'Showing for training and testing samples {train_mode}')
        prediction_path = f'{prediction_base_path}/{model}_prediction_month_{m}_grid_{train_mode}.csv'
        prediction_m =  importer.import_df(prediction_path, date_format=None)
        prediction_m['exc'] = np.where(prediction_m['exc']>=50, 1, 0)

        interpolated_to_grid,b = interp.interpolate(
            'exc', 
            'NN', 
            milano_shapefile,
            prediction_m.copy(), 
            visual_output=True,
            epsg_utm=milano_epsg,
            plot_min=0,
            plot_max=1
        )
    