In [None]:
import sys
sys.path.append("../../..")

from datetime import date
import pandas as pd
import plotly.graph_objects as go
import numpy as np
import rasterio as rio
import rioxarray as rxr


from modules import processing_module as processing
from modules import interpolation_module as interp
from modules import ai_module as ai

In [None]:
def load_model(pollutant, training_mode, prefix):
    train_path = f'../harmonia_processor/{pollutant}/train/training_{training_mode}.csv'
    training_dataset = importer.import_df(train_path, date_format='%Y-%m-%d')
    training_dataset = training_dataset.dropna()
    training_dates = training_dataset.copy()[['date']]
    if 'date' in list(training_dataset.columns):
        training_dataset = training_dataset.drop(['date'], axis=1)

    test_path = f'../harmonia_processor/{pollutant}/train/validation_{training_mode}.csv'
    testing_dataset = importer.import_df(test_path, date_format='%Y-%m-%d')
    testing_dataset = testing_dataset.dropna()
    testing_dates = testing_dataset.copy()[['date']]
    if 'date' in list(testing_dataset.columns):
        testing_dataset = testing_dataset.drop(['date'], axis=1)

    #save the training columns for selecting them in the prediction dataset
    training_columns = list(training_dataset.columns)
    training_columns.remove('exc')

    model_path = f'../harmonia_processor/{pollutant}/model/{prefix}_model_{training_mode}.csv'
    ai_model = ai.MLProcessor(training_dataset.copy(), testing_dataset.copy())
    ai_model.load_model(model_path)
    ai_model.model_type = model_to_use
    return ai_model

In [None]:
model_options = {
    'rf': {
        "prefix": 'rf',
        "training_options": {
            "normalized": False,
            "n_estimators": 500,
            "n_jobs": -1,
            "max_depth": 30,
            "random_state": None
        },
        "prediction_options": {
            "normalized": False
        }
    },
    
    'svm': {
        "prefix": 'svm',
        "training_options": {
            "normalized": True,
            "kernel": 'rbf',
            "probability": True,
            "verbose": False,
            "max_iter": 10,
            "random_state": None,
            "cache_size": 1024,
            "n_jobs": 5
        },
        "prediction_options":{
            "normalized": True,
        }
    },
    
    'lstm': {
        "prefix": 'lstm',
        "training_options": {
            "normalized": True,
            "activation": 'sigmoid',
            "metrics": ['accuracy'],
            "optimizer": 'adam',
            "loss": 'binary_crossentropy'
        },
        "prediction_options":{
            "normalized": True,
        }
    }
}


In [None]:
importer = processing.HarmoniaProcessor()
pollutants = ['pm10', 'pm25', 'so2', 'o3']
#Missing no2 because there are not exceedances
models = ['rf', 'lstm']
#models = ['rf']
train_modes = [
    'rand_balance',
    'NOrand_balance',
    'rand_NObalance',
    'NOrand_NObalance'
]


In [None]:
#Train the models
for model_to_use in models:
    prefix = model_options[model_to_use]['prefix']
    print(f"FOR {model_to_use}")

    for pollutant in pollutants:
        print(f"FOR {pollutant}")
        
        for training_mode in train_modes:
            try:
                train_path = f'../harmonia_processor/{pollutant}/train/training_{training_mode}.csv'
                training_dataset = importer.import_df(train_path, date_format='%Y-%m-%d')
                training_dataset = training_dataset.dropna()
                training_dates = training_dataset.copy()[['date']]
                if 'date' in list(training_dataset.columns):
                    training_dataset = training_dataset.drop(['date'], axis=1)

                test_path = f'../harmonia_processor/{pollutant}/train/validation_{training_mode}.csv'
                testing_dataset = importer.import_df(test_path, date_format='%Y-%m-%d')
                testing_dataset = testing_dataset.dropna()
                testing_dates = testing_dataset.copy()[['date']]
                if 'date' in list(testing_dataset.columns):
                    testing_dataset = testing_dataset.drop(['date'], axis=1)

                #save the training columns for selecting them in the prediction dataset
                training_columns = list(training_dataset.columns)
                training_columns.remove('exc')

                ai_model = ai.MLProcessor(training_dataset.copy(), testing_dataset.copy())

                ai_model.train_model(
                    model_to_use,
                    'exc',
                    model_options=model_options[model_to_use]['training_options']
                )

                model_path = f'../harmonia_processor/{pollutant}/model/{prefix}_model_{training_mode}.csv'
                ai_model.save_model(model_path)
            except Exception as ex:
                print(f"could not train for {pollutant}!")
    
    

In [None]:
#Score the models
scores_df_columns = ["type", "train_mode", "pollutant", "prefix", "score"]
scores_df = pd.DataFrame(columns=scores_df_columns)

for model_to_use in models:
    prefix = model_options[model_to_use]['prefix']
    print(f"FOR {model_to_use}")

    for pollutant in pollutants:
        print(f"FOR {pollutant}")
        
        for training_mode in train_modes:
    
            train_path = f'../harmonia_processor/{pollutant}/train/training_{training_mode}.csv'
            training_dataset = importer.import_df(train_path, date_format='%Y-%m-%d')
            training_dataset = training_dataset.dropna()
            training_dates = training_dataset.copy()[['date']]
            if 'date' in list(training_dataset.columns):
                training_dataset = training_dataset.drop(['date'], axis=1)

            test_path = f'../harmonia_processor/{pollutant}/train/validation_{training_mode}.csv'
            testing_dataset = importer.import_df(test_path, date_format='%Y-%m-%d')
            testing_dataset = testing_dataset.dropna()
            testing_dates = testing_dataset.copy()[['date']]
            if 'date' in list(testing_dataset.columns):
                testing_dataset = testing_dataset.drop(['date'], axis=1)

            #save the training columns for selecting them in the prediction dataset
            training_columns = list(training_dataset.columns)
            training_columns.remove('exc')
            
            model_path = f'../harmonia_processor/{pollutant}/model/{prefix}_model_{training_mode}.csv'
            ai_model = ai.MLProcessor(training_dataset.copy(), testing_dataset.copy())
            ai_model.load_model(model_path)
            ai_model.model_type = model_to_use
        
            score = ai_model.score_model()
            score_row = [model_to_use, training_mode, pollutant, prefix, score]
            scores_df = pd.concat([
                scores_df, 
                pd.DataFrame(
                    [score_row], 
                    columns=scores_df_columns
                )
            ])
    
scores_path = f'../harmonia_processor/model_scores.csv'
scores_df.to_csv(scores_path)
scores_df

In [None]:
#plot for model accuracies
for model in models:
    plot_dfs = []
    plot_names = []
    sort_by = 'train_mode'
    print(f"--------------------------------------------------")
    print(f"scores for model {model}")
    for pollutant in pollutants:
        temp_plot_df = scores_df.loc[
            (scores_df['pollutant'] == pollutant) & (scores_df['type'] == model)
        ].reset_index(drop=True).sort_values(by=sort_by)
        plot_dfs.append(temp_plot_df.copy())
        plot_names.append(pollutant)

    importer.show_plot(
        plot_dfs,
        ['train_mode', 'train_mode', 'train_mode', 'train_mode'],
        ['score', 'score', 'score', 'score'],
        plot_names
    )

In [None]:
#calculate best models
scores_path = f'../harmonia_processor/model_scores.csv'
scores_df = importer.import_df(scores_path, date_format=None)
scores_df = scores_df.reset_index(drop=True)

best_model_data = {}
for pollutant in pollutants:
    best = scores_df.sort_values(by='score', ascending=False).loc[
        scores_df['pollutant'] == pollutant
    ].reset_index(drop=True).iloc[0]
    best_path = f'../harmonia_processor/{pollutant}/model/{best.prefix}_model_{best.train_mode}.csv'
    best_type = best.type
    best_model_data[pollutant] = {
        "model_path": best_path,
        "prefix": best.prefix,
        "train_mode": best.train_mode,
        "type": best_type,
        "score": best.score
    }
best_model_data

In [None]:
#predict the stations and grid samples for each model, each month, each pollutant, for the best model score
best_base_path_predictions = f'../best_model/predictions'
for pollutant in pollutants:
    print(f"FOR {pollutant}")
    model_data = best_model_data[pollutant]
    train_mode = model_data['train_mode']
    prefix = model_data['prefix']
    model_to_use = model_data['type']
    
    train_path = f'../harmonia_processor/{pollutant}/train/training_{train_mode}.csv'
    training_dataset = importer.import_df(train_path, date_format='%Y-%m-%d')
    training_dataset = training_dataset.dropna()
    training_dates = training_dataset.copy()[['date']]
    if 'date' in list(training_dataset.columns):
        training_dataset = training_dataset.drop(['date'], axis=1)

    test_path = f'../harmonia_processor/{pollutant}/train/validation_{train_mode}.csv'
    testing_dataset = importer.import_df(test_path, date_format='%Y-%m-%d')
    testing_dataset = testing_dataset.dropna()
    testing_dates = testing_dataset.copy()[['date']]
    if 'date' in list(testing_dataset.columns):
        testing_dataset = testing_dataset.drop(['date'], axis=1)

    #save the training columns for selecting them in the prediction dataset
    training_columns = list(training_dataset.columns)
    if 'exc' in training_columns:
        training_columns.remove('exc')

    model_path = f'../harmonia_processor/{pollutant}/model/{prefix}_model_{train_mode}.csv'
    ai_model = ai.MLProcessor(training_dataset.copy(), testing_dataset.copy())
    ai_model.load_model(model_path)
    ai_model.model_type = model_to_use

    predictions = {}
    predictions_grid = {}
    predicts = {}
    predicts_grid = {}

    for m in range(1,13):
        print(f'FOR {pollutant} MONTH {m}')
        print(f"Predicting in stations datasets")
        predict_path = f'../harmonia_processor/{pollutant}/predict/stations_month_{m}.csv'
        prediction_df =  importer.import_df(predict_path, date_format=None)
        prediction_dataset = prediction_df.copy()
        prediction_dataset = prediction_dataset[training_columns]
        predicts[m] = prediction_df.copy()
        if 'date' in list(prediction_dataset.columns):
            prediction_dataset = prediction_dataset.drop(['date'], axis=1)

        prediction_dataset = prediction_dataset.reset_index(drop=True).dropna()
        if model_to_use == 'lstm':
            predicted_probabilities = ai_model.predict(
                prediction_dataset.copy(), 
                predict_options=model_options[model_to_use]['prediction_options']
            )
        else:
            predicted_probabilities = ai_model.predict_probabilities(
                prediction_dataset.copy(),
                predict_options=model_options[model_to_use]['prediction_options']
            )
            predicted_probabilities = predicted_probabilities[:,1]

        #Create a DF from predicted labels
        predicted_df = pd.DataFrame(predicted_probabilities)
        predicted_df.columns = ['exc']

        #Concat DF with UTM coordinates 
        predicted_df = predicted_df.reset_index(drop=True)
        prediction_locations = prediction_dataset[['lat','lng']].reset_index(drop=True)
        predicted_df = pd.concat([predicted_df, prediction_locations],axis=1)
        predicted_df['exc'] = predicted_df['exc']*100

        predictions[m] = predicted_df.copy()

        prediction_path = f'{best_base_path_predictions}/best_{pollutant}_stations_month_{m}.csv'
        predictions[m].to_csv(prediction_path)


        print(f"Predicting in grid datasets")
        predict_path = f'../harmonia_processor/{pollutant}/predict/grid_month_{m}.csv'
        prediction_df =  importer.import_df(predict_path, date_format=None)
        prediction_dataset = prediction_df.copy()
        prediction_dataset = prediction_dataset[training_columns]
        predicts_grid[m] = prediction_df.copy()
        if 'date' in list(prediction_dataset.columns):
            prediction_dataset = prediction_dataset.drop(['date'], axis=1)


        prediction_dataset = prediction_dataset.reset_index(drop=True).dropna()
        if model_to_use == 'lstm':
            predicted_probabilities = ai_model.predict(
                prediction_dataset.copy(), 
                predict_options=model_options[model_to_use]['prediction_options']
            )
        else:
            predicted_probabilities = ai_model.predict_probabilities(
                prediction_dataset.copy(), 
                predict_options=model_options[model_to_use]['prediction_options']
            )
            predicted_probabilities = predicted_probabilities[:,1]

        #Create a DF from predicted labels
        predicted_df = pd.DataFrame(predicted_probabilities)
        predicted_df.columns = ['exc']

        #Concat DF with UTM coordinates 
        #reset index to avoid indexing problems
        predicted_df = predicted_df.reset_index(drop=True)
        prediction_locations = prediction_dataset[['lat','lng']].reset_index(drop=True)
        predicted_df = pd.concat([predicted_df, prediction_locations],axis=1)
        predicted_df['exc'] = predicted_df['exc']*100

        predictions_grid[m] = predicted_df.copy()

        prediction_path = f'{best_base_path_predictions}/best_{pollutant}_grid_month_{m}.csv'
        predictions_grid[m].to_csv(prediction_path)


In [None]:
milano_shapefile = '../data/milano_final_shapefile/milano_metro.shp'
milano_epsg = 32632
grid = interp.create_grid_from_shapefile(milano_shapefile, xdelta=1000, ydelta=1000, shapefile_epsg=milano_epsg)
pollutants = ['pm10', 'pm25', 'so2', 'o3']

In [None]:
grid

In [None]:
#Generate rasters with stations data
best_raster_base_path = f'../best_model/rasters'
for pollutant in pollutants:
    for m in range(1,13):
        prediction_path = f'{best_base_path_predictions}/best_{pollutant}_stations_month_{m}.csv'
        prediction_m =  importer.import_df(prediction_path, date_format=None)
        print(f"------ {pollutant} --- month {m} ------")
        interpolated_to_grid,b = interp.interpolate(
            'exc', 
            'NN', 
            milano_shapefile,
            prediction_m.copy(), 
            visual_output=True,
            epsg_utm=milano_epsg
        )

        new_interp = pd.DataFrame()
        new_interp['y'] = interpolated_to_grid.original_centroids.y
        new_interp['x'] = interpolated_to_grid.original_centroids.x
        new_interp['value'] = interpolated_to_grid.NN
        interp_xar = new_interp.set_index(['y', 'x']).to_xarray()

        raster_path = f'{best_raster_base_path}/stations_{pollutant}_month_{m}.tiff'
        
        array_to_write = np.flip(np.flip(interp_xar.value.to_numpy()), axis=1)
        profile = {
            'driver': 'GTiff', 
            'dtype': 'float32', 
            'nodata': -9999.0, 
            'width': int(len(interp_xar.x)), 
            'height': int(len(interp_xar.y)), 
            'count': 1, 
            'crs': rio.CRS.from_epsg(32632), 
            'transform': rio.Affine(1000.0, 0.0, int(interp_xar.x[0]), 0.0, -1000.0, int(interp_xar.y[-1])), 
            'tiled': False, 
            'interleave': 'band'
        }
        with rio.open(raster_path, 'w', **profile) as dest:
            dest.write(array_to_write, 1)
    
        #prediction_raster = interp.save_as_raster(new_interp, raster_path, crs=milano_epsg)
        print(f"saved raster to {raster_path}")
    

In [None]:
#Generate rasters with grid data
best_raster_base_path = f'../best_model/rasters'
for pollutant in pollutants:
    for m in range(1,13):
        prediction_path = f'{best_base_path_predictions}/best_{pollutant}_grid_month_{m}.csv'
        prediction_m =  importer.import_df(prediction_path, date_format=None)
        print(f"------ {pollutant} --- month {m} ------")

        interpolated_to_grid,b = interp.interpolate(
            'exc', 
            'NN', 
            milano_shapefile, 
            prediction_m, 
            visual_output=True,
            epsg_utm=milano_epsg
        )

        new_interp = pd.DataFrame()
        new_interp['y'] = interpolated_to_grid.original_centroids.y
        new_interp['x'] = interpolated_to_grid.original_centroids.x
        new_interp['value'] = interpolated_to_grid.NN
        
        interp_xar = new_interp.set_index(['y', 'x']).to_xarray()

        raster_path = f'{best_raster_base_path}/grid_{pollutant}_month_{m}.tiff'
        
        array_to_write = np.flip(np.flip(interp_xar.value.to_numpy()), axis=1)
        profile = {
            'driver': 'GTiff', 
            'dtype': 'float32', 
            'nodata': -9999.0, 
            'width': int(len(interp_xar.x)), 
            'height': int(len(interp_xar.y)), 
            'count': 1, 
            'crs': rio.CRS.from_epsg(32632), 
            'transform': rio.Affine(1000.0, 0.0, int(interp_xar.x[0]), 0.0, -1000.0, int(interp_xar.y[-1])), 
            'tiled': False, 
            'interleave': 'band'
        }
        with rio.open(raster_path, 'w', **profile) as dest:
            dest.write(array_to_write, 1)

        
        #prediction_raster = interp.save_as_raster(new_interp, raster_path, crs=milano_epsg)
        print(f"saved raster to {raster_path}")
    

In [None]:
#PREDICT ALL!
best_base_path_predictions = f'../harmonia_processor'
for pollutant in pollutants:
    print(f"FOR {pollutant}")
    for model_to_use in models:
        print(f"FOR {model_to_use}")
        for train_mode in train_modes:
            print(f"FOR {train_mode}")
            prefix = model_options[model_to_use]['prefix']

            train_path = f'{best_base_path_predictions}/{pollutant}/train/training_{train_mode}.csv'
            training_dataset = importer.import_df(train_path, date_format='%Y-%m-%d')
            training_dataset = training_dataset.dropna()
            training_dates = training_dataset.copy()[['date']]
            if 'date' in list(training_dataset.columns):
                training_dataset = training_dataset.drop(['date'], axis=1)

            test_path = f'{best_base_path_predictions}/{pollutant}/train/validation_{train_mode}.csv'
            testing_dataset = importer.import_df(test_path, date_format='%Y-%m-%d')
            testing_dataset = testing_dataset.dropna()
            testing_dates = testing_dataset.copy()[['date']]
            if 'date' in list(testing_dataset.columns):
                testing_dataset = testing_dataset.drop(['date'], axis=1)

            #save the training columns for selecting them in the prediction dataset
            training_columns = list(training_dataset.columns)
            if 'exc' in training_columns:
                training_columns.remove('exc')

            model_path = f'{best_base_path_predictions}/{pollutant}/model/{prefix}_model_{train_mode}.csv'
            ai_model = ai.MLProcessor(training_dataset.copy(), testing_dataset.copy())
            ai_model.load_model(model_path)
            ai_model.model_type = model_to_use

            predictions = {}
            predictions_grid = {}
            predicts = {}
            predicts_grid = {}

            for m in range(1,13):
                print(f'FOR {pollutant} MONTH {m}')
                print(f"Predicting in stations datasets")
                predict_path = f'{best_base_path_predictions}/{pollutant}/predict/stations_month_{m}.csv'
                prediction_df =  importer.import_df(predict_path, date_format=None)
                prediction_dataset = prediction_df.copy()
                prediction_dataset = prediction_dataset[training_columns]
                predicts[m] = prediction_df.copy()
                if 'date' in list(prediction_dataset.columns):
                    prediction_dataset = prediction_dataset.drop(['date'], axis=1)

                prediction_dataset = prediction_dataset.reset_index(drop=True).dropna()
                if model_to_use == 'lstm':
                    predicted_probabilities = ai_model.predict(
                        prediction_dataset.copy(), 
                        predict_options=model_options[model_to_use]['prediction_options']
                    )
                else:
                    predicted_probabilities = ai_model.predict_probabilities(
                        prediction_dataset.copy(),
                        predict_options=model_options[model_to_use]['prediction_options']
                    )
                    predicted_probabilities = predicted_probabilities[:,1]

                #Create a DF from predicted labels
                predicted_df = pd.DataFrame(predicted_probabilities)
                predicted_df.columns = ['exc']

                #Concat DF with UTM coordinates 
                predicted_df = predicted_df.reset_index(drop=True)
                prediction_locations = prediction_dataset[['lat','lng']].reset_index(drop=True)
                predicted_df = pd.concat([predicted_df, prediction_locations],axis=1)
                predicted_df['exc'] = predicted_df['exc']*100

                predictions[m] = predicted_df.copy()

                prediction_path = f'{best_base_path_predictions}/{pollutant}/predictions/{prefix}_prediction_month_{m}_stations_{train_mode}.csv'
                predictions[m].to_csv(prediction_path)


                print(f"Predicting in grid datasets")
                predict_path = f'{best_base_path_predictions}/{pollutant}/predict/grid_month_{m}.csv'
                prediction_df =  importer.import_df(predict_path, date_format=None)
                prediction_dataset = prediction_df.copy()
                prediction_dataset = prediction_dataset[training_columns]
                predicts_grid[m] = prediction_df.copy()
                if 'date' in list(prediction_dataset.columns):
                    prediction_dataset = prediction_dataset.drop(['date'], axis=1)


                prediction_dataset = prediction_dataset.reset_index(drop=True).dropna()
                if model_to_use == 'lstm':
                    predicted_probabilities = ai_model.predict(
                        prediction_dataset.copy(), 
                        predict_options=model_options[model_to_use]['prediction_options']
                    )
                else:
                    predicted_probabilities = ai_model.predict_probabilities(
                        prediction_dataset.copy(), 
                        predict_options=model_options[model_to_use]['prediction_options']
                    )
                    predicted_probabilities = predicted_probabilities[:,1]

                #Create a DF from predicted labels
                predicted_df = pd.DataFrame(predicted_probabilities)
                predicted_df.columns = ['exc']

                #Concat DF with UTM coordinates 
                #reset index to avoid indexing problems
                predicted_df = predicted_df.reset_index(drop=True)
                prediction_locations = prediction_dataset[['lat','lng']].reset_index(drop=True)
                predicted_df = pd.concat([predicted_df, prediction_locations],axis=1)
                predicted_df['exc'] = predicted_df['exc']*100

                predictions_grid[m] = predicted_df.copy()

                prediction_path = f'{best_base_path_predictions}/{pollutant}/predictions/{prefix}_prediction_month_{m}_grid_{train_mode}.csv'
                predictions_grid[m].to_csv(prediction_path)
