# Data_download_MILANO_hourly_pollutants

In [1]:
#----------------Utils--------------------------
import pandas as pd
import plotly
import plotly.graph_objects as go
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import requests

#----------------Interpolation--------------------------
from shapely.geometry import box
from scipy.interpolate import griddata, interpn

from scipy.interpolate import Rbf
from scipy.interpolate import RBFInterpolator
import pykrige

#----------------Machine Learning--------------------------
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from sklearn.preprocessing import MinMaxScaler
from math import sqrt
from sklearn.metrics import mean_squared_error, mean_absolute_error
pd.set_option('display.max_columns', None)


#------------------Export raster-----------------------------
import rasterio
from rasterio.features import rasterize
from rasterio.transform import from_origin

2023-11-06 14:44:38.345182: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-06 14:44:38.346336: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-06 14:44:38.368465: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-06 14:44:38.368944: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
#Remove Outliers function
#This remove a point if it exceeds +-3 std deviations in a window of "window" observations 
# at column "value_column"
def filter_outliers_by_sensor(input_df, window, sensor_list, value_column='value'):
    filtered_df = pd.DataFrame()
    for sensor in sensor_list:
        df = input_df.copy()
        df = df.loc[df['sensor_id'] == sensor]
        #iterate all the df
        df['mean']= df[value_column].rolling(window, center=True, step=1, min_periods=1).mean()
        df['std'] = df[value_column].rolling(window, center=True, step=1, min_periods=1).std()
        df
        #filter setup
        df = df[(df[value_column] <= df['mean']+3*df['std']) & (df[value_column] >= df['mean']-3*df['std'])]
        
        filtered_df = pd.concat([filtered_df, df])
    
    filtered_df = filtered_df.drop(["mean", "std"], axis=1)
    return filtered_df

In [4]:
#global variables
polution_variables = ['pm10', 'pm25', 'o3', 'no2', 'so2']

pollution_datasets = {
    "pm10": {"value_column": "value", "csv_name": "pm10", "sensor_type": "PM10%20(SM2005)"},
    "pm25": {"value_column": "value", "csv_name": "pm25", "sensor_type": "Particelle%20sospese%20PM2.5"},
    "o3": {"value_column": "value", "csv_name": "o3", "sensor_type": "Ozono"},
    "no2": {"value_column": "value", "csv_name": "no2", "sensor_type": "Biossido%20di%20Azoto"},
    "so2": {"value_column": "value", "csv_name": "so2", "sensor_type": "Biossido%20di%20Zolfo"}
}

date_format = "%Y-%m-%dT%H:%M:%S"

In [11]:
#DONT RUN IF NOT NECESSARY. IT TAKES TIME TO GET THE ENTIRE DATASET
#MILANO
headers = {'Accept': 'application/json'}
milano_data_basepath = "http://api.harmonia.info.uvt.ro/airquality_data_view?"
milano_sensor_base_path = "http://api.harmonia.info.uvt.ro/airquality_stations?"

for variable in polution_variables:
    pollution_dataset = pollution_datasets[variable]
    sensor_type = pollution_dataset['sensor_type']
    sensor_path = f'{milano_sensor_base_path}nometiposensore=eq.{sensor_type}&provincia=eq.MI'
    #sensor_path = f'{milano_sensor_base_path}nometiposensore=eq.{sensor_type}'
    print(sensor_path)
    
    sensor_req = requests.get(sensor_path, headers=headers)
    sensors_list = sensor_req.json()
    variable_stations = list(map(lambda a: a['idsensore'], sensors_list))
    variable_paths = []
    for station in variable_stations:
        variable_paths.append(f"{milano_data_basepath}sensor_id=eq.{station}")
        
    variable_stations = list(map(lambda a: a['idsensore'], sensors_list))
    variable_paths = []
    for station in variable_stations:
        variable_paths.append(f"{milano_data_basepath}sensor_id=eq.{station}")

    pollution_datasets[variable]['paths'] = variable_paths.copy()
    pollution_datasets[variable]['sensors'] = variable_stations.copy()


http://api.harmonia.info.uvt.ro/airquality_stations?nometiposensore=eq.PM10%20(SM2005)&provincia=eq.MI
http://api.harmonia.info.uvt.ro/airquality_stations?nometiposensore=eq.Particelle%20sospese%20PM2.5&provincia=eq.MI
http://api.harmonia.info.uvt.ro/airquality_stations?nometiposensore=eq.Ozono&provincia=eq.MI
http://api.harmonia.info.uvt.ro/airquality_stations?nometiposensore=eq.Biossido%20di%20Azoto&provincia=eq.MI
http://api.harmonia.info.uvt.ro/airquality_stations?nometiposensore=eq.Biossido%20di%20Zolfo&provincia=eq.MI


In [12]:
#request data from the HARMONIA API
headers = {'Accept': 'application/json'}

for variable in polution_variables:
    print(f"fetching data for {variable}")
    variable_data = []

    for variable_path in pollution_datasets[variable]['paths']:
        print(f'fetching {variable_path}')
        r = requests.get(variable_path, headers=headers)
        req_data = r.json()
        variable_data += req_data
        
    print(f"END fetching data for {variable}")
    print(f"Building dataframe for {variable}")
    df = pd.DataFrame(variable_data)
    pollution_datasets[variable]['raw'] = df.copy()
    df = df.drop(["utm_nord", "utm_est"], axis=1)
    df = df.replace(-999.0, np.nan).dropna(subset=[pollution_datasets[variable]['value_column']])
    df['date'] = pd.to_datetime(df['date'],  format=date_format)
    df = df.sort_values(by='date')
    window = 30 #days
    df = filter_outliers_by_sensor(
        df, 
        window,
        pollution_datasets[variable]['sensors'],
        pollution_datasets[variable]['value_column']
    )
    df = df.sort_values(by='date')
    pollution_datasets[variable]['df'] = df.copy()
    print(f"END Building dataframe for {variable}")
    
    

fetching data for pm10
fetching http://api.harmonia.info.uvt.ro/airquality_data_view?sensor_id=eq.10273
fetching http://api.harmonia.info.uvt.ro/airquality_data_view?sensor_id=eq.10320
fetching http://api.harmonia.info.uvt.ro/airquality_data_view?sensor_id=eq.10352
fetching http://api.harmonia.info.uvt.ro/airquality_data_view?sensor_id=eq.10354
fetching http://api.harmonia.info.uvt.ro/airquality_data_view?sensor_id=eq.20428
fetching http://api.harmonia.info.uvt.ro/airquality_data_view?sensor_id=eq.6905
fetching http://api.harmonia.info.uvt.ro/airquality_data_view?sensor_id=eq.6907
fetching http://api.harmonia.info.uvt.ro/airquality_data_view?sensor_id=eq.6908
fetching http://api.harmonia.info.uvt.ro/airquality_data_view?sensor_id=eq.6909
fetching http://api.harmonia.info.uvt.ro/airquality_data_view?sensor_id=eq.6912
fetching http://api.harmonia.info.uvt.ro/airquality_data_view?sensor_id=eq.6956
fetching http://api.harmonia.info.uvt.ro/airquality_data_view?sensor_id=eq.9963
fetching htt

fetching http://api.harmonia.info.uvt.ro/airquality_data_view?sensor_id=eq.5643
fetching http://api.harmonia.info.uvt.ro/airquality_data_view?sensor_id=eq.5644
fetching http://api.harmonia.info.uvt.ro/airquality_data_view?sensor_id=eq.5646
fetching http://api.harmonia.info.uvt.ro/airquality_data_view?sensor_id=eq.5647
fetching http://api.harmonia.info.uvt.ro/airquality_data_view?sensor_id=eq.5650
fetching http://api.harmonia.info.uvt.ro/airquality_data_view?sensor_id=eq.5651
fetching http://api.harmonia.info.uvt.ro/airquality_data_view?sensor_id=eq.5694
fetching http://api.harmonia.info.uvt.ro/airquality_data_view?sensor_id=eq.5696
END fetching data for so2
Building dataframe for so2
END Building dataframe for so2


In [13]:
#Comment so it is not overwritten "by mistake".
#csv_base_path = f'../../data/lombardy_pollutants_data'
csv_base_path = '../data/milano_pollutant_data'

for variable in polution_variables:
    pollution_datasets[variable]['df'].to_csv(f'{csv_base_path}/MI_{pollution_datasets[variable]["csv_name"]}.csv')
