# Data_download_MILANO_hourly_meteo

In [2]:
#----------------Utils--------------------------
import pandas as pd
import plotly.graph_objects as go
import numpy as np
import requests


In [3]:
def show_plot(df, x_col, y_col, name="Unnamed Plot", additional_traces=[]):
    if type(df) != list:
        df = [df]
        x_col = [x_col]
        y_col = [y_col]
        name = [name]
    
    fig = go.Figure()
    for i, single_df in enumerate(df):
        x = single_df[x_col[i]]
        y = single_df[y_col[i]]
        fig_name = name[i]
        fig.add_trace(go.Scatter(x=x, y=y, mode='lines+markers',name=fig_name))
    
    if len(additional_traces) > 0:
        for trace in additional_traces:
            fig.add_trace(trace)
    fig.show()
    
def import_df(path, date_format="%Y-%m-%dT%H:%M:%S"):
    df = pd.read_csv(path, index_col = 0)
    if date_format is not None:
        df['date'] = pd.to_datetime(df['date'],  format=date_format)
    return df


#Remove Outliers function
#This remove a point if it exceeds +-3 std deviations in a window of "window" observations 
# at column "value_column"
def filter_outliers(input_df, sensor_list, value_column='value'):
    filtered_df = pd.DataFrame()
    for sensor in sensor_list:
        df = input_df.copy()
        df = df.loc[df['sensor_id'] == sensor]
        #iterate all the df
        df['mean']= df[value_column].mean()
        df['std'] = df[value_column].std()
        df
        #filter setup
        df = df[(df[value_column] <= df['mean']+3*df['std']) & (df[value_column] >= df['mean']-3*df['std'])]
        
        filtered_df = pd.concat([filtered_df, df])
    
    filtered_df = filtered_df.drop(["mean", "std"], axis=1)
    return filtered_df

In [23]:
#GLOBAL VARIABLES
headers = {'Accept': 'application/json'}

milano_data_basepath = "http://localhost:8000/api/collections/"
milano_sensor_base_path = "http://localhost:8000/api/collections/meteosensor/items?"

download = True

date_format = '%Y-%m-%dT%H:%M:%S'

meteo_variables = ["Temperature", "Wind Velocity", "Wind Direction", "Humidity", "Global Radiation", "Precipitation"]

#build the meteo datasets object with the value column of each dataset and the csv name
meteo_datasets = {
    "Temperature": {"collection_name": "temperature", "csv_name": "temperature"},
    "Precipitation": {"collection_name": "precipitation", "csv_name": "precipitation"},
    "Wind Direction": {"collection_name": "winddirection", "csv_name": "wind_direction"},
    "Wind Velocity": {"collection_name": "windvelocity", "csv_name": "wind_velocity"},
    "Global Radiation": {"collection_name": "solarradiation", "csv_name": "radiation"},
    "Humidity": {"collection_name": "humidity", "csv_name": "humidity"},
}

stations_data = []
#fetch the paths to be called to the HARMONIA API
for variable in meteo_variables:
    variable_collection = meteo_datasets[variable]["collection_name"]
    print(f"fetching paths for {variable}")
    sensor_req = requests.get(
        f'{milano_sensor_base_path}sensor_type={variable}&province=MI', 
        headers=headers
    )
    print(f'{milano_sensor_base_path}sensor_type={variable}&province=MI')
    sensors_list = sensor_req.json()
    print(sensors_list)
    variable_stations = list(map(lambda a: a['sensor_id'], sensors_list))
    variable_paths = []
    for station in variable_stations:
        variable_paths.append(f"{milano_data_basepath}{variable_collection}/items?sensor_id={station}&datetime=2022-01-01/2023-12-31&limit=1000000&f=geojson&skipGeometry=true")

    meteo_datasets[variable]['paths'] = variable_paths.copy()
    meteo_datasets[variable]['sensors'] = variable_stations.copy()
    stations_data = stations_data + sensors_list.copy()
    print(f"END fetching paths for {variable}")
    print(f"----------------------------------------")

#extract information of the meteo stations from the sensors list.
# this data is then merged with the meteo observations to complement the list.
#drop_cols = ['nomestazione', 'datastart', 'storico', 'cgb_nord', 'cgb_est', 'location', ':@computed_region_6hky_swhk', ':@computed_region_ttgh_9sm5', 'datastop', 'wkb_geometry']
stations_data_df = pd.DataFrame(stations_data)
#stations_data_df = stations_data_df.drop(columns=drop_cols)


fetching paths for Temperature
http://localhost:8000/api/collections/meteosensor/items?sensor_type=Temperature&province=MI
[{'sensor_type': 'Temperature', 'measurement_unit': '°C', 'station_id': 100, 'station_name': 'Milano Lambrate', 'altitude': 120, 'province': 'MI', 'is_historical': False, 'date_start': '2001-06-29', 'date_stop': None, 'utm_north': '5038171.0000', 'utm_east': '520120.0000', 'latitude': '45.4968', 'longitude': '9.2575', 'id': 2001, 'sensor_id': 2001}, {'sensor_type': 'Temperature', 'measurement_unit': '°C', 'station_id': 102, 'station_name': 'San Colombano al Lambro', 'altitude': 140, 'province': 'MI', 'is_historical': False, 'date_start': '2001-06-27', 'date_stop': None, 'utm_north': '5002165.0000', 'utm_east': '538425.0000', 'latitude': '45.1719', 'longitude': '9.4890', 'id': 2039, 'sensor_id': 2039}, {'sensor_type': 'Temperature', 'measurement_unit': '°C', 'station_id': 104, 'station_name': 'Segrate Milano Due', 'altitude': 120, 'province': 'MI', 'is_historical': 

In [22]:
float(stations_data_df.loc[stations_data_df["sensor_id"] == 2001]["latitude"].values[0])

45.4968

In [29]:
#request data from the HARMONIA API
headers = {'Accept': 'application/json'}

for variable in meteo_variables:
    print(f"fetching data for {variable}")
    variable_data = []

    for variable_path in meteo_datasets[variable]['paths']:
        print(f'fetching {variable_path}')
        r = requests.get(variable_path, headers=headers)
        req_data = r.json()
        req_features = req_data["features"]

        if len(req_features) > 0:
            sensor_id = req_features[0]["properties"]["sensor_id"]
            lat = float(stations_data_df.loc[stations_data_df["sensor_id"] == sensor_id]["latitude"].values[0])
            lng = float(stations_data_df.loc[stations_data_df["sensor_id"] == sensor_id]["longitude"].values[0])

            req_list = list(map(lambda element: {
                #"id": element["id"],
                "date": element["properties"]["date"],
                "sensor_id": element["properties"]["sensor_id"],
                "value": float(element["properties"]["value"]),
                "lat": lat,
                "lng": lng
            }, req_features))
            variable_data += req_list
            
    print(f"END fetching data for {variable}")

    print(f'Building DF for {variable}')
    df = pd.DataFrame(variable_data)
    meteo_datasets[variable]['raw'] = df.copy()
    df = df.replace(-9999, np.nan).dropna(subset=['value'])
    df['date'] = pd.to_datetime(df['date'],  format=date_format)
    df = filter_outliers(df, meteo_datasets[variable]['sensors'], 'value')
    df = df.sort_values(by='date')
    meteo_datasets[variable]['df'] = df.copy()
    print(f'END Building DF for {variable}')
    print(f"----------------------------------------")


fetching data for Temperature
fetching http://localhost:8000/api/collections/temperature/items?sensor_id=2001&datetime=2022-01-01/2023-12-31&limit=1000000&f=geojson&skipGeometry=true
fetching http://localhost:8000/api/collections/temperature/items?sensor_id=2039&datetime=2022-01-01/2023-12-31&limit=1000000&f=geojson&skipGeometry=true
fetching http://localhost:8000/api/collections/temperature/items?sensor_id=2063&datetime=2022-01-01/2023-12-31&limit=1000000&f=geojson&skipGeometry=true
fetching http://localhost:8000/api/collections/temperature/items?sensor_id=4001&datetime=2022-01-01/2023-12-31&limit=1000000&f=geojson&skipGeometry=true
fetching http://localhost:8000/api/collections/temperature/items?sensor_id=4058&datetime=2022-01-01/2023-12-31&limit=1000000&f=geojson&skipGeometry=true
fetching http://localhost:8000/api/collections/temperature/items?sensor_id=4066&datetime=2022-01-01/2023-12-31&limit=1000000&f=geojson&skipGeometry=true
fetching http://localhost:8000/api/collections/tempe

In [35]:
meteo_datasets['Precipitation']['df'].rename(columns={'sensor_id': 'sensorID'})

Unnamed: 0,date,sensorID,value,lat,lng
725850,2022-01-01,5918,0.0,45.6137,9.5081
743270,2022-01-01,9341,0.0,45.4761,9.1418
625778,2022-01-01,5916,0.0,45.4726,9.3535
1368668,2022-01-01,30536,0.0,45.6078,8.9529
520864,2022-01-01,5908,0.0,45.4732,9.2223
...,...,...,...,...,...
952671,2023-12-31,14121,0.0,45.4901,9.1944
416806,2023-12-31,4112,0.0,45.5485,8.8473
847781,2023-12-31,9341,0.0,45.4761,9.1418
1263755,2023-12-31,19373,0.0,45.4717,9.1891


In [44]:
import_df('../data/milano_meteo_data/precipitation.csv',1998,2023)

Unnamed: 0,sensorID,lat,lng,date,value
0,5908,45.473226,9.222315,1998-01-01 00:00:00,0.4
1,5908,45.473226,9.222315,1998-01-01 02:00:00,0.2
2,5908,45.473226,9.222315,1998-01-01 03:00:00,0.0
3,5908,45.473226,9.222315,1998-01-01 04:00:00,0.2
4,5908,45.473226,9.222315,1998-01-01 05:00:00,0.0
...,...,...,...,...,...
3826265,14121,45.490100,9.194400,2023-12-31 00:00:00,0.0
3826266,4112,45.548500,8.847300,2023-12-31 00:00:00,0.0
3826267,9341,45.476100,9.141800,2023-12-31 00:00:00,0.0
3826268,19373,45.471700,9.189100,2023-12-31 00:00:00,0.0


In [8]:
stations_data_df

Unnamed: 0,idsensore,tipologia,unit_dimisura,idstazione,quota,provincia,lng,lat
0,12757,Temperatura,°C,1511,215,MI,8.952897,45.607845
1,12759,Temperatura,°C,1512,160,MI,9.005200,45.535577
2,14742,Temperatura,°C,1546,143,MI,9.087923,45.517811
3,17488,Temperatura,°C,1874,137,MI,8.854409,45.458065
4,2001,Temperatura,°C,100,120,MI,9.257515,45.496780
...,...,...,...,...,...,...,...,...
93,6180,Umidità Relativa,%,535,97,MI,9.134517,45.324517
94,6183,Umidità Relativa,%,614,112,MI,9.353497,45.472580
95,6184,Umidità Relativa,%,513,199,MI,9.508122,45.613692
96,6185,Umidità Relativa,%,503,122,MI,9.141786,45.476063


In [32]:
def import_df(path, start_year, end_year, date_format="%Y-%m-%d", date_column="date"):
    df = pd.read_csv(path, index_col = 0)
    if date_format is not None and date_column is not None:
        df[date_column] = pd.to_datetime(df[date_column],  format=date_format)
    df = df.loc[
        (df['date'].dt.year >= start_year) &
        (df['date'].dt.year <= end_year)
    ]
    return df

In [39]:
csv_base_path = '../data/milano_meteo_data'
for variable in meteo_variables:
    name = meteo_datasets[variable]['csv_name']
    legacy = import_df(f'{csv_base_path}/{name}.csv',1998,2021).drop(columns=['type','unit','province','altitude','stationID'])
    new = pd.concat([legacy, meteo_datasets[variable]['df'].rename(columns={'sensor_id': 'sensorID'})])
    new = new.reset_index(drop=True)
    new.to_csv(f'{csv_base_path}/{name}.csv')

In [9]:
# Building the process-ready datasets
#datasets -> meteo_datasets[variable]['df']
#stations -> stations_data_df
#meteo variables -> meteo_variables

for variable in meteo_variables:
    print(f'Building data-ready dataset for {variable}')
    var_station_data = stations_data_df.loc[stations_data_df['tipologia'] == variable]
    var_df = meteo_datasets[variable]['df'].copy()

    #common columns to drop
    columns_to_drop = ['cgb_nord', 'cgb_est', 'massimo', 'valore_medio_giornaliero', 'minimo_valore_medio_orario', 'massimo_valore_medio_orario']
    var_df = var_df.drop(columns=columns_to_drop)
    var_df = var_df.rename(columns={"sensor_id": "idsensore"})

    #Specific column to drop for precitipation and rename the data column to Valore
    if variable == "Precipitazione":
        var_df = var_df.drop(columns=['medio'])
        var_df = var_df.rename(columns={"valore_cumulato": "valore"})
    else:
        var_df = var_df.drop(columns=['valore_cumulato'])
        var_df = var_df.rename(columns={"medio": "valore"})

    #merge the data with the station information
    var_station_data_merge = var_station_data[['idsensore', 'tipologia', 'unit_dimisura', 'idstazione', 'quota', 'provincia', 'lng', 'lat']]
    var_df = var_df.merge(var_station_data_merge, on=['idsensore', 'lat', 'lng'])

    #rename columns
    column_rename = {
        "idsensore": "sensorID",
        "date": "date",
        "valore": "value",
        "tipologia": "type",
        "unit_dimisura": "unit",
        "idstazione": "stationID",
        "quota": "altitude",
        "provincia": "province"
    }
    var_df = var_df.rename(columns=column_rename)

    meteo_datasets[variable]['df'] = var_df.copy()
    print(f"saving to file {meteo_datasets[variable]['csv_name']}.csv")
    meteo_datasets[variable]['df'].to_csv(f'../data/milano_meteo_data/{meteo_datasets[variable]["csv_name"]}.csv')


Building data-ready dataset for Temperatura
saving to file AAA_READY_temperature
Building data-ready dataset for Precipitazione
saving to file AAA_READY_precipitation
Building data-ready dataset for Direzione Vento
saving to file AAA_READY_wind_direction
Building data-ready dataset for Velocità Vento
saving to file AAA_READY_wind_velocity
Building data-ready dataset for Livello Idrometrico
saving to file AAA_READY_hydrometric_level
Building data-ready dataset for Radiazione Globale
saving to file AAA_READY_radiation
Building data-ready dataset for Umidità Relativa
saving to file AAA_READY_humidity


In [7]:
a = import_df(f'../data/milano_meteo_data/temperature.csv')