In [None]:
# load packages
!pip install mapie
!pip install shap
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score, mean_absolute_percentage_error as mape, mean_squared_error as mse
from pyarrow import feather as pq
import geopandas as gpd
import folium
from folium import Marker
from shapely import geometry
from tqdm import tqdm
pd.set_option('display.max_columns', None)
from ipywidgets import interact
import scipy
from tqdm import tqdm
from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, RidgeCV
import mapie
from mapie import regression
from mapie.metrics import regression_coverage_score, regression_mean_width_score
from mapie.regression import MapieRegressor
from mapie.quantile_regression import MapieQuantileRegressor
from sklearn.ensemble import GradientBoostingRegressor
import shap
# Feature Importance
from sklearn.inspection import PartialDependenceDisplay
from sklearn.inspection import permutation_importance
from sklearn.model_selection import TimeSeriesSplit
# scaler for Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# mount drive
from google.colab import drive
drive.mount("/content/gdrive")
# load data for monthly performance
data = pd.read_csv("/content/gdrive/MyDrive/Aurora_Thesis/data_converted.csv")

In [None]:
# load data
data.time = pd.to_datetime(data.time)
data = data.reset_index()
# need to manually add 2 station Id
data.loc[data.station== "Bologna (BO)", "station_id"] = "ID1999"
data.loc[data.station== "San Pietro Capofiume (SPC)", "station_id"] = "ID1998"
# switch London data
data["OAtot_2"] = data.HOA_PMF + data.BBOA_PMF + data.OOAtot_PMF
data.loc[data.station == "London","OAtot_PMF"] = data.loc[data.station == "London","OAtot_2"]
# Remove Zurich 2017
data = data.loc[(data.station != "Zurich") | (data.year != 2017),:]

In [None]:
# CLEAN DATA
# remove OA with less than 0.1
data= data.loc[data.OAtot_PMF >= 0.1, :]
# and stations with less than 30 obs.
select = (data.groupby("station_id")["OAtot_PMF"].size() > 30).reset_index()
data = data.set_index("station_id")
data = data.join(select.set_index("station_id"), rsuffix = "keep")
data = data.loc[ data.OAtot_PMFkeep == True, :]
data = data.reset_index()

In [None]:
# add day of week
data["day_week"] = data.time.dt.day_of_week

In [None]:
# feature engineering
data["rc_1_1000-rc_1_100"] = data["road_class_1_1000"] - data["road_class_1_100"]
data["rc_2_1000-rc_2_100"] = data["road_class_2_1000"] - data["road_class_2_100"]
data["rc_3_1000-rc_3_100"] = data["road_class_3_1000"] - data["road_class_3_100"]
# CAMX proportions of components of OA
data["p_HOA"] = data["HOA_CAMX"] / data["OAtot_CAMX"]
data["p_BBOA"] = data["BBOA_CAMX"] / data["OAtot_CAMX"]
data["p_OOAtot"] = data["OOAtot_CAMX"] / data["OAtot_CAMX"]

# need to decorralate some land-use variables
data["diff_agriculture"] = data["agriculture1000"] - data["agriculture500"]
data["diff_airports"] = data["airports1000"] - data["airports500"]
data["diff_barren"] = data["barren1000"] - data["barren500"]
data["diff_industrial"] = data["industrial1000"] - data["industrial500"]
data["diff_industrial_transport"]= data["industrial_transport1000"] - data["industrial_transport500"]
data["diff_natural_green"] =  data["natural_green1000"] - data["natural_green500"]
data["diff_ports"] = data["ports1000"] - data["ports500"]
data["diff_roads_rails"] = data["roads_rails1000"] - data["roads_rails500"]
data["diff_snow_ice"] = data["snow_ice1000"] - data["snow_ice500"]
data["diff_transport"] = data["transport1000"] - data["transport500"]
data["diff_urban_fabric"] = data["urban_fabric1000"] - data["urban_fabric500"]
data["diff_urban_green"] = data["urban_green1000"] - data["urban_green500"]
data["diff_water"] = data["water1000"] - data["water500"]
data["diff_wetlands"] = data["wetlands1000"] - data["wetlands500"]
# also for Population and IMD
data["diff_population"] = data["population_1000"] - data["population_500"]
data["diff_imd"] = data["imd1000"] - data["imd500"]

In [None]:
# Get OA
data = data.loc[data.OAtot_PMF.isnull()==False,:]
data = data.sort_values(by="time")
# Get Y
Y = data.loc[:, ["time","station_id","station","OAtot_PMF"]]
Y = Y.set_index("time")
# Get X design
# probabaly better the 3 components
X = data.loc[: , ["time", "station","station_id", "HOA_CAMX", "BBOA_CAMX", "OOAtot_CAMX", "year", "month","day_week",
"temp_CAMX", "rh_CAMX", "press_CAMX", "ws_CAMX", "wd_CAMX", "pblh_CAMX", "wind_x_CAMX", "wind_y_CAMX"]]
X = X.set_index("time")

In [None]:
# down-scale at station
def down_scale(station_name:str):
    rf = RandomForestRegressor(random_state=99)
    boost = GradientBoostingRegressor(random_state=00)
    # get station
    X_station = X.loc[X.station == station_name, :]
    Y_station = Y.loc[Y.station == station_name, :]
    # train and test split (as in training data proportion)
    split = 0.75
    index_train = int(np.floor(len(X_station) * split))
    Y_train_station = Y_station.iloc[:index_train,:]
    Y_test_station = Y_station.iloc[index_train:, :]
    X_train_station = X_station.iloc[:index_train,:]
    X_test_station = X_station.iloc[index_train:,:]


    # fit models
    covars = ["HOA_CAMX", "BBOA_CAMX", "OOAtot_CAMX", "year", "month","day_week",
    "temp_CAMX", "rh_CAMX", "press_CAMX", "ws_CAMX", "wd_CAMX", "pblh_CAMX", "wind_x_CAMX", "wind_y_CAMX"]
    # grid search for time series
    tscv = TimeSeriesSplit(n_splits = 5)
    param_rf = {"max_features":["sqrt","log2"]}
    param_boost = {"max_depth":[3,5,10],  "learning_rate":[0.001, 0.01,0.1]} # also 0.001 for monthly
    # RF
    clf = GridSearchCV(rf, param_grid = param_rf, cv = tscv.split(X_train_station.loc[:, covars]))
    clf.fit(X_train_station.loc[:, covars], Y_train_station.OAtot_PMF)
    # BOOSTING
    clf2 = GridSearchCV(boost, param_grid= param_boost, cv = tscv.split(X_train_station.loc[:, covars]))
    clf2.fit(X_train_station.loc[:, covars], Y_train_station.OAtot_PMF)
    # RIDGE
    # standardize features
    scaler = StandardScaler()
    scaler.fit(X_train_station.loc[:,covars])
    X_train_scaled = pd.DataFrame(scaler.transform(X_train_station.loc[:,covars]))

    lm = RidgeCV(cv = tscv)
    lm.fit(X_train_scaled, Y_train_station.OAtot_PMF)
    #rf.fit(X_train_station.loc[:, covars], Y_train_station.OAtot_PMF)
    #boost.fit(X_train_station.loc[:, covars], Y_train_station.OAtot_PMF)

    # predict on test set
    rf_pred = clf.predict(X_test_station.loc[:, covars])
    # features for ridge
    X_test_scaled = pd.DataFrame(scaler.transform(X_test_station.loc[:, covars]))
    lm_pred = lm.predict(X_test_scaled)
    boost_pred = clf2.predict(X_test_station.loc[:, covars])
    camx_pred = X_test_station.HOA_CAMX + X_test_station.BBOA_CAMX +  X_test_station.OOAtot_CAMX
    # compute error on test set
    mse_full_rf = mse(Y_test_station.OAtot_PMF, rf_pred)
    mse_full_boost = mse(Y_test_station.OAtot_PMF, boost_pred)
    mse_full_lm = mse(Y_test_station.OAtot_PMF, lm_pred)
    mse_full_camx = mse(Y_test_station.OAtot_PMF, camx_pred)
    # plot resulting predictions
    plt.figure(figsize=(16,9))
    plt.plot(Y_test_station.OAtot_PMF, label = "Truth")
    plt.plot(Y_test_station.reset_index().time, rf_pred, label = "rf")
    #plt.plot(Y_test_station.reset_index().time, boost_pred,label = "boost")
    #plt.plot(Y_test_station.reset_index().time, lm_pred, label ="linear")
    plt.plot(Y_test_station.reset_index().time, camx_pred,label = "CAMx")
    plt.title(station_name)
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1),
            fancybox=True, shadow=True, ncol=5)

    # print mse
    print("mse rf", mse_full_rf)
    print("mse boost", mse_full_boost)
    print("mse lm", mse_full_lm)
    print("mse CAMx", mse_full_camx)

    #same but without CAMx's input
    # fit models
    covars = ["year", "month","day_week",
    "temp_CAMX", "rh_CAMX", "press_CAMX", "ws_CAMX", "wd_CAMX", "pblh_CAMX", "wind_x_CAMX", "wind_y_CAMX"]
    # fit models
    # RF
    clf = GridSearchCV(rf, param_grid= param_rf, cv =tscv.split(X_train_station.loc[:, covars]) )
    clf.fit(X_train_station.loc[:, covars], Y_train_station.OAtot_PMF)
    # BOOSTING
    clf2 = GridSearchCV(boost, param_grid= param_boost, cv = tscv.split(X_train_station.loc[:, covars]))
    clf2.fit(X_train_station.loc[:, covars], Y_train_station.OAtot_PMF)
    # RIDGE
    scaler = StandardScaler()
    scaler.fit(X_train_station.loc[:, covars])
    X_train_scaled = pd.DataFrame(X_train_station.loc[:, covars])
    lm.fit( X_train_scaled, Y_train_station.OAtot_PMF)
    #rf.fit(X_train_station.loc[:, covars], Y_train_station.OAtot_PMF)
    #boost.fit(X_train_station.loc[:, covars], Y_train_station.OAtot_PMF)

    # predict on test set
    rf_pred = clf.predict(X_test_station.loc[:, covars])
    X_test_scaled = pd.DataFrame(X_test_station.loc[:, covars])
    lm_pred = lm.predict(X_test_scaled)
    boost_pred = clf2.predict(X_test_station.loc[:, covars])
    camx_pred = X_test_station.HOA_CAMX + X_test_station.BBOA_CAMX +  X_test_station.OOAtot_CAMX
    # compute error on test set
    mse_part_rf = mse(Y_test_station.OAtot_PMF, rf_pred)
    mse_part_boost = mse(Y_test_station.OAtot_PMF, boost_pred)
    mse_part_lm = mse(Y_test_station.OAtot_PMF, lm_pred)
    mse_part_camx = mse(Y_test_station.OAtot_PMF, camx_pred)

    # print mse
    print("mse rf", mse_part_rf)
    print("mse boost", mse_part_boost)
    print("mse lm", mse_part_lm)
    print("mse CAMx", mse_part_camx)

    # plot resulting predictions
    plt.figure(figsize=(16,9))
    sns.set_style("whitegrid")
    plt.plot(Y_test_station.reset_index(drop=True).index, Y_test_station.OAtot_PMF, label = "OA",color = "blue")
    plt.plot(Y_test_station.reset_index(drop=True).index, rf_pred, label = "Predicted OA", color = "green")
    #plt.plot(Y_test_station.reset_index().time, boost_pred,label = "boost")
    plt.plot(Y_test_station.reset_index(drop=True).index, lm_pred, label ="linear")
    plt.plot(Y_test_station.reset_index(drop=True).index, camx_pred,label = "CAMx", color = "orange")
    plt.title(station_name, fontsize = 24)
    plt.ylabel("OA", fontsize = 24)
    plt.yticks(fontsize = 20)
    plt.xticks(fontsize = 20)
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1),
            fancybox=True, shadow=True, ncol=5, fontsize = 20)



In [None]:
interact(down_scale, station_name = X.station.unique())

In [None]:
# target metric of choice
def my_mape(Y_true, Y_pred):
    loss = (np.abs( (Y_true - Y_pred)/(Y_true+1))).mean()
    return loss

In [None]:
# down-scale at station (function to get results--> NO PLOTS)
def down_scale(station_name:str):
    rf = RandomForestRegressor(random_state=99, n_estimators = 500)
    boost = GradientBoostingRegressor(random_state=00)
    lm = RidgeCV()
    # get station
    X_station = X.loc[X.station_id == station_name, :]
    Y_station = Y.loc[Y.station_id == station_name, :]
    # train and test split (as in training data proportion)
    split = 0.75
    index_train = int(np.floor(len(X_station) * split))
    Y_train_station = Y_station.iloc[:index_train,:]
    Y_test_station = Y_station.iloc[index_train:, :]
    X_train_station = X_station.iloc[:index_train,:]
    X_test_station = X_station.iloc[index_train:,:]

    # fit models
    covars = ["HOA_CAMX", "BBOA_CAMX", "OOAtot_CAMX", "year", "month","day_week",
    "temp_CAMX", "rh_CAMX", "press_CAMX", "ws_CAMX", "wd_CAMX", "pblh_CAMX", "wind_x_CAMX", "wind_y_CAMX"]
    # grid search
    tscv = TimeSeriesSplit(n_splits = 5)
    param_rf = {"max_features":["sqrt","log2"]}
    param_boost = {"max_depth":[3,5,10],  "learning_rate":[0.001, 0.01,0.1]}
    # RF
    clf = GridSearchCV(rf, param_grid= param_rf, cv = tscv)
    clf.fit(X_train_station.loc[:, covars], Y_train_station.OAtot_PMF)
    # BOOSTING
    clf2 = GridSearchCV(boost, param_grid= param_boost, cv = tscv)
    clf2.fit(X_train_station.loc[:, covars], Y_train_station.OAtot_PMF)
    # RIDGE
    # standardize features
    scaler = StandardScaler()
    scaler.fit(X_train_station.loc[:,covars])
    X_train_scaled = pd.DataFrame(scaler.transform(X_train_station.loc[:,covars]))
    lm = RidgeCV(cv = tscv)
    lm.fit(X_train_scaled, Y_train_station.OAtot_PMF)

    # predict on test set
    rf_pred = clf.predict(X_test_station.loc[:, covars])
    lm_pred = lm.predict(pd.DataFrame(scaler.transform(X_test_station.loc[:, covars])))
    boost_pred = clf2.predict(X_test_station.loc[:, covars])
    camx_pred = X_test_station.HOA_CAMX + X_test_station.BBOA_CAMX +  X_test_station.OOAtot_CAMX
    # compute error on test set
    mse_full_rf = mse(Y_test_station.OAtot_PMF, rf_pred)
    mse_full_boost = mse(Y_test_station.OAtot_PMF, boost_pred)
    mse_full_lm = mse(Y_test_station.OAtot_PMF, lm_pred)
    mse_full_camx = mse(Y_test_station.OAtot_PMF, camx_pred)
    mape_full_rf = my_mape(Y_test_station.OAtot_PMF, rf_pred)
    mape_full_boost = my_mape(Y_test_station.OAtot_PMF, boost_pred)
    mape_full_lm = my_mape(Y_test_station.OAtot_PMF, lm_pred)
    mape_full_camx = my_mape(Y_test_station.OAtot_PMF, camx_pred)

    #same but without CAMx's input
    # fit models
    covars = ["year", "month","day_week",
    "temp_CAMX", "rh_CAMX", "press_CAMX", "ws_CAMX", "wd_CAMX", "pblh_CAMX", "wind_x_CAMX", "wind_y_CAMX"]
    # RF
    clf = GridSearchCV(rf, param_grid= param_rf)
    clf.fit(X_train_station.loc[:, covars], Y_train_station.OAtot_PMF)
    # BOOSTING
    clf2 = GridSearchCV(boost, param_grid= param_boost)
    clf2.fit(X_train_station.loc[:, covars], Y_train_station.OAtot_PMF)
    # RIDGE
    scaler = StandardScaler()
    scaler.fit(X_train_station.loc[:,covars])
    X_train_scaled = pd.DataFrame(scaler.transform(X_train_station.loc[:,covars]))
    lm.fit(X_train_scaled, Y_train_station.OAtot_PMF)

    # predict on test set
    rf_pred = clf.predict(X_test_station.loc[:, covars])
    lm_pred = lm.predict(pd.DataFrame(scaler.transform(X_test_station.loc[:, covars])))
    boost_pred = clf2.predict(X_test_station.loc[:, covars])
    # compute error on test set
    mse_part_rf = mse(Y_test_station.OAtot_PMF, rf_pred)
    mse_part_boost = mse(Y_test_station.OAtot_PMF, boost_pred)
    mse_part_lm = mse(Y_test_station.OAtot_PMF, lm_pred)
    mape_part_rf = my_mape(Y_test_station.OAtot_PMF, rf_pred)
    mape_part_boost = my_mape(Y_test_station.OAtot_PMF, boost_pred)
    mape_part_lm = my_mape(Y_test_station.OAtot_PMF, lm_pred)
    n_test = len(Y_test_station)


    return mse_full_rf, mape_full_rf, mse_full_boost, mape_full_boost, mse_full_lm, mape_full_lm, mse_full_camx, mape_full_camx, mse_part_rf, mape_part_rf,mse_part_boost,mape_part_boost, mse_part_lm,mape_part_lm, n_test

In [None]:
# dict to store results
full_rf_ind_mse = {}
full_boost_ind_mse = {}
full_ridge_ind_mse = {}
full_rf_ind_mape = {}
full_boost_ind_mape = {}
full_ridge_ind_mape = {}
camx_mse = {}
part_rf_ind_mse = {}
part_boost_ind_mse = {}
part_ridge_ind_mse = {}
part_rf_ind_mape = {}
part_boost_ind_mape = {}
part_ridge_ind_mape = {}
camx_mape = {}
# loop over every station
for stat in tqdm(data.station_id.unique()):
  full_rf_ind_mse[stat], full_rf_ind_mape[stat], full_boost_ind_mse[stat], full_boost_ind_mape[stat], full_ridge_ind_mse[stat], full_ridge_ind_mape[stat], camx_mse[stat], camx_mape[stat], part_rf_ind_mse[stat], part_rf_ind_mape[stat],part_boost_ind_mse[stat],part_boost_ind_mape[stat], part_ridge_ind_mse[stat],part_ridge_ind_mape[stat], n_test = down_scale(stat)

100%|██████████| 102/102 [1:06:19<00:00, 39.01s/it]


In [None]:
# save all dictionaries in JSON formar
dictionaries = [full_rf_ind_mse, full_rf_ind_mape, full_boost_ind_mse, full_boost_ind_mape, full_ridge_ind_mse, full_ridge_ind_mape, camx_mse, camx_mape,
                part_rf_ind_mse, part_rf_ind_mape, part_boost_ind_mse, part_boost_ind_mape, part_ridge_ind_mse, part_ridge_ind_mape]
names = ["full_rf_ind_mse", "full_rf_ind_mape", "full_boost_ind_mse", "full_boost_ind_mape", "full_ridge_ind_mse", "full_ridge_ind_mape", "camx_mse", "camx_mape",
                "part_rf_ind_mse", "part_rf_ind_mape", "part_boost_ind_mse", "part_boost_ind_mape", "part_ridge_ind_mse", "part_ridge_ind_mape"]
for di, di_name in zip(dictionaries, names):
  with open(di_name +".json", "w") as json_file:
    json.dump(di, json_file)

In [None]:
# get results - unweighted
print("mse rf full", np.mean(list(full_rf_ind_mse.values())), "std.dev.: ", np.std(list(full_rf_ind_mse.values())))
print("mse boost full", np.mean(list(full_boost_ind_mse.values())), "std.dev.: ", np.std(list(full_boost_ind_mse.values())))
print("mse ridge full", np.mean(list(full_ridge_ind_mse.values())), "std.dev.: ", np.std(list(full_ridge_ind_mse.values())))
print("mse camx", np.mean(list(camx_mse.values())), "std.dev.: ", np.std(list(camx_mse.values())))
print("mse rf part", np.mean(list(part_rf_ind_mse.values())), "std.dev.: ", np.std(list(part_rf_ind_mse.values())))
print("mse boost part", np.mean(list(part_boost_ind_mse.values())), "std.dev.: ", np.std(list(part_boost_ind_mse.values())))
print("mse ridge part", np.mean(list(part_ridge_ind_mse.values())), "std.dev.: ", np.std(list(part_ridge_ind_mse.values())))
print("mape rf full", np.mean(list(full_rf_ind_mape.values())), "std.dev.: ", np.std(list(full_rf_ind_mape.values())))
print("mape boost full", np.mean(list(full_boost_ind_mape.values())), "std.dev.: ", np.std(list(full_boost_ind_mape.values())))
print("mape ridge full", np.mean(list(full_ridge_ind_mape.values())), "std.dev.: ", np.std(list(full_ridge_ind_mape.values())))
print("mape rf part", np.mean(list(part_rf_ind_mape.values())), "std.dev.: ", np.std(list(part_rf_ind_mape.values())))
print("mape boost part", np.mean(list(part_boost_ind_mape.values())), "std.dev.: ", np.std(list(part_boost_ind_mape.values())))
print("mape ridge part", np.mean(list(part_ridge_ind_mape.values())), "std.dev.: ", np.std(list(part_ridge_ind_mape.values())))
print("mape camx", np.mean(list(camx_mape.values())), "std.dev.: ", np.std(list(camx_mape.values())))

In [None]:
# boxplot for mse
sns.set_style("whitegrid")
plt.figure(figsize = (16,9))
labels = ["CAMx","RF","RF*","Boost","Boost*","Ridge","Ridge*"]
positions = [0,3,4,7,8,11,12]
plt.boxplot([list(camx_mse.values()), list(full_rf_ind_mse.values()),list(part_rf_ind_mse.values()), list(full_boost_ind_mse.values()),list(part_boost_ind_mse.values()), list(full_ridge_ind_mse.values()), list(part_ridge_ind_mse.values())], patch_artist = True, positions = positions,labels = labels)
plt.yscale("log")
plt.ylabel("log(MSE)", fontsize = 20)
plt.tick_params(axis='x', labelsize=20)
plt.tick_params(axis='y', labelsize=24)
plt.title("MSE for down-scaling: Individual Models", fontsize = 30)

In [None]:
# boxplot for mape
sns.set_style("whitegrid")
plt.figure(figsize = (16,9))
labels = ["CAMx","RF","RF*","Boost","Boost*","Ridge","Ridge*"]
positions = [0,3,4,7,8,11,12]
plt.boxplot([list(camx_mape.values()), list(full_rf_ind_mape.values()),list(part_rf_ind_mape.values()), list(full_boost_ind_mape.values()),list(part_boost_ind_mape.values()), list(full_ridge_ind_mape.values()), list(part_ridge_ind_mape.values())], patch_artist = True, positions = positions,labels = labels)
plt.yscale("log")
plt.ylabel("log(MAPE)", fontsize = 20)
plt.tick_params(axis='x', labelsize=24)
plt.tick_params(axis='y', labelsize=24)
plt.title("MAPE for down-scaling: Individual Models", fontsize = 30)

# Global model, ie train on all stations at the same time! and account for station once with dummy and once with random effect

In [None]:
# Get OA
data = data.loc[data.OAtot_PMF.isnull()==False,:]
data = data.sort_values(by="time")
# Get Y
Y = data.loc[:, ["time","station_id","station","OAtot_PMF"]]
Y = Y.set_index("time")
# Get X design
# probabaly better the 3 components
covars = ["time", "station", "station_id", "OAtot_PMF", "HOA_CAMX", "BBOA_CAMX", "OOAtot_CAMX", "year", "month","day_week",
"temp_CAMX", "rh_CAMX", "press_CAMX", "ws_CAMX", "wd_CAMX", "pblh_CAMX", "wind_x_CAMX", "wind_y_CAMX","diff_agriculture", "diff_airports", "diff_barren", "diff_industrial", "diff_industrial_transport",
    "diff_natural_green", "diff_ports", "diff_roads_rails", "diff_snow_ice", "diff_transport", "diff_urban_fabric", "diff_urban_green", "diff_water", "diff_wetlands",
    "agriculture500","airports500", "barren500", "industrial500", "industrial_transport500", "natural_green500", "ports500", "roads_rails500", "snow_ice500",
    "transport500", "urban_fabric500","urban_green500", "water500", "wetlands500","diff_imd","imd500", "diff_population", "population_500","elevation","Lat","Lon", "area_grid",
    "distance_border", "distance_mt"]
X = data.loc[: , covars]
X = X.set_index("time")

In [None]:
# generate train dataset and test dataset
X_train = pd.DataFrame(np.zeros(X.shape[1])).T
X_test = pd.DataFrame(np.zeros(X.shape[1])).T
Y_train = pd.Series(0)
Y_test = pd.Series(0)

for station_id in tqdm(data.station_id.unique()):
    # get station
    X_station = X.loc[X.station_id == station_id, :]
    Y_station = Y.loc[Y.station_id == station_id, :]
    # train and test split (as in training data proportion)
    split = 0.75
    index_train = int(np.floor(len(X_station) * split))
    Y_train_station = Y_station.iloc[:index_train,:]
    Y_test_station = Y_station.iloc[index_train:, :]
    X_train_station = X_station.iloc[:index_train,:]
    X_test_station = X_station.iloc[index_train:,:]

    X_train = pd.concat((X_train, X_train_station),axis = 0)
    X_test = pd.concat((X_test, X_test_station),axis = 0)
    Y_train = pd.concat ((Y_train, Y_train_station),axis = 0)
    Y_test = pd.concat ((Y_test, Y_test_station),axis = 0)

# remove first row/obs. of each df (fix here maybe)
X_train = X_train.iloc[1:, 16:]
X_train = X_train.fillna(0)
X_test = X_test.iloc[1:, 16:]
X_test = X_test.fillna(0)
Y_train = Y_train.iloc[1:]
Y_test = Y_test.iloc[1:]

100%|██████████| 102/102 [00:01<00:00, 54.96it/s]


In [None]:
# global Ridge
covars = ["HOA_CAMX", "BBOA_CAMX", "OOAtot_CAMX", "year", "month","day_week",
"temp_CAMX", "rh_CAMX", "press_CAMX", "ws_CAMX", "wd_CAMX", "pblh_CAMX", "wind_x_CAMX", "wind_y_CAMX","diff_agriculture", "diff_airports", "diff_barren", "diff_industrial", "diff_industrial_transport",
    "diff_natural_green", "diff_ports", "diff_roads_rails", "diff_snow_ice", "diff_transport", "diff_urban_fabric", "diff_urban_green", "diff_water", "diff_wetlands",
    "agriculture500","airports500", "barren500", "industrial500", "industrial_transport500", "natural_green500", "ports500", "roads_rails500", "snow_ice500",
    "transport500", "urban_fabric500","urban_green500", "water500", "wetlands500","diff_imd","imd500", "diff_population", "population_500","elevation","Lat","Lon", "area_grid",
    "distance_border", "distance_mt"]
X_fit_train = pd.concat(( X_train.loc[:, covars],  pd.get_dummies(X_train.station_id)), axis = 1)
X_fit_test = pd.concat(( X_test.loc[:, covars],  pd.get_dummies(X_test.station_id)), axis = 1)
ridge = RidgeCV()
scaler = StandardScaler()
scaler.fit(X_fit_train)
X_fit_train = pd.DataFrame(scaler.transform(X_fit_train))
ridge.fit(X_fit_train, Y_train.OAtot_PMF)
# predict
y_rf = ridge.predict(pd.DataFrame(scaler.transform(X_fit_test)))
# evaluate
print("mse ridge: ", mse(Y_test.OAtot_PMF, y_rf))

mse ridge:  11.52213625435028


In [None]:
# unweighted
full_ridge_glo_mse = {}
full_ridge_glo_mape = {}
for stat_id in data.station_id.unique():
  x_test = X_fit_test.loc[X_test.station_id == stat_id,:]
  x_test = pd.DataFrame(scaler.transform(x_test))
  y_stat = Y_test.loc[X_test.station_id == stat_id,"OAtot_PMF"]
  full_ridge_glo_mse[stat_id] = mse(y_stat, ridge.predict(x_test))
  full_ridge_glo_mape[stat_id] = my_mape(y_stat, ridge.predict(x_test))

In [None]:
print("full ridge mse", np.mean(list(full_ridge_glo_mse.values())), "std.dev.: ", np.std(list(full_ridge_glo_mse.values())))
print("full ridge mape", np.mean(list(full_ridge_glo_mape.values())), "std.dev.: ", np.std(list(full_ridge_glo_mape.values())))
# save dict
dictionaries = [full_ridge_glo_mse, full_ridge_glo_mape]
names = ["full_ridge_glo_mse", "full_ridge_glo_mape"]
for di, di_name in zip(dictionaries, names):
  with open(di_name +".json", "w") as json_file:
    json.dump(di, json_file)

In [None]:
# global Ridge - NO CAMX
covars = [ "year", "month","day_week",
"temp_CAMX", "rh_CAMX", "press_CAMX", "ws_CAMX", "wd_CAMX", "pblh_CAMX", "wind_x_CAMX", "wind_y_CAMX","diff_agriculture", "diff_airports", "diff_barren", "diff_industrial", "diff_industrial_transport",
    "diff_natural_green", "diff_ports", "diff_roads_rails", "diff_snow_ice", "diff_transport", "diff_urban_fabric", "diff_urban_green", "diff_water", "diff_wetlands",
    "agriculture500","airports500", "barren500", "industrial500", "industrial_transport500", "natural_green500", "ports500", "roads_rails500", "snow_ice500",
    "transport500", "urban_fabric500","urban_green500", "water500", "wetlands500","diff_imd","imd500", "diff_population", "population_500","elevation","Lat","Lon", "area_grid",
    "distance_border", "distance_mt"]
X_fit_train = pd.concat(( X_train.loc[:, covars],  pd.get_dummies(X_train.station_id)), axis = 1)
X_fit_test = pd.concat(( X_test.loc[:, covars],  pd.get_dummies(X_test.station_id)), axis = 1)
ridge = RidgeCV()
scaler = StandardScaler()
scaler.fit(X_fit_train)
X_fit_train = pd.DataFrame(scaler.transform(X_fit_train))
ridge.fit(X_fit_train, Y_train.OAtot_PMF)
# predict
y_rf = ridge.predict(pd.DataFrame(scaler.transform(X_fit_test)))
# evaluate
print("mse ridge: ", mse(Y_test.OAtot_PMF, y_rf))

In [None]:
part_ridge_glo_mse = {}
part_ridge_glo_mape = {}
for stat_id in data.station_id.unique():
  x_test = X_fit_test.loc[X_test.station_id == stat_id,:]
  x_test = pd.DataFrame(scaler.transform(x_test))
  y_stat = Y_test.loc[X_test.station_id == stat_id, "OAtot_PMF"]
  part_ridge_glo_mse[stat_id] = mse(y_stat, ridge.predict(x_test))
  part_ridge_glo_mape[stat_id] = my_mape(y_stat, ridge.predict(x_test))

In [None]:
print("part ridge mse", np.mean(list(part_ridge_glo_mse.values())), "std.dev.: ", np.std(list(part_ridge_glo_mse.values())))
print("part ridge mape", np.mean(list(part_ridge_glo_mape.values())), "std.dev.: ", np.std(list(part_ridge_glo_mape.values())))
# save dict
dictionaries = [part_ridge_glo_mse, part_ridge_glo_mape]
names = ["part_ridge_glo_mse", "part_ridge_glo_mape"]
for di, di_name in zip(dictionaries, names):
  with open(di_name +".json", "w") as json_file:
    json.dump(di, json_file)


In [None]:
# fit rf and boost with dummy
covars = [ "HOA_CAMX", "BBOA_CAMX", "OOAtot_CAMX", "year", "month","day_week",
"temp_CAMX", "rh_CAMX", "press_CAMX", "ws_CAMX", "wd_CAMX", "pblh_CAMX", "wind_x_CAMX", "wind_y_CAMX","diff_agriculture", "diff_airports", "diff_barren", "diff_industrial", "diff_industrial_transport",
    "diff_natural_green", "diff_ports", "diff_roads_rails", "diff_snow_ice", "diff_transport", "diff_urban_fabric", "diff_urban_green", "diff_water", "diff_wetlands",
    "agriculture500","airports500", "barren500", "industrial500", "industrial_transport500", "natural_green500", "ports500", "roads_rails500", "snow_ice500",
    "transport500", "urban_fabric500","urban_green500", "water500", "wetlands500","diff_imd","imd500", "diff_population", "population_500","elevation","Lat","Lon", "area_grid",
    "distance_border", "distance_mt"]
X_fit_train = pd.concat(( X_train[covars],  pd.get_dummies(X_train.station_id)), axis = 1)
X_fit_test = pd.concat(( X_test[covars],  pd.get_dummies(X_test.station_id)), axis = 1)
rf  = RandomForestRegressor(random_state = 5, n_estimators = 500)
boost = GradientBoostingRegressor(random_state = 5)
# RF tunining
param_rf = {"max_features":["sqrt","log2"]}
clf_rf = RandomizedSearchCV(rf, param_rf)
# Boosting tuning
param_boost = {"max_depth":[3, 5, 7], "learning_rate":[0.1,0.01,0.001]}
clf_boost = RandomizedSearchCV(boost, param_boost)
# train
clf_rf.fit(X_fit_train, Y_train.OAtot_PMF)
clf_boost.fit(X_fit_train, Y_train.OAtot_PMF)
# predict
y_rf = clf_rf.predict(X_fit_test)
y_boost = clf_boost.predict(X_fit_test)
# evaluate
print("mse of best individual model: ", 10.507)
print("mse rf: ", mse(Y_test.OAtot_PMF, y_rf))
print("mse boost: ", mse(Y_test.OAtot_PMF, y_boost))

In [None]:
# unweighted results
full_rf_glo_mse = {}
full_boost_glo_mse = {}
full_rf_glo_mape = {}
full_boost_glo_mape = {}
for stat_id in data.station_id.unique():
  x_test = X_fit_test.loc[X_test.station_id == stat_id,:]
  y_stat = Y_test.loc[X_test.station_id == stat_id, "OAtot_PMF"]
  full_rf_glo_mse[stat_id] = mse(y_stat, clf_rf.predict(x_test))
  full_boost_glo_mse[stat_id] = mse(y_stat, clf_boost.predict(x_test))
  full_rf_glo_mape[stat_id] = my_mape(y_stat, clf_rf.predict(x_test))
  full_boost_glo_mape[stat_id] = my_mape(y_stat, clf_boost.predict(x_test))
## print results
print("full rf mse:", np.mean(list(full_rf_glo_mse.values())), "std. dev: ", np.std(list(full_rf_glo_mse.values())))
print("full boost mse:", np.mean(list(full_boost_glo_mse.values())), "std. dev: ", np.std(list(full_boost_glo_mse.values())))
print("full rf mape:", np.mean(list(full_rf_glo_mape.values())), "std. dev: ", np.std(list(full_rf_glo_mape.values())))
print("full boost mape:", np.mean(list(full_boost_glo_mape.values())), "std. dev: ", np.std(list(full_boost_glo_mape.values())))
# save dict
dictionaries = [full_rf_glo_mse, full_boost_glo_mse, full_rf_glo_mape, full_boost_glo_mape]
names = ["full_rf_glo_mse", "full_boost_glo_mse", "full_rf_glo_mape", "full_boost_glo_mape"]
for di, di_name in zip(dictionaries, names):
  with open(di_name +".json", "w") as json_file:
    json.dump(di, json_file)

In [None]:
# now without CAMx
covars = ["year", "month","day_week",
"temp_CAMX", "rh_CAMX", "press_CAMX", "ws_CAMX", "wd_CAMX", "pblh_CAMX", "wind_x_CAMX", "wind_y_CAMX","diff_agriculture", "diff_airports", "diff_barren", "diff_industrial", "diff_industrial_transport",
    "diff_natural_green", "diff_ports", "diff_roads_rails", "diff_snow_ice", "diff_transport", "diff_urban_fabric", "diff_urban_green", "diff_water", "diff_wetlands",
    "agriculture500","airports500", "barren500", "industrial500", "industrial_transport500", "natural_green500", "ports500", "roads_rails500", "snow_ice500",
    "transport500", "urban_fabric500","urban_green500", "water500", "wetlands500","diff_imd","imd500", "diff_population", "population_500","elevation","Lat","Lon", "area_grid",
    "distance_border", "distance_mt"]
X_fit_train = pd.concat(( X_train.loc[:, covars],  pd.get_dummies(X_train.station_id)), axis = 1)
X_fit_test = pd.concat(( X_test.loc[:, covars],  pd.get_dummies(X_test.station_id)), axis = 1)
rf  = RandomForestRegressor(random_state = 5, n_estimators = 500)
boost = GradientBoostingRegressor(random_state = 5)
# RF tunining
param_rf = {"max_features":["sqrt","log2"]}
clf_rf = GridSearchCV(rf, param_rf)
# Boosting tuning
param_boost = {"max_depth":[3, 5, 7], "learning_rate":[0.1,0.01,0.001]}
clf_boost = GridSearchCV(boost, param_boost)
# train
clf_rf.fit(X_fit_train, Y_train.OAtot_PMF)
clf_boost.fit(X_fit_train, Y_train.OAtot_PMF)
# predict
y_rf = clf_rf.predict(X_fit_test)
y_boost = clf_boost.predict(X_fit_test)
# evaluate
print("mse of best individual model: ", 10.507)
print("mse rf: ", mse(Y_test.OAtot_PMF, y_rf))
print("mse boost: ", mse(Y_test.OAtot_PMF, y_boost))

In [None]:
# unweighted results
part_rf_glo_mse = {}
part_boost_glo_mse = {}
part_rf_glo_mape = {}
part_boost_glo_mape = {}
for stat_id in data.station_id.unique():
  x_test = X_fit_test.loc[X_test.station_id == stat_id,:]
  y_stat = Y_test.loc[X_test.station_id == stat_id, "OAtot_PMF"]
  part_rf_glo_mse[stat_id] = mse(y_stat, clf_rf.predict(x_test))
  part_boost_glo_mse[stat_id] = mse(y_stat, clf_boost.predict(x_test))
  part_rf_glo_mape[stat_id] = my_mape(y_stat, clf_rf.predict(x_test))
  part_boost_glo_mape[stat_id] = my_mape(y_stat, clf_boost.predict(x_test))
## print results
print("part rf mse:", np.mean(list(part_rf_glo_mse.values())), "std. dev: ", np.std(list(part_rf_glo_mse.values())))
print("part boost mse:", np.mean(list(part_boost_glo_mse.values())), "std. dev: ", np.std(list(part_boost_glo_mse.values())))
print("part rf mape:", np.mean(list(part_rf_glo_mape.values())), "std. dev: ", np.std(list(part_rf_glo_mape.values())))
print("part boost mape:", np.mean(list(part_boost_glo_mape.values())), "std. dev: ", np.std(list(part_boost_glo_mape.values())))
# save dict
dictionaries = [part_rf_glo_mse, part_boost_glo_mse, part_rf_glo_mape, part_boost_glo_mape]
names = ["part_rf_glo_mse", "part_boost_glo_mse", "part_rf_glo_mape", "part_boost_glo_mape"]
for di, di_name in zip(dictionaries, names):
  with open(di_name +".json", "w") as json_file:
    json.dump(di, json_file)


# GP-BOOST

In [None]:
!pip install gpboost
import gpboost as gpb

In [None]:
#fit gp-boost with random effect per station
# GP-BOOST
# with station_id so to allow an interaction between random effect and fixed_effect predictors
covars = ["HOA_CAMX", "BBOA_CAMX", "OOAtot_CAMX", "year", "month","day_week",
"temp_CAMX", "rh_CAMX", "press_CAMX", "ws_CAMX", "wd_CAMX", "pblh_CAMX", "wind_x_CAMX", "wind_y_CAMX","diff_agriculture", "diff_airports", "diff_barren", "diff_industrial", "diff_industrial_transport",
    "diff_natural_green", "diff_ports", "diff_roads_rails", "diff_snow_ice", "diff_transport", "diff_urban_fabric", "diff_urban_green", "diff_water", "diff_wetlands",
    "agriculture500","airports500", "barren500", "industrial500", "industrial_transport500", "natural_green500", "ports500", "roads_rails500", "snow_ice500",
    "transport500", "urban_fabric500","urban_green500", "water500", "wetlands500","diff_imd","imd500", "diff_population", "population_500","elevation","Lat","Lon", "area_grid",
    "distance_border", "distance_mt"]
data_train = gpb.Dataset(data=X_train.loc[:, covars], label=Y_train.OAtot_PMF)
likelihood = "gaussian"
groups = X_train.station_id
gp_model = gpb.GPModel(group_data=X_train.loc[:, "station_id"], likelihood=likelihood)
num_boost_round = 1000
param_grid = {'learning_rate': [1,0.1,0.01, 0.001],
                      'min_data_in_leaf': [10,100,1000],
                      'max_depth': [1,2,3,5,10],
                      'lambda_l2': [0,1,10]}
other_params = {'num_leaves': 2**10, 'verbose': 0}
opt_params = gpb.grid_search_tune_parameters(param_grid=param_grid, params=other_params,
                                                    num_try_random=None, seed=1000, #n_folds = 4
                                                    train_set=data_train, gp_model=gp_model, #, folds = folds, # folds = innner_kfold.split(X_train, Y_train, inner_groups), #stratified = True,
                                                    use_gp_model_for_validation=True, verbose_eval=1,
                                                    num_boost_round=num_boost_round, early_stopping_rounds=10, metric = "mse")
# Train
cvbst = gpb.cv(params=opt_params["best_params"], train_set=data_train,
               gp_model=gp_model, use_gp_model_for_validation=True, # folds = folds, # innner_kfold.split(X_train, Y_train, inner_groups),
               num_boost_round=1000, early_stopping_rounds=50, #original 10
               nfold=4, verbose_eval=True, show_stdv=False, seed=1)
metric_name = list(cvbst.keys())[0]
print("Best number of iterations: " + str(np.argmin(cvbst[metric_name])))
best_num_iter = np.argmin(cvbst[metric_name])
print(best_num_iter)
bst = gpb.train(params= opt_params['best_params'], train_set=data_train,  gp_model=gp_model,
                    num_boost_round=best_num_iter)
#gp_model.summary() # Estimated random effects model
gp_pred = bst.predict(data=X_test.loc[:, covars], group_data_pred=X_test.station_id, predict_var=True, pred_latent=False)
# print results
print("mse gp-boost: ", mse(Y_test.OAtot_PMF, gp_pred["response_mean"]))

In [None]:
# unweighted results
full_gp_mse = {}
full_gp_mape = {}

for stat_id in data.station_id.unique():
  x_test = X_test.loc[X_test.station_id == stat_id, covars]
  y_stat = Y_test.loc[X_test.station_id == stat_id, "OAtot_PMF"]
  group_data = X_test.loc[X_test.station_id == stat_id, "station_id"]
  gp_pred = bst.predict(data=x_test, group_data_pred=group_data, predict_var=True, pred_latent=False)
  full_gp_mse[stat_id] = mse(y_stat, gp_pred["response_mean"])
  full_gp_mape[stat_id] = my_mape(y_stat, gp_pred["response_mean"])

In [None]:
# print
print("part gp mse:", np.mean(list(full_gp_mse.values())), "std. dev: ", np.std(list(full_gp_mse.values())))
print("part gp mape:", np.mean(list(full_gp_mape.values())), "std. dev: ", np.std(list(full_gp_mape.values())))
# save dict
dictionaries = [full_gp_mse, full_gp_mape]
names = ["full_gp_mse", "full_gp_mape"]
for di, di_name in zip(dictionaries, names):
  with open(di_name +".json", "w") as json_file:
    json.dump(di, json_file)

part gp mse: 14.929502083458956 std. dev:  44.898420344350896
part gp mape: 0.3807225745793061 std. dev:  0.2019996476458781


# PLOT RANDOM EFFECTS ON MAP OF EUROPE

In [None]:
# plot for Europe
#load europe map
path2 = "/content/gdrive/MyDrive/Aurora_Thesis/NUTS_RG_20M_2021_3035.shp"
europe = gpd.read_file(path2)
# Look at Coordinate Reference System (CRS)
print(europe.crs)
# Select Level Code
europe = europe.loc[europe.LEVL_CODE == 1,:]
# Remove remore French island
europe = europe.loc[europe.NAME_LATN != "RUP FR — Régions Ultrapériphériques Françaises",:]

EPSG:3035


In [None]:
# make data.frame with stat_id, random effect, lat and lon
# get variables ...
# construct df
gp_pred = bst.predict(data=X_test.loc[:, covars], group_data_pred=X_test.station_id, predict_var=True, pred_latent=True)
res = pd.concat( (pd.DataFrame(gp_pred["random_effect_mean"]),  X_test.Lat.reset_index(drop=True), X_test.Lon.reset_index(drop=True), Y_test.OAtot_PMF.reset_index(drop=True)),axis = 1)
res = res.rename(columns={0:"random_effect"})
res_gpd = gpd.GeoDataFrame(data = res, geometry= gpd.points_from_xy(x = res.Lon, y =  res.Lat), crs = 4326)
res_gpd["abs_re"] = np.abs(res_gpd.random_effect)
res_gpd["fixed_effect"] = gp_pred['fixed_effect']

In [None]:
sns.set_style("whitegrid")
ax = europe.plot(figsize=(16,9), rasterized = True)
res_gpd.to_crs("epsg:3035").plot(ax = ax,
                                    column = res_gpd["random_effect"],marker = "o",
                                    cmap='plasma',legend=True,
                                  rasterized = True, legend_kwds={"label": "Random Effect", "orientation": "vertical"}) #'shrink': 0.7
xmin, xmax = 2*10**6 ,6.5*10**6
ymin, ymax = 1*10**6, 6*10**6
plt.xlim([xmin, xmax])
plt.ylim([ymin, ymax])
plt.show()

In [None]:
# Without CAMx
#fit gp-boost with random effect per station
# with station_id so to allow an interaction between random effect and fixed_effect predictors
covars = ["year", "month","day_week",
"temp_CAMX", "rh_CAMX", "press_CAMX", "ws_CAMX", "wd_CAMX", "pblh_CAMX", "wind_x_CAMX", "wind_y_CAMX","diff_agriculture", "diff_airports", "diff_barren", "diff_industrial", "diff_industrial_transport",
    "diff_natural_green", "diff_ports", "diff_roads_rails", "diff_snow_ice", "diff_transport", "diff_urban_fabric", "diff_urban_green", "diff_water", "diff_wetlands",
    "agriculture500","airports500", "barren500", "industrial500", "industrial_transport500", "natural_green500", "ports500", "roads_rails500", "snow_ice500",
    "transport500", "urban_fabric500","urban_green500", "water500", "wetlands500","diff_imd","imd500", "diff_population", "population_500","elevation","Lat","Lon", "area_grid",
    "distance_border", "distance_mt"]
data_train = gpb.Dataset(data=X_train.loc[:, covars], label=Y_train.OAtot_PMF)
likelihood = "gaussian"
groups = X_train.station_id
gp_model = gpb.GPModel(group_data=groups, likelihood=likelihood)
num_boost_round = 1000
param_grid = {'learning_rate': [1,0.1,0.01, 0.001],
                      'min_data_in_leaf': [10,100,1000],
                      'max_depth': [1,2,3,5,10],
                      'lambda_l2': [0,1,10]}
other_params = {'num_leaves': 2**10, 'verbose': 0}
opt_params = gpb.grid_search_tune_parameters(param_grid=param_grid, params=other_params,
                                                    num_try_random=None, seed=1000, #n_folds = 4
                                                    train_set=data_train, gp_model=gp_model, #, folds = folds, # folds = innner_kfold.split(X_train, Y_train, inner_groups), #stratified = True,
                                                    use_gp_model_for_validation=True, verbose_eval=1,
                                                    num_boost_round=num_boost_round, early_stopping_rounds=10, metric = "mse")
# Train
cvbst = gpb.cv(params=opt_params["best_params"], train_set=data_train,
               gp_model=gp_model, use_gp_model_for_validation=True, # folds = folds, # innner_kfold.split(X_train, Y_train, inner_groups),
               num_boost_round=1000, early_stopping_rounds=50, #original 10
               nfold=4, verbose_eval=True, show_stdv=False, seed=1)
metric_name = list(cvbst.keys())[0]
print("Best number of iterations: " + str(np.argmin(cvbst[metric_name])))
best_num_iter = np.argmin(cvbst[metric_name])
print(best_num_iter)
bst = gpb.train(params= opt_params['best_params'], train_set=data_train,  gp_model=gp_model,
                    num_boost_round=best_num_iter)
#gp_model.summary() # Estimated random effects model
gp_pred = bst.predict(data=X_test.loc[:, covars], group_data_pred=X_test.station_id, predict_var=True, pred_latent=True)

In [None]:
# unweighted results
part_gp_mse = {}
part_gp_mape = {}

for stat_id in data.station_id.unique():
  x_test = X_test.loc[X_test.station_id == stat_id, covars]
  y_stat = Y_test.loc[X_test.station_id == stat_id, "OAtot_PMF"]
  group_data = X_test.loc[X_test.station_id == stat_id, "station_id"]
  gp_pred = bst.predict(data=x_test, group_data_pred=group_data, predict_var=True, pred_latent=False)
  part_gp_mse[stat_id] = mse(y_stat, gp_pred["response_mean"])
  part_gp_mape[stat_id] = my_mape(y_stat, gp_pred["response_mean"])

In [None]:
import json
# print
print("part gp mse:", np.mean(list(part_gp_mse.values())), "std. dev: ", np.std(list(part_gp_mse.values())))
print("part gp mape:", np.mean(list(part_gp_mape.values())), "std. dev: ", np.std(list(part_gp_mape.values())))
# save dict
dictionaries = [part_gp_mse, part_gp_mape]
names = ["part_gp_mse", "part_gp_mape"]
for di, di_name in zip(dictionaries, names):
  with open(di_name +".json", "w") as json_file:
    json.dump(di, json_file)

part gp mse: 14.962123518215241 std. dev:  45.06108470770233
part gp mape: 0.4018699344461716 std. dev:  0.21995060863960753
