In [5]:
# load packages
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score, mean_absolute_percentage_error as mape, mean_squared_error as mse
from pyarrow import feather as pq
import geopandas as gpd
import folium
from folium import Marker
from shapely import geometry
from tqdm import tqdm
pd.set_option('display.max_columns', None)
from ipywidgets import interact
import scipy
from tqdm import tqdm
from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, RidgeCV
import mapie
from mapie import regression
from mapie.metrics import regression_coverage_score
from mapie.regression import MapieRegressor
from mapie.quantile_regression import MapieQuantileRegressor
from sklearn.ensemble import GradientBoostingRegressor
import shap
# Feature Importance
from sklearn.inspection import PartialDependenceDisplay
from sklearn.inspection import permutation_importance
from sklearn.model_selection import TimeSeriesSplit

In [6]:
# load data
data = pd.read_csv("data_converted.csv")
data.time = pd.to_datetime(data.time)
data = data.reset_index()
# need to manually add 2 station Id
data.loc[data.station== "Bologna (BO)", "station_id"] = "ID1999"
data.loc[data.station== "San Pietro Capofiume (SPC)", "station_id"] = "ID1998"
# switch London data
data["OAtot_2"] = data.HOA_PMF + data.BBOA_PMF + data.OOAtot_PMF
data.loc[data.station == "London","OAtot_PMF"] = data.loc[data.station == "London","OAtot_2"]
# Remove Zurich 2017
data = data.loc[(data.station != "Zurich") | (data.year != 2017),:]

In [7]:
# CLEAN DATA
# remove OA with less than 0.1
data= data.loc[data.OAtot_PMF >= 0.1, :]
# and stations with less than 30 obs.
select = (data.groupby("station_id")["OAtot_PMF"].size() > 30).reset_index()
data = data.set_index("station_id")
data = data.join(select.set_index("station_id"), rsuffix = "keep")
data = data.loc[ data.OAtot_PMFkeep == True, :]
data = data.reset_index()

In [8]:
# add day of week
data["day_week"] = data.time.dt.day_of_week

In [None]:
data.head()

In [10]:
# feature engineering
data["rc_1_1000-rc_1_100"] = data["road_class_1_1000"] - data["road_class_1_100"]
data["rc_2_1000-rc_2_100"] = data["road_class_2_1000"] - data["road_class_2_100"]
data["rc_3_1000-rc_3_100"] = data["road_class_3_1000"] - data["road_class_3_100"]
# CAMX proportions of components of OA
data["p_HOA"] = data["HOA_CAMX"] / data["OAtot_CAMX"]
data["p_BBOA"] = data["BBOA_CAMX"] / data["OAtot_CAMX"]
data["p_OOAtot"] = data["OOAtot_CAMX"] / data["OAtot_CAMX"]

# need to decorralate some land-use variables
data["diff_agriculture"] = data["agriculture1000"] - data["agriculture500"]
data["diff_airports"] = data["airports1000"] - data["airports500"]
data["diff_barren"] = data["barren1000"] - data["barren500"]
data["diff_industrial"] = data["industrial1000"] - data["industrial500"]
data["diff_industrial_transport"]= data["industrial_transport1000"] - data["industrial_transport500"]
data["diff_natural_green"] =  data["natural_green1000"] - data["natural_green500"]
data["diff_ports"] = data["ports1000"] - data["ports500"]
data["diff_roads_rails"] = data["roads_rails1000"] - data["roads_rails500"]
data["diff_snow_ice"] = data["snow_ice1000"] - data["snow_ice500"]
data["diff_transport"] = data["transport1000"] - data["transport500"]
data["diff_urban_fabric"] = data["urban_fabric1000"] - data["urban_fabric500"]
data["diff_urban_green"] = data["urban_green1000"] - data["urban_green500"]
data["diff_water"] = data["water1000"] - data["water500"]
data["diff_wetlands"] = data["wetlands1000"] - data["wetlands500"]
# also for Population and IMD
data["diff_population"] = data["population_1000"] - data["population_500"]
data["diff_imd"] = data["imd1000"] - data["imd500"]

In [None]:
data.head()

# DOWN-SCALE AT UNSEEN LOCATIONS

In [39]:
# Predict at unseen stations
n_sp = 10
cv = GroupKFold(n_splits = n_sp)
res_rf = {}
res_boost = {}
covars = ["HOA_CAMX", "BBOA_CAMX", "OOAtot_CAMX", "year", "month","day_week",
"temp_CAMX", "rh_CAMX", "press_CAMX", "ws_CAMX", "wd_CAMX", "pblh_CAMX", "wind_x_CAMX", "wind_y_CAMX","diff_agriculture", "diff_airports", "diff_barren", "diff_industrial", "diff_industrial_transport",
    "diff_natural_green", "diff_ports", "diff_roads_rails", "diff_snow_ice", "diff_transport", "diff_urban_fabric", "diff_urban_green", "diff_water", "diff_wetlands",
    "agriculture500","airports500", "barren500", "industrial500", "industrial_transport500", "natural_green500", "ports500", "roads_rails500", "snow_ice500",
    "transport500", "urban_fabric500","urban_green500", "water500", "wetlands500","diff_imd","imd500", "diff_population", "population_500","elevation","Lat","Lon", "area_grid",
    "distance_border", "distance_mt"]
Y = data.loc[:, ["time","station","OAtot_PMF"]]
Y = Y.set_index("time")
X = data.loc[:, covars]
X = X.fillna(0)
# Grouped Cross-Validation
for idx, (train_idx, test_idx) in tqdm(enumerate(cv.split(X, Y, data.station_id))):
    # get train ad test folds
    X_train = X.iloc[train_idx,:]
    X_train = X_train.fillna(0)
    Y_train = Y.iloc[train_idx, :]["OAtot_PMF"]
    X_test = X.iloc[test_idx, :]
    X_test = X_test.fillna(0)
    Y_test = Y.iloc[test_idx, :]["OAtot_PMF"]
    # model
    rf = RandomForestRegressor(random_state=10)
    # Inner CV for parameter tuning
    param_rf = {"max_features": ["sqrt","log2"]}
    inner_station_id = data.station_id.iloc[train_idx]
    cv_inner = GroupKFold(n_splits = n_sp)
    clf = GridSearchCV(rf, param_rf, cv =cv_inner.split(X_train, Y_train, inner_station_id ))
    clf.fit(X_train, Y_train)
    # boosting
    boost = GradientBoostingRegressor()
    param_boost = {"learning_rate": [0.1, 0.01, 0.05], "max_depth":[1,3,5]}
    clf2 = GridSearchCV(boost, param_boost, cv =cv_inner.split(X_train, Y_train, inner_station_id ))
    clf2.fit(X_train, Y_train)
    # Predict at unseen station
    test_stations = data.station_id.iloc[test_idx].unique()
    for test_id in test_stations:
        x_test = data.loc[data.station_id == test_id, covars]
        x_test = x_test.fillna(0)
        res_rf[test_id] = clf.predict(x_test)
        res_boost[test_id] = clf2.predict(x_test)

10it [2:46:26, 998.65s/it] 


In [27]:
def plot_res(station_name):
    id_station = data.loc[data.station == station_name, "station_id"].unique()
    for id in id_station:
        plt.figure(figsize=(16,9))
        y_pred = res_rf[id]
        Y_true = data.loc[data.station_id == id, "OAtot_PMF"]
        Y_camx = data.loc[data.station_id == id, "OAtot_CAMX"]
        plt.plot(Y_true.index, y_pred, label ="Prediction")
        plt.plot(Y_true.index,Y_true, label ="Observation")
        plt.plot(Y_true.index,Y_camx, label ="CAMx")
        plt.title(station_name + " " + id)
        plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15),
                fancybox=True, shadow=True, ncol=5)
        print("mse rf", mse(Y_true, y_pred))
        print("mse camx", mse(Y_true, Y_camx))
        print("mape rf", mape(Y_true, y_pred))
        print("mape camx", mape(Y_true, Y_camx))

In [None]:
interact(plot_res, station_name = data.station.unique())

In [30]:
def my_mape(Y_true, Y_pred):
    loss = (np.abs( (Y_true - Y_pred)/(Y_true + 1))).mean()
    return loss

In [52]:
# get full results
mse_rf = {}
mse_boost = {}
mse_camx = {}
mape_rf = {}
mape_boost = {}
mape_camx = {}
imp_rf_mse = 0
imp_boost_mse = 0
imp_rf_mape = 0
imp_boost_mape =0

for stat in data.station_id.unique():
    y_pred_rf = res_rf[stat]
    y_pred_boost = res_boost[stat]
    Y_true = data.loc[data.station_id == stat, "OAtot_PMF"]
    Y_camx = data.loc[data.station_id == stat, "OAtot_CAMX"]
    
    mse_rf[stat] = mse(Y_true, y_pred_rf)
    mse_camx[stat] = mse(Y_true, Y_camx)
    mse_boost[stat] =  mse(Y_true, y_pred_boost)
    
    mape_rf[stat]= my_mape(Y_true, y_pred_rf)
    mape_boost[stat]= my_mape(Y_true, y_pred_boost)
    mape_camx[stat] = my_mape(Y_true, np.array(Y_camx))

print("mse_rf", np.mean(list(mse_rf.values())), "std.dev: ", np.std(list(mse_rf.values()))) 
print("mse_boost", np.mean(list(mse_boost.values())), "std.dev: ", np.std(list(mse_boost.values()))) 
print("mse_camx", np.mean(list(mse_camx.values())) , "std.dev: ", np.std(list(mse_camx.values())))
print("mape_rf",  np.mean(list(mape_rf.values())) , "std.dev: ", np.std(list(mape_rf.values())))
print("mape_boost",  np.mean(list(mape_boost.values())) , "std.dev: ", np.std(list(mape_boost.values())))
print("mape_camx",  np.mean(list(mape_camx.values())), "std.dev: ", np.std(list(mape_camx.values())) )

mse_rf 30.279068471404102 std.dev:  115.32025850657475
mse_boost 33.91150181876793 std.dev:  141.0740659634627
mse_camx 41.368535130917905 std.dev:  150.1319050053784
mape_rf 0.49977102754108427 std.dev:  0.30944158669665206
mape_boost 0.46966484737643727 std.dev:  0.2831330396878055
mape_camx 0.4489430055277247 std.dev:  0.16146792264629894


In [53]:
# percentage of improvements of RF wrt to CAMx
imp_mse_rf = 0
imp_mse_boost = 0
imp_mape_rf = 0
imp_mape_boost = 0
n = len(list(mse_rf.values()))
for i in range(n):
    imp_mse_rf += (list(mse_rf.values())[i] < list(mse_camx.values())[i])*1
    imp_mse_boost += (list(mse_boost.values())[i] < list(mse_camx.values())[i])*1
    imp_mape_rf += (list(mape_rf.values())[i] < list(mape_camx.values())[i])*1
    imp_mape_boost += (list(mape_boost.values())[i] < list(mape_camx.values())[i])*1
print("MSE rf percentage of improved stations", imp_mse_rf/n)
print("MSE boost percentage of improved stations", imp_mse_boost/n)
print("MAPE rf percentage of improved stations", imp_mape_rf/n)
print("MAPE boost percentage of improved stations", imp_mape_boost/n)

MSE rf percentage of improved stations 0.8137254901960784
MSE boost percentage of improved stations 0.8431372549019608
MAPE rf percentage of improved stations 0.5294117647058824
MAPE boost percentage of improved stations 0.6274509803921569


In [None]:
# boxplot
sns.set_style("whitegrid")
labels = ['CAMx', 'Random Forest', 'Boosting']
plt.figure(figsize=(16,9))
plt.boxplot([list(mse_camx.values()), list(mse_rf.values()), list(mse_boost.values()) ],labels = labels, patch_artist= "blue")
plt.yscale("log")
plt.ylabel("log(MSE)", fontsize = 28)
plt.tick_params(axis='x', labelsize=28) 
plt.tick_params(axis='y', labelsize=24)
plt.title("MSE", fontsize = 30)

In [None]:
sns.set_style("whitegrid")
labels = ['CAMx', 'Random Forest', 'Boosting']
plt.figure(figsize=(16,9))
plt.boxplot([list(mape_camx.values()), list(mape_rf.values()), list(mape_boost.values()) ],labels = labels, patch_artist= "blue")
plt.yscale("log")
plt.ylabel("log(MAPE)", fontsize = 28)
plt.tick_params(axis='x', labelsize=28) 
plt.tick_params(axis='y', labelsize=24) 
plt.title("MAPE", fontsize = 30)


In [None]:
from sklearn.inspection import permutation_importances
result = permutation_importance(clf, X, Y.OAtot_PMF, n_repeats=30, random_state=42)

In [None]:
perm_sorted_idx = result.importances_mean.argsort()
fig, ax1 = plt.subplots(1,1, figsize = (16,9))
sns.set_style("whitegrid")
ax1.boxplot(
    result.importances[perm_sorted_idx][-10:].T,
    vert=False,
    labels=X.columns[perm_sorted_idx][-10:], patch_artist=True
)
plt.title("Permutation Feature Importance: Random Forest", fontsize = 30)
plt.xlabel("Average increase in loss", fontsize = 28)
plt.tick_params(axis='x', labelsize=24) 
plt.tick_params(axis='y', labelsize=24)