# Local Bias Analysis
### Predicting absolute relative residuals at seen locations

In [1]:
# load packages
!pip install shap
import shap
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score, mean_absolute_percentage_error as mape, mean_squared_error as mse
from pyarrow import feather as pq
import geopandas as gpd
import folium
from folium import Marker
from shapely import geometry
from tqdm import tqdm
pd.set_option('display.max_columns', None)
from ipywidgets import interact
import scipy
from tqdm import tqdm
from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import mapie
from mapie import regression
from mapie.metrics import regression_coverage_score
from mapie.regression import MapieRegressor
from mapie.quantile_regression import MapieQuantileRegressor
from sklearn.ensemble import GradientBoostingRegressor
import shap
# Feature Importance
from sklearn.inspection import PartialDependenceDisplay
from sklearn.inspection import permutation_importance

In [175]:
# load data
data = pd.read_csv("data_converted.csv")
data.time = pd.to_datetime(data.time)
data = data.reset_index()

In [176]:
# CLEAN DATA
# remove OA with less than 0.1
data= data.loc[data.OAtot_PMF >= 0.1, :]
# and stations with less than 30 obs.
select = (data.groupby("station_id")["OAtot_PMF"].size() > 30).reset_index()
data = data.set_index("station_id")
data = data.join(select.set_index("station_id"), rsuffix = "keep")
data = data.loc[ data.OAtot_PMFkeep == True, :]
data = data.reset_index()
# switch London data
data["OAtot_2"] = data.HOA_PMF + data.BBOA_PMF + data.OOAtot_PMF
data.loc[data.station == "London","OAtot_PMF"] = data.loc[data.station == "London","OAtot_2"]
# Remove Zurich 2017
data = data.loc[(data.station != "Zurich") | (data.year != 2017),:]

In [177]:
# Get OA
data["day_of_week"] = data.time.dt.day_of_week
data = data.loc[data.OAtot_PMF.isnull()==False,:]
data = data.sort_values(by="time")
# Get Y
Y = data.loc[:, ["time","station","OAtot_PMF","OAtot_CAMX"]]
Y = Y.set_index("time")
# Get X design
X = data.loc[: , ["time","station", "month","week","day_of_week",
"temp_CAMX", "rh_CAMX", "press_CAMX", "ws_CAMX", "wd_CAMX", "pblh_CAMX", "wind_x_CAMX", "wind_y_CAMX"]]
X = X.set_index("time")

In [178]:
def shap_residuals(station_name, abs:bool, rel:bool):
    
    X_stat = X.loc[X.station == station_name,:]
    Y_OA = Y.loc[Y.station == station_name , "OAtot_PMF"]
    Y_CAMx = Y.loc[Y.station == station_name , "OAtot_CAMX"]
    
    # add delta of weather variables
    X_stat["delta_rh"] = np.zeros(len(X_stat))
    X_stat["delta_rh"].iloc[1:] = np.array(X_stat["rh_CAMX"].iloc[1:]) - np.array(X_stat["rh_CAMX"].iloc[:-1])  
    X_stat["delta_temp"] = np.zeros(len(X_stat))
    X_stat["delta_temp"].iloc[1:] = np.array(X_stat["temp_CAMX"].iloc[1:]) - np.array(X_stat["temp_CAMX"].iloc[:-1])  
    X_stat["delta_press"] = np.zeros(len(X_stat))
    X_stat["delta_press"].iloc[1:] = np.array(X_stat["press_CAMX"].iloc[1:]) - np.array(X_stat["press_CAMX"].iloc[:-1])  
    X_stat["delta_ws"] = np.zeros(len(X_stat))
    X_stat["delta_ws"].iloc[1:] = np.array(X_stat["ws_CAMX"].iloc[1:]) - np.array(X_stat["ws_CAMX"].iloc[:-1])  
    X_stat["delta_wd"] = np.zeros(len(X_stat))
    X_stat["delta_wd"].iloc[1:] = np.array(X_stat["wd_CAMX"].iloc[1:]) - np.array(X_stat["wd_CAMX"].iloc[:-1])    
    X_stat["delta_pblh"] = np.zeros(len(X_stat))
    X_stat["delta_pblh"].iloc[1:] = np.array(X_stat["pblh_CAMX"].iloc[1:]) - np.array(X_stat["pblh_CAMX"].iloc[:-1])    
    X_stat["delta_wind_x"] = np.zeros(len(X_stat))
    X_stat["delta_wind_x"].iloc[1:] = np.array(X_stat["wind_x_CAMX"].iloc[1:]) - np.array(X_stat["wind_x_CAMX"].iloc[:-1])    
    X_stat["delta_wind_y"] = np.zeros(len(X_stat))
    X_stat["delta_wind_y"].iloc[1:] = np.array(X_stat["wind_y_CAMX"].iloc[1:]) - np.array(X_stat["wind_y_CAMX"].iloc[:-1])    
    X_stat = X_stat.fillna(0)
    # if target is absolute residual
    if abs:
        Y_stat = np.abs(Y_OA - Y_CAMx)
        if rel:
            Y_stat = np.abs( (Y_OA - Y_CAMx)/(Y_OA + 1) )

    else:
        Y_stat = Y_OA - Y_CAMx
        if rel:
            Y_stat = (Y_OA - Y_CAMx) / (Y_OA + 1)
        

   

    # Plot residual
    fig, ax = plt.subplots(2, 1, figsize = (16,9))
    sns.set_style("whitegrid")
    ax[0].plot(Y_stat, label = "residual")
    ax[0].plot(Y_OA, label = "OA")
    ax[0].plot(Y_CAMx, label = "CAMx")
    ax[0].legend()
    #ax[0].set_xticks(rotation = 45)
    ax[0].set_title(station_name)
    
    # split in 2: train and test set
    n = len(X_stat)
    # indeces
    end_train = np.floor(n*0.75).astype(int)
    
    # get data
    X_train = X_stat.iloc[:end_train, :]
    X_test = X_stat.iloc[end_train:, :]
    Y_train = Y_stat.iloc[:end_train]
    Y_test = Y_stat.iloc[end_train:]

    # drop station from features
    X_train = X_train.drop("station",axis = 1)
    X_test = X_test.drop("station",axis = 1)
    
    # display plot
    ax[1].plot(Y_train, label = "train")
    ax[1].plot(Y_test, label = "test", color = "pink")
    ax[1].set_title(station_name)
    ax[1].set_xlabel("Time")
    ax[1].set_ylabel("OA")
    ax[1].legend()

    # RF
    rf = RandomForestRegressor(n_estimators=500)
    rf.fit(X_train.reset_index(drop=True), Y_train)
    y_pred = rf.predict(X_test.reset_index(drop=True))
    loss = mse(Y_test, y_pred)

    fig, ax = plt.subplots(1, 2, figsize = (16,9))
    ax[0].plot(Y_test, label = "Observed residuals", color = "pink")
    ax[0].plot(Y_test.index, y_pred, label = "Predicted residuals", color = "black")
    ax[0].set_title(str(station_name) + " MSE: " + str(loss))
    ax[0].legend()

    sns.regplot(x = Y_test, y = y_pred,ax = ax[1])
    ax[1].axline(xy1 = (0,0), slope = 1,color = "red", label = "identity")
    ax[1].set_xlabel("Y_test")
    ax[1].set_ylabel("Y_pred")
    ax[1].legend()

    result = permutation_importance(rf, X_test, Y_test, n_repeats=30, random_state=42)
    perm_sorted_idx = result.importances_mean.argsort()
    plt.figure(figsize = (20,9))
    plt.boxplot(
        result.importances[perm_sorted_idx].T,
        vert=False,
        labels=X_test.columns[perm_sorted_idx], patch_artist = True
    )

    plt.xticks(fontsize = 20)
    plt.yticks(fontsize = 20)
    


    # select kind="individual" to dispaly ICE plot
    #plt.figure(figsize = (16,9))
   # perm_sorted_idx = (-result.importances_mean).argsort()
   # feature = X_test.columns[perm_sorted_idx][:3]
    #PartialDependenceDisplay.from_estimator(rf, X_test, feature , kind ="average")
    #ax.set_xticklabels(labels = ax.get_xticklabels(),fontsize = 50)
    plt.figure(figsize = (16,9))
    shap_values = shap.TreeExplainer(rf).shap_values(X_train.copy())
    shap.summary_plot(shap_values, X_train.copy(), max_display=10) 
    #plt.figure(figsize=(16,9))
    # Fits the explainer
    # train or test data ?
    #explainer = shap.Explainer(rf, X_test.reset_index(drop = True))
    # Calculates the SHAP values - It takes some time
    #shap_values = explainer(X_test.reset_index(drop = True), check_additivity=False)
    #shap.plots.beeswarm(shap_values)
    

In [179]:
interact(shap_residuals, station_name = data.station.unique(), abs = [True, False], rel = [False, True])

interactive(children=(Dropdown(description='station_name', options=('Virolahti II', 'Iskrba', 'Aspvreten', 'Is…

<function __main__.shap_residuals(station_name, abs: bool, rel: bool)>

## Compute ranks and perform rank aggregation

In [None]:
data.head()

In [94]:
# Get Y
Y = data.loc[:, ["time","station_id","OAtot_PMF","OAtot_CAMX"]]
Y = Y.set_index("time")
# Get X design
X = data.loc[: , ["time", "station_id", "month","week","day_of_week","temp_CAMX", "rh_CAMX", "press_CAMX", "ws_CAMX", "wd_CAMX", "pblh_CAMX", "wind_x_CAMX", "wind_y_CAMX"]]
X = X.set_index("time")

In [95]:
# df to store results
res = pd.DataFrame( np.zeros( (data.station_id.nunique() , len(X.columns) + 8))) 
res.columns = [*X.columns, "delta_rh", "delta_temp","delta_press", "delta_ws",
               "delta_wd", "delta_pblh", "delta_wind_x", "delta_wind_y" ]

In [96]:
def importance_residuals(station_id):
    
    X_stat = X.loc[X.station_id == station_id,:]
    Y_OA = Y.loc[Y.station_id == station_id , "OAtot_PMF"]
    Y_CAMx = Y.loc[Y.station_id == station_id , "OAtot_CAMX"]
    
    # add delta of weather variables
    X_stat["delta_rh"] = np.zeros(len(X_stat))
    X_stat["delta_rh"].iloc[1:] = np.array(X_stat["rh_CAMX"].iloc[1:]) - np.array(X_stat["rh_CAMX"].iloc[:-1])  
    X_stat["delta_temp"] = np.zeros(len(X_stat))
    X_stat["delta_temp"].iloc[1:] = np.array(X_stat["temp_CAMX"].iloc[1:]) - np.array(X_stat["temp_CAMX"].iloc[:-1])  
    X_stat["delta_press"] = np.zeros(len(X_stat))
    X_stat["delta_press"].iloc[1:] = np.array(X_stat["press_CAMX"].iloc[1:]) - np.array(X_stat["press_CAMX"].iloc[:-1])  
    X_stat["delta_ws"] = np.zeros(len(X_stat))
    X_stat["delta_ws"].iloc[1:] = np.array(X_stat["ws_CAMX"].iloc[1:]) - np.array(X_stat["ws_CAMX"].iloc[:-1])  
    X_stat["delta_wd"] = np.zeros(len(X_stat))
    X_stat["delta_wd"].iloc[1:] = np.array(X_stat["wd_CAMX"].iloc[1:]) - np.array(X_stat["wd_CAMX"].iloc[:-1])    
    X_stat["delta_pblh"] = np.zeros(len(X_stat))
    X_stat["delta_pblh"].iloc[1:] = np.array(X_stat["pblh_CAMX"].iloc[1:]) - np.array(X_stat["pblh_CAMX"].iloc[:-1])    
    X_stat["delta_wind_x"] = np.zeros(len(X_stat))
    X_stat["delta_wind_x"].iloc[1:] = np.array(X_stat["wind_x_CAMX"].iloc[1:]) - np.array(X_stat["wind_x_CAMX"].iloc[:-1])    
    X_stat["delta_wind_y"] = np.zeros(len(X_stat))
    X_stat["delta_wind_y"].iloc[1:] = np.array(X_stat["wind_y_CAMX"].iloc[1:]) - np.array(X_stat["wind_y_CAMX"].iloc[:-1])    
    
    # target is residual
    Y_stat = np.abs((Y_OA - Y_CAMx)/(Y_OA+1))
    
    # split in 2: train and test set
    n = len(X_stat)
    # indeces
    end_train = np.floor(n*0.75).astype(int)
    X_stat = X_stat.fillna(0)
    # get data
    X_train = X_stat.iloc[:end_train, :]
    X_test = X_stat.iloc[end_train:, :]
    Y_train = Y_stat.iloc[:end_train]
    Y_test = Y_stat.iloc[end_train:]

    # drop station from features
    X_train = X_train.drop("station_id",axis = 1)
    X_test = X_test.drop("station_id",axis = 1)

    # RF
    rf = RandomForestRegressor(n_estimators=500)
    rf.fit(X_train.reset_index(drop=True), Y_train)
    y_pred = rf.predict(X_test.reset_index(drop=True))
    loss = mse(Y_test, y_pred)

    # get permutation importance
    result = permutation_importance(rf, X_test, Y_test, n_repeats=30, random_state=42)

    return result
    

In [97]:
# iterate over station_id and store results
for idx, stat in tqdm(enumerate(data.station_id.unique())):
    result = importance_residuals(stat)
    res.station_id.iloc[idx] = stat
    res.loc[idx, res.columns[1:]] = result.importances_mean

100it [24:47, 14.87s/it]


In [None]:
res.head()

In [99]:
# df for ranks for each station
res_rank = res.copy()

In [113]:
for i in range(len(res)):
    res_rank.iloc[i,:].loc[(-res_rank.iloc[0]).sort_values().index] =((-res_rank.iloc[i]).sort_values().argsort()).astype(int)

In [None]:
res_rank.head()

In [115]:
# aggregate ranks in a robust way
! pip install ranky
import ranky as rk
# average rank
rk.show(np.sort(rk.borda(res.T)))

In [None]:
sns.set_style("whitegrid")
res_rank.T.mean(axis = 1).sort_values().plot(figsize = (16,9), kind = "bar")
plt.ylabel("Avg. Rank")
plt.title("Rank aggregation of Permutation Feature Importance", fontsize = 24)
plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
plt.ylabel("Average Rank", fontsize = 20)

In [None]:
# Other aggregation scheme: median rank
plt.figure(figsize = (16,9))
rk.show(rk.majority(res.T))

In [None]:
# Other aggregation scheme: kendal rank
kendal_res = rk.center(res.T, method='kendalltau')
kendal_res.plot(kind = "bar")