### Looking at PMF results and trying to predict them (at unseen stations)

In [1]:
# load packages
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score, mean_absolute_percentage_error as mape, mean_squared_error as mse
from pyarrow import feather as pq
import geopandas as gpd
import folium
from folium import Marker
from shapely import geometry
from tqdm import tqdm
pd.set_option('display.max_columns', None)
from ipywidgets import interact
import scipy
from tqdm import tqdm
from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import os

In [2]:
# load data
# load "converted" data (contains geo-spatial features and converted OC to OA using c=1.52) 
path = "/Users/andrea/Desktop/Thesis/CleanCode/data_converted.csv"
data = pd.read_csv(path)
data.time = pd.to_datetime(data.time)
# keep only where I have OA measurements
data = data.loc[data.OAtot_PMF.isnull()== False,:]
# need to manually add 2 station Id
data.loc[data.station== "Bologna (BO)", "station_id"] = "ID1999"
data.loc[data.station== "San Pietro Capofiume (SPC)", "station_id"] = "ID1998"

# CLEAN DATA
# remove OA with less than 0.1
data= data.loc[data.OAtot_PMF >= 0.1, :]
# and stations with less than 30 obs.
select = (data.groupby("station_id")["OAtot_PMF"].size() > 30).reset_index()
data = data.set_index("station_id")
data = data.join(select.set_index("station_id"), rsuffix = "keep")
data = data.loc[ data.OAtot_PMFkeep == True, :]
data = data.reset_index()
# switch London data
data["OAtot_2"] = data.HOA_PMF + data.BBOA_PMF + data.OOAtot_PMF
data.loc[data.station == "London","OAtot_PMF"] = data.loc[data.station == "London","OAtot_2"]
# Remove Zurich 2017
data = data.loc[(data.station != "Zurich") | (data.year != 2017),:]

In [3]:
data["rc_1_1000-rc_1_100"] = data["road_class_1_1000"] - data["road_class_1_100"]
data["rc_2_1000-rc_2_100"] = data["road_class_2_1000"] - data["road_class_2_100"]
data["rc_3_1000-rc_3_100"] = data["road_class_3_1000"] - data["road_class_3_100"]
# CAMX proportions of components of OA
data["p_HOA"] = data["HOA_CAMX"] / data["OAtot_CAMX"]
data["p_BBOA"] = data["BBOA_CAMX"] / data["OAtot_CAMX"]
data["p_OOAtot"] = data["OOAtot_CAMX"] / data["OAtot_CAMX"]

# need to decorralate some land-use variables
data["diff_agriculture"] = data["agriculture1000"] - data["agriculture500"]
data["diff_airports"] = data["airports1000"] - data["airports500"]
data["diff_barren"] = data["barren1000"] - data["barren500"]
data["diff_industrial"] = data["industrial1000"] - data["industrial500"]
data["diff_industrial_transport"]= data["industrial_transport1000"] - data["industrial_transport500"]
data["diff_natural_green"] =  data["natural_green1000"] - data["natural_green500"]
data["diff_ports"] = data["ports1000"] - data["ports500"]
data["diff_roads_rails"] = data["roads_rails1000"] - data["roads_rails500"]
data["diff_snow_ice"] = data["snow_ice1000"] - data["snow_ice500"]
data["diff_transport"] = data["transport1000"] - data["transport500"]
data["diff_urban_fabric"] = data["urban_fabric1000"] - data["urban_fabric500"]
data["diff_urban_green"] = data["urban_green1000"] - data["urban_green500"]
data["diff_water"] = data["water1000"] - data["water500"]
data["diff_wetlands"] = data["wetlands1000"] - data["wetlands500"]
# also for Population and IMD
data["diff_population"] = data["population_1000"] - data["population_500"]
data["diff_imd"] = data["imd1000"] - data["imd500"]
data["day_of_week"] = data.time.dt.day_of_week

In [4]:
(data.OOAtot_PMF.isnull() == False).sum()

4119

In [5]:
# look at PMF Results 
PMF = data.loc[(data.HOA_PMF.isnull() == False) | (data.BBOA_PMF.isnull() == False) | (data.OOAtot_PMF.isnull() == False), :]

In [None]:
# need more cleaning: 
# remove stations that are not in Chen's paper.
#Finokalia,Preila,Rugsteliskis,Vilnius, SMEARII
PMF.station.unique()

In [7]:
PMF = PMF.loc[PMF.station != "SMEARII",:]
# Now NAN should be zeros
PMF = PMF.fillna(0)

In [None]:
# OVERLAP WITH CAMXs
# VISUALIZE
fig, ax = plt.subplots(3,1, figsize = (16,9))
PMF.HOA_PMF.plot(ax = ax[0], kind = "density")
PMF.HOA_CAMX.plot(ax = ax[0], kind = "density", alpha = 0.5)
ax[0].set_title("HOA")
ax[0].legend()
PMF.BBOA_PMF.plot(ax = ax[1], kind = "density", )
PMF.BBOA_CAMX.plot(ax = ax[1], kind = "density", alpha = 0.5)
ax[1].set_title("BBOA")
ax[1].legend()
PMF.OOAtot_PMF.plot(ax = ax[2], kind = "density")
PMF.OOAtot_CAMX.plot(ax = ax[2], kind = "density", alpha = 0.5)
ax[2].set_title("OOA")
ax[2].legend()

In [9]:
# MODELLING

# Y: proportions of each component as response
PMF["SUM"] = PMF.HOA_PMF + PMF.BBOA_PMF + PMF.OOAtot_PMF
PMF["HOA_p"] = PMF.HOA_PMF/PMF.SUM
PMF["BBOA_p"] = PMF.BBOA_PMF/PMF.SUM
PMF["OOAtot_p"] = PMF.OOAtot_PMF/PMF.SUM
# _p for proportions
Y = PMF.loc[:, ["HOA_p","BBOA_p","OOAtot_p"] ]

# X: predictors
PMF["HOA_p_CAMX"] = PMF.HOA_CAMX/PMF.OAtot_CAMX
PMF["BBOA_p_CAMX"] = PMF.BBOA_CAMX/PMF.OAtot_CAMX
PMF["OOAtot_p_CAMX"] = PMF.OOAtot_CAMX/PMF.OAtot_CAMX

       # "HOA_p_CAMX", "BBOA_p_CAMX","OOAtot_p_CAMX", "HOA_CAMX","BBOA_CAMX", "OOAtot_CAMX",             
covars = ["station_id", "HOA_p_CAMX", "BBOA_p_CAMX","OOAtot_p_CAMX", "HOA_CAMX","BBOA_CAMX", "OOAtot_CAMX", "year", "month","day_of_week",
"temp_CAMX", "rh_CAMX", "press_CAMX", "ws_CAMX", "wd_CAMX", "pblh_CAMX", "wind_x_CAMX", "wind_y_CAMX",
"diff_agriculture", "diff_airports", "diff_barren", "diff_industrial", "diff_industrial_transport",
    "diff_natural_green", "diff_ports", "diff_roads_rails", "diff_snow_ice", "diff_transport", "diff_urban_fabric",
      "diff_urban_green", "diff_water", "diff_wetlands","agriculture500","airports500", "barren500", "industrial500",
      "industrial_transport500", "natural_green500", "ports500", "roads_rails500", "snow_ice500", "transport500",
      "urban_fabric500","urban_green500", "water500", "wetlands500","diff_imd","imd500", "diff_population",
        "population_500","elevation","Lat","Lon"]
X = PMF.loc[:, covars]

# get Groups for Grouped Cross-Validation
groups = X.pop("station_id")

In [10]:
# TO DO: How could a NA aries?
Y = Y.fillna(0)

In [11]:
# How well does CAMx align with PMF
def compare_component(component):
    groups = PMF.station.unique()
    plt.figure(figsize = (16,9))
    for name in groups:
        plt.scatter( PMF.loc[PMF.station == name, component + "_p"], PMF.loc[PMF.station == name, component + "_p_CAMX"])
        plt.axline(xy1 = (0,0), slope = 1)
        plt.xlabel("PMF")
        plt.ylabel("CAMX")
        plt.title(component)

In [None]:
interact(compare_component, component = ["HOA", "BBOA", "OOAtot"])

In [52]:
# Kfold
# nested CV to tune and compute prediction performance
# groupKfold
# train with absolute error
kfold = GroupKFold(n_splits= PMF.station.nunique())
# store results
res_forest = dict()
observed = dict()
camx = dict()

for idx, (train_idx, test_idx) in tqdm(enumerate(kfold.split(X,Y,groups))):
   # get data
    X_train = X.iloc[train_idx,:]
    X_test = X.iloc[test_idx,:]
    Y_train = Y.iloc[train_idx]
    Y_test = Y.iloc[test_idx]

    # train regressors
    rf = RandomForestRegressor(random_state=99, n_jobs=-1, n_estimators=500)#, criterion = "absolute_error")    
    # GridSearch
    innner_kfold = GroupKFold(n_splits =  5) #
    inner_groups = groups.iloc[train_idx]
    
    param_rf = {"max_features":["sqrt","log2"]} #, "criterion":["squared_error", "absolute_error"]}
    
    clf = GridSearchCV(rf, param_grid= param_rf, cv = innner_kfold.split(X_train, Y_train, inner_groups))
    clf.fit(X_train, Y_train)
 
    # store result per stations
    y_pred = clf.predict(X_test)

    # station 
    station = groups.iloc[test_idx].iloc[0]
   
    res_forest[station] = clf.predict(X_test)     
    observed[station] = Y_test
    camx[station] = np.array((X_test.HOA_p_CAMX, X_test.BBOA_p_CAMX, X_test.OOAtot_p_CAMX))


22it [04:18, 11.74s/it]


In [53]:
# target metric of choice
def my_mape(Y_true, Y_pred):
    loss = (np.abs( (Y_true - Y_pred)/(Y_true+1))).mean()
    return loss

In [None]:
# get unweighted results 
mse_forest = {}
mape_forest = {}
mse_camx = {}
mape_camx = {}

# 0 for HOA; 1 for BBOA; and 2 for OOA.
i = 2
for stat in PMF.station_id.unique():
    mse_forest[stat] = mse(observed[stat].iloc[:,i], res_forest[stat][:,i])
    mape_forest[stat] = my_mape(observed[stat].iloc[:,i], res_forest[stat][:,i])
    mse_camx[stat] = mse(observed[stat].iloc[:,i], camx[stat][i])
    mape_camx[stat] = my_mape(observed[stat].iloc[:,i], camx[stat][i])
# unweighted
print("mse rf:", np.mean(list(mse_forest.values())), "std.dev: ", np.std(list(mse_forest.values())))
print("mse camx:", np.mean(list(mse_camx.values())),  "std.dev: ", np.std(list(mse_camx.values())))
print("mape rf:", np.mean(list(mape_forest.values())),  "std.dev: ", np.std(list(mape_forest.values())))
print("mape camx:", np.mean(list(mape_camx.values())),  "std.dev: ", np.std(list(mape_camx.values())))

In [None]:
# get unweighted results 
rf_hoa = {}
rf_bboa = {}
rf_ooa = {}
camx_hoa = {}
camx_bboa = {}
camx_ooa = {}

# 0 for HOA; 1 for BBOA; and 2 for OOA.
for stat in PMF.station_id.unique():
    # for HOA
    rf_hoa[stat] = mse(observed[stat].iloc[:,0], res_forest[stat][:,0])
    camx_hoa[stat] = mse(observed[stat].iloc[:,0], camx[stat][0])
    # for BBOA
    rf_bboa[stat] = mse(observed[stat].iloc[:,1], res_forest[stat][:,1])
    camx_bboa[stat] = mse(observed[stat].iloc[:,1], camx[stat][1])
    # for =OA
    rf_ooa[stat] = mse(observed[stat].iloc[:,2], res_forest[stat][:,2])
    camx_ooa[stat] = mse(observed[stat].iloc[:,2], camx[stat][2])

plt.figure(figsize = (16,9))
sns.set_style("whitegrid")
labels = ["RF HOA","CAMx HOA","RF BBOA","CAMx BBOA", "RF OOA", "CAMx OOA"]
positions = [1, 2.5, 6, 7.5, 11,12.5]
plt.boxplot([list(rf_hoa.values()), list(camx_hoa.values()), list(rf_bboa.values()), list(camx_bboa.values()),
              list(rf_ooa.values()), list(camx_ooa.values())],positions = positions, labels = labels, patch_artist=True)
plt.yscale("log")
plt.ylabel("log(MSE)", fontsize = 20)
plt.tick_params(axis='x', labelsize=20) 
plt.tick_params(axis='y', labelsize=24) 
plt.title("MSE for OA proportions", fontsize = 30)

In [None]:
# boxplot
labels = ["Random Forest","CAMx"]
sns.set_style("whitegrid")
plt.figure(figsize=(16,9))
plt.boxplot([list(mape_forest.values()), list(mape_camx.values())], patch_artist=True, labels = labels)
plt.yscale("log")
plt.ylabel("log(MAPE)", fontsize = 20)
plt.tick_params(axis='x', labelsize=28) 
plt.tick_params(axis='y', labelsize=24) 
plt.title("MAPE of OOA", fontsize = 30)

In [38]:
def compare_component(component):
    groups = PMF.station.unique()
    plt.figure(figsize = (16,9))
    for name in groups:
        stat_id = PMF.loc[PMF.station == name, "station_id"].iloc[0]
        if component == "HOA":
            plt.scatter( PMF.loc[PMF.station == name, component + "_p"], res_forest[stat_id][:,0] )
        elif component == "BBOA":
            plt.scatter( PMF.loc[PMF.station == name, component + "_p"], res_forest[stat_id][:,1] )
        else:
            plt.scatter( PMF.loc[PMF.station == name, component + "_p"], res_forest[stat_id][:,2] )
        plt.axline(xy1 = (0,0), slope = 1)
        plt.xlabel("PMF")
        plt.ylabel("Predicted")
        plt.title(component)

In [None]:
interact(compare_component, component = ["HOA","BBOA","OOAtot"])

In [None]:
# visualize results
def predict_comp(station_name, compare: bool):
    fig, ax = plt.subplots(3,1, figsize = (16,9))
    # from station name to station ID
    stat_id = PMF.loc[PMF.station == station_name, "station_id"].unique()
    # HOA
    ax[0].plot( PMF.loc[PMF.station_id == stat_id[0], "time"], res_forest[stat_id[0]][:,0], label = "Predicted")
    ax[0].plot( PMF.loc[PMF.station_id == stat_id[0], "time"], observed[stat_id[0]].iloc[:,0], label = "Observed")
    ax[0].legend()
    ax[0].set_title("HOA")
    #ax[0].set_ylim(0,1)
    # BBOA
    ax[1].plot( PMF.loc[PMF.station_id == stat_id[0], "time"], res_forest[stat_id[0]][:,1], label = "Predicted")
    ax[1].plot( PMF.loc[PMF.station_id == stat_id[0], "time"], observed[stat_id[0]].iloc[:,1], label = "Observed")
    ax[1].legend()
    ax[1].set_title("BBOA")
    #ax[1].set_ylim(0,1)
    # OOAtot
    ax[2].plot( PMF.loc[PMF.station_id == stat_id[0], "time"], res_forest[stat_id[0]][:,2], label = "Predicted")
    ax[2].plot( PMF.loc[PMF.station_id == stat_id[0], "time"], observed[stat_id[0]].iloc[:,2], label = "Observed")
    ax[2].legend()
    ax[2].set_title("OOAtot")
    #ax[2].set_ylim(0,1)

    # for comparison with raw CAMx
    if compare: # _p for proportion
        ax[0].plot(PMF.loc[PMF.station_id == stat_id[0], "time"], PMF.loc[PMF.station_id == stat_id[0], "HOA_p_CAMX"], label = "CAMx")
        ax[0].legend()
        ax[1].plot(PMF.loc[PMF.station_id == stat_id[0], "time"], PMF.loc[PMF.station_id == stat_id[0], "BBOA_p_CAMX"], label = "CAMx")
        ax[1].legend()
        ax[2].plot(PMF.loc[PMF.station_id == stat_id[0], "time"], PMF.loc[PMF.station_id == stat_id[0], "OOAtot_p_CAMX"], label = "CAMx")
        ax[2].legend()

In [None]:
interact(predict_comp, station_name = PMF.station.unique(), compare = [True, False])

In [None]:
# compute wighted performance 
n = groups.nunique()
hoa = np.zeros(n)
bboa= np.zeros(n)
ooa = np.zeros(n)
hoa_camx = np.zeros(n)
bboa_camx= np.zeros(n)
ooa_camx = np.zeros(n)
# choose metric
metric = my_mape
# choose results (i.e. monthly or daily and RF or Boosting)
result = res_forest

for i, id in enumerate(groups.unique()):
    hoa[i] = metric(observed[id].iloc[:,0], result[id][:,0])* len(result[id])
    bboa[i] = metric(observed[id].iloc[:,1], result[id][:,1]) * len(result[id])
    ooa[i] = metric(observed[id].iloc[:,2], result[id][:,2])* len(result[id])
    hoa_camx[i] = metric(observed[id].iloc[:,0], camx[id][0])* len(result[id])
    bboa_camx[i] = metric(observed[id].iloc[:,1], camx[id][1])* len(result[id])
    ooa_camx[i] = metric(observed[id].iloc[:,2], camx[id][2])* len(result[id])


print(metric, "HOA", hoa.sum()/len(PMF))
print(metric, "BBOA", bboa.sum()/len(PMF))
print(metric, "OOA", ooa.sum()/len(PMF))
# mse of CAMx
print(metric, " of HOA CAMx", metric(Y["HOA_p"], PMF.loc[:, "HOA_p_CAMX"]))
print(metric, " of BBOA CAMx", metric(Y["BBOA_p"], PMF.loc[:, "BBOA_p_CAMX"]))
print(metric, " of OOA CAMx", metric(Y["OOAtot_p"], PMF.loc[:, "OOAtot_p_CAMX"]))

In [None]:
# now compare PMF absolute values to CAMX and my reconstruced ones
# try mse, mae, my_mape
metric = my_mape
score_hoa = {}
score_bboa = {}
score_ooa = {}
camx_hoa= {}
camx_bboa = {}
camx_ooa = {}

for i, id in enumerate(groups.unique()):
    # get OA of CAMx
    oa_camx = PMF.loc[PMF.station_id == id, "OAtot_CAMX"]
    hoa_camx = PMF.loc[PMF.station_id == id, "HOA_CAMX"]
    bboa_camx = PMF.loc[PMF.station_id == id, "BBOA_CAMX"]
    ooa_camx = PMF.loc[PMF.station_id == id, "OOAtot_CAMX"]
    oa_obs = PMF.loc[PMF.station_id == id, "OAtot_PMF"]
    oa_sum = PMF.loc[PMF.station_id == id, "SUM"]
    # my predictions
    p_hoa = res_forest[id][:,0]
    p_bboa = res_forest[id][:,1]
    p_ooa = res_forest[id][:,2]
    pred_hoa = p_hoa * oa_camx
    pred_bboa = p_bboa * oa_camx
    pred_ooa = p_ooa * oa_camx
    # the sum does not match the observed sometimes
    score_hoa[id] = metric(observed[id].iloc[:,0]* oa_sum, pred_hoa) 
    score_bboa[id] = metric(observed[id].iloc[:,1]* oa_sum, pred_bboa) 
    score_ooa[id] = metric(observed[id].iloc[:,2]* oa_sum, pred_ooa) 
    # CAMX
    camx_hoa[id] = metric(observed[id].iloc[:,0]* oa_sum, hoa_camx) 
    camx_bboa[id] = metric(observed[id].iloc[:,1]* oa_sum, bboa_camx) 
    camx_ooa[id] = metric(observed[id].iloc[:,2]* oa_sum, ooa_camx) 

print("rf HOA", np.mean(list(score_hoa.values()) ), "std.dev: ", np.std(list(score_hoa.values())  ) )
print("rf BBOA", np.mean(list(score_bboa.values()) ), "std.dev: ",np.std(list(score_bboa.values())) )
print("rf OOA", np.mean(list(score_ooa.values()) ), "std.dev: ",np.std(list(score_ooa.values())) )
print("camx HOA", np.mean(list(camx_hoa.values()) ), "std.dev: ",np.std(list(camx_hoa.values()) ) )
print("camx BBOA", np.mean(list(camx_bboa.values()) ), "std.dev: ",np.std(list(camx_bboa.values())) )
print("camx OOA", np.mean(list(camx_ooa.values()) ), "std.dev: ",np.std(list(camx_ooa.values())) )



In [None]:
plt.figure(figsize = (16,9))
sns.set_style("whitegrid")
labels = ["RF HOA","CAMx HOA","RF BBOA","CAMx BBOA", "RF OOA", "CAMx OOA"]
positions = [1, 2.5, 6, 7.5, 11,12.5]
plt.boxplot([list(score_hoa.values()), list(camx_hoa.values()), list(score_bboa.values()), list(camx_bboa.values()),
              list(score_ooa.values()), list(camx_ooa.values())],positions = positions, labels = labels, patch_artist=True)
plt.yscale("log")
plt.ylabel("log(MSE)", fontsize = 20)
plt.tick_params(axis='x', labelsize=20) 
plt.tick_params(axis='y', labelsize=24) 
plt.title("MSE for OA components", fontsize = 30)


In [47]:
# visualize results
def predict_comp(station_name, compare: bool):
    fig, ax = plt.subplots(3,1, figsize = (16,9))
    # from station name to station ID
    stat_id = PMF.loc[PMF.station == station_name, "station_id"].unique()
    oa_camx = PMF.loc[PMF.station_id == stat_id[0], "OAtot_CAMX"]
    oa_sum = PMF.loc[PMF.station_id == stat_id[0], "SUM"]
    # HOA
    ax[0].plot( PMF.loc[PMF.station_id == stat_id[0], "time"], res_forest[stat_id[0]][:,0]*oa_camx, label = "Predicted")
    ax[0].plot( PMF.loc[PMF.station_id == stat_id[0], "time"], observed[stat_id[0]].iloc[:,0]*oa_sum, label = "Observed")
    ax[0].legend()
    ax[0].set_title("HOA")

    # BBOA
    ax[1].plot( PMF.loc[PMF.station_id == stat_id[0], "time"], res_forest[stat_id[0]][:,1]*oa_camx, label = "Predicted")
    ax[1].plot( PMF.loc[PMF.station_id == stat_id[0], "time"], observed[stat_id[0]].iloc[:,1]*oa_sum, label = "Observed")
    ax[1].legend()
    ax[1].set_title("BBOA")
 
    # OOAtot
    ax[2].plot( PMF.loc[PMF.station_id == stat_id[0], "time"], res_forest[stat_id[0]][:,2]*oa_camx, label = "Predicted")
    ax[2].plot( PMF.loc[PMF.station_id == stat_id[0], "time"], observed[stat_id[0]].iloc[:,2]*oa_sum, label = "Observed")
    ax[2].legend()
    ax[2].set_title("OOAtot")


    # for comparison with raw CAMx
    if compare:
        ax[0].plot(PMF.loc[PMF.station_id == stat_id[0], "time"], PMF.loc[PMF.station_id == stat_id[0], "HOA_p_CAMX"]*oa_camx, label = "CAMx")
        ax[0].legend()
        ax[1].plot(PMF.loc[PMF.station_id == stat_id[0], "time"], PMF.loc[PMF.station_id == stat_id[0], "BBOA_p_CAMX"]*oa_camx, label = "CAMx")
        ax[1].legend()
        ax[2].plot(PMF.loc[PMF.station_id == stat_id[0], "time"], PMF.loc[PMF.station_id == stat_id[0], "OOAtot_p_CAMX"]*oa_camx, label = "CAMx")
        ax[2].legend()

In [None]:
interact(predict_comp, station_name = PMF.station.unique(), compare = [True, False])