# Analysis of CAMX Performance
### Code to reproduce content in Chapter 2

In [None]:
# load packages
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score, mean_absolute_percentage_error as mape, mean_squared_error as mse
from pyarrow import feather as pq
import geopandas as gpd
import folium
from folium import Marker
from shapely import geometry
from tqdm import tqdm
pd.set_option('display.max_columns', None)
from ipywidgets import interact
import seaborn as sns

In [None]:
# mount drive
from google.colab import drive
drive.mount("/content/gdrive")
# load data
data = pd.read_csv("/content/gdrive/MyDrive/Aurora_Thesis/data_converted.csv")
data.time = pd.to_datetime(data.time)
data = data.reset_index()
# make it geodataframe
data_gdf = gpd.GeoDataFrame(data, geometry= gpd.points_from_xy(x = data.Lon, y =  data.Lat), crs = 4326)

# load europe map
path2 = "/content/gdrive/MyDrive/Aurora_Thesis/NUTS_RG_20M_2021_3035.shp"
europe = gpd.read_file(path2)
# Look at Coordinate Reference System (CRS)
print(europe.crs)
# Select Level Code
europe = europe.loc[europe.LEVL_CODE == 1,:]
# Remove remore French island
europe = europe.loc[europe.NAME_LATN != "RUP FR — Régions Ultrapériphériques Françaises",:]

In [None]:
# CLEAN DATA
# remove OA with less than 0.1
data= data.loc[data.OAtot_PMF >= 0.1, :]
# and stations with less than 30 obs.
select = (data.groupby("station_id")["OAtot_PMF"].size() > 30).reset_index()
data = data.set_index("station_id")
data = data.join(select.set_index("station_id"), rsuffix = "keep")
data = data.loc[ data.OAtot_PMFkeep == True, :]
data = data.reset_index()
# switch London data
data["OAtot_2"] = data.HOA_PMF + data.BBOA_PMF + data.OOAtot_PMF
data.loc[data.station == "London","OAtot_PMF"] = data.loc[data.station == "London","OAtot_2"]
# Remove Zurich 2017
data = data.loc[(data.station != "Zurich") | (data.year != 2017),:]

In [None]:
# define MAPE with + 1 in denominator
def my_mape(Y_true, Y_pred):
    loss = (np.abs( (Y_true - Y_pred)/(Y_true + 1))).mean()
    return loss

In [None]:
# define metrics of interest
metric = my_mape

# group by station and compute performance
station_perf = data.groupby(["station_id"])[["OAtot_PMF","OAtot_CAMX"]].apply(lambda x: metric(x.iloc[:,0], x.iloc[:,1])).reset_index()
station_perf = station_perf.rename(columns={0:"score"})
station_perf.head()

In [None]:
# join with data on station_id
data_gdf = data_gdf.set_index("station_id").join(station_perf.set_index("station_id"))

In [None]:
data_gdf.head()

In [None]:
# group by station and compute number of data points
station_size = data.groupby(["station_id"])["OAtot_PMF"].size().reset_index()
station_size = station_size.rename(columns={"OAtot_PMF":"size"})
station_size.head()

In [None]:
# join with data on station_id
data_gdf = data_gdf.join(station_size.set_index("station_id"))
data_gdf = data_gdf.reset_index()

In [None]:
(data_gdf["score"].isnull() == False).sum()

In [None]:
# separate datasets
online = data_gdf.loc[data_gdf.dataset == "online",:]
riurban = data_gdf.loc[data_gdf.dataset == "riurban",:]
ebas = data_gdf.loc[data_gdf.dataset == "ebas",:]

In [None]:
# plot station performance on Europe Map
def plot_performance(dataset_name, zoom, annotate):
    sns.set_style("whitegrid")
    if dataset_name == "full":
        dataset = data_gdf
    else:
        dataset = data_gdf.loc[data_gdf.dataset == dataset_name,:]
    #sns.set_theme("whitegrid")
    ax = europe.plot(figsize=(16,9))
    # If Zoom
    if zoom:
        # To zoom in into map
        xmin, xmax = 2*10**6 ,6.5*10**6
        ymin, ymax = 1*10**6, 6*10**6
        plt.xlim([xmin, xmax])
        plt.ylim([ymin, ymax])

    dataset.to_crs("epsg:3035").plot(ax = ax,
                                    markersize=dataset['size']/5,
                                    column = dataset['score'],
                                    cmap='plasma', legend=True, vmin = data_gdf.score.min(), vmax = data_gdf.score.max())
    plt.title(dataset_name + " dataset", fontsize = 24)


    # If annotate = True, then show station name on map
    if annotate:
        for x, y, label in zip(dataset.to_crs("epsg:3035").geometry.x, dataset.to_crs("epsg:3035").geometry.y, dataset['station']):
            plt.annotate(label, xy=(x, y), xytext=(3, 3), textcoords="offset points")
    plt.show()

In [None]:
# Function to plot each dataset separately and choose performance metric!
interact(plot_performance, dataset_name = ["online","riurban","ebas","full"], zoom = [False,True], annotate = [False,True])

In [None]:
# PLOT PERFORMANCE IN MODELING RELATIVE CHANGE

In [None]:
# sort by time
data = data.sort_values(by = "time")
# get relative change as alternative response:
data_grouped = data.groupby("station_id")

def get_relative(group):
    series_values = group['OAtot_PMF']
    rate_of_change = (np.array(series_values.iloc[1:]) - np.array(series_values.iloc[:-1])) / np.array(series_values.iloc[:-1])  # Compute the difference between successive measurements
    # TO DO: replace zeros with NANs
    group['rate_of_change_obs'] = np.full(len(series_values), np.nan)
    group['rate_of_change_obs'].iloc[1:] = rate_of_change
    return group

# Concatenate the individual groups back into a single DataFrame
df_obs = pd.concat([get_relative(group) for _, group in data_grouped])
data_grouped = df_obs.groupby("station_id")

def get_relative2(group):
    series_values = group['OAtot_CAMX']
    rate_of_change = (np.array(series_values.iloc[1:]) - np.array(series_values.iloc[:-1])) / np.array(series_values.iloc[:-1])  # Compute the difference between successive measurements
    group['rate_of_change_CAMX'] = np.full(len(series_values), np.nan)
    group['rate_of_change_CAMX'].iloc[1:] = rate_of_change
    return group

data = pd.concat([get_relative2(group) for _, group in data_grouped])
# last two columns of data have now the desired relative changes
data.loc[: , ["rate_of_change_obs", "rate_of_change_CAMX"]].head()

In [None]:
# define metrics of interest
metric = my_mape

# group by station and compute performance
station_perf = data.groupby(["station_id"])[["rate_of_change_obs","rate_of_change_CAMX"]].apply(lambda x: metric(x.iloc[:,0], x.iloc[:,1])).reset_index()
station_perf = station_perf.rename(columns={0:"score_rel"})
# join with data on station_id
data_gdf = data_gdf.set_index("station_id").join(station_perf.set_index("station_id"))

In [None]:
(data_gdf["score_rel"].isnull()==False).sum()

103277

In [None]:
# plot station performance on Europe Map (NOW WITH RELATIVE PERFORMANCE)
def plot_performance(dataset_name, zoom, annotate):
    if dataset_name == "full":
        dataset = data_gdf
    else:
        dataset = data_gdf.loc[data_gdf.dataset == dataset_name,:]
    sns.set_style("whitegrid")
    ax = europe.plot(figsize=(16,9))
    # If Zoom
    if zoom:
        # To zoom in into map
        xmin, xmax = 2*10**6 ,6.5*10**6
        ymin, ymax = 1*10**6, 6*10**6
        plt.xlim([xmin, xmax])
        plt.ylim([ymin, ymax])

    dataset.to_crs("epsg:3035").plot(ax = ax,
                                    markersize=dataset['size']/5,
                                    column = dataset['score_rel'],
                                    cmap='plasma', legend=True, vmin = data_gdf.score_rel.min(), vmax = data_gdf.score_rel.max())
    plt.title(dataset_name + " dataset")


    # If annotate
    if annotate:
        for x, y, label in zip(dataset.to_crs("epsg:3035").geometry.x, dataset.to_crs("epsg:3035").geometry.y, dataset['station']):
            plt.annotate(label, xy=(x, y), xytext=(3, 3), textcoords="offset points")
    plt.show()

In [None]:
interact(plot_performance, dataset_name = ["online","riurban","ebas","full"], zoom = [False,True], annotate = [False,True])

interactive(children=(Dropdown(description='dataset_name', options=('online', 'riurban', 'ebas', 'full'), valu…

<function __main__.plot_performance(dataset_name, zoom, annotate)>

In [None]:
# compute score per station
scores_mape = data.groupby(["station_id"])[["OAtot_PMF","OAtot_CAMX"]].apply(lambda x: np.round(my_mape(x.iloc[:,0], x.iloc[:,1]),2)).reset_index()
scores_rmse = data.groupby(["station_id"])[["OAtot_PMF","OAtot_CAMX"]].apply(lambda x: np.round(mse(x.iloc[:,0], x.iloc[:,1], squared = False),2)).reset_index()
scores_bias = data.groupby(["station_id"])[["OAtot_PMF","OAtot_CAMX"]].apply(lambda x: np.round(np.mean(x.iloc[:,0] - x.iloc[:,1]),2)).reset_index()

In [None]:
# set names
scores_mape = scores_mape.rename(columns = {0:"Mape"})
scores_rmse = scores_rmse.rename(columns = {0:"RMSE"})
scores_bias = scores_bias.rename(columns = {0:"Bias"})

In [None]:
# get station names
station_names = data.groupby(["station_id"])["station"].apply(lambda x: x.unique()[0]).reset_index()
# station dataset
station_dataset =data.groupby(["station_id"])["dataset"].apply(lambda x: x.unique()[0]).reset_index()

In [None]:
# join on station_id
scores = scores_mape.set_index("station_id").join(station_names.set_index("station_id"))

In [None]:
# join for dataset
scores = scores.join(station_dataset.set_index("station_id"))

In [None]:
# join other scores
scores = scores.join(scores_rmse.set_index("station_id"))
scores = scores.join(scores_bias.set_index("station_id"))
scores = scores.reset_index()

In [None]:
# order columns
scores = scores[["station","station_id","Mape","Bias","RMSE","dataset"]]

In [None]:
scores = scores.sort_values(by = "station")

In [None]:
# get top 10 and worse 10
top10_mape = scores.sort_values(by = "Mape", ascending=True)[:10]
worse10_mape = scores.sort_values(by = "Mape", ascending=True)[-10:]

In [None]:
# save to csv
scores.to_csv("mape_stations.csv",index=False)
top10_mape.to_csv("top10_mape.csv",index=False)
worse10_mape.to_csv("worse10_mape.csv",index=False)

In [None]:
# time series of yearly/monthly performance aggregated by station
year_perf = data.groupby(["station","year"])["OAtot_PMF","OAtot_CAMX"].apply(lambda x: my_mape(x.iloc[:,0], x.iloc[:,1])).reset_index()
year_perf = year_perf.rename(columns = {0: "Mape"})

month_perf = data.groupby(["station", "month"])["OAtot_PMF","OAtot_CAMX"].apply(lambda x: my_mape(x.iloc[:,0], x.iloc[:,1])).reset_index()
month_perf = month_perf.rename(columns = {0: "Mape"})

In [None]:
# SAME BUT FOR RELATIVE
# time series of yearly/monthly performance aggregated by station
year_perf = data.groupby(["station","year"])["rate_of_change_obs","rate_of_change_CAMX"].apply(lambda x: my_mape(x.iloc[:,0], x.iloc[:,1])).reset_index()
year_perf = year_perf.rename(columns = {0: "Mape"})

month_perf = data.groupby(["station", "month"])["rate_of_change_obs","rate_of_change_CAMX"].apply(lambda x: my_mape(x.iloc[:,0], x.iloc[:,1])).reset_index()
month_perf = month_perf.rename(columns = {0: "Mape"})

In [None]:
# Boxplot
plt.boxplot( [year_perf.loc[year_perf.year == 2011,"Mape"],year_perf.loc[year_perf.year == 2013,"Mape"],
year_perf.loc[year_perf.year == 2015,"Mape"], year_perf.loc[year_perf.year == 2017,"Mape"],
year_perf.loc[year_perf.year == 2019,"Mape"]],
labels=['2011', '2013', '2015', "2017","2019"],
notch=False, patch_artist=True, boxprops={'facecolor': 'skyblue', 'linewidth': 2},
medianprops={'color': 'red', 'linewidth': 2},
flierprops={'marker': 'o', 'markerfacecolor': 'black', 'markersize': 5})
plt.ylabel("Mape")

In [None]:
# Using seaborn
sns.set_style("whitegrid")
plt.figure(figsize=(16,9))
sns.boxplot(year_perf, x="year", y="Mape", color = "skyblue",
width=0.5, linewidth=2, notch=False, fliersize=5,
whiskerprops={'color': 'black'},
capprops={'color': 'black'}, medianprops={'color': 'red'},
flierprops={'markerfacecolor': 'black', 'marker': 'o', 'markeredgecolor': 'black'})
plt.xlabel("Year", fontsize = 18)
plt.ylabel("Mape", fontsize = 18)
plt.xticks(fontsize = 18)
plt.yticks(fontsize = 18)

In [None]:
# Using seaborn
plt.figure(figsize=(16,9))
sns.set_style("whitegrid")
sns.boxplot(month_perf, x="month", y="Mape", color = "skyblue",
width=0.5, linewidth=2, notch=False, fliersize=5,
whiskerprops={'color': 'black'},
capprops={'color': 'black'}, medianprops={'color': 'red'},
flierprops={'markerfacecolor': 'black', 'marker': 'o', 'markeredgecolor': 'black'})
plt.xlabel("Month", fontsize = 18)
plt.ylabel("Mape", fontsize = 18)
plt.xticks(fontsize = 18)
plt.yticks(fontsize = 18)