# Exploratory Data Analysis
#### Code to reproduce results of Thesis Chapter: Introduction

In [1]:
# load packages
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score, mean_absolute_percentage_error as mape
from pyarrow import feather as pq
import geopandas as gpd
import folium
from folium import Marker
from shapely import geometry
from tqdm import tqdm
pd.set_option('display.max_columns', None)
from ipywidgets import interact
from IPython.display import display
import ipywidgets as widgets

In [2]:
# mount drive
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [3]:
# load data (original data)
data = pq.read_feather("/content/gdrive/MyDrive/Aurora_Thesis/ebas_riurban_and_source_data.feather")
# add year, month, week, day
data["year"] = data.time.dt.year
data["month"] = data.time.dt.month
data["week"] = data.time.dt.week
data["day"] = data.time.dt.dayofyear
# make it a GeoPandas
data_gdf = gpd.GeoDataFrame(data, geometry= gpd.points_from_xy(x = data.Lon, y = data.Lat), crs = 4326)
# read locations of CAMx's Grid
location = pd.read_excel("/content/gdrive/MyDrive/Aurora_Thesis/camx_latitude_longitude.xlsx", index_col=0)
# make it a GeoPandas
location_gdf = gpd.GeoDataFrame(location, geometry = gpd.points_from_xy(x = location.Longitude, y= location.Latitude), crs = 4326)

In [None]:
# SOME STATS
print("Number of stations", data.station_id.nunique())
print("Number of datasets", data.dataset.nunique(),data.dataset.unique() )
print("Number of years", data.year.nunique(), data.year.unique())
print("Number of OA measurements", (data.OAtot_PMF.isnull()==False).sum())
print("Number of OC measurements", (data.OC_PMF.isnull()==False).sum())

In [None]:
# STATS ON DATASETS
print("Number of stations ONLINE", data.loc[data.dataset == "online", "station_id"].nunique())
print("Number of stations RIURBAN", data.loc[data.dataset == "riurban","station_id"].nunique())
print("Measurements (OC) in RIURBAN", data.loc[(data.dataset == "riurban") & (data.OC_PMF.isnull()==False),:].shape[0])
print("Number of stations EBAS", data.loc[data.dataset == "ebas","station_id"].nunique())
print("Measurements (OC) in EBAS", data.loc[(data.dataset == "ebas") & (data.OC_PMF.isnull()==False),:].shape[0])


In [None]:
# MORE STATS
print("Years in ONLINE", data.loc[data.dataset == "online", "year"].unique())
print("Years in RIURBAN", data.loc[data.dataset == "riurban", "year"].unique())
print("Years in EBAS", data.loc[data.dataset == "ebas", "year"].unique())


In [None]:
# PLOTTING STATION LOCATIONS USING FOLIUM

# create GeoDataFrame, original coordinates (Lat and Lon) are referenced with crs = 4326
data_gdf = gpd.GeoDataFrame(data, geometry= gpd.points_from_xy(x = data.Lon, y = data.Lat), crs = 4326)
# remove stations without info on coordinates
data_gdf = data_gdf.loc[(data_gdf.Lat.isnull()== False) & (data_gdf.Lat.isnull() == False),:]

# Plot stations
map = folium.Map(location = [data_gdf.geometry.y.mean(), data_gdf.geometry.x.mean()],
                   tiles = "OpenStreetMap", zoom_start = 5)
# add marker per station
for geom in data_gdf.geometry.unique():
    folium.Marker([geom.y, geom.x], radius =1, popup = "marker").add_to(map)
map

In [8]:
# Plot only from one dataset
def plot_dataset(dataset_name):
    # get data
    dataset_data = data_gdf.loc[data_gdf.dataset == dataset_name,:]

    map = folium.Map(location = [dataset_data.geometry.y.mean(), dataset_data.geometry.x.mean()],
                    zoom_start = 4.5)

    # add marker per station
    for geom  in dataset_data.geometry.unique():
        folium.CircleMarker([geom.y, geom.x], radius = 4, color = "red").add_to(map)

    return map

In [None]:
interact(plot_dataset, dataset_name = data.dataset.unique())

In [10]:
# Plot with station names (slower than above)
def plot_dataset_name(dataset_name):
    # get data
    dataset_data = data_gdf.loc[data_gdf.dataset == dataset_name,:]

    map = folium.Map(location = [dataset_data.geometry.y.mean(), dataset_data.geometry.x.mean()],
                    zoom_start = 4.5)

    # add marker per station
    for i, r in dataset_data.iterrows():
        # add Icon ?
        folium.CircleMarker([r.geometry.y, r.geometry.x], radius = 4, color = "red", popup = r.station).add_to(map)

    return map

In [None]:
# plot with station name if click
plot_dataset_name("ebas")

In [None]:
# Plot using GeoPandas
# load background map of Europe
path2 = "/content/gdrive/MyDrive/Aurora_Thesis/NUTS_RG_20M_2021_3035.shp"
europe = gpd.read_file(path2)
# Look at Coordinate Reference System (CRS)
print(europe.crs)
# Select Level Code
europe = europe.loc[europe.LEVL_CODE == 1,:]
# Remove remore French island
europe = europe.loc[europe.NAME_LATN != "RUP FR — Régions Ultrapériphériques Françaises",:]
europe.plot()

In [None]:
sns.set_theme(style="whitegrid")
# make sure to project station coordinate to same CRS at the Europe map (i.e. epsg:3035)
fig, ax = plt.subplots(1,3, figsize = (20,30))
# online
europe.to_crs("epsg:4326").plot(figsize = (16,9), ax = ax[0])
data_gdf.to_crs("epsg:4326").loc[data_gdf.dataset == "online",:].plot(ax = ax[0], color = "darkorange",edgecolor='black', marker = "o",markersize = 60,
                                                                      label = "Online Stations",rasterized=True, figsize = (16,9), aspect = "equal")
ax[0].set_title("Online", fontsize = 24)
ax[0].set_ylabel("Lat", fontsize = 20)
ax[0].set_xlabel("Lon", fontsize = 20)
# ebas
europe.to_crs("epsg:4326").plot(figsize = (16,9), ax = ax[1])
data_gdf.to_crs("epsg:4326").loc[data_gdf.dataset == "ebas",:].plot(ax = ax[1], color = "darkorange",edgecolor='black', marker = "o",markersize = 60,
                                                                      label = "Ebas Stations",rasterized=True, figsize = (16,9), aspect = "equal")
ax[1].set_title("Ebas", fontsize = 24)
ax[1].set_ylabel("Lat", fontsize = 20)
ax[1].set_xlabel("Lon", fontsize = 20)
# riurban
europe.to_crs("epsg:4326").plot(figsize = (16,9), ax = ax[2])
data_gdf.to_crs("epsg:4326").loc[data_gdf.dataset == "riurban",:].plot(ax = ax[2], color = "darkorange",edgecolor='black', marker = "o",markersize = 60,
                                                                      label = "Riurban Stations",rasterized=True, figsize = (16,9), aspect = "equal")
ax[2].set_title("Riurban", fontsize = 24)
ax[2].set_ylabel("Lat", fontsize = 20)
ax[2].set_xlabel("Lon", fontsize = 20)
#xmin, xmax = 2*10**6 ,6.5*10**6
#ymin, ymax = 1*10**6, 6*10**6
#plt.xlim([xmin, xmax])
#plt.ylim([ymin, ymax])
#plt.xlim([-25, 45])
#plt.ylim([30, 75])

In [None]:
# plot stations using Geopandas
sns.set_theme(style="whitegrid")
ax = europe.plot(figsize = (16,9))
# make sure to project station coordinate to same CRS at the Europe map (i.e. epsg:3035)
data_gdf.to_crs("epsg:3035").plot(ax = ax, color = "darkorange",edgecolor='black', marker = "o",markersize = 60, rasterized = True, figsize = (16,9))
# add grid centroids
location_gdf.to_crs("epsg:3035").plot(ax = ax, color = "green", markersize = 0.01, figsize = (16,9), rasterized = True)
#plt.savefig("/Users/andrea/Desktop/Thesis/Pic/p_grid.pdf", bbox_inches = "tight")

# For map in Lat and Lon plane: project europe map to epsg:4326 and leave stations and grid in original CRS.
ax = europe.to_crs("epsg:4326").plot(figsize = (16,9))
data_gdf.plot(ax = ax, color = "darkorange",edgecolor='black', marker = "o",markersize = 60, rasterized = True, figsize = (16,9))
location_gdf.plot(ax = ax, color = "green", markersize = 0.01, figsize = (16,9), rasterized = True )
plt.xlabel("Lon")
plt.ylabel("Lat")
#plt.savefig("/Users/andrea/Desktop/Thesis/Pic/non_p_grid.pdf", bbox_inches = "tight")

In [16]:
# Zurich OA data
Zurich_11 = data.loc[(data.OAtot_PMF.isnull()==False)&(data.year == 2011)&(data.station == "Zurich"),:]
Zurich_17 = data.loc[(data.OAtot_PMF.isnull()==False)&(data.year == 2017)&(data.station == "Zurich"),:]
Zurich_11 = Zurich_11.set_index("day")
Zurich_17 = Zurich_17.set_index("day")

In [None]:
#  Plot of 2011 vs 2017
sns.set_theme(style="whitegrid")
fig, ax = plt.subplots(figsize = (16,9))
plt.rcParams["font.size"] ="24"
Zurich_11.OAtot_PMF.plot(ax = ax, label = "2011")
Zurich_17.OAtot_PMF.plot(ax = ax, label = "2017")
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1),
          fancybox=True, shadow=True, ncol=5, fontsize = 20)

plt.title("Zürich", fontsize = 24)
plt.xlabel("Day of the Year", fontsize = 20)
plt.ylabel("OA concentration", fontsize = 20)
plt.xticks(fontsize = 18)
plt.yticks(fontsize = 18)
#plt.savefig("/Users/andrea/Desktop/Thesis/Pic/zu.pdf", bbox_inches = "tight")

In [19]:
# Remove Svalbard (Norway) since outside of CAMx's grid
data = data.loc[data.station != "Zeppelin mountain (Ny-Ålesund)",:]
data_gdf = data_gdf.loc[data_gdf.station != "Zeppelin mountain (Ny-Ålesund)",:]

In [20]:
# CAMx GRID INFORMATION
print("Number of grid centroids:", location.shape[0])

Number of grid centroids: 55521


In [21]:
# Compute GeoSpatial Features
# Will work with the following projection: crs_ = "epsg:3035" (can choose others epsg:3857, epsg 4326)
crs_ = "epsg:3035"
data_gdf_mt = data_gdf.to_crs(crs_)
data_gdf_mt = data_gdf_mt.fillna(0)

In [22]:
# distance to closest center
distance_center = dict()
for i, stat in enumerate(tqdm(data_gdf_mt.station_id.unique())):
    # get station coordinates
    stat_loc = data_gdf_mt.loc[data_gdf_mt.station_id == stat,"geometry"].iloc[0]
    # get smallest distance to any centroid as distance to closest center
    distance_center[stat] = min(np.abs(stat_loc.distance(location_gdf.to_crs(crs_).geometry)))
distance_center = pd.DataFrame(distance_center.items())
# rename
distance_center = distance_center.rename(columns = {0:"station_id",1:"distance_mt"})
data = data.set_index("station_id")
# join on station ID to add the feature
data = data.join(distance_center.set_index("station_id"), rsuffix=2)
data.loc[:,["station","distance_mt"]]
data = data.reset_index()

100%|██████████| 128/128 [00:20<00:00,  6.14it/s]


In [23]:
# create border of grid
min_lat = min(location.Latitude)
max_lat = max(location.Latitude)
min_lon = min(location.Longitude)
max_lon = max(location.Longitude)
border = []
for row in location_gdf.loc[:,["Latitude","Longitude","geometry"]].iterrows():
    if (row[1].Latitude == min_lat) |  (row[1].Latitude == max_lat) | (row[1].Longitude == max_lon) | (row[1].Longitude == min_lon):
        border.append(row[1])

df_border = pd.DataFrame(border)
gdf_border = gpd.GeoDataFrame(df_border, geometry = df_border.geometry, crs = 4326)

In [None]:
# Plot Grid's border
sns.set_style("whitegrid")
ax = europe.to_crs("epsg:3035").plot(figsize=(16,9))
gdf_border.to_crs("epsg:3035").plot(ax = ax, color = "green",markersize = 3, rasterized = True, figsize = (16,9))
data_gdf.to_crs("epsg:3035").plot(ax= ax, color = "darkorange",edgecolor='black', marker = "o",markersize = 60, rasterized = True, figsize = (16,9))
#plt.savefig("/Users/andrea/Desktop/Thesis/Pic/border.pdf", bbox_inches = "tight")

In [25]:
# Distance to Grid Border
crs_ = "epsg:3035"
distance_border = dict()
for i, stat in enumerate(tqdm(data_gdf_mt.station_id.unique())):
    stat_loc = data_gdf_mt.loc[data_gdf_mt.station_id == stat,"geometry"].iloc[0]
    distance_border[stat] = min(np.abs(stat_loc.distance(gdf_border.to_crs(crs_).geometry)))
distance_border = pd.DataFrame(distance_border.items())
distance_border = distance_border.rename(columns = {0:"station_id",1:"distance_border"})
data = data.set_index("station_id").join(distance_border.set_index("station_id"), rsuffix=2)
data = data.reset_index()

100%|██████████| 128/128 [00:01<00:00, 77.12it/s]


In [26]:
# check spacing of grid centroids
# sort by Longitude
location_gdf_sorted_lon = location_gdf.sort_values(by= "Longitude")
# Group by Latitude and check difference in Longitude
print("Difference in Longitude between centers:",
location_gdf_sorted_lon.groupby("Latitude").apply(lambda x: x.iloc[1:,1].reset_index() - x.iloc[:-1,1].reset_index()).Longitude.unique())

# sort by Latitude
location_gdf_sorted_lat = location_gdf.sort_values(by= "Latitude")
# Group by Latitude and check difference in Latitude
print("Difference in Latitude between centers:",
    location_gdf_sorted_lat.groupby("Longitude").apply(lambda x: x.iloc[1:,0].reset_index() - x.iloc[:-1,0].reset_index()).Latitude.unique())


Difference in Longitude between centers: [0.25]
Difference in Latitude between centers: [0.125]


In [27]:
# From the information above, reconstruct the boxes of the grid and then compute areas
import shapely
from shapely import geometry
from shapely.geometry import Polygon
location_gdf["box"] = location_gdf.apply(lambda s: shapely.geometry.box(s.geometry.x - 0.125,
                                                            s.geometry.y - 0.0625,s.geometry.x + 0.125,
                                                            s.geometry.y  + 0.0625), axis=1)

In [28]:
# make another GeoPandas with box as main geometry
box_gdf = location_gdf.set_geometry("box")
box_gdf = box_gdf.set_crs("epsg:4326")

In [None]:
# Plot boxes
sns.set_style("whitegrid")
ax = europe.plot(figsize=(30,15))
box_gdf.to_crs("epsg:3035").plot(ax=ax, markersize = 0.0000000001,facecolor = "none", edgecolor = "green")

In [30]:
# Compute Areas
# now compute area of boxes in the right projection
areas = box_gdf.to_crs("epsg:3035").area
# make it in km^2
print("mean",np.round(np.sqrt(areas.mean()/1000000),2),
      "min", np.round(np.sqrt(areas.min()/1000000),2),
      "max",np.round(np.sqrt(areas.max()/1000000),2),
      "std dev", np.round(np.sqrt(areas/1000000).std(),2))

mean 15.25 min 11.59 max 17.78 std dev 1.78


In [None]:
# Distribution of Area
plt.hist(np.sqrt(areas/1000000), lw=1,ec="black", alpha=0.5)
plt.xlabel("Area")
plt.ylabel("Frequency")

In [32]:
# add area of each box to its center
location_gdf["area"] = box_gdf.to_crs("epsg:3035").area

In [None]:
location_gdf.head()

In [None]:
# Correlation with Latitude
plt.figure(figsize = (16,9))
plt.scatter(location_gdf["Latitude"], location_gdf["area"])
plt.ylabel("Box's area")
plt.xlabel("Latitude")
plt.title("Correlation between Box's area and Latitude")

In [34]:
# assign area of box to stations
crs_ = "epsg:3035"
area_grid = dict()
for i, stat in enumerate(tqdm(data_gdf_mt.station_id.unique())):
    stat_loc = data_gdf_mt.loc[data_gdf_mt.station_id == stat,"geometry"].iloc[0]
    index = np.argmin(np.abs(stat_loc.distance(location_gdf.to_crs(crs_).geometry)))
    area_grid[stat] = location_gdf["area"].iloc[index]
# make dataframe and join to data
area_grid = pd.DataFrame(area_grid.items())
area_grid = area_grid.rename(columns = {0:"station_id",1:"area_grid"})
data = data.set_index("station_id").join(area_grid.set_index("station_id"), rsuffix=3)
data = data.reset_index()

100%|██████████| 128/128 [00:20<00:00,  6.19it/s]


In [35]:
# Plot individual countries
def plot_country(country_code):
    sns.set_style("whitegrid")
    country = europe.loc[europe.CNTR_CODE == country_code,:]
    fig, ax = plt.subplots(figsize=(16,9))

    # Plot the first geospatial data layer
    country.to_crs("epsg:3035").plot(ax = ax)
    # Save the original xlim and ylim values
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()

    # Plot the other geospatial data layers on top
    data_gdf.to_crs("epsg:3035").plot(ax = ax, color = "darkorange",edgecolor='black', marker = "o",markersize = 60, rasterized = True, figsize = (16,9))
    box_gdf.to_crs("epsg:3035").plot(ax = ax, facecolor = "none",edgecolor = "green", markersize = 1, rasterized = True)
    location_gdf.to_crs("epsg:3035").plot(ax = ax,color = "black", markersize = 1, rasterized = True)

    # Restore the original xlim and ylim values
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)
    # Show the plot
    #plt.savefig("/Users/andrea/Desktop/Thesis/Pic/border_country.pdf", bbox_inches = "tight")

In [None]:
sns.set_style("whitegrid")
country = europe.loc[europe.CNTR_CODE == "CH",:]
fig, ax = plt.subplots(figsize=(16,9))

# Plot the first geospatial data layer
country.to_crs("epsg:3035").plot(ax = ax)
# Save the original xlim and ylim values
xlim = ax.get_xlim()
ylim = ax.get_ylim()

# Plot the other geospatial data layers on top
data_gdf.to_crs("epsg:3035").plot(ax = ax, color = "darkorange",edgecolor='black', marker = "o",markersize = 60, rasterized = True, figsize = (16,9))
box_gdf.to_crs("epsg:3035").plot(ax = ax, facecolor = "none",edgecolor = "green", markersize = 1, rasterized = True)
location_gdf.to_crs("epsg:3035").plot(ax = ax,color = "black", markersize = 1, rasterized = True)

# Restore the original xlim and ylim values
ax.set_xlim(xlim)
ax.set_ylim(ylim)
# Show the plot
#plt.savefig("/Users/andrea/Desktop/Thesis/Pic/border_country.pdf", bbox_inches = "tight")

In [None]:
# interactive plotting
interact(plot_country, country_code = europe.CNTR_CODE.unique())

In [39]:
# transform OC to OA

# conversion factor
c = 1.52

# to do: make it faster
for i in tqdm(range(len(data))):
    if data.OAtot_PMF.isnull().iloc[i]:
        data.OAtot_PMF.iloc[i] = data.OC_PMF.iloc[i] * c

100%|██████████| 123292/123292 [00:36<00:00, 3349.61it/s]


In [None]:
# save Data
#data.to_csv("data_converted.csv",index=False)
# read in data
#data2 = pd.read_csv("data_converted.csv")
#data2.time = pd.to_datetime(data2.time)

In [40]:
data2 = data.copy()

In [41]:
# Function to visualize Time Series of OA vs CAMx
# TO DO: look by station name and if two results, then 2 plots
def visual_ts(station_id):
    sns.set_style("whitegrid")
    plt.figure(figsize=(16,9))
    station_data = data2.loc[data2.station_id == station_id,:]
    plt.plot(station_data.time, station_data.OAtot_PMF, label = "OA")
    plt.plot(station_data.time, station_data.OAtot_CAMX, label = "CAMX")
    plt.legend()
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15),
          fancybox=True, shadow=True, ncol=5)

    plt.xticks(rotation = 45)
    plt.ylabel("OA concentration", fontsize = 18)
    plt.title(station_id)

In [None]:
# interact
list_oa = data2.loc[data2.OAtot_PMF.isnull()==False, "station_id"].unique()
interact(visual_ts, station_id = list_oa)

In [44]:
# Simple plots from Online dataset
def plot_online(station_name:str):
    sns.set_style("whitegrid")
    plt.figure(figsize = (16,9))
    data_online = data2.loc[data.dataset == "online",:]
    station = data_online.loc[data_online.station == station_name,:]
    station = station.loc[station.OAtot_PMF.isnull()==False,:]


    plt.plot(station.time, station.OAtot_PMF, label = "Observed OA")
    plt.plot(station.time, station.OAtot_CAMX, label = "CAMx OA")
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.25),
          fancybox=True, shadow=True, ncol=5, fontsize = 24)

    plt.title(station_name, fontsize = 30)
    plt.xlabel("Time", fontsize = 24)
    plt.ylabel("OA concentration", fontsize = 24)
    plt.xticks(fontsize=20, rotation = 30)
    plt.yticks(fontsize=20)
    #plt.savefig("/Users/andrea/Desktop/Thesis/Pic/" + str(station_name) + ".pdf", bbox_inches = "tight")


In [None]:
interact(plot_online, station_name= data2.loc[data2.dataset == "online",:].station.unique())

In [None]:
sns.set_style("whitegrid")
fig, ax = plt.subplots(1,2, figsize = (24,9))
data_online = data2.loc[data.dataset == "online",:]
station = data_online.loc[data_online.station == "Dublin",:]
station = station.loc[station.OAtot_PMF.isnull()==False,:]

ax[0].plot(station.time, station.OAtot_PMF, label = "Observed OA")
ax[0].plot(station.time, station.OAtot_CAMX, label = "CAMx OA")
ax[0].set_title("Dublin", fontsize = 30)
ax[0].set_xlabel("Time", fontsize = 24)
ax[0].set_ylabel("OA", fontsize = 24)
ax[0].tick_params(axis="x", labelsize=20, rotation = 30)
ax[0].tick_params(axis="y", labelsize=18)


station = data_online.loc[data_online.station == "Barcelona",:]
station = station.loc[station.OAtot_PMF.isnull()==False,:]

ax[1].plot(station.time, station.OAtot_PMF, label = "Observed OA")
ax[1].plot(station.time, station.OAtot_CAMX, label = "CAMx OA")
ax[1].set_title("Barcelona", fontsize = 30)
ax[1].set_xlabel("Time", fontsize = 24)
ax[1].set_ylabel("OA", fontsize = 24)
ax[1].tick_params(axis="x", labelsize=18, rotation = 30)
ax[1].tick_params(axis="y", labelsize=18)

plt.legend(loc='upper center', bbox_to_anchor=(-0.15, -0.25),
          fancybox=True, shadow=True, ncol=5, fontsize = 24)


