# Estimation of conversion factor C
### Code for content of Chapter 2

In [None]:
# load packages
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score, mean_absolute_percentage_error as mape
from pyarrow import feather as pq
import geopandas as gpd
import folium
from folium import Marker
from shapely import geometry
from tqdm import tqdm
pd.set_option('display.max_columns', None)
from ipywidgets import interact
from IPython.display import display
import ipywidgets as widgets
from pyarrow import feather as pq

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")
# load data for monthly performance
data = pd.read_csv("/content/gdrive/MyDrive/Aurora_Thesis/data_converted.csv")

In [None]:
# find overlap
# SAME STATION AND TIME, BUT DIFFERENT DATASET
data_oa = data.loc[data.OAtot_PMF.isnull() == False,:]
data_oc = data.loc[data.OC_PMF.isnull() == False,:]

# first check which stations overlap
station_overlap = []

for i in data_oa.station.unique():
    if i in data_oc.station.unique():
        station_overlap.append(i)
print(station_overlap)

In [None]:
# count overlap
def overlap(station_name):
    count = 0
    for i in data_oa.loc[data_oa.station == station_name,"time"].unique():
        if i in data_oc.loc[data_oc.station == station_name,"time"].unique():
            count += 1
    return count

In [None]:
print("Overlap in Melpitz: ", overlap("Melpitz"))
print("Overlap in Montesec: ", overlap("Montesec"))
print("Overlap in Finokalia: ", overlap("Finokalia"))
print("Overlap in Zurich: ", overlap("Zurich"))
print("Overlap in Tartu: ", overlap("Tartu"))
print("Overlap in Hyytiälä: ", overlap("Hyytiälä"))
print("Overlap in Barcelona: ", overlap("Barcelona"))

In [None]:
# get overlap
def get_over(station_name):
    tim = []
    for i in data_oa.loc[data_oa.station == station_name, "time"].unique():
        if i in data_oc.loc[data_oc.station == station_name, "time"].unique():
            tim.append(i)

    get_data_oa = []
    get_data_oc = []
    station_oa = data_oa.loc[data_oa.station == station_name,:]
    station_oc = data_oc.loc[data_oc.station == station_name,:]
    for i in tim:
            get_data_oa.append(station_oa.loc[station_oa.time == i, "OAtot_PMF"])
            get_data_oc.append(station_oc.loc[station_oc.time == i, "OC_PMF"].iloc[0])

    return np.array(get_data_oa, dtype= "float"), np.array(get_data_oc, dtype = "float"), tim

In [None]:
# get overlap for Melpitz
mel_oa, mel_oc, mel_time = get_over("Melpitz")
mel_oc = mel_oc.reshape(227,1)

In [None]:
# Plot for Melptiz
sns.set_style("whitegrid")
plt.figure(figsize = (16,9))
plt.plot(mel_time, mel_oa, label = "OA")
plt.plot(mel_time, mel_oc, label = "OC", color = "red")
plt.legend()
plt.title("Melpitz", fontsize = 24)
plt.xticks(fontsize = 18, rotation = 45)
plt.ylabel("OA/OC", fontsize = 20)
plt.yticks(fontsize = 18)
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.175), fancybox=True, shadow=True, ncol=5, fontsize = 18)

In [None]:
# get Barcelona data
bar_oa, bar_oc, time_bar = get_over("Barcelona")
# Plot for Barcelona
plt.figure(figsize = (16,9))
plt.plot(time_bar, bar_oa, label = "OA")
plt.plot(time_bar, bar_oc, label = "OC", color = "red")
plt.legend()
plt.title("Barcelona")
plt.xticks(rotation = 45)

In [None]:
# Melpitz Ratios
plt.figure(figsize=(16,9))
plt.plot(mel_time,  mel_oa / mel_oc, color = "black" )
plt.xticks(rotation = 45)
plt.ylabel("C")
plt.title("OA/OC in Melpitz")

In [None]:
# change point detection
#!pip install ruptures
import ruptures as rpt

# Define signal
signal = mel_oa/mel_oc
# detection
algo = rpt.Pelt(model="l2").fit(signal) # fit method
result = algo.predict(pen=5) # predict
# "l1", "l2", "normal", "rbf", "linear",
algo = rpt.Dynp(model="rbf").fit(signal) # fit method
result = algo.predict(2) # predict

# display
rpt.display(signal, result)
#plt.show()
print("Break-point", result)
print("start", mel_time[result[0]])
print("end", mel_time[result[1]])
print("length", mel_time[result[1]] - mel_time[result[0]] )


In [None]:
# estimate C with 2 change points
c_1 = (mel_oa[20:45]/mel_oc[20:45]).mean()
c_2 = np.concatenate((signal[:20],signal[45:])).mean()
print("Winter C", c_1)
print("Other season C", c_2)

In [None]:
# repeat with only one change point
# "l1", "l2", "normal", "rbf", "linear",
algo = rpt.Dynp(model="l2").fit(signal) # fit method
result = algo.predict(1) # predict

# display
rpt.display(signal, result)
#plt.show()
print("Break-point", result)
print("Break-point Day", mel_time[result[0]])

In [None]:
# Global estimate from 1st of March, i.e. only one change point
c_global = (mel_oa[55:]/mel_oc[55:]).mean()
print("Global Conversion Factor", c_global)
print("First Part", (mel_oa[:55]/mel_oc[:55]).mean())

# Melpitz Ratios used to estimate global C
plt.figure(figsize=(16,9))
plt.plot(mel_time[55:],  mel_oa[55:] / mel_oc[55:], color = "black" )
plt.xticks(rotation = 45)
plt.ylabel("C")
plt.title("OA/OC in Melpitz")

In [None]:
# Block Bootstrap
signal = mel_oa[55:]/mel_oc[55:]

# import test to check for stationarity, if unit root then not stationary
from statsmodels.tsa.stattools import adfuller
print("P-value for Ratios",adfuller(signal)[1]) # --> stationary!

In [None]:
# Estimate optimal block length
from arch.bootstrap import optimal_block_length
print(optimal_block_length(signal))

In [None]:
# BLOCK BOOTSTRAP USING PACKAGE
#!pip install arch
import arch
from arch.bootstrap import CircularBlockBootstrap, StationaryBootstrap

# block length [ T^(1/3) ]
l =  5
bs = CircularBlockBootstrap(l, signal) # Have to set a random state
c = []
for idx, data in tqdm(enumerate(bs.bootstrap(500))):
    c_s = data[0][0]
    #print(c_s.shape)
    #print(c_s.mean())
    c.append(c_s.mean())
var_bb = np.array(c).var()
print(var_bb)

In [None]:
# Wald type CI
lower_bb = c_global - 2*np.sqrt(var_bb)
upper_bb = c_global + 2*np.sqrt(var_bb)
print("Wald Type Confidence Interval: ", lower_bb, upper_bb)

In [None]:
# Check reconstruction
def plot_reconstruction(station_name):

    plt.figure(figsize = (16,9))
    station = data.loc[(data.station == station_name) & (data.OC_PMF.isnull()==False),:]

    plt.plot(station.time, station.OC_PMF * 1.52, label = "Reconstructed OA")
    plt.plot(station.time, station.OC_PMF, label = "OC", color = "red")
    plt.plot(station.time, station.OAtot_CAMX, label = "CAMx OA", color = "orange")
    plt.title(station_name, fontsize = 24)
    plt.xticks(fontsize = 18, rotation = 45)
    plt.ylabel("OA/OC", fontsize = 20)
    plt.yticks(fontsize = 18)
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.175), fancybox=True, shadow=True, ncol=5, fontsize = 24)
    plt.xticks(rotation = 45)


In [None]:
interact(plot_reconstruction, station_name = data.station.unique())

In [None]:
plt.figure(figsize = (16,9))
    station = data.loc[(data.station == station_name) & (data.OC_PMF.isnull()==False),:]

    plt.plot(station.time, station.OC_PMF * 1.52, label = "Reconstructed OA")
    plt.plot(station.time, station.OC_PMF, label = "OC", color = "red")
    plt.plot(station.time, station.OAtot_CAMX, label = "CAMx OA", color = "orange")
    plt.title(station_name, fontsize = 24)
    plt.xticks(fontsize = 18, rotation = 45)
    plt.ylabel("OA/OC", fontsize = 20)
    plt.yticks(fontsize = 18)
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.175), fancybox=True, shadow=True, ncol=5, fontsize = 18)
    plt.xticks(rotation = 45)