# Compare load data from OPSD
see https://github.com/calliope-project/euro-calliope/issues/41 for more info

In [8]:
SOURCE_PRIORITY = [
    'actual_entsoe_power_statistics',
    'actual_entsoe_transparency',
    'actual_tso',
    'actual_net_consumption_tso'
]

In [1]:
import os
os.chdir('../euro-calliope')

In [2]:
import pandas as pd

In [4]:
opsd_data = pd.read_csv('data/automatic/raw-load-data.csv', nrows=20950674, parse_dates=[3])

In [6]:
load_data = opsd_data[(opsd_data["variable"] == "load")]

In [7]:
# old method
def remove_entsoe_power_statistic_data_where_possible(load):
    sorted_load = load.sort_values(
        "attribute",
        ascending=False
    ) # will end with entsoe-transparency ahead of entsoe-power-statistics
    return sorted_load.drop_duplicates(["region", "utc_timestamp"], keep="first")

#new method
def select_statistics_by_source_priority(load):
    """
    Choosing `entsoe_power_statistics` as main source since OPSD states:
        The two sources differ Values on PS (~500 TWh annaually in Germany) are
        usually slightly higher than on the TP (~490 TWh). The reason probably
        lies with different reporting deadlines: Values on the TP have to be
        reported "no later than one hour after the end of the operating period".
        For the PS, the data is published with a delay of up to 3 months,
        which might allow for more accurate metering.
        For a comparison of the two sources see Hirth, et al. (2018).
    See https://nbviewer.jupyter.org/github/Open-Power-System-Data/datapackage_timeseries/blob/2020-10-06/main.ipynb for more info.
    """
    load_by_attribute = (
        load
        .set_index(["region", "utc_timestamp", "attribute"])
        ["data"]
        .unstack("attribute")
    )
    load_top_priority = load_by_attribute[SOURCE_PRIORITY[0]]
    for source in SOURCE_PRIORITY[1:]:
        load_top_priority = load_top_priority.fillna(load_by_attribute[source])

    return load_top_priority

In [9]:
old_data = remove_entsoe_power_statistic_data_where_possible(load_data)
new_data = select_statistics_by_source_priority(load_data)