# Wgrywanie bibliotek

Import necessary libraries

In [1]:
import csv
import glob
import os
import time
import pickle
import math
import numpy as np
import pandas as pd
import zipfile
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from tbats import TBATS
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from statsmodels.tsa.statespace.sarimax import SARIMAX

# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import LSTM, Dense

# Ustawienia

Settings

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

Output folders to save files

In [3]:
# Get the path to the current folder where the notebook is located
current_folder = os.path.dirname(os.path.abspath("__file__"))

images_output_folder = os.path.join(current_folder, "../02_paper/out_figures/")
tables_output_folder = os.path.join(current_folder, "../02_paper/out_tables/")
models_output_folder = os.path.join(current_folder, "../../models/models")

# Wgranie danych
* Get the path to the "data.zip" file
* Unpacking the file
* Uploading the CSV (both: **_data_** and **_data_dict_**)

In [4]:
# Get the path to the "data" folder inside the repository
data_folder = os.path.join(current_folder, "..", "00_data")

# Get the path to the "data.zip" file inside the "data" folder
data_zip_path = os.path.join(data_folder, "data.zip")

# Check if both entsoe_country_file and entsoe_country_dict_file exist in the target location
entsoe_country_file = os.path.join(data_folder, "entsoe_country.csv")
entsoe_country_dict_file = os.path.join(data_folder, "entsoe_country_dict.csv")

if not (os.path.exists(entsoe_country_file) and os.path.exists(entsoe_country_dict_file)):
    # Extract the data.zip file
    with zipfile.ZipFile(data_zip_path, 'r') as zip_ref:
        zip_ref.extractall(data_folder)
    
# Read the CSV files and create DataFrames
data = pd.read_csv(entsoe_country_file, sep=';')
data_dict = pd.read_csv(entsoe_country_dict_file, sep=',')

# Data preparation (part 1)

In [5]:
data = data.drop_duplicates()

# Country name mapping
data['CountryCode'] = data['Variable'].map(lambda x: x.lstrip('BZN_'))
data = pd.merge(data, data_dict, on="CountryCode")
data = data.drop(['Variable', 'CountryCode'], axis=1)

data['Timestamp'] = pd.to_datetime(data['Timestamp'])

## Analysis of missing data

The assumption has been made that the present values of $0.00$ in the data also qualify as missing values. Originally, the function also checked information about TotalLoad_Forecast_MW. Currently, these code sections are commented out.

In [6]:
def find_missing_value_date_ranges(dataframe):
    dataframe['Timestamp'] = pd.to_datetime(dataframe['Timestamp'])

    # Creation the resulting DataFrame.
    result = []

    # Iteration over unique countries.
    for country in dataframe['Country'].unique():
        country_data = dataframe[dataframe["Country"] == country][["Timestamp", "TotalLoad_Actual_MW"]]  # [["Timestamp", "TotalLoad_Forecast_MW", "TotalLoad_Actual_MW"]]
        country_data['Timestamp'] = pd.to_datetime(country_data['Timestamp'])

        # Aggregation data to full days and sorting.
        agg_df = country_data.resample('D', on='Timestamp').sum().reset_index()

        for metric in ["TotalLoad_Actual_MW"]:  #  ["TotalLoad_Forecast_MW", "TotalLoad_Actual_MW"]
            start_date = None
            end_date = None

            for index, row in agg_df.iterrows():
                if row[metric] == 0.00:
                    if start_date is None:
                        start_date = row['Timestamp']
                elif start_date is not None:
                    end_date = row['Timestamp']
                    number_of_days = (end_date - start_date).days + 1
                    result.append({'country': country, 'metric': metric, 'start_date': start_date, 'end_date': end_date, 'number_of_days': number_of_days})
                    start_date = None
                    end_date = None
            
            # Handling missing end_date - if we have reached the last record
            if end_date is None and start_date is not None:
                end_date = agg_df['Timestamp'].iloc[-1]
                number_of_days = (end_date - start_date).days + 1
                result.append({'country': country, 'metric': metric, 'start_date': start_date, 'end_date': end_date, 'number_of_days': number_of_days})

    missing_value_date_ranges = pd.DataFrame(result)
    missing_value_date_ranges = missing_value_date_ranges.sort_values(by=['country', 'metric']).reset_index(drop=True)

    missing_value_date_ranges.drop(columns=['metric'], inplace=True)  # This line should be removed if we want to obtain data regarding TotalLoad_Forecast_MW
    
    return missing_value_date_ranges

missing_value_date_ranges = find_missing_value_date_ranges(data)
display(missing_value_date_ranges)

missing_days_per_country = missing_value_date_ranges.groupby(["country"])["number_of_days"].sum().sort_values(ascending=False).reset_index()  # groupby(["country", "metric"])["number_of_days"].sum().sort_values(ascending=False).reset_index()
display(missing_days_per_country)

Unnamed: 0,country,start_date,end_date,number_of_days
0,Austria,2017-12-30,2017-12-31,2
1,Croatia,2018-01-05,2018-01-06,2
2,Cyprus,2015-01-01,2016-09-21,630
3,Cyprus,2018-06-03,2018-06-07,5
4,Cyprus,2019-03-03,2019-03-04,2
5,Cyprus,2019-06-08,2019-07-23,46
6,Cyprus,2020-03-13,2020-04-05,24
7,Cyprus,2020-04-07,2020-04-09,3
8,Cyprus,2020-07-23,2020-07-27,5
9,Cyprus,2020-07-28,2020-08-04,8


Unnamed: 0,country,number_of_days
0,Cyprus,729
1,Lithuania,10
2,Romania,10
3,Luxembourg,9
4,Czech Republic,4
5,Denmark,4
6,France,4
7,Ireland,4
8,Sweden,4
9,Austria,2


In [7]:
missing_value_date_ranges['start_date'] = missing_value_date_ranges['start_date'].dt.date
missing_value_date_ranges['end_date'] = missing_value_date_ranges['end_date'].dt.date

missing_value_date_ranges.style.hide(axis = 0).to_latex(os.path.join(tables_output_folder, "tab_00.tex"))

In [8]:
missing_days_per_country.style.hide(axis = 0).to_latex(os.path.join(tables_output_folder, "tab_01.tex"))

## Data analysis
* For each country, measurements start on **2015-01-01** and end on **2021-08-30**.
* **Frequency of measurements**:
  * In most of the surveyed countries, measurements were taken hourly, 
  * in five countries (Austria, Hungary, Germany, Luxembourg, Romania) every 15 minutes, 
  * in two (Ireland, Cyprus) every half an hour.
* **Information about specific countries**:
  * Cyprus:
    * Lack of data:
        * **TotalLoad_Actual_MW**:
            * from 2015-01-01 00:00:00 to 2016-09-21 00:00:00
    * Generally, the data for Cyprus is:
        * from 2016-09-21 00:00:00 to 2018-01-16 22:00:00 (),
        * from 2015-07-24 21:30:00 to 2017-10-09 21:00:00
          and from 2017-10-10 21:30:00 to 2018-01-16 22:00:00 (**TotalLoad_Forecast_MW**),
        * 0.00 from 2018-01-10 11:00:00 to 2018-01-16 22:00:00 (**TotalLoad_Forecast_MW**).
  * Romania - there are fewer measurements, **need to check why!**

# Data preparation (part 2 - weekly aggregation)

In [9]:
data_prepared = data.set_index('Timestamp')

def remove_first_last(group):
    return group.iloc[1:-1]

# Domyślnie w pandas tydzień jest definiowany jako tydzień kalendarzowy, który rozpoczyna się od poniedziałku i kończy się w niedzielę.
# Jeśli tydzień jest częściowo w jednym roku i częściowo w drugim, domyślnie będzie on przypisywany do roku, w którym większość dni tygodnia 
# przypada na ten rok.
weekly_data = data_prepared.groupby('Country')[['TotalLoad_Actual_MW']].resample('W').sum().reset_index()
weekly_data = weekly_data.rename(columns={'Timestamp': 'Date'})
weekly_data = weekly_data.groupby('Country').apply(remove_first_last).reset_index(drop=True)

In [13]:
weekly_data.head(5)

Unnamed: 0,Country,Date,TotalLoad_Actual_MW
0,Austria,2015-01-11,4345870.0
1,Austria,2015-01-18,5067420.0
2,Austria,2015-01-25,4607080.0
3,Austria,2015-02-01,4781652.0
4,Austria,2015-02-08,5298053.0


### Plots
#### _TotalLoad_Actual_MW per country **before imputation**._

In [14]:
country_list = weekly_data['Country'].unique()

for country in country_list:
    electricity_consumption_per_country = weekly_data[weekly_data['Country'] == country].copy()
    electricity_consumption_per_country['Date'] = pd.to_datetime(electricity_consumption_per_country['Date'])
    
    plt.style.use('seaborn-v0_8')
    plt.rcParams['font.family'] = 'Times New Roman'
    
    plt.figure(figsize=(20, 5))
    plt.title(f"Actual Electricity Consumption in Terawatts for Country: {country}", fontsize=16)
    
    formatter = ticker.ScalarFormatter(useMathText=True)
    formatter.set_scientific(False)
    formatter.set_powerlimits((-6, 6))
    plt.gca().yaxis.set_major_formatter(formatter)
    
    plt.plot(electricity_consumption_per_country["Date"], electricity_consumption_per_country["TotalLoad_Actual_MW"], color='steelblue')
    
    plt.xlim(electricity_consumption_per_country["Date"].iloc[0], electricity_consumption_per_country["Date"].iloc[-1])
    
    ticks = plt.gca().get_yticks()
    tick_labels = [f'{int(tick) / 1000000:.1f}' for tick in ticks]
    plt.gca().yaxis.set_major_locator(ticker.FixedLocator(ticks))
    plt.gca().set_yticklabels(tick_labels)

    output_file_path = os.path.join(images_output_folder, f"actual_electricity_consumption_{country}.jpeg")
    if not os.path.exists(output_file_path):
        plt.savefig(output_file_path)
    
    plt.close()

## Imputation

Imputation was performed using a weighted average with a 7-day window.

Currently, due to a significant amount of missing data for Cyprus, I am refraining from performing imputation for this country. This method is not efficient for Cyprus.

Kolejność czynności przy imputacji danych:
- zamiana wartości 0 na NA w oryginalnych danych
- imputacja danych na poziomie oryginalnych danych
     * imputację przeprowadzamy jako średnia z ostatnich 30 obserwacji tego samego typu (ta sama godzina i dzień tego samego typu)
     * imputację wykonujemy sekwencyjnie zgodnie z kierunkiem czasu
- agregacja danych do tygodni

In [15]:
imputed_data_list = []

for country in data['Country'].unique():
    country_data = data[data["Country"] == country][["Timestamp", "TotalLoad_Actual_MW"]]
    country_data['Timestamp'] = pd.to_datetime(country_data['Timestamp'])
    
    # Aggregation data to full days and sorting.
    agg_df = country_data.resample('D', on='Timestamp').sum().reset_index()
    
    # Replacing values of 0.00 with null values
    agg_df['TotalLoad_Actual_MW'].replace(0.00, pd.NA, inplace=True)
    agg_df['TotalLoad_Actual_MW'] = pd.to_numeric(agg_df['TotalLoad_Actual_MW'], errors='coerce')
    
    window_size = 30

    # Imputing missing values using a moving average
    agg_df['TotalLoad_Imputed_MW'] = agg_df['TotalLoad_Actual_MW'].fillna(agg_df['TotalLoad_Actual_MW'].rolling(window=window_size, min_periods=1, center=True).mean())

    agg_df['Country'] = country
    imputed_data_list.append(agg_df)

daily_imputed_data = pd.concat(imputed_data_list, ignore_index=True)

In [16]:
# Setting the index to 'Timestamp'.
daily_imputed_data.set_index('Timestamp', inplace=True)

# Resampling and grouping by weeks
weekly_imputed_data = daily_imputed_data.groupby('Country')[['TotalLoad_Actual_MW', 'TotalLoad_Imputed_MW']].resample('W').sum().reset_index()  # weekly_data = data_prep.groupby('Country')[['TotalLoad_Forecast_MW', 'TotalLoad_Actual_MW']].resample('W').sum().reset_index()  # version with TotalLoad_Forecast_MW
weekly_imputed_data = weekly_imputed_data.rename(columns={'Timestamp': 'Date'})
weekly_imputed_data = weekly_imputed_data.groupby('Country').apply(remove_first_last).reset_index(drop=True)
weekly_imputed_data = weekly_imputed_data.reset_index()

### Plots 

_TotalLoad_Actual_MW per country **after imputation**._

In [17]:
country_list = weekly_imputed_data['Country'].unique()
for country in country_list:
    electricity_consumption_per_country = weekly_imputed_data[weekly_imputed_data['Country'] == country]
    electricity_consumption_per_country = electricity_consumption_per_country.reset_index(drop=True)
    electricity_consumption_per_country.drop(columns="index", inplace=True)

    electricity_consumption_per_country = weekly_imputed_data[weekly_imputed_data['Country'] == country].copy()
    electricity_consumption_per_country['Date'] = pd.to_datetime(electricity_consumption_per_country['Date'])
    
    plt.style.use('seaborn-v0_8')
    plt.rcParams['font.family'] = 'Times New Roman'
    
    plt.figure(figsize=(20, 5))
    plt.title(f"Imputed Electricity Consumption in Terawatts for Country: {country}", fontsize=16)
    
    formatter = ticker.ScalarFormatter(useMathText=True)
    formatter.set_scientific(False)
    formatter.set_powerlimits((-6, 6))
    plt.gca().yaxis.set_major_formatter(formatter)
    
    plt.plot(electricity_consumption_per_country["Date"], electricity_consumption_per_country["TotalLoad_Actual_MW"], color='firebrick', label='Actual')
    plt.plot(electricity_consumption_per_country["Date"], electricity_consumption_per_country["TotalLoad_Imputed_MW"], color='steelblue', label='Imputed')
    
    plt.xlim(electricity_consumption_per_country["Date"].iloc[0], electricity_consumption_per_country["Date"].iloc[-1])
    plt.legend()
    
    ticks = plt.gca().get_yticks()
    tick_labels = [f'{int(tick) / 1000000:.1f}' for tick in ticks]
    plt.gca().yaxis.set_major_locator(ticker.FixedLocator(ticks))
    plt.gca().set_yticklabels(tick_labels)

    output_file_path = os.path.join(images_output_folder, f"imputed_electricity_consumption_{country}.jpeg")
    if not os.path.exists(output_file_path):
        plt.savefig(output_file_path)
    
    plt.close()

### Final data preparation for models

In [18]:
data_for_the_model = weekly_imputed_data[['Country', 'Date', 'TotalLoad_Imputed_MW']]
data_for_the_model.loc[:, 'Date'] = pd.to_datetime(data_for_the_model['Date'])

# PREDICTIONS

## METRICS

In [None]:
def MAPE(y, y_pred):
    mape = np.mean(np.abs((y - y_pred)/y))*100
    return round(mape, 2)

def ME(y, y_pred):
    me = np.mean(y_pred - y)
    return round(me, 2)

def RMSE(MSE):
    rmse = math.sqrt(MSE)
    return round(rmse, 2)

## MODELS

### SARIMA

#### Searching for optimal parameters 

The **sarima_split_best_params_search_fit_predict_plot()** function was employed to discover the optimal parameters for SARIMA models for each country, as well as to evaluate the model performance using the selected parameters.

Dlaczego nie wykorzystać istniejących bibliotek `pmdarima`, np. auto.arima()?

In [None]:
# def sarima_split_best_params_search_fit_predict_plot(country_name, data):
    
#     dataset = data.values
#     train_data = data[data.index <= '2019-12-31']
#     test_data = data[data.index >= '2020-01-01']
    
#     p = range(0, 2)
#     d = range(0, 2)
#     q = range(0, 2)
#     P = range(0, 2)
#     D = range(1, 2)
#     Q = range(0, 2)
#     s = 12

#     best_aic = float("inf")
#     best_params = None

#     for p_val in p:
#         for d_val in d:
#             for q_val in q:
#                 for P_val in P:
#                     for D_val in D:
#                         for Q_val in Q:
#                             try:
#                                 model = SARIMAX(train_data, order=(p_val, d_val, q_val), seasonal_order=(P_val, D_val, Q_val, s))
#                                 fit_model = model.fit()
#                                 aic = fit_model.aic
#                                 if aic < best_aic:
#                                     best_aic = aic
#                                     best_params = (p_val, d_val, q_val, P_val, D_val, Q_val)
#                             except:
#                                 continue

#     print(f"Best SARIMA parameters for {country_name}:", best_params)

#     results = {
#         'country': country_name,
#         'best_params': best_params,
#     }
    
#     try:
#         p_val, d_val, q_val, P_val, D_val, Q_val = best_params
#         s = 52 
#         model = SARIMAX(train_data, order=(p_val, d_val, q_val), seasonal_order=(P_val, D_val, Q_val, s))
#         fit_model = model.fit()
#         yhat = fit_model.predict(start=len(train_data), end=(len(dataset)-1))
    
#         pd.DataFrame(yhat).plot()
#         data.plot(figsize=(20, 5))
#         plt.legend()
#     except TypeError:
#         pass
    
#     return results

# results_list = []

# for country in data_for_the_model['Country'].unique():
#     try: 
#         print(f'Evaluation for country: {country}')
#         country_data = data_for_the_model[data_for_the_model['Country'] == country].set_index('Date').asfreq('W')
#         results = sarima_split_best_params_search_fit_predict_plot(country, country_data["TotalLoad_Imputed_MW"])
#         results_list.append(results)
#     except Exception as e:
#         print(f"An error occurred for country {country}: {e}")

# sarima_best_params = {results['country']: results['best_params'] for results in results_list}

# sarima_best_params_df = pd.DataFrame(results_list)
# display(sarima_best_params_df)

#### Model Fitting, Prediction, Plotting, and Evaluation

In [None]:
def sarima_fit_predict_plot_evaluate(country_name, country_data, best_params):
    dataset = country_data.values
    train_data = country_data[country_data.index <= '2019-12-31']
    test_data = country_data[country_data.index >= '2020-01-01']
    
    s = 52

    p_val, d_val, q_val, P_val, D_val, Q_val = best_params

    model = SARIMAX(train_data, order=(p_val, d_val, q_val), seasonal_order=(P_val, D_val, Q_val, s))
    fit_model = model.fit()
    yhat = fit_model.predict(start=len(train_data), end=(len(dataset)-1))

    model_filename = f"sarima_model_{country_name}.pkl"
    model_file_path = os.path.join(models_output_folder, model_filename)

    if not os.path.exists(model_file_path):
        with open(model_file_path, 'wb') as model_file:
            pickle.dump(fit_model, model_file)

    plt.style.use('seaborn-v0_8')
    plt.rcParams['font.family'] = 'Times New Roman'
    
    plt.figure(figsize=(20, 5))
    plt.title(f"SARIMA Prediction for Electricity Consumption in Terawatts for Country: {country_name}", fontsize=16)

    yhat_df = pd.DataFrame(yhat)
    country_data_subset = country_data.iloc[1:-1]  
    yhat_df_subset = yhat_df.iloc[1:-1]
    
    formatter = ticker.ScalarFormatter(useMathText=True)
    formatter.set_scientific(False)  
    formatter.set_powerlimits((-6, 6))  
    plt.gca().yaxis.set_major_formatter(formatter)
    
    plt.plot(yhat_df_subset, color="firebrick", label='Predicted')
    plt.plot(country_data_subset, color='steelblue', label='Actual')
    
    plt.xlim(country_data_subset.index[0], country_data_subset.index[-1])
    plt.legend()
    
    # Changing the Y-axis value labels from 5000000 to 5.0
    plt.gca().set_yticklabels([f'{int(tick) / 1000000:.1f}' for tick in plt.gca().get_yticks()])

    output_file_path = os.path.join(images_output_folder, f"sarima_predictions_{country}.jpeg")
    if not os.path.exists(output_file_path):
        plt.savefig(output_file_path)
    
    plt.close()
    
    MAPE_metric = MAPE(test_data, yhat)
    ME_metric = ME(test_data, yhat)
    MAE_metric = round(mean_absolute_error(test_data, yhat), 2)
    MSE_metric = round(mean_squared_error(test_data, yhat), 2)
    RMSE_metric = RMSE(MSE_metric)

    results = {
        'model': 'sarima',
        'country': country_name,
        'mape': MAPE_metric,
        'me': ME_metric,
        'mae': MAE_metric,
        'mse': MSE_metric,
        'rmse': RMSE_metric
    }
    
    return results

# Dictionary of best SARIMA parameters obtained using sarima_split_best_params_search_fit_predict_plot() function.
sarima_best_params = {
    "Austria": (0, 1, 0, 1, 1, 1),
    "Croatia": (0, 1, 0, 1, 1, 1),
    "Cyprus": (1, 1, 1, 0, 1, 1),
    "Czech Republic": (0, 1, 0, 1, 1, 1),
    "Denmark": (0, 1, 0, 1, 1, 1),
    "Estonia": (0, 1, 0, 1, 1, 1),
    "Finland": (0, 1, 0, 1, 1, 1),
    "France": (0, 1, 0, 1, 1, 1),
    "Germany": (0, 1, 0, 1, 1, 1),
    "Greece": (0, 1, 0, 1, 1, 1),
    "Hungary": (0, 1, 0, 1, 1, 1),
    "Ireland": (0, 1, 0, 1, 1, 1),
    "Italy": (0, 1, 0, 1, 1, 1),
    "Latvia": (0, 1, 0, 1, 1, 1),
    "Lithuania": (0, 1, 1, 1, 1, 1),
    "Luxembourg": (0, 1, 0, 1, 1, 1),
    "Poland": (0, 1, 0, 1, 1, 1),
    "Portugal": (0, 1, 0, 1, 1, 1),
    "Romania": (0, 1, 0, 1, 1, 1),
    "Slovakia": (0, 1, 0, 1, 1, 1),
    "Slovenia": (0, 1, 0, 1, 1, 1),
    "Spain": (0, 1, 0, 1, 1, 1),
    "Sweden": (0, 1, 0, 1, 1, 1)
}


results_list = []

for country in data_for_the_model['Country'].unique():
    try: 
        print(f'Evaluation for country: {country}')
        country_data = data_for_the_model[data_for_the_model['Country'] == country].set_index('Date').asfreq('W')
        best_params = sarima_best_params.get(country)
        if best_params is None:
            print(f"No parameters found for country: {country}")
            continue
        results = sarima_fit_predict_plot_evaluate(country, country_data["TotalLoad_Imputed_MW"], best_params)
        results_list.append(results)
    except Exception as e:
        print(f"An error occurred for country {country}: {e}")

sarima_results = pd.DataFrame(results_list)

### TBATS

#### Searching for optimal parameters 

The **tbats_split_best_params_search_fit_predict()** function was employed to discover the optimal parameters for TBATS models for each country, as well as to evaluate the model performance using the selected parameters.

In [None]:
# tbats_best_params = {}

# def tbats_split_best_params_search_fit_predict(country_name, data):
    
#     dataset = data.values
#     train_data = data[data.index <= '2019-12-31']
#     test_data = data[data.index >= '2020-01-01']
    
#     seasonal_periods = 52
#     use_arma_errors = True
#     use_box_cox_options = [True, False]
#     use_trend_options = [True, False]
#     n_jobs_option = os.cpu_count()
#     use_damped_trend = True
    
#     best_aic = float("inf")
#     best_params = None
#     best_result_dict = None

#     for use_box_cox in use_box_cox_options:
#         for use_trend in use_trend_options:
#             try:
#                 model = TBATS(seasonal_periods=[seasonal_periods],
#                               use_arma_errors=use_arma_errors,
#                               use_box_cox=use_box_cox,
#                               use_trend=use_trend,
#                               n_jobs=n_jobs_option,
#                               use_damped_trend=use_damped_trend
#                              )
#                 fit_model = model.fit(train_data)
#                 aic = fit_model.aic
#                 if aic < best_aic:
#                     best_aic = aic
#                     best_params = (
#                         seasonal_periods, 
#                         use_arma_errors, 
#                         use_box_cox,
#                         use_trend, 
#                         n_jobs_option,
#                         use_damped_trend
#                         )
#                     best_result_dict = {
#                     "country": country_name,
#                     "seasonal_period": seasonal_periods,
#                     "use_arma_errors": use_arma_errors,
#                     "use_box_cox": use_box_cox,
#                     "use_trend": use_trend,
#                     "use_damped_trend": use_damped_trend
#                 }
#             except:
#                 continue

#     tbats_best_params[country_name] = best_result_dict

#     print(f'Best params for country {country_name}: {best_params}')
    
#     results = {
#         'country': country_name,
#         'best_params': best_params,
#     }
    
#     return(results)

# results_list = []

# for country in data_for_the_model['Country'].unique():
#     try: 
#         print(f'Evaluation for country: {country}')
#         country_data = data_for_the_model[data_for_the_model['Country'] == country].set_index('Date').asfreq('W')
#         results = tbats_split_best_params_search_fit_predict(country, country_data["TotalLoad_Imputed_MW"])
#         results_list.append(results)
#     except Exception as e:
#         print(f"An error occurred for country {country}: {e}")

# tbats_best_params = {results['country']: results['best_params'] for results in results_list}

# tbats_best_params_df = pd.DataFrame(results_list)
# display(tbats_best_params_df)

#### Model Fitting, Prediction, Plotting, and Evaluation

In [None]:
def tbats_fit_predict_plot_evaluate(country_name, country_data, best_params):

    dataset = country_data.values
    train_data = country_data[country_data.index <= '2019-12-31']
    test_data = country_data[country_data.index >= '2020-01-01']

    (
    seasonal_period, 
    use_arma_errors, 
    use_box_cox,
    use_trend, 
    n_jobs_option,
    use_damped_trend
    ) = best_params
    
    model = TBATS(seasonal_periods=[seasonal_period],
                                    use_arma_errors=use_arma_errors,
                                    use_box_cox=use_box_cox,
                                    use_trend=use_trend,
                                    n_jobs=n_jobs_option,
                                    use_damped_trend=use_damped_trend
                                    )
    fit_model = model.fit(train_data)
    yhat = fit_model.forecast(steps=len(test_data))

    model_filename = f"tbats_model_{country_name}.pkl"
    model_file_path = os.path.join(models_output_folder, model_filename)

    if not os.path.exists(model_file_path):
        with open(model_file_path, 'wb') as model_file:
            pickle.dump(fit_model, model_file)

    start_date = '2020-01-07'
    end_date = '2021-09-05'
    date_range = pd.date_range(start=start_date, end=end_date, freq='7D')

    yhat_df = pd.DataFrame(yhat, index=date_range)
    yhat_df_subset = yhat_df.iloc[1:-1]

    country_data_subset = country_data.iloc[1:-1] 

    plt.style.use('seaborn-v0_8')
    plt.rcParams['font.family'] = 'Times New Roman'
    
    plt.figure(figsize=(20, 5))
    plt.title(f"TBATS Prediction for Electricity Consumption in Terawatts for Country: {country_name}", fontsize=16)
    
    formatter = ticker.ScalarFormatter(useMathText=True)
    formatter.set_scientific(False)
    formatter.set_powerlimits((-6, 6))
    plt.gca().yaxis.set_major_formatter(formatter)
    
    plt.plot(yhat_df_subset, color="firebrick", label='Predicted')
    plt.plot(country_data_subset, color='steelblue', label='Actual')
    
    plt.xlim(country_data_subset.index[0], country_data_subset.index[-1])
    plt.legend()
    
    plt.gca().set_yticklabels([f'{int(tick) / 1000000:.1f}' for tick in plt.gca().get_yticks()])

    output_file_path = os.path.join(images_output_folder, f"tbats_predictions_{country}.jpeg")
    if not os.path.exists(output_file_path):
        plt.savefig(output_file_path)
    
    plt.close()

    MAPE_metric = MAPE(test_data, yhat)
    ME_metric = ME(test_data, yhat)
    MAE_metric = round(mean_absolute_error(test_data, yhat), 2)
    MSE_metric = round(mean_squared_error(test_data, yhat), 2)
    RMSE_metric = RMSE(MSE_metric)

    results = {
        'model': 'tbats',
        'country': country_name,
        'mape': MAPE_metric,
        'me': ME_metric,
        'mae': MAE_metric,
        'mse': MSE_metric,
        'rmse': RMSE_metric
    }

    return results

# Dictionary of best TBATS parameters obtained using tbats_split_best_params_search_fit_predict() function
tbats_best_params = {
    "Austria": (52, True, False, True, 4, True),
    "Croatia": (52, True, True, False, 4, True),
    "Cyprus": (52, True, True, False, 4, True),
    "Czech Republic": (52, True, True, True, 4, True),
    "Denmark": (52, True, True, False, 4, True),
    "Estonia": (52, True, True, False, 4, True),
    "Finland": (52, True, True, False, 4, True),
    "France": (52, True, True, True, 4, True),
    "Germany": (52, True, False, False, 4, True),
    "Greece": (52, True, True, False, 4, True),
    "Hungary": (52, True, False, False, 4, True),
    "Ireland": (52, True, False, False, 4, True),
    "Italy": (52, True, True, False, 4, True),
    "Latvia": (52, True, True, False, 4, True),
    "Lithuania": (52, True, True, True, 4, True),
    "Luxembourg": (52, True, False, False, 4, True),
    "Poland": (52, True, False, False, 4, True),
    "Portugal": (52, True, False, False, 4, True),
    "Romania": (52, True, True, True, 4, True),
    "Slovakia": (52, True, True, True, 4, True),
    "Slovenia": (52, True, True, False, 4, True),
    "Spain": (52, True, True, True, 4, True),
    "Sweden": (52, True, True, False, 4, True)
}

results_list = []

for country in data_for_the_model['Country'].unique():
    try: 
        print(f'Evaluation for country: {country}')
        country_data = data_for_the_model[data_for_the_model['Country'] == country].set_index('Date').asfreq('W')
        best_params = tbats_best_params.get(country)
        if best_params is None:
            print(f"No parameters found for country: {country}")
            continue
        results = tbats_fit_predict_plot_evaluate(country, country_data["TotalLoad_Imputed_MW"], best_params)
        results_list.append(results)
    except Exception as e:
        print(f"An error occurred for country {country}: {e}")

tbats_results = pd.DataFrame(results_list)

### LSTM

#### Model Fitting, Prediction, Plotting, and Evaluation

In [None]:
def lstm_fit_predict_plot_evaluate(country_name, country_data):
    
    dataset = country_data.values
    train_data = country_data[country_data.index <= '2019-12-31']
    test_data = country_data[country_data.index >= '2020-01-01']
    
    look_back = 10

    scaler = MinMaxScaler(feature_range=(-1, 1))
    train_data = scaler.fit_transform(train_data.values.reshape(-1, 1))
    test_data_rescaled = scaler.transform(test_data.values.reshape(-1, 1))
    
    def prepare_data(data, look_back=10):
        X, y = [], []
        for i in range(len(data) - look_back):
            X.append(data[i:i + look_back])
            y.append(data[i + look_back])
        return np.array(X), np.array(y)

    X_train, y_train = prepare_data(train_data, look_back)
    X_test, y_test = prepare_data(test_data_rescaled, look_back)

    model = Sequential()
    model.add(LSTM(64, input_shape=(look_back, 1), return_sequences=True))  # Dodana warstwa LSTM
    model.add(LSTM(64))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', metrics=["mape"], optimizer='adam')
    
    fit_model = model.fit(X_train, y_train, epochs=500, batch_size=1, verbose=2)

    yhat = model.predict(X_test)
    predictions_rescaled = scaler.inverse_transform(yhat)
    yhat_df = pd.DataFrame(predictions_rescaled, index=test_data.index[look_back:])

    model_filename = f"lstm_model_{country_name}.pkl"
    model_file_path = os.path.join(models_output_folder, model_filename)

    if not os.path.exists(model_file_path):
        with open(model_file_path, 'wb') as model_file:
            pickle.dump(fit_model, model_file)

    country_data_subset = country_data.iloc[1:-1] 

    plt.style.use('seaborn-v0_8')
    plt.rcParams['font.family'] = 'Times New Roman'
    
    plt.figure(figsize=(20, 5))
    plt.title(f"LSTM Prediction for Electricity Consumption in Terawatts for Country: {country_name}", fontsize=16)
    
    country_data_subset = country_data.iloc[1:-1] 
    yhat_df_subset = yhat_df.iloc[1:-1]
    
    formatter = ticker.ScalarFormatter(useMathText=True)
    formatter.set_scientific(False)
    formatter.set_powerlimits((-6, 6))
    plt.gca().yaxis.set_major_formatter(formatter)
    
    plt.plot(yhat_df_subset, color="firebrick", label='Predicted')
    plt.plot(country_data_subset, color='steelblue', label='Actual')
    
    plt.xlim(country_data_subset.index[0], country_data_subset.index[-1])
    plt.legend()
    
    plt.gca().set_yticklabels([f'{int(tick) / 1000000:.1f}' for tick in plt.gca().get_yticks()])

    output_file_path = os.path.join(images_output_folder, f"lstm_predictions_{country}.jpeg")
    if not os.path.exists(output_file_path):
        plt.savefig(output_file_path)
    
    plt.close()

    MAPE_metric = MAPE(test_data.iloc[look_back:], yhat_df)
    ME_metric = ME(test_data.iloc[look_back:], yhat_df)
    MAE_metric = round(mean_absolute_error(test_data.iloc[look_back:], yhat_df), 2)
    MSE_metric = round(mean_squared_error(test_data.iloc[look_back:], yhat_df), 2)
    RMSE_metric = RMSE(MSE_metric)

    results = {
        'model': 'lstm',
        'country': country_name,
        'mape': MAPE_metric,
        'me': ME_metric,
        'mae': MAE_metric,
        'mse': MSE_metric,
        'rmse': RMSE_metric
    }
    
    return fit_model, results

results_list = []

for country in data_for_the_model['Country'].unique():
    try: 
        print(f'Evaluation for country: {country}')
        country_data = data_for_the_model[data_for_the_model['Country'] == country].set_index('Date').asfreq('W')
        country_history, country_results = lstm_fit_predict_plot_evaluate(country, country_data["TotalLoad_Imputed_MW"])
        locals()[f'history_{country}'] = country_history
        results_list.append(country_results)
    except Exception as e:
        print(f"An error occurred for country {country}: {e}")

lstm_results = pd.DataFrame(results_list)

In [None]:
evaluation_results = pd.concat([sarima_results, tbats_results, lstm_results], ignore_index=True).sort_values(by=['country', 'model'], ascending=[True, True])
display(results)

evaluation_results.style.hide(axis = 0).to_latex(os.path.join(tables_output_folder, "tab_02.tex"))

In [None]:
# mape=history.history['mape']
# loss=history.history['loss']