# Import of the require modules

In [2]:
# Data preprocessing and visualization
from datetime import datetime
from dateutil.relativedelta import relativedelta
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

# Deep learning modules
from keras.models import Sequential
from keras import layers
import tensorflow as tf
from keras import backend as K

# from metricas import calculo_metricas
import gc

# Versions of some of the packages
print('Version pandas:', pd.__version__)
print('Version numpy:', np.__version__)
print('Version seaborn', sns.__version__)
print('Version tensorflow: ', tf.version.VERSION)

# Functions

In [3]:
def mae(predictions, targets):
    """Función que calcula el MAE (Mean Absolute Error) entre la predicción
    y la variable real.
    Params:
        - predictions: array de las predicciones
        - targets: array de los valores reales.
    """
    return abs(predictions - targets)

def mape(predictions, targets):
    """Funtion that computes the MAPE (Mean Absolute Percentage Error) between the prediction and the target variable
    Params:
        - predictions: array with the predictions
        - targets: array with real values
    """
    return abs((targets - predictions) / targets) * 100

def wmape(predictions, targets):
    """Funtion that computes the WMAPE (Weight Mean Absolute Percentage Error) between the prediction and the
    target variable.
    Params:
        - predictions: array with the predictions
        - targets: array with real values
    """
    return np.sum(abs(targets - predictions)) / np.sum(targets) * 100


def rmse(predictions, targets):
    """Funtion that computes the RMSE (Root Mean Squared Error) between the prediction and the target variable
    Params:
        - predictions: array with the predictions
        - targets: array with real values
    """
    return np.sqrt(((predictions - targets) ** 2).mean())

def rmse_lstm(y_true, y_pred):
    """Monitorizing the rmse with denormalized values in the training of the LSTM model
    Params:
        - y_pred: array with the predictions
        - y_true: array with real values
            """
    y_real = y_true* std[0] + mean[0]
    y_p = y_pred * std[0] + mean[0]
    
    return K.sqrt(K.mean(K.square(y_p - y_real), axis= -1))


def wmape_lstm(predictions, targets):
    """Monitorizing the wmape with denormalized values in the training of the LSTM model
    Params:
        - predictions: array de las predicciones
        - targets: array de los valores reales.
    """
    y_real = targets* std[0] + mean[0]
    y_p = predictions * std[0] + mean[0]
    
    return K.sum(K.abs(y_real - y_p)) / K.sum(y_real) * 100

def calculo_metricas(dataframe):
    """Function that performs all the metrics that we want to compare.
    Params:
        - dataframe: a pd.Dataframe with two columns
            + Pred: column with the predictions
            + Real: column with real values
    """
    
    # Cálculos de las métricas de MAE y MAPE
    dataframe.loc[:, "MAE"] = mae(dataframe.Pred, dataframe.Real)
    dataframe.loc[:, "MAPE"] = mape(dataframe.Pred, dataframe.Real)
    
    # Dataframe de predicción de tendencia
    up_down_price = dataframe[["Pred", "Real"]].diff(1).dropna()
    up_down_price.loc[:, "Ind_Pred"] = [1 if data > 0 else 0 for data in up_down_price.Pred]
    up_down_price.loc[:, "Ind_Real"] = [1 if data > 0 else 0 for data in up_down_price.Real]
    up_down_price.loc[:, "Aciertos"] = up_down_price.loc[:, "Ind_Pred"] == up_down_price.loc[:, "Ind_Real"]
    
    # Creación del dataframe que se mostrará por pantalla
    metrica = pd.DataFrame({"MAE" : round(float(dataframe[['MAE']].mean()), 2),
                            "MAE (median)" : round(float(np.median(dataframe[['MAE']])), 2),
                            "MAPE" : round(float(dataframe[['MAPE']].mean()), 2),
                            "WMAPE" : round(wmape(dataframe.Pred, dataframe.Real), 2),
                            "RMSE" : round(rmse(dataframe.Pred, dataframe.Real), 2),
                            "% Trend" : round(up_down_price.Aciertos.value_counts()[1] / 
                                              up_down_price.Aciertos.value_counts().sum() * 100, 2)}, index = [0])
    
    # Mostramos por pantalla el dataframe de métricas
    print(metrica)                 
                            
    # Dataframe de métricas por meses                        
    return dataframe, metrica


# Reading and preparation of the database

We read the dataframe already preprocessed, delete the variables that we have already analyzed that are not important and create the lags of 24 and 48 hours and 1 week.

In [4]:
# Dataframe reading
df = pd.read_csv('/kaggle/input/spain-electricity-price/dataframe.csv', index_col = 0)

# Index conversion to datetime
df.index = pd.to_datetime(df.index)

# Drop useless columns
df = df.drop(columns = ["Festivo_Regional", "Humedad_Relativa", "Precipitacion", "Radiacion", "Velocidad_Viento"])

# Create lags columns
df.loc[:, "lag_24"] = df.Spot_electricidad.shift(24)
df.loc[:, "lag_48"] = df.Spot_electricidad.shift(48)
df.loc[:, "lag_1_semana"] = df.Spot_electricidad.shift(24*7)

# Brief exploration of the dataframe and the target variable

48 variables (all numeric)

In [5]:
df.describe()

In [6]:
plt.figure(figsize = (15,6))
sns.boxplot(x = df.Spot_electricidad.index.year, y = df.Spot_electricidad)
plt.ylabel('Electricity Price €/MWh')
plt.xlabel('Year')
plt.title('Evolution of the average price of electricity in Spain from 2014 to 2021')
plt.show()

**We note the importance of trying to predict this variable since in the last year it has deviated a lot from the historical and try to explain this increase.**

In [7]:
time_series = df.Spot_electricidad.copy()
time_series = time_series.loc[:]
fig = go.Figure()
fig.add_trace(go.Scatter(
                    x= time_series.index.tolist(),
                    y=time_series,
                    mode='lines',
                    name='Real'))
fig.update_layout(
    title="Electricity hourly price evolution in Spain",
    xaxis_title="Date",
    yaxis_title="€/MWh",
    legend_title="Spot price")

fig.show()

## Data normalization

We will perform an hourly backtesting of the full year 2021.

In [8]:
# Drop nulls, it's important.
df = df.dropna()

print('First date of the dataframe: ', df.index[0])
print('Last date of the dataframe: ', df.index[-1])

# Work with numpy arrays
raw_data = df.values

print("Dataframe dimensions", np.shape(raw_data))

# Standardization of the data
mean = raw_data.mean(axis=0)
raw_data -= mean
std = raw_data.std(axis=0)
raw_data /= std

## Generator construction

In [9]:
def generator(data, order, target, lookback, min_index, max_index,
              shuffle=False, delay = 24, batch_size =24, step = 1):
    """Function that creates the input data for the LSTM.
    - Params:
        - data: The original array of floating-point data
        - order: order in the variables that have been used in the array
        - lookback: How many timesteps back the input data should go.
        - delay: How many timesteps in the future the target should be. We are going to predict the next 24 hours each day,
                 so this value must be 24.
        - min_index and max_index:Indices in the data array that delimit which timesteps to draw from.
                                  This is useful for keeping a segment of the data for validation and another for testing.
        - shuffle: Whether to shuffle the samples or draw them in chronological order.
        - batch_size: The number of samples per batch. 24 four because of daily data

    - Output:
        - samples: regressor variables
        - targets: targets variables
        """
    target_index = order.index(target)
    
    if max_index is None:
        max_index = len(data) - delay
    i = min_index + lookback
    while 1:
        if shuffle:
            rows = np.random.randint(min_index + lookback, max_index, size = batch_size)
        else:
            if i + batch_size >= max_index:
                i = min_index + lookback
            rows = np.arange(i, min(i+batch_size, max_index))
            i += len(rows)
            
        samples = np.zeros((len(rows),
                           lookback // step,
                           data.shape[-1]))
        
        targets = np.zeros((len(rows),))
        for j , row in enumerate(rows):
            indices = range(rows[j] - lookback, rows[j], step)
            samples[j] = data[indices]
            targets[j] = data[rows[j] + delay][target_index]  
            
        yield samples, targets
        

def get_index(df, date):
    """dñflahdf
    Params:
        - df: original dataframe without nan's
        - date: str with the string in the format: "%Y-%m-%d %H:%M%S"
    Output:
        - Index: index of the dataframe which matches the date
    """
    index = 0
    for i, dt in enumerate(df.index):
        if date == str(dt):
            index = i
    return index
    

In [10]:
# PARAMETERS
# ====================================================================================================
lookback_days = 20
lookback = 24 * lookback_days

train_min_index = 0
train_max_index = get_index(df.dropna(), '2021-01-01 00:00:00')

# For the backtesting I'm going to ignore the validation data in the training because i'm going to make a backtesting
# val_min_index = get_index(df.dropna(), '2021-06-01 00:00:00') - lookback
# val_max_index = len(df.dropna())

test_min_index = get_index(df.dropna(), '2021-01-01 00:00:00') - lookback
test_max_index = len(df.dropna())

print(train_min_index, train_max_index, test_min_index, test_max_index)
train_gen = generator(data = raw_data,
                     order = df.columns.tolist(),
                     target = 'Spot_electricidad',
                     lookback = lookback,
                     min_index = train_min_index,
                     max_index = train_max_index,
                     shuffle = False) # It's True, but I'm going to try with False

# val_gen = generator(data = raw_data,
#                      order = df.columns.tolist(),
#                      target = 'Spot_electricidad',
#                      lookback = lookback,
#                      min_index = val_min_index,
#                      max_index = val_max_index)

test_gen = generator(data = raw_data,
                     order = df.columns.tolist(),
                     target = 'Spot_electricidad',
                     lookback = lookback,
                     min_index = test_min_index,
                     max_index = test_max_index)


train_steps = int((train_max_index - train_min_index - lookback)/24)
# val_steps = int((val_max_index - val_min_index - lookback)/24)
test_steps = int((test_max_index - test_min_index - lookback)/24)

print("Training steps", train_steps)  # Two years (2019, 2020)
# print("Steps de validación",val_steps)
print("Test steps", test_steps) # Test data (backtesting, 2021)

# Model 

In [10]:
model = Sequential()
model.add(layers.LSTM(128,
                    dropout = 0.1,
                    return_sequences = True,
                    input_shape = (lookback, raw_data.shape[-1])))

model.add(layers.LSTM(64,
                      return_sequences = True,
                      dropout = 0.1))

model.add(layers.LSTM(32,
                      return_sequences = True,
                      dropout = 0.1))

model.add(layers.LSTM(16,
                     dropout = 0.1))

model.add(layers.Dense(50))

model.add(layers.Dense(1))

model.compile(optimizer = 'adam', loss = 'mae', metrics = [rmse_lstm, wmape_lstm])

history = model.fit(train_gen,
                    steps_per_epoch = 100,
                    epochs = 150)

# ============================================================================
#If we want to see the validation data:

# early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_wmape',
#                                             patience=50,
#                                             restore_best_weights=True)

# history = model.fit(train_gen,
#                     steps_per_epoch = 100,
#                     epochs = 200,
#                     validation_data = val_gen,
#                     validation_steps = val_steps,
#                     callbacks = [early_stop])

# Backtesting

In [11]:
def backtesting_lstm(raw_data, df, target_variable, date, lookback,
                     retraining_epochs, retraining_steps_per_epoch, backtesting_days):
    
    """Function that performs the backtesting of the 2021 full year.
    Params:
        - raw_data: normalized data in np.array format
        - df: original dataframe
        - target_variable: name of the variable to be predicted
        - date: this is the date which we want to start our backtesting
        - retraining_epochs: number of epochs that we want to retrain every day
        - retraining_steps_per_epoch: number of steps_per_epoch that we want to use for each retraining
        - backtesting_days: number of days we want to perform our backtesting since the 'date' parameter
    Output:
        - dataframe: pd.Dataframe with two columns: Original value and predicted value with the dimensions 
                     (24 * backtesting_days, 2)
    """

    prediction, target = [], []
    index = get_index(df.dropna(), date)
    index_normalization = df.dropna().columns.tolist().index(target_variable)
    
    # El menos 1 para que no dé index error
    for i in range(backtesting_days - 1):
        gc.collect()
        print(i)
        train_min_index = index - lookback + (i-1)*24
        train_max_index = index + i*24
        test_min_index = index - lookback + i*24
        test_max_index = index + (i+1)*24

        test_steps = int((test_max_index - test_min_index - lookback)/24)

        train_gen = generator(data = raw_data,
                     order = df.columns.tolist(),
                     target = 'Spot_electricidad',
                     lookback = lookback,
                     min_index = train_min_index,
                     max_index = train_max_index,
                     shuffle = False)
        
        test_gen = generator(data = raw_data,
                     order = df.columns.tolist(),
                     target = 'Spot_electricidad',
                     lookback = lookback,
                     min_index = test_min_index,
                     max_index = test_max_index)
        
        # Retraining
        if i == 0:
            pass
        
        else:
            model.fit(train_gen,
                      steps_per_epoch = retraining_steps_per_epoch,
                      epochs = retraining_epochs,
                      verbose = 0)
        
        # Conversion of the test
        generator_samples_test = []
        generator_target_test = []
        
        for _ in range(test_steps):
            data_sample, target_sample = next(test_gen)
            generator_samples_test.append(data_sample)
            generator_target_test.append(target_sample)
            
        generator_samples_test = np.array(generator_samples_test)
        generator_target_test = np.array(generator_target_test)
        
        # Predictions
        preds = []
        for rows in generator_samples_test:
            preds.append(model.predict(rows))
        preds = np.array(preds).flatten()
        
        # Denormalization
        target_aux = (generator_target_test * std[index_normalization] + mean[index_normalization]).flatten()
        prediction_aux = preds * std[index_normalization] + mean[index_normalization]
        
        # =================================================== BORRAR ===================================================
#         plt.plot(target_aux)
#         plt.plot(prediction_aux)
#         plt.show()
        # ======================================================================================================
        prediction.append(prediction_aux)
        target.append(target_aux)
        
    # Flat the data and get the dataframe
    final_target = np.array(target).flatten()
    final_prediction = np.array(prediction).flatten()
    dataframe = pd.DataFrame({"Real":pd.Series(final_target),
                              "Pred":pd.Series(final_prediction)})
    
    return dataframe

backtesting_lstm = backtesting_lstm(raw_data,
                 df.dropna(),
                 target_variable = 'Spot_electricidad',
                 date = "2021-01-01 00:00:00",
                 lookback = lookback,
                 retraining_epochs = 5,
                 retraining_steps_per_epoch = 24,
                 backtesting_days = test_steps)


In [12]:
a, b = calculo_metricas(backtesting_lstm)

In [17]:
backtesting_lstm[["Real", "Pred"]].plot(figsize = (15,6 ))

In [21]:
backtesting_lstm[["Real", "Pred"]].loc[24*100:24*150].plot(figsize = (15,6 ))

# Without Retraining

It's going to make bad predictions because the correlations between the variables changeg very much in the periods that we are studying, so the more epochs, the worst metrics for the test data.

In [11]:
model = Sequential()

model.add(layers.LSTM(128,
                    dropout = 0.1,
                    return_sequences = True,
                    input_shape = (lookback, raw_data.shape[-1])))

model.add(layers.LSTM(64,
                      return_sequences = True,
                      dropout = 0.1))

model.add(layers.LSTM(32,
                      return_sequences = True,
                      dropout = 0.1))

model.add(layers.LSTM(16,
                     dropout = 0.1))

model.add(layers.Dense(50))

model.add(layers.Dense(1))

model.compile(optimizer = 'adam', loss = 'mae', metrics = [rmse_lstm, wmape_lstm])


early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_wmape_lstm',
                                            patience=50,
                                            restore_best_weights=True)

history = model.fit(train_gen,
                    steps_per_epoch = 100,
                    epochs = 150,
                    validation_data = test_gen,
                    validation_steps = test_steps,
                    callbacks = [early_stop])

In [14]:
test_min_index = get_index(df.dropna(), '2021-01-01 00:00:00') - lookback
test_max_index = len(df.dropna())
test_gen = generator(data = raw_data,
                     order = df.columns.tolist(),
                     target = 'Spot_electricidad',
                     lookback = lookback,
                     min_index = test_min_index,
                     max_index = test_max_index)

def predictions_lstm(raw_data, df, target_variable, date, lookback,
                     retraining_epochs, retraining_steps_per_epoch, backtesting_days):
    
    index_normalization = df.dropna().columns.tolist().index(target_variable)
    
    generator_samples_test = []
    generator_target_test = []

    for _ in range(test_steps - 1):
        data_sample, target_sample = next(test_gen)
        generator_samples_test.append(data_sample)
        generator_target_test.append(target_sample)

    generator_samples_test = np.array(generator_samples_test)
    generator_target_test = np.array(generator_target_test)

    # Predictions
    preds = []
    for rows in generator_samples_test:
        preds.append(model.predict(rows))
    preds = np.array(preds).flatten()

    # Denormalization
    target = (generator_target_test * std[index_normalization] + mean[index_normalization]).flatten()
    prediction = preds * std[index_normalization] + mean[index_normalization]

    # Flat the data and get the dataframe
    final_target = np.array(target).flatten()
    final_prediction = np.array(prediction).flatten()
    dataframe = pd.DataFrame({"Real":pd.Series(final_target),
                                  "Pred":pd.Series(final_prediction)})
    return dataframe

without_retraining_lstm = predictions_lstm(raw_data,
                             df.dropna(),
                             target_variable = 'Spot_electricidad',
                             date = "2021-01-01 00:00:00",
                             lookback = lookback,
                             retraining_epochs = 5,
                             retraining_steps_per_epoch = 24,
                             backtesting_days = test_steps)
    


In [15]:
without_retraining_lstm[["Real", "Pred"]].plot(figsize = (15,6 ))