<h1 align="center">Deep Learning - Master in Deep Learning of UPM</h1>

**IMPORTANTE**

Antes de empezar debemos instalar PyTorch Lightning, por defecto, esto valdría:

In [None]:
!pip install pytorch-lightning

Además, si te encuentras ejecutando este código en Google Collab, lo mejor será que montes tu drive para tener acceso a los datos:

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import datetime

import torch
import torch.nn as nn

import pytorch_lightning as pl
import torchmetrics
from pytorch_lightning import seed_everything

import numpy as np

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import matplotlib.pyplot as plt


DATA_PATH = 'data/stocks.csv'
SEED = 42
seed_everything(seed=SEED) # Fijamos una semilla para reproducibilidad en los experimentos



Este dataset consta del precio de cierre de la acción de Amazon (AMZN) desde 2006 hasta 2017. Es el mismo que utilizamos durante la practica de redes recurrentes.


In [None]:
data = pd.read_csv(DATA_PATH)

data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d')
data.sort_values('date', inplace=True)
print(f"Date range: {data['date'].min()} to {data['date'].max()}")

Esta es la distribución de los datos, también usaremos un escalador para evitar los problemas de magnitud

In [None]:
data = pd.read_csv(DATA_PATH)
data.plot(x='date', y='close', title='AMZN stock price', ylabel='Price', xlabel='Date', figsize=(10, 5))



Como esto ya lo hemos hecho en un punto anterior del tiempo, lo tendremos disponible! Dataset, DataModule

In [None]:
class StocksDataset(torch.utils.data.Dataset):
    def __init__(self, df, w=10, h=1):
        self.data = df.drop('date', axis=1).values
        self.w = w
        self.h = h

    def __len__(self):
        return len(self.data) - (self.w + self.h) + 1

    def __getitem__(self, idx):
        features = self.data[idx:idx+self.w] # [i: i+w)
        target = self.data[idx+self.w: idx+self.w+self.h].reshape(-1) # [i+w, i+w+h)
        return features, target # (w, input_size), (h,)

In [None]:
class StocksDataModule(pl.LightningDataModule):
    def __init__(self, df, w=10, h=1, batch_size=16, val_size=0.2, test_size=0.2):
        super().__init__()
        self.data = df

        self.sequential_train_val_test_split(df, val_size=val_size, test_size=test_size)
        self.normalize()

        self.w = w
        self.h = h

        self.batch_size = batch_size

    def setup(self, stage=None):
        if stage == 'fit':
            self.train_dataset = StocksDataset(self.train_df, w=self.w, h=self.h)
            self.val_dataset = StocksDataset(self.val_df, w=self.w, h=self.h)
        elif stage == 'test':
            self.test_dataset = StocksDataset(self.test_df, w=self.w, h=self.h)

    def normalize(self):
        self.scaler_train = MinMaxScaler()
        self.scaler_val = MinMaxScaler()
        self.scaler_test = MinMaxScaler()

        # Ajusta y transforma cada split
        self.train_df['close'] = self.scaler_train.fit_transform(self.train_df[['close']])
        self.val_df['close'] = self.scaler_val.fit_transform(self.val_df[['close']])
        self.test_df['close'] = self.scaler_test.fit_transform(self.test_df[['close']])

    def sequential_train_val_test_split(self, df, val_size=0.2, test_size=0.2):
        # Aseguramos el formato de la fecha y ordenamos por ella
        df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y')
        df.sort_values('date', inplace=True)

        # Calculamos los índices para hacer los splits
        n = len(df)
        train_end = int((1 - val_size - test_size) * n)
        val_end = int((1 - test_size) * n)

        self.train_df = df.iloc[:train_end].copy()
        self.val_df = df.iloc[train_end:val_end].copy()
        self.test_df = df.iloc[val_end:].copy()

    def collate_fn(self, batch):
        features, targets = zip(*batch)

        features = np.stack(features, axis=0)  # [batch_size, w, input_size]
        targets = np.stack(targets, axis=0)    # [batch_size, h, input_size]

        features = torch.tensor(features, dtype=torch.float32)
        targets = torch.tensor(targets, dtype=torch.float32)
        return features, targets

    def train_dataloader(self):
        return torch.utils.data.DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, collate_fn=self.collate_fn)

    def val_dataloader(self):
        return torch.utils.data.DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False, collate_fn=self.collate_fn)

    def test_dataloader(self):
        return torch.utils.data.DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, collate_fn=self.collate_fn)

Vamos a crear nuestro propios módulos de atención. Todos ellos queremos que sean entrenables! Así que los diseñaremos con pesos.
Hemos aprendido como calcular la LSTM con pesos entrenables de atención, pero no como ocurre en otros módulos de atención.
Para ello tendremos que hacer una llamada previa a un capa tipo `linear(q)`, `linear(k)`, `linear(v)`.

In [None]:
class TrainableAdditiveAttention(nn.Module):
    """
    Modulo de atencion
    hidden_dim[int]: tamaño de la representación
    """
    def __init__(self, hidden_dim):
        super(TrainableAdditiveAttention, self).__init__()
        self.hidden_dim = hidden_dim
        self.W1 = nn.Linear(hidden_dim, hidden_dim)
        self.W2 = nn.Linear(hidden_dim, hidden_dim)
        self.W3 = nn.Linear(hidden_dim, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)

    def _score(self, q, k):
        return self.V(torch.tanh(self.W1(q) + self.W2(k)))

    def forward(self, q, k, v):
        score = self._score(q, k) # Q[batch_size; seq_len, hidden_dim]
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * self.W3(v) # Una atencion para cada elem. de la secuencia
        return context_vector, attention_weights #C[batch_size; seq_len; hidden_dim] // #A[batch_size; seq_len]

In [None]:
class TrainableGeneralAttention(nn.Module):
    def __init__(self, query_dim, key_dim, value_dim):
        super(TrainableGeneralAttention, self).__init__()
        self.W_a = nn.Parameter(torch.randn(query_dim, key_dim))
        self.Wq = nn.Linear(query_dim, query_dim)
        self.Wk = nn.Linear(key_dim, key_dim)
        self.Wv = nn.Linear(value_dim, value_dim)

    def _score(self, q, k):
        # Q = Query, K = Key
        # Las dimensiones de Q y K tienen que ser compatibles!
        # Mecanismo de la atencion general
        left = torch.matmul(q, self.W_a)
        return torch.matmul(left, k.transpose(-2, -1))

    def forward(self, q, k, v):
        # Q = Query, K = Key, V = Value
        # Las dimensiones de Q, K y V tienen que ser compatibles!
        score = self._score(q, k) #C[batch_size; seq_len; hidden_dim]
        attention_weights = torch.softmax(score, dim=1)
        # Cuidado, esto es el producto matricial!
        context_vector = torch.matmul(attention_weights, v)
        return context_vector, attention_weights #C[batch_size; seq_len; hidden_dim] // #A[batch_size; seq_len; seq_len]

In [None]:
class TrainableScaledDotProductAttention(nn.Module):
    def __init__(self, query_dim, key_dim, value_dim):
        super(TrainableScaledDotProductAttention, self).__init__()
        self.Wk = nn.Linear(key_dim, key_dim)
        self.Wq = nn.Linear(query_dim, query_dim)
        self.Wv = nn.Linear(value_dim, value_dim)

    def _score(self, q, k):
        # Q = Query, K = Key
        # Las dimensiones de Q y K tienen que ser compatibles!
        # Mecanismo de la atencion general
        return torch.matmul(q, k.transpose(-2, -1)) / np.sqrt(q.size(-1))

    def forward(self, q, k, v):
        # Q = Query, K = Key, V = Value
        # Las dimensiones de Q, K y V tienen que ser compatibles!
        score = self._score(self.Wq(q), self.Wk(k)) #C[batch_size; seq_len; hidden_dim]
        attention_weights = torch.softmax(score, dim=1)
        # Cuidado, esto es el producto matricial!
        context_vector = torch.matmul(attention_weights, self.Wv(v))
        return context_vector, attention_weights #C[batch_size; seq_len; hidden_dim] // #A[batch_size; seq_len; seq_len]

El siguiente paso será definir la LSTM compatible con atencion y con multiples métodos de pooling. Vamos a crear un nuevo módulo que incorpora las atenciones propuestas. También vamos a generalizar la definición de la LSTM para poder intercalar sucesivas atenciones.

In [None]:
class AdvancedAttentionLSTMRegressor(nn.Module):
    """
    LSTM Regressor model
    h[int]: horizonte de predicción
    input_size[int]: variables de la serie temporal
    hidden_size[int]: tamaño de las capas ocultas de la RNN
    num_layers[int]: número de capas de la RNN (si > 1, stacking de células RNN)
    batch_first[bool]: si el batch_size es la primera dimensión
    p_drop[float]: probabilidad de dropout
    """
    def __init__(self,  h=1,
                 input_size=1,
                 hidden_size=64,
                 num_layers=1,
                 batch_first=True,
                 p_drop=0.0,
                 attention_type='None',
                 pooling_type='last'):
        super(AdvancedAttentionLSTMRegressor, self).__init__()
        self.lstm_init_layer = nn.LSTM(input_size=input_size,
                                       hidden_size=hidden_size,
                                       num_layers=1,
                                       batch_first=batch_first)
        self.lstm_layers = nn.ModuleList([nn.LSTM(input_size=hidden_size,
                                                  hidden_size=hidden_size,
                                                  num_layers=1,
                                                  batch_first=batch_first,
                                                  ) for i in range(num_layers-1)])
        self.dropout_layers = nn.ModuleList([nn.Dropout(p_drop) for i in range(num_layers-1)]) # No hay dropout en la ultima capa!
        self.pooling_type = pooling_type
        self.attention_type = None
        # Attention types
        if attention_type == 'additive':
          self.attention_type = TrainableAdditiveAttention
          self.attention_layers = nn.ModuleList([self.attention_type(hidden_size) for i in range(num_layers)])
        elif attention_type == 'general':
          self.attention_type = TrainableGeneralAttention
          self.attention_layers = nn.ModuleList([self.attention_type(hidden_size, hidden_size, hidden_size) for i in range(num_layers)])
        elif attention_type == 'sdpa':
          self.attention_type = TrainableScaledDotProductAttention
          self.attention_layers = nn.ModuleList([self.attention_type(hidden_size, hidden_size, hidden_size) for i in range(num_layers)])

        self.fc = nn.Linear(hidden_size, h)

    def forward(self, x):
        x, _ = self.lstm_init_layer(x)
        
        for i, lstm_layer in enumerate(self.lstm_layers):
          x = self.dropout_layers[i](x)
          if self.attention_type:
            x, _ = self.attention_layers[i](x,x,x)
          x, _ = lstm_layer(x)

        if self.attention_type:
          x, _ = self.attention_layers[-1](x,x,x)

        if self.pooling_type == 'last':
          x = x[:, -1, :]
        elif self.pooling_type == 'mean':
          x = x.mean(dim=1)
        elif self.pooling_type == 'max':
          x = x.max(dim=1)[0]

        return self.fc(x) #out[batch_size; h]

Declaramos el Lighting Module

In [None]:
class StockPredictor(pl.LightningModule):
    def __init__(self, model, learning_rate=1e-3):
        super().__init__()
        self.save_hyperparameters() # guardamos la configuración de hiperparámetros
        self.learning_rate = learning_rate
        self.model = model
        self.criterion = nn.MSELoss()

    def forward(self, x):
        return self.model(x)

    def compute_batch(self, batch, split='train'):
        inputs, targets = batch
        output = self(inputs)

        preds = output.view(-1)
        targets = targets.view(-1)

        loss = self.criterion(preds, targets)
        self.log_dict(
            {
                f'{split}_loss': loss,
            },
            on_epoch=True, prog_bar=True)

        return loss

    def training_step(self, batch, batch_idx):
        return self.compute_batch(batch, 'train')

    def validation_step(self, batch, batch_idx):
        return self.compute_batch(batch, 'val')

    def test_step(self, batch, batch_idx):
        return self.compute_batch(batch, 'test')

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.learning_rate) # self.parameters() son los parámetros del modelo

Explora varios parametros y configuraciones, observa que ocurre al entrenamiento con cada mecanismo.

In [None]:
# @title Seleccion parametros
w = 10 #@param {type:"integer"}
h = 3 #@param {type:"integer"}
input_size = 1 #@param {type:"integer"}
batch_size = 64 #@param {type:"integer"}
num_layers = 1 #@param {type:"integer"}
hidden_size = 128 #@param {type:"integer"}
learning_rate = 1e-3 #@param {type:"number"}
p_drop = 0.2 #@param {type:"number"}
pooling = 'last' #@param ["last", "mean", "max"]
attention = 'none' #@param ["none", "additive", "general", "sdpa"]

In [None]:
# Parámetros
SAVE_DIR = f'lightning_logs/stock/{datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}'

# DataModule
data = pd.read_csv(DATA_PATH)
data_module = StocksDataModule(data, w=w, h=h, batch_size=batch_size)

# Model
model = AdvancedAttentionLSTMRegressor(h=h, input_size=input_size,
                                       hidden_size=hidden_size,
                                       num_layers=num_layers,
                                       batch_first=True,
                                       p_drop=p_drop,
                                       pooling=pooling,
                                       attention=attention)

# LightningModule
module = StockPredictor(model, learning_rate=learning_rate)

# Callbacks
early_stopping_callback = pl.callbacks.EarlyStopping(
    monitor='val_loss', # monitorizamos la pérdida en el conjunto de validación
    mode='min',
    patience=5, # número de epochs sin mejora antes de parar
    verbose=False, # si queremos que muestre mensajes del estado del early stopping
)
model_checkpoint_callback = pd.callbacks.ModelCheckpoint(
    monitor='val_loss', # monitorizamos la pérdida en el conjunto de validación
    mode='min', # queremos minimizar la pérdida
    save_top_k=1, # guardamos solo el mejor modelo
    dirpath=SAVE_DIR, # directorio donde se guardan los modelos
    filename=f'best_model' # nombre del archivo
)

# Descomentar en función de si queremos o no el callback de forecasting
# forecasting_callback = ForecastingCallback()
# callbacks = [early_stopping_callback, model_checkpoint_callback, forecasting_callback]

callbacks = [early_stopping_callback, model_checkpoint_callback]

# Loggers
csv_logger = pl.loggers.CSVLogger(
    save_dir=SAVE_DIR,
    name='metrics',
    version=None
)

loggers = [csv_logger] # se pueden poner varios loggers (mirar documentación)

# Trainer
trainer = pl.Trainer(max_epochs=50, accelerator='gpu', devices=[0], callbacks=callbacks, logger=loggers)

trainer.fit(module, data_module)
results = trainer.test(module, data_module)

In [None]:
# Parámetros
SAVE_DIR = f'lightning_logs/stock/{datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}'

# DataModule
data = pd.read_csv(DATA_PATH)
data_module = StocksDataModule(data, w=w, h=h, batch_size=batch_size)

# Model
model = AdvancedAttentionLSTMRegressor(h=h, input_size=input_size,
                                       hidden_size=hidden_size,
                                       num_layers=num_layers,
                                       batch_first=True,
                                       p_drop=p_drop,
                                       pooling_type=pooling,
                                       attention_type=attention)

# LightningModule
module = StockPredictor(model, learning_rate=learning_rate)

# Callbacks
early_stopping_callback = pl.callbacks.EarlyStopping(
    monitor='val_loss', # monitorizamos la pérdida en el conjunto de validación
    mode='min',
    patience=5, # número de epochs sin mejora antes de parar
    verbose=False, # si queremos que muestre mensajes del estado del early stopping
)
model_checkpoint_callback = pl.callbacks.ModelCheckpoint(
    monitor='val_loss', # monitorizamos la pérdida en el conjunto de validación
    mode='min', # queremos minimizar la pérdida
    save_top_k=1, # guardamos solo el mejor modelo
    dirpath=SAVE_DIR, # directorio donde se guardan los modelos
    filename=f'best_model' # nombre del archivo
)

# Descomentar en función de si queremos o no el callback de forecasting
# forecasting_callback = ForecastingCallback()
# callbacks = [early_stopping_callback, model_checkpoint_callback, forecasting_callback]

callbacks = [early_stopping_callback, model_checkpoint_callback]

# Loggers
csv_logger = pl.loggers.CSVLogger(
    save_dir=SAVE_DIR,
    name='metrics',
    version=None
)

loggers = [csv_logger] # se pueden poner varios loggers (mirar documentación)

# Trainer
trainer = pl.Trainer(max_epochs=50, accelerator='cpu', callbacks=callbacks, logger=loggers)

trainer.fit(module, data_module)
results = trainer.test(module, data_module)