In [97]:
import torch
import torch.nn as nn
import torch.utils.data
from torch.nn import functional as F
import pandas as pd
import numpy as np
import random
import math

In [98]:
# -------
EPOCHS = 10
LR = 0.001
SEQ_LENGTH = 12 # Number of historical data points to consider
BATCH_SIZE = 64
D_MODEL = 1 #2  # number of features (demand + temperature)
NHEAD = 1
NUM_ENCODER_LAYERS = 2
NUM_DECODER_LAYERS = 2
# -------

In [99]:
# def create_sequences(data, seq_length, num_samples):
#     sequences = []
#     target = []
#     if (num_samples > len(data)):
#         print("num_samples too large")
#         return
    
#     for _ in range(num_samples):
#         idx = random.randint(0, len(data)-seq_length - 1)
#         seq = data[idx:idx+seq_length+1]
# #         label = data[idx+seq_length]
#         sequences.append(seq)
#         target.append(seq)
# #         target.append(label)

#     return np.array(sequences), np.array(target)

In [100]:
def create_sequences(data, seq_length, num_samples):
    res = []
    if (num_samples > len(data)):
        print("num_samples too large")
        return
    
    for _ in range(num_samples):
        idx = random.randint(0, len(data)-seq_length - 1)
        seq = data[idx:idx+seq_length+1]
        res.append([seq, seq])
    return res

def batchify_data(data, batch_size=16, padding=False, padding_token=-1):
    batches = []
    for idx in range(0, len(data), batch_size):
        # We make sure we dont get the last bit if its not batch_size size
        if idx + batch_size < len(data):
            # Here you would need to get the max length of the batch,
            # and normalize the length with the PAD token.
            if padding:
                max_batch_length = 0

                # Get longest sentence in batch
                for seq in data[idx : idx + batch_size]:
                    if len(seq) > max_batch_length:
                        max_batch_length = len(seq)

                # Append X padding tokens until it reaches the max length
                for seq_idx in range(batch_size):
                    remaining_length = max_bath_length - len(data[idx + seq_idx])
                    data[idx + seq_idx] += [padding_token] * remaining_length

            batches.append(np.array(data[idx : idx + batch_size]))

    print(f"{len(batches)} batches of size {batch_size}")

    return batches

In [101]:
# Load your data into a DataFrame
path = "/Users/aidanwiteck/Desktop/Princeton/Year 4/Thesis/electricgrid/data/final_tables/banc/banc.csv"
df = pd.read_csv(path)
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Z-score normalization
mean_demand = df['Demand (MWh)'].mean()
std_demand = df['Demand (MWh)'].std()

df['Normalized Demand'] = (df['Demand (MWh)'] - mean_demand) / std_demand

In [102]:
train_data = create_sequences(df['Normalized Demand'].values, SEQ_LENGTH, 9000)
val_data = create_sequences(df['Normalized Demand'].values, SEQ_LENGTH, 3000)

train_dataloader = batchify_data(train_data)
val_dataloader = batchify_data(val_data)

562 batches of size 16
187 batches of size 16


In [103]:
# # Create sequences
# X, y = create_sequences(df['Normalized Demand'].values, SEQ_LENGTH, 5000)  # For example, creating 5000 samples
# X = torch.tensor(X, dtype=torch.float32).unsqueeze(-1)
# y = torch.tensor(y, dtype=torch.float32).unsqueeze(-1)

In [104]:
# # Split data (80/20 split)
# train_size = int(0.8 * len(X))
# X_train, X_test = X[:train_size], X[train_size:]
# y_train, y_test = y[:train_size], y[train_size:]

In [105]:
# train_data = torch.utils.data.TensorDataset(X_train, y_train)
# train_loader = torch.utils.data.DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE)

In [106]:
# train_data[1]

In [117]:
class PositionalEncoding(nn.Module):
    def __init__(self, dim_model, dropout_p, max_len):
        super().__init__()
        # Modified version from: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
        # max_len determines how far the position can have an effect on a token (window)
        
        # Info
        self.dropout = nn.Dropout(dropout_p)
        
        # Encoding - From formula
        pos_encoding = torch.zeros(max_len, dim_model)
        positions_list = torch.arange(0, max_len, dtype=torch.float).view(-1, 1) # 0, 1, 2, 3, 4, 5
        division_term = torch.exp(torch.arange(0, dim_model, 2).float() * (-math.log(10000.0)) / dim_model) 
        
        # PE(pos, 2i) = sin(pos/1000^(2i/dim_model))
        pos_encoding[:, 0::2] = torch.sin(positions_list * division_term)
        
        # PE(pos, 2i + 1) = cos(pos/1000^(2i/dim_model))
        pos_encoding[:, 1::2] = torch.cos(positions_list * division_term)
        
        # Saving buffer (same as parameter without gradients needed)
        pos_encoding = pos_encoding.unsqueeze(0).transpose(0, 1)
        self.register_buffer("pos_encoding",pos_encoding)
        
    def forward(self, token_embedding: torch.tensor) -> torch.tensor:
        # Residual connection + pos encoding
        return self.dropout(token_embedding + self.pos_encoding[:token_embedding.size(0), :])

In [118]:
class Transformer(nn.Module):
    """
    Model from "A detailed guide to Pytorch's nn.Transformer() module.", by
    Daniel Melchor: 
    https://medium.com/@danielmelchor/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1
    """
    # Constructor
    def __init__(
        self,
        num_tokens,
        dim_model,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        dropout_p,
    ):
        super().__init__()

        # INFO
        self.model_type = "Transformer"
        self.dim_model = dim_model

        # LAYERS
        self.positional_encoder = PositionalEncoding(
            dim_model=dim_model, dropout_p=dropout_p, max_len=5000
        )
        self.embedding = nn.Embedding(num_tokens, dim_model)
        self.transformer = nn.Transformer(
            d_model=dim_model,
            nhead=num_heads,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dropout=dropout_p,
        )
        self.out = nn.Linear(dim_model, num_tokens)
        
    def forward(self, src, tgt, tgt_mask=None, src_pad_mask=None, tgt_pad_mask=None):
        # Src size must be (batch_size, src sequence length)
        # Tgt size must be (batch_size, tgt sequence length)

        # Embedding + positional encoding - Out size = (batch_size, sequence length, dim_model)
#         src = self.embedding(src) * math.sqrt(self.dim_model)
#         tgt = self.embedding(tgt) * math.sqrt(self.dim_model)
        src = self.positional_encoder(src)
        tgt = self.positional_encoder(tgt)
        
        # We could use the parameter batch_first=True, but our KDL version doesn't support it yet, so we
        # permute to obtain size (sequence length, batch_size, dim_model),
        src = src.permute(1,0,2)
        tgt = tgt.permute(1,0,2)

        # Transformer blocks - Out size = (sequence length, batch_size, num_tokens)
        transformer_out = self.transformer(src, 
                                           tgt, 
                                           tgt_mask=tgt_mask, 
                                           src_key_padding_mask=src_pad_mask, 
                                           tgt_key_padding_mask=tgt_pad_mask)
        out = self.out(transformer_out)
        
        return out
      
    def get_tgt_mask(self, size) -> torch.tensor:
        # Generates a squeare matrix where the each row allows one word more to be seen
        mask = torch.tril(torch.ones(size, size) == 1) # Lower triangular matrix
        mask = mask.float()
        mask = mask.masked_fill(mask == 0, float('-inf')) # Convert zeros to -inf
        mask = mask.masked_fill(mask == 1, float(0.0)) # Convert ones to 0
        
        # EX for size=5:
        # [[0., -inf, -inf, -inf, -inf],
        #  [0.,   0., -inf, -inf, -inf],
        #  [0.,   0.,   0., -inf, -inf],
        #  [0.,   0.,   0.,   0., -inf],
        #  [0.,   0.,   0.,   0.,   0.]]
        
        return mask
    
    def create_pad_mask(self, matrix: torch.tensor, pad_token: int) -> torch.tensor:
        # If matrix = [1,2,3,0,0,0] where pad_token=0, the result mask is
        # [False, False, False, True, True, True]
        return (matrix == pad_token)

In [119]:
# def generate_random_data(n):
#     SOS_token = np.array([2])
#     EOS_token = np.array([3])
#     length = 8

#     data = []

#     # 1,1,1,1,1,1 -> 1,1,1,1,1
#     for i in range(n // 3):
#         X = np.concatenate((SOS_token, np.ones(length), EOS_token))
#         y = np.concatenate((SOS_token, np.ones(length), EOS_token))
#         data.append([X, y])

#     # 0,0,0,0 -> 0,0,0,0
#     for i in range(n // 3):
#         X = np.concatenate((SOS_token, np.zeros(length), EOS_token))
#         y = np.concatenate((SOS_token, np.zeros(length), EOS_token))
#         data.append([X, y])

#     # 1,0,1,0 -> 1,0,1,0,1
#     for i in range(n // 3):
#         X = np.zeros(length)
#         start = random.randint(0, 1)

#         X[start::2] = 1

#         y = np.zeros(length)
#         if X[-1] == 0:
#             y[::2] = 1
#         else:
#             y[1::2] = 1

#         X = np.concatenate((SOS_token, X, EOS_token))
#         y = np.concatenate((SOS_token, y, EOS_token))

#         data.append([X, y])

#     np.random.shuffle(data)

#     print(len(data))
#     print(data[123])
#     return data


# def batchify_data(data, batch_size=16, padding=False, padding_token=-1):
#     batches = []
#     for idx in range(0, len(data), batch_size):
#         # We make sure we dont get the last bit if its not batch_size size
#         if idx + batch_size < len(data):
#             # Here you would need to get the max length of the batch,
#             # and normalize the length with the PAD token.
#             if padding:
#                 max_batch_length = 0

#                 # Get longest sentence in batch
#                 for seq in data[idx : idx + batch_size]:
#                     if len(seq) > max_batch_length:
#                         max_batch_length = len(seq)

#                 # Append X padding tokens until it reaches the max length
#                 for seq_idx in range(batch_size):
#                     remaining_length = max_bath_length - len(data[idx + seq_idx])
#                     data[idx + seq_idx] += [padding_token] * remaining_length

#             batches.append(np.array(data[idx : idx + batch_size]).astype(np.int64))

#     print(f"{len(batches)} batches of size {batch_size}")

#     return batches

In [120]:
# train_data = generate_random_data(9000)
# val_data = generate_random_data(3000)

# train_dataloader = batchify_data(train_data)
# val_dataloader = batchify_data(val_data)

In [121]:
print(train_dataloader[0][:, 0])
print(train_dataloader[0][:, 0][:,:-1])

[[-2.69617821e-02 -2.11041870e-01 -3.60471588e-01 -4.60091401e-01
  -5.16398251e-01 -5.56462741e-01 -5.59711213e-01 -5.08818483e-01
  -3.70217005e-01 -1.64480436e-01 -3.02102543e-02 -1.39678936e-02
  -3.02102543e-02]
 [-6.11686767e-01 -4.97990242e-01 -2.97667794e-01 -2.31615527e-01
  -2.01296454e-01 -1.34161363e-01 -3.77900226e-02  1.09370595e-02
   9.75629832e-02  1.82023259e-01  2.76228951e-01  3.46612514e-01
   4.22410197e-01]
 [-5.32640612e-01 -4.49263160e-01 -2.97667794e-01 -2.11041870e-01
  -1.35244187e-01 -9.30140490e-02 -9.19312249e-02 -4.10384947e-02
   1.52683557e-02 -9.62625211e-02 -6.59434478e-02  1.31027076e-02
   6.72439099e-02]
 [-2.86839553e-01 -2.80342609e-01 -2.26201407e-01 -1.42823955e-01
  -6.59434478e-02  6.50782618e-02  2.38330109e-01  4.28907141e-01
   6.32478062e-01  8.90190185e-01  1.18471833e+00  1.46950105e+00
   1.73262729e+00]
 [ 1.80940435e-01  1.65780898e-01  1.30047705e-01  7.04923820e-02
  -6.38812527e-03 -1.14670530e-01 -2.29449879e-01 -3.12827330e-01


In [122]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = Transformer(
    num_tokens=4, dim_model=8, num_heads=2, num_encoder_layers=3, num_decoder_layers=3, dropout_p=0.1
).to(device)
opt = torch.optim.SGD(model.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()

In [123]:
def train_loop(model, opt, loss_fn, dataloader):
    """
    Method from "A detailed guide to Pytorch's nn.Transformer() module.", by
    Daniel Melchor: 
    https://medium.com/@danielmelchor/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1
    """
    
    model.train()
    total_loss = 0
    
    for batch in dataloader:
        X, y = batch[:, 0], batch[:, 1]
        X, y = torch.tensor(X).to(device), torch.tensor(y).to(device)

        # Now we shift the tgt by one so with the <SOS> we predict the token at pos 1
        y_input = y[:,:-1]
        y_expected = y[:,1:]
        
        # Get mask to mask out the next words
        sequence_length = y_input.size(1)
        tgt_mask = model.get_tgt_mask(sequence_length).to(device)

        # Standard training except we pass in y_input and tgt_mask
        pred = model(X, y_input, tgt_mask)

        # Permute pred to have batch size first again
        pred = pred.permute(1, 2, 0)      
        loss = loss_fn(pred, y_expected)

        opt.zero_grad()
        loss.backward()
        opt.step()
    
        total_loss += loss.detach().item()
        
    return total_loss / len(dataloader)

In [124]:
def validation_loop(model, loss_fn, dataloader):
    """
    Method from "A detailed guide to Pytorch's nn.Transformer() module.", by
    Daniel Melchor: 
    https://medium.com/@danielmelchor/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1
    """
    
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch in dataloader:
            X, y = batch[:, 0], batch[:, 1]
            X, y = torch.tensor(X, dtype=torch.long, device=device), torch.tensor(y, dtype=torch.long, device=device)

            # Now we shift the tgt by one so with the <SOS> we predict the token at pos 1
            y_input = y[:,:-1]
            y_expected = y[:,1:]
            
            # Get mask to mask out the next words
            sequence_length = y_input.size(1)
            tgt_mask = model.get_tgt_mask(sequence_length).to(device)

            # Standard training except we pass in y_input and src_mask
            pred = model(X, y_input, tgt_mask)

            # Permute pred to have batch size first again
            pred = pred.permute(1, 2, 0)      
            loss = loss_fn(pred, y_expected)
            total_loss += loss.detach().item()
        
    return total_loss / len(dataloader)

In [125]:
def fit(model, opt, loss_fn, train_dataloader, val_dataloader, epochs):
# def fit(model, opt, loss_fn, train_dataloader, epochs):
    """
    Method from "A detailed guide to Pytorch's nn.Transformer() module.", by
    Daniel Melchor: 
    https://medium.com/@danielmelchor/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1
    """
    
    # Used for plotting later on
    train_loss_list, validation_loss_list = [], []
    
    print("Training and validating model")
    for epoch in range(epochs):
        print("-"*25, f"Epoch {epoch + 1}","-"*25)
        
        train_loss = train_loop(model, opt, loss_fn, train_dataloader)
        train_loss_list += [train_loss]
        
        validation_loss = validation_loop(model, loss_fn, val_dataloader)
        validation_loss_list += [validation_loss]
        
        print(f"Training loss: {train_loss:.4f}")
        print(f"Validation loss: {validation_loss:.4f}")
        print()
        
    return train_loss_list, validation_loss_list
    
# train_loss_list = fit(model, opt, loss_fn, train_loader, 10)
train_loss_list, validation_loss_list = fit(model, opt, loss_fn, train_dataloader, val_dataloader, 10)

Training and validating model
------------------------- Epoch 1 -------------------------


RuntimeError: The size of tensor a (13) must match the size of tensor b (8) at non-singleton dimension 2