In [29]:
# https://github.com/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb

import torch
import numpy as np
import pandas as pd
from faker import Faker
import random
import babel
from babel.dates import format_date
import tqdm
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from sklearn import model_selection
import pytorch_lightning as pl
from torch.nn import functional as F
from torch.nn.functional import cross_entropy
from torch.nn import CrossEntropyLoss
from torchmetrics.functional import accuracy
from torchvision import transforms
from pytorch_lightning.callbacks import ModelCheckpoint

In [30]:
fake = Faker()
Faker.seed(12345)
random.seed(12345)

# Define format of the data we would like to generate
FORMATS = ['short',
           'medium',
           'long',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'd MMM YYY', 
           'd MMMM YYY',
           'dd MMM YYY',
           'd MMM, YYY',
           'd MMMM, YYY',
           'dd, MMM YYY',
           'd MM YY',
           'd MMMM YYY',
           'MMMM d YYY',
           'MMMM d, YYY',
           'dd.MM.YY']

# change this if you want it to work with another language
LOCALES = ['en_US']

In [31]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class Config:
    RANDOM_STATE_SEED = 42
    Tx_max = 30
    Ty_max = 11
    BATCH_SIZE = 128
    NUM_WORKERS = 8
    NUM_EPOCHS = 25
    PRECISION = 16
    NUM_FOLDS = 5
    FAST_DEV_RUN = False
    DEVICE = device

In [32]:
def generate_date_data():
    dt = fake.date_object()
    human_readable_dt = None
    machine_readable_dt = None
    try:
        human_readable_dt = format_date(dt, random.choice(FORMATS), "en_US")
        human_readable_dt = human_readable_dt.replace(",", "")
        machine_readable_dt = dt.isoformat()
    except AttributeError as e:
        return None, None, None
    return human_readable_dt, machine_readable_dt, dt

In [33]:
def load_date_dataset(num_examples=100):    
    dataset = []
    for row in range(num_examples):
        h_dt, m_dt, dt = generate_date_data()        
        dataset.append([h_dt, m_dt])    
    return np.array(dataset)

In [34]:
# the vocab for both source and target sequences needs to be generated from training data only
# to prevent data leakage into the validation sets leading to inflated model accuracy in validation phase
def get_source_target_vocab(human_dates, machine_dates):
    human_dt_vocab = set()
    machine_dt_vocab = set()
    for (h_dt, m_dt) in zip(human_dates, machine_dates):
        human_dt_vocab.update(tuple(h_dt))
        machine_dt_vocab.update(tuple(m_dt))
    human_dt_vocab = {value: index for index, value in enumerate(sorted(human_dt_vocab) + ['<unk>', '<pad>', '<sos>', '<eos>'])}
    machine_dt_vocab = {value: index for index, value in enumerate(sorted(machine_dt_vocab) + ['<sos>'])}
    inv_machine_dt_vocab = dict(enumerate(sorted(machine_dt_vocab)))       
    return human_dt_vocab, machine_dt_vocab, inv_machine_dt_vocab 

In [35]:
def stoi(str, length, vocab, add_sos_token=False):
    """
    Converts all strings in the vocabulary into a list of integers representing the positions of the
    input string's characters in the "vocab"
    
    Arguments:
    string -- input string, e.g. 'Wed 10 Jul 2007'
    length -- the number of time steps you'd like, determines if the output will be padded or cut
    vocab -- vocabulary, dictionary used to index every character of your "string"
    
    Returns:
    rep -- list of integers (or '<unk>') (size = length) representing the position of the string's character in the vocabulary
    """
    str = str.lower()
    str = str.replace(",", "")
    if len(str) > length:
        str = str[:length]
    unk_index = vocab.get("<unk>")            
    char_indexes = [vocab.get(char, unk_index) for char in str]
    if add_sos_token:
        sos_index = vocab.get("<sos>")        
        # We add index corresponding to <sos> token to the start of target date sequence
        char_indexes.insert(0, sos_index)
    return np.array(char_indexes)

In [36]:
# split the training dataframe into kfolds for cross validation. We do this before any processing is done
# on the data. We use stratified kfold if the target distribution is unbalanced
def strat_kfold_dataframe(df, target_col_name, num_folds=5):
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1
    # randomize of shuffle the rows of dataframe before splitting is done
    df = df.sample(frac=1, random_state=Config.RANDOM_STATE_SEED).reset_index(drop=True)
    # get the target data
    y = df[target_col_name].values
    skf = model_selection.StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=Config.RANDOM_STATE_SEED)
    for fold, (train_index, val_index) in enumerate(skf.split(X=df, y=y)):
        df.loc[val_index, "kfold"] = fold
    return df        

In [37]:
dataset = load_date_dataset(10000)
# Let us create a dates dataframe that will contain training data of human readable and machine
# readable dates
df_dates = pd.DataFrame({"h_dt": dataset[:, 0], "m_dt": dataset[:, 1]})
df_dates = strat_kfold_dataframe(df_dates, target_col_name="m_dt", num_folds=5)  
df_dates.head()



Unnamed: 0,h_dt,m_dt,kfold
0,February 3 1993,1993-02-03,0
1,Friday September 22 1995,1995-09-22,2
2,Saturday May 29 1999,1999-05-29,3
3,Thursday October 16 2008,2008-10-16,1
4,6/15/02,2002-06-15,2


In [38]:
# h_vocab, m_vocab, inv_m_vocab = get_source_target_vocab(df_dates.h_dt, df_dates.m_dt)
# h_vocab

In [39]:
# dt = stoi(df_dates.h_dt[0], 30, h_vocab)
# torch.from_numpy(dt)

In [40]:
# # Converts a vectorized date ( which is a tensor of ints where each int is the 
# # position of the corresponding char in the relevant vocab ) to one hot encoded form
# class OneHotTransform(object):
#     def __init__(self, vocab, max_seq_length):
#         self.vocab = vocab
#         self.max_seq_length = max_seq_length

#     def __call__(self, X):
#         pad_index = self.vocab["<pad>"]
#         pad_sequence()
#         return F.one_hot(X.T.long(), len(self.vocab))

In [41]:
# converts a string (sequence of chars) to a tensor of ints where each int is the 
# position of the corresponding char in the relevant vocab
class StoITensorTransform(object):
    def __init__(self, vocab, max_seq_length, add_sos_token=False):
        self.vocab = vocab
        self.max_seq_length = max_seq_length
        self.add_sos_token = add_sos_token

    def __call__(self, X):
        vectorized_str = stoi(X, self.max_seq_length, self.vocab, self.add_sos_token)
        #print(vectorized_str)
        return torch.from_numpy(vectorized_str)

In [42]:
# batch is the return value of __getitem__ method of the dataset being used. For DateDataset it is h_dt, m_dt
def pad_collate(batch):
    # we want to pad the h_dt sequences as these can be of variable length.
    # h_dt is of shape len(h_dt)
    sorted_batch = sorted(batch, key=lambda x:x[0].shape[0], reverse=True)
    h_dt_sorted = [x[0] for x in sorted_batch]
    h_dt_padded = pad_sequence(h_dt_sorted, batch_first = True, padding_value=0)
    # the original length of the padded h_dt sequences
    h_dt_len = torch.Tensor([len(x) for x in h_dt_sorted])
    # unpadded m_dt sequences    
    m_dt = torch.stack([x[1] for x in sorted_batch])        
    return h_dt_padded, h_dt_len, m_dt

In [43]:
class DateDataset(Dataset):
    def __init__(self, human_fmt_dates, machine_fmt_dates, transform, target_transform):
        super().__init__()
        self.h_dts = human_fmt_dates
        self.m_dts = machine_fmt_dates
        self.transform = transform
        self.target_transform = target_transform        

    # Returns vectorized form of human format date and its corresponding machine format date
    # with elements of the vectorized date being the index of the characters in the corresponding date vocab
    def __getitem__(self, index):
        h_dt = self.h_dts[index]
        m_dt = self.m_dts[index]
        if self.transform:
            h_dt = self.transform(h_dt)
        if self.target_transform:
            m_dt = self.target_transform(m_dt)
        return h_dt, m_dt

    def __len__(self):                
        return len(self.h_dts)

In [44]:
# Get the train and validation data loaders for a specific fold. 
# X: numpy array of input features
# y: numpy array of target labels
# fold: fold index for which to create data loaders                                     
# kfolds: Array that marks each of the data items as belonging to a specific fold
def get_fold_dls(fold, df):
    fold += 1                         
    train_df = df[df.kfold != fold].reset_index(drop=True)
    val_df = df[df.kfold == fold].reset_index(drop=True)
    h_dt_max_len = train_df.h_dt.apply(lambda x: len(x)).max()
    h_vocab, m_vocab, inv_m_vocab = get_source_target_vocab(train_df.h_dt, train_df.m_dt)    
    # transform to convert human_date and machine_date to one hot encoded forms
    transform = StoITensorTransform(h_vocab, h_dt_max_len)
    target_transform = StoITensorTransform(m_vocab, len(m_vocab), add_sos_token=True)
    ds_train = DateDataset(train_df.h_dt, train_df.m_dt, transform=transform, target_transform=target_transform)
    ds_val = DateDataset(val_df.h_dt, val_df.m_dt, transform=transform, target_transform=target_transform)
    dl_train = DataLoader(ds_train, batch_size=Config.BATCH_SIZE, shuffle=True, 
                        num_workers=Config.NUM_WORKERS, collate_fn=pad_collate)
    dl_val = DataLoader(ds_val, batch_size=Config.BATCH_SIZE, num_workers=Config.NUM_WORKERS, 
                        collate_fn=pad_collate)
    return dl_train, dl_val, ds_train, ds_val, h_vocab, m_vocab

In [45]:
dl_train, dl_val, ds_train, ds_val, h_vocab, m_vocab = get_fold_dls(0, df_dates)

In [46]:
ds_train[2][1]

tensor([11,  2, 10, 10, 10,  0,  1,  6,  0,  3, 10])

In [47]:
# test_tensor = ds_train[2][1]
# test_tensor = test_tensor.reshape(len(test_tensor), 1)
# print(test_tensor.shape)
# m_vocab_size = len(m_vocab)
# zeros_tensor = torch.zeros(len(test_tensor), m_vocab_size)
# print(zeros_tensor.shape)
# src_tensor = torch.ones((len(test_tensor), 1))
# print(src_tensor.shape)
# m_dt_oh = zeros_tensor.scatter_(1, test_tensor, 1)
# print(test_tensor)
# print(m_dt_oh)

In [48]:
# https://discuss.pytorch.org/t/what-does-the-scatter-function-do-in-layman-terms/28037/3
def one_hot_encode(input, vocab_size):        
    #print(f"input.shape = {input.shape}, vocab_size = {vocab_size}")
    batch_size = input.shape[0]
    seq_length = input.shape[1]
    input = input.reshape(batch_size, seq_length, 1)    
    zeros_tensor = torch.zeros(batch_size, seq_length, vocab_size).to(Config.DEVICE)    
    return zeros_tensor.scatter_(2, input, 1)

In [49]:
# dl_train_iter = iter(dl_train)
# h_dt, h_dt_len, m_dt = next(dl_train_iter)
# test = one_hot_encode(m_dt, len(m_vocab))
# test.shape


In [50]:
class lstm_encoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, drop_out, is_bidirect=False):
        super().__init__()
        # input to lstm is a sequence (of words, of chars, of anything else). The dimensions being 
        # (batch_size, sequence_length, input_size) if batch_first = True with sequence_length = length of longest sequence in the batch
        # where input_size = number of features(cols) in input X. If you use embedding layer, then each word in the
        # the sequence is represented by an embedding vector, so input_size = size of the embedding vector. If one
        # hot encoding representation is used then input_size = vocab_size with each word represented by a one hot
        # vector with size = vocab_size        
        self.input_size = input_size
        # hidden_size = number of units in the hidden layer
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.is_bidirect = is_bidirect
        self.num_directions = 2 if is_bidirect else 1
        self.lstm_layer = nn.LSTM(
            input_size = input_size, 
            hidden_size = hidden_size,
            num_layers = num_layers,
            batch_first = True,
            dropout = drop_out,
            bidirectional = is_bidirect
            )                

    def forward(self, inputs, input_lengths):       
        # inputs = [batch_size, max_seq_length] 
        # we are going to use one hot encoding representation of the human dates data. The input data is
        # vectorized form of human format date with elements of the vectorized date being the index of the 
        # characters in the corresponding date vocab (input_size = vocab_size)
        # inputs_oh = F.one_hot(inputs.T.float(), self.input_size)
        #print(f"inputs.shape = {inputs.shape}")
        inputs_oh = one_hot_encode(inputs, self.input_size)
        #print(f"inputs_oh.shape = {inputs_oh.shape}")
        # inputs_oh = [batch_size, max_seq_length, vocab_size]
        # pack_padded_sequence before feeding into LSTM. This is required so pytorch knows
        # which elements of the sequence are padded ones and ignore them in computation.
        packed_padded_inputs = pack_padded_sequence(inputs_oh, input_lengths.to("cpu"), batch_first=True)
        lstm_out_pack, (h_final, c_final) = self.lstm_layer(packed_padded_inputs)
        # h_final and c_final = [num_direction * num_layers, batch_size, hidden_size]
        # unpack the output
        # lstm_out, lstm_out_len = pad_packed_sequence(lstm_out_pack, batch_first=True)
        # lstm_out = [batch_size , seq_length , num_directions * hidden_size]
        return h_final, c_final

    # `nn.LSTM` takes a tuple of hidden states (h0, c0). h0 = initial
    # hidden state for each element in the batch, c0 = initial cell state for each element in the batch
    def init_state(self, batch_size):
        return (
            torch.zeros((self.num_directions * self.num_layers, batch_size, self.hidden_size)),
            torch.zeros((self.num_directions * self.num_layers, batch_size, self.hidden_size))
        )

In [51]:
class lstm_decoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, drop_out):
        super().__init__()        
        self.input_size = input_size        
        self.hidden_size = hidden_size
        self.num_layers = num_layers                
        self.lstm_layer = nn.LSTM(
            input_size = input_size, 
            hidden_size = hidden_size,
            num_layers = num_layers,
            batch_first = True,
            dropout = drop_out,
            bidirectional = False
            )   
        self.linear = nn.Linear(hidden_size, input_size)                        

    def forward(self, input_oh, encoder_hidden, encoder_cell):
        # input_oh = [batch_size, target_vocab_size]
        # The input sequence length in decoder is always 1 as we feed in one character at a time
        input_oh = input_oh.unsqueeze(1).to(Config.DEVICE)
        # input_oh = [batch_size, 1, target_vocab_size]        
        # inputs_oh = F.one_hot(inputs.T.long(), self.input_size)        
        #print(f"decoder inputs_oh.shape = {inputs_oh.shape}")
        lstm_out, (h_final, c_final) = self.lstm_layer(input_oh, (encoder_hidden, encoder_cell))
        #print(f"decoder lstm_out.shape = {lstm_out.shape}")
        # lstm_out = [batch_size , seq_length , num_directions * hidden_size]
        # h_final and c_final = [num_direction * num_layers, batch_size, hidden_size]
        # seq_length and num_direction will always be 1 for decoder. Thus
        # lstm_out = [batch_size, 1, hidden_size]
        # h_final and c_final = [num_layers, batch_size, hidden_size]
        pred = self.linear(lstm_out.squeeze(1))
        #print(f"decoder pred.shape = {pred.shape}")
        # pred = [batch_size, output_dim] where output_dim = vocab_size of target sequences ( machine dates in our case)
        # For multi class classification the number of output nodes is equal to the number of classes to predict (vocab size)
        return pred
        

Input sequence is of max 30 characters (Tx = 30), a date string like "January 3, 1984". The length of each input sequence may vary.
Output sequence is of fixed length (Ty=11), a date string like "1984-01-03" (10 for digits and one for - character).
Vectorized input data is a matrix of dimension batch_size * sequence_length(Tx) * human_date_vocab_size.
Take the example of input_date = "January 3, 1984". Consider human_date_vocab has 44 characters with each character being mapped to an index. Thus one hot encoded form of an input date character would be vector of 44 zeros with a 1 at the index corresponding to that character.
Now our input date is going to be padded to a sequence length of Tx=30. And each of these 30 characters is a vector of length human_date_vocab_size. Similarly output data is matrix of dimension batch_size * sequence_length(Ty) * machine_date_vocab_size.
An output_date like "1984-01-03" in its one hot encoded for would be a matrix of dimension sequence_length(Ty=11) * machine_date_vocab_size.
This holds true for both predicted output dates as well as actual output dates. <br/>

Prediction is done for each character in the output date. Thus for a single data row, the model outputs 11 character sequence as output. Each character prediction is a multi class classification problem with loss function (for pytorch) being cross entropy. The sum of losses for each of the 11 character predictions can be considered the loss for a single data row.

In [52]:
class EncoderDecoderLitModel(pl.LightningModule):
    def __init__(self, hparams, source_vocab_size, target_vocab_size):
        super().__init__()
        self.save_hyperparameters()
        self.lr = hparams["lr"]
        self.weight_decay = hparams["weight_decay"]
        # target_vocab_size = vocab_size for target sequence data (machine_date) as we are using one hot encoding
        self.target_vocab_size = target_vocab_size
        self.encoder = lstm_encoder(
            input_size = source_vocab_size,
            hidden_size = hparams["hidden_size"],
            num_layers = hparams["num_layers"],
            drop_out = hparams["enc_drop_out"],
            is_bidirect = hparams["enc_is_bidirect"]
            )
        self.decoder = lstm_decoder(
            input_size = target_vocab_size,
            hidden_size = hparams["hidden_size"],
            num_layers = hparams["num_layers"],
            drop_out = hparams["dec_drop_out"]
        )            

    def forward(self, src_seq, src_seq_lengths, target_seq_oh, teacher_forcing_ratio=0.5):
        #print(f"enc_inputs.shape = {enc_inputs.shape}")
        #print(f"dec_inputs.shape = {dec_inputs.shape}")
        # enc_inputs = [batch_size, max_source_seq_length]
        # target_seq = [batch_size, target_seq_length]
        batch_size = target_seq_oh.shape[0]        
        # target sequence length is 11 as it includes the <sos> token at the begining
        target_seq_length = target_seq_oh.shape[1]            
        # tensor to store decoder output
        dec_outputs = torch.zeros((batch_size, target_seq_length, self.target_vocab_size))        
        #print(f"dec_outputs.shape = {dec_outputs.shape}")
        # first input to the decoder is the <sos> token which is the first character in all target sequences
        input = target_seq_oh[:, 0, :].reshape(batch_size, -1)
        # input = [batch_size, target_vocab_size]
        #print(f"decoder input.shape = {input.shape}")
        # last hidden and cell state of the encoder is used as initial hidden and cell state of the decoder
        enc_hidden, enc_cell = self.encoder(src_seq, src_seq_lengths)
        for t in range(1, target_seq_length):            
            dec_output = self.decoder(input, enc_hidden, enc_cell)
            # dec_output = [batch_size, target_vocab_size]            
            dec_outputs[:, t, :] = dec_output
            # select the max value out of dec_input_size values for each row
            # pred_t = dec_output.argmax(1)
            #print(f"decoder pred_t.shape = {pred_t.shape}")
            # whether to use teacher forcing
            teacher_forcing = False if np.random.random() < teacher_forcing_ratio else True            
            # if teacher forcing use actual token at t as the input to t+1, otherwise use the prediction at t
            # as the input to t+1
            actual_t = target_seq_oh[:, t, :]
            #print(f"decoder actual_t.shape = {actual_t.shape}")
            input = actual_t if teacher_forcing else dec_outputs[:, t, :]
            input = input.reshape(batch_size, -1)
        return dec_outputs            

    def configure_optimizers(self):
        model_optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, self.parameters()), lr=self.lr, weight_decay=self.weight_decay)
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(model_optimizer, mode="min")
        return {
            "optimizer": model_optimizer,
            "lr_scheduler": {
                "scheduler": lr_scheduler,
                "monitor": "val_loss",
                "frequency": 1
            }
        }

    def training_step(self, batch, batch_idx):
        # data loader batch doesn't perform one hot encoding of either source or target sequences
        src_padded_seq, src_seq_lengths, target_seq = batch
        # target_seq = [batch_size, target_seq_length]
        # src_padded_seq = [batch_size, max_src_seq_length]
        target_seq_oh = one_hot_encode(target_seq, self.target_vocab_size)
        # target_seq_oh = [batch_size, target_seq_length, target_vocab_size]
        pred_target_seq = self(src_padded_seq, src_seq_lengths, target_seq_oh)
        # pred_target_seq = [batch_size, target_seq_length, target_vocab_size]
        # we will exclude the first character from both the predicted and actual target dates. The first character
        # in target_dates in <sos> token while the first value in pred_target_dates is 0.         
        #print(f"target_seq.shape = {target_seq.shape}")
        #print(f"pred_target_seq.shape = {pred_target_seq.shape}")
        target_seq = target_seq[:, 1:]
        #target_seq = target_seq.to(torch.float32)
        # flatten the target_seq to 1d 
        target_seq = target_seq.reshape(-1)
        pred_target_seq = pred_target_seq[:, 1:, :].to(Config.DEVICE)
        # flatten the predicted target seq to 2d
        pred_target_seq = pred_target_seq.view(-1, self.target_vocab_size)
        #print(f"target_seq.shape = {target_seq.shape}")
        #print(f"pred_target_seq.shape = {pred_target_seq.shape}")
        #print(f"target sequence row 0:")
        #print(target_seq[0])
        #print(f"pred target sequence row 0:")
        #print(pred_target_seq[0])
        loss = cross_entropy(pred_target_seq, target_seq)
        #acc = accuracy(pred_target_seq, target_seq)
        self.log("train_loss", loss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
        #self.log("train_acc", acc, prog_bar=True, logger=True, on_step=True, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):        
        src_padded_seq, src_seq_lengths, target_seq = batch
        target_seq_oh = one_hot_encode(target_seq, self.target_vocab_size)
        # Remember to turn teacher forcing off for validation
        pred_target_seq = self(src_padded_seq, src_seq_lengths, target_seq_oh, teacher_forcing_ratio=0)
        target_seq = target_seq[:, 1:].reshape(-1)
        pred_target_seq = pred_target_seq[:, 1:, :].to(Config.DEVICE)
        # flatten the predicted target seq to 2d
        pred_target_seq = pred_target_seq.view(-1, self.target_vocab_size)
        val_loss = cross_entropy(pred_target_seq, target_seq)
        #val_acc = accuracy(pred_target_seq, target_seq)
        self.log("val_loss", val_loss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
        #self.log("val_acc", val_acc, prog_bar=True, logger=True, on_step=True, on_epoch=True)
        return val_loss

In [53]:
# For results reproducibility 
# sets seeds for numpy, torch, python.random and PYTHONHASHSEED.
pl.seed_everything(42, workers=True)
# model hyperparameters
model_params = {    
    "num_layers": 2,
    "enc_is_bidirect": False,
    "hidden_size": 100,
    "enc_drop_out": 0.25,
    "dec_drop_out": 0.25,
    "lr": 0.001,
    "weight_decay": 0.001
    }

Global seed set to 42


In [54]:
from pytorch_lightning.callbacks import Callback
from pytorch_lightning import LightningModule, Trainer

class MetricsAggCallback(Callback):
    def __init__(self, metric_to_monitor, mode):
        self.metric_to_monitor = metric_to_monitor
        self.metrics = []
        self.best_metric = None
        self.mode = mode
        self.best_metric_epoch = None

    def on_epoch_end(self, trainer: Trainer, pl_module: LightningModule):
        metric_value = trainer.callback_metrics[self.metric_to_monitor].cpu().detach().item()
        print(f"metric {self.metric_to_monitor} = {metric_value}")
        self.metrics.append(metric_value)
        if self.mode == "max":
            self.best_metric = max(self.metrics)            
        elif self.mode == "min":            
            self.best_metric = min(self.metrics)           
        self.best_metric_epoch = self.metrics.index(self.best_metric)    

In [55]:
def run_training(fold, fold_loss, dl_train, dl_val, h_vocab, m_vocab, find_lr=True):
    fold_str = f"fold{fold}"
    print(f"Running training for {fold_str}")
    seq2seq_model = EncoderDecoderLitModel(
        hparams = model_params, 
        source_vocab_size = len(h_vocab),
        target_vocab_size = len(m_vocab)
        )
    tb_logger = pl.loggers.TensorBoardLogger(save_dir="logs")    
    chkpt_file_name = "best_model_{epoch}_{val_loss:.4f}"
    loss_chkpt_callback = ModelCheckpoint(dirpath="./model", verbose=True, monitor="val_loss", mode="min", filename=chkpt_file_name)    
    chkpt_callback = MetricsAggCallback(metric_to_monitor="val_loss", mode="min")
    trainer = pl.Trainer(
        gpus = 1,
        deterministic = True,
        auto_select_gpus = True,
        progress_bar_refresh_rate = 20,
        max_epochs = Config.NUM_EPOCHS,
        logger = tb_logger,
        auto_lr_find = True,    
        precision = Config.PRECISION,   
        fast_dev_run = Config.FAST_DEV_RUN, 
        callbacks = [loss_chkpt_callback, chkpt_callback]
    )        
    if find_lr:
        trainer.tune(model=seq2seq_model, train_dataloaders=dl_train)
        print(seq2seq_model.lr)
    trainer.fit(seq2seq_model, train_dataloaders=dl_train, val_dataloaders=dl_val)
    fold_loss.append(loss_chkpt_callback.best_model_score.cpu().detach().item())
    #fold_acc.append(acc_chkpt_callback.best_metric)
    print(f"Loss for {fold_str} = {fold_loss[fold]}")
    del trainer, seq2seq_model, loss_chkpt_callback 


In [56]:
find_lr = True
fold_loss = []

for fold in range(Config.NUM_FOLDS):
    dl_train, dl_val, ds_train, ds_val, h_vocab, m_vocab = get_fold_dls(fold, df_dates)
    run_training(fold, fold_loss, dl_train, dl_val, h_vocab, m_vocab)
    break

Running training for fold0


Using native 16bit precision.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  rank_zero_warn(f"you defined a {step_name} but have no {loader_name}. Skipping {stage} loop")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type         | Params
-----------------------------------------
0 | encoder | lstm_encoder | 140 K 
1 | decoder | lstm_decoder | 127 K 
-----------------------------------------
267 K     Trainable params
0         Non-trainable params
267 K     Total params
1.070     Total estimated model params size (MB)
Global seed set to 42


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Restoring states from the checkpoint file at /home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/lr_find_temp_model.ckpt
Restored all states from the checkpoint file at /home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/lr_find_temp_model.ckpt
Learning rate set to 0.005754399373371567
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type         | Params
-----------------------------------------
0 | encoder | lstm_encoder | 140 K 
1 | decoder | lstm_decoder | 127 K 
-----------------------------------------
267 K     Trainable params
0         Non-trainable params
267 K     Total params
1.070     Total estimated model params size (MB)


0.005754399373371567


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 42


metric val_loss = 2.492827892303467


Training: 36it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 62: val_loss reached 1.84141 (best 1.84141), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=0_val_loss=1.8414.ckpt" as top 1


metric val_loss = 1.8414080142974854
metric val_loss = 1.8414080142974854


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 125: val_loss reached 1.62704 (best 1.62704), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=1_val_loss=1.6270.ckpt" as top 1


metric val_loss = 1.6270416975021362
metric val_loss = 1.6270416975021362


Validating: 0it [00:00, ?it/s]

Epoch 2, global step 188: val_loss reached 1.49631 (best 1.49631), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=2_val_loss=1.4963.ckpt" as top 1


metric val_loss = 1.4963139295578003
metric val_loss = 1.4963139295578003


Validating: 0it [00:00, ?it/s]

Epoch 3, global step 251: val_loss reached 1.46106 (best 1.46106), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=3_val_loss=1.4611.ckpt" as top 1


metric val_loss = 1.4610594511032104
metric val_loss = 1.4610594511032104


Validating: 0it [00:00, ?it/s]

Epoch 4, global step 314: val_loss was not in top 1


metric val_loss = 1.474239706993103
metric val_loss = 1.474239706993103


Validating: 0it [00:00, ?it/s]

Epoch 5, global step 377: val_loss reached 1.44059 (best 1.44059), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=5_val_loss=1.4406.ckpt" as top 1


metric val_loss = 1.4405889511108398
metric val_loss = 1.4405889511108398


Validating: 0it [00:00, ?it/s]

Epoch 6, global step 440: val_loss reached 1.37669 (best 1.37669), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=6_val_loss=1.3767.ckpt" as top 1


metric val_loss = 1.376690149307251
metric val_loss = 1.376690149307251


Validating: 0it [00:00, ?it/s]

Epoch 7, global step 503: val_loss was not in top 1


metric val_loss = 1.4169812202453613
metric val_loss = 1.4169812202453613


Validating: 0it [00:00, ?it/s]

Epoch 8, global step 566: val_loss was not in top 1


metric val_loss = 1.4245829582214355
metric val_loss = 1.4245829582214355


Validating: 0it [00:00, ?it/s]

Epoch 9, global step 629: val_loss reached 1.33121 (best 1.33121), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=9_val_loss=1.3312.ckpt" as top 1


metric val_loss = 1.3312138319015503
metric val_loss = 1.3312138319015503


Validating: 0it [00:00, ?it/s]

Epoch 10, global step 692: val_loss reached 1.32001 (best 1.32001), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=10_val_loss=1.3200.ckpt" as top 1


metric val_loss = 1.3200061321258545
metric val_loss = 1.3200061321258545


Validating: 0it [00:00, ?it/s]

Epoch 11, global step 755: val_loss reached 1.27357 (best 1.27357), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=11_val_loss=1.2736.ckpt" as top 1


metric val_loss = 1.2735685110092163
metric val_loss = 1.2735685110092163


Validating: 0it [00:00, ?it/s]

Epoch 12, global step 818: val_loss was not in top 1


metric val_loss = 1.2836506366729736
metric val_loss = 1.2836506366729736


Validating: 0it [00:00, ?it/s]

Epoch 13, global step 881: val_loss was not in top 1


metric val_loss = 1.288377046585083
metric val_loss = 1.288377046585083


Validating: 0it [00:00, ?it/s]

Epoch 14, global step 944: val_loss was not in top 1


metric val_loss = 1.3186700344085693
metric val_loss = 1.3186700344085693


Validating: 0it [00:00, ?it/s]

Epoch 15, global step 1007: val_loss reached 1.22411 (best 1.22411), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=15_val_loss=1.2241.ckpt" as top 1


metric val_loss = 1.224107027053833
metric val_loss = 1.224107027053833


Validating: 0it [00:00, ?it/s]

Epoch 16, global step 1070: val_loss reached 1.22373 (best 1.22373), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=16_val_loss=1.2237.ckpt" as top 1


metric val_loss = 1.223727822303772
metric val_loss = 1.223727822303772


Validating: 0it [00:00, ?it/s]

Epoch 17, global step 1133: val_loss was not in top 1


metric val_loss = 1.2516676187515259
metric val_loss = 1.2516676187515259


Validating: 0it [00:00, ?it/s]

Epoch 18, global step 1196: val_loss was not in top 1


metric val_loss = 1.2245814800262451
metric val_loss = 1.2245814800262451


Validating: 0it [00:00, ?it/s]

Epoch 19, global step 1259: val_loss reached 1.19415 (best 1.19415), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=19_val_loss=1.1941.ckpt" as top 1


metric val_loss = 1.1941465139389038
metric val_loss = 1.1941465139389038


Validating: 0it [00:00, ?it/s]

Epoch 20, global step 1322: val_loss reached 1.18230 (best 1.18230), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=20_val_loss=1.1823.ckpt" as top 1


metric val_loss = 1.1822988986968994
metric val_loss = 1.1822988986968994


Validating: 0it [00:00, ?it/s]

Epoch 21, global step 1385: val_loss was not in top 1


metric val_loss = 1.2507240772247314
metric val_loss = 1.2507240772247314


Validating: 0it [00:00, ?it/s]

Epoch 22, global step 1448: val_loss was not in top 1


metric val_loss = 1.1969553232192993
metric val_loss = 1.1969553232192993


Validating: 0it [00:00, ?it/s]

Epoch 23, global step 1511: val_loss reached 1.15944 (best 1.15944), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=23_val_loss=1.1594.ckpt" as top 1


metric val_loss = 1.1594374179840088
metric val_loss = 1.1594374179840088


In [None]:
loss = nn.CrossEntropyLoss()
input = torch.randn(3, 5, requires_grad=True)
print(input)
target = torch.empty(3, dtype=torch.long).random_(5)
print(target)
output = loss(input, target)
output.backward()