In [37]:
# https://github.com/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb

import torch
import numpy as np
import pandas as pd
from faker import Faker
import random
import babel
from babel.dates import format_date
import tqdm
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from sklearn import model_selection
import pytorch_lightning as pl
from torch.nn import functional as F
from torch.nn.functional import cross_entropy
from torch.nn import CrossEntropyLoss
from torchmetrics.functional import accuracy
from torchvision import transforms
from pytorch_lightning.callbacks import ModelCheckpoint

In [38]:
fake = Faker()
Faker.seed(12345)
random.seed(12345)

# Define format of the data we would like to generate
FORMATS = ['short',
           'medium',
           'long',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'd MMM YYY', 
           'd MMMM YYY',
           'dd MMM YYY',
           'd MMM, YYY',
           'd MMMM, YYY',
           'dd, MMM YYY',
           'd MM YY',
           'd MMMM YYY',
           'MMMM d YYY',
           'MMMM d, YYY',
           'dd.MM.YY']

# change this if you want it to work with another language
LOCALES = ['en_US']

In [39]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class Config:
    RANDOM_STATE_SEED = 42
    Tx_max = 30
    Ty_max = 11
    BATCH_SIZE = 2048
    NUM_WORKERS = 8
    NUM_EPOCHS = 75
    PRECISION = 16
    NUM_FOLDS = 5
    FAST_DEV_RUN = False
    DEVICE = device

In [40]:
def generate_date_data():
    dt = fake.date_object()
    human_readable_dt = None
    machine_readable_dt = None
    try:
        human_readable_dt = format_date(dt, random.choice(FORMATS), "en_US")
        human_readable_dt = human_readable_dt.replace(",", "")
        machine_readable_dt = dt.isoformat()
    except AttributeError as e:
        return None, None, None
    return human_readable_dt, machine_readable_dt, dt

In [41]:
def load_date_dataset(num_examples=100):    
    dataset = []
    for row in range(num_examples):
        h_dt, m_dt, dt = generate_date_data()        
        dataset.append([h_dt, m_dt])    
    return np.array(dataset)

In [42]:
# the vocab for both source and target sequences needs to be generated from training data only
# to prevent data leakage into the validation sets leading to inflated model accuracy in validation phase
def get_source_target_vocab(human_dates, machine_dates):
    human_dt_vocab = set()
    machine_dt_vocab = set()
    for (h_dt, m_dt) in zip(human_dates, machine_dates):
        human_dt_vocab.update(tuple(h_dt))
        machine_dt_vocab.update(tuple(m_dt))
    human_dt_vocab = {value: index for index, value in enumerate(sorted(human_dt_vocab) + ['<unk>', '<pad>', '<sos>', '<eos>'])}
    machine_dt_vocab = {value: index for index, value in enumerate(sorted(machine_dt_vocab) + ['<sos>'])}
    inv_machine_dt_vocab = dict(enumerate(sorted(machine_dt_vocab)))  
    inv_human_dt_vocab = dict(enumerate(sorted(human_dt_vocab)))       
    return human_dt_vocab, machine_dt_vocab, inv_human_dt_vocab, inv_machine_dt_vocab 

In [43]:
def stoi(str, length, vocab, add_sos_token=False):
    """
    Converts all strings in the vocabulary into a list of integers representing the positions of the
    input string's characters in the "vocab"
    
    Arguments:
    string -- input string, e.g. 'Wed 10 Jul 2007'
    length -- the number of time steps you'd like, determines if the output will be padded or cut
    vocab -- vocabulary, dictionary used to index every character of your "string"
    
    Returns:
    rep -- list of integers (or '<unk>') (size = length) representing the position of the string's character in the vocabulary
    """
    #str = str.lower()
    str = str.replace(",", "")
    if len(str) > length:
        str = str[:length]
    unk_index = vocab.get("<unk>")            
    char_indexes = [vocab.get(char, unk_index) for char in str]
    if add_sos_token:
        sos_index = vocab.get("<sos>")        
        # We add index corresponding to <sos> token to the start of target date sequence
        char_indexes.insert(0, sos_index)
    return np.array(char_indexes)

In [44]:
# split the training dataframe into kfolds for cross validation. We do this before any processing is done
# on the data. We use stratified kfold if the target distribution is unbalanced
def strat_kfold_dataframe(df, target_col_name, num_folds=5):
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1
    # randomize of shuffle the rows of dataframe before splitting is done
    df = df.sample(frac=1, random_state=Config.RANDOM_STATE_SEED).reset_index(drop=True)
    # get the target data
    y = df[target_col_name].values
    skf = model_selection.StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=Config.RANDOM_STATE_SEED)
    for fold, (train_index, val_index) in enumerate(skf.split(X=df, y=y)):
        df.loc[val_index, "kfold"] = fold
    return df        

In [45]:
dataset = load_date_dataset(80000)
# Let us create a dates dataframe that will contain training data of human readable and machine
# readable dates
df_dates = pd.DataFrame({"h_dt": dataset[:, 0], "m_dt": dataset[:, 1]})
df_dates = strat_kfold_dataframe(df_dates, target_col_name="m_dt", num_folds=5)  
df_dates.head()



Unnamed: 0,h_dt,m_dt,kfold
0,June 7 1970,1970-06-07,0
1,December 23 2006,2006-12-23,0
2,July 17 1982,1982-07-17,4
3,Wednesday June 16 1976,1976-06-16,0
4,Nov 27 2015,2015-11-27,2


In [46]:
# h_vocab, m_vocab, inv_m_vocab = get_source_target_vocab(df_dates.h_dt, df_dates.m_dt)
# h_vocab

In [47]:
# dt = stoi(df_dates.h_dt[0], 30, h_vocab)
# torch.from_numpy(dt)

In [48]:
# # Converts a vectorized date ( which is a tensor of ints where each int is the 
# # position of the corresponding char in the relevant vocab ) to one hot encoded form
# class OneHotTransform(object):
#     def __init__(self, vocab, max_seq_length):
#         self.vocab = vocab
#         self.max_seq_length = max_seq_length

#     def __call__(self, X):
#         pad_index = self.vocab["<pad>"]
#         pad_sequence()
#         return F.one_hot(X.T.long(), len(self.vocab))

In [49]:
# converts a string (sequence of chars) to a tensor of ints where each int is the 
# position of the corresponding char in the relevant vocab
class StoITensorTransform(object):
    def __init__(self, vocab, max_seq_length, add_sos_token=False):
        self.vocab = vocab
        self.max_seq_length = max_seq_length
        self.add_sos_token = add_sos_token

    def __call__(self, X):
        vectorized_str = stoi(X, self.max_seq_length, self.vocab, self.add_sos_token)        
        return torch.from_numpy(vectorized_str)

In [50]:
# batch is the return value of __getitem__ method of the dataset being used. For DateDataset it is h_dt, m_dt
def pad_collate(batch):
    # we want to pad the h_dt sequences as these can be of variable length.
    # h_dt is of shape len(h_dt)
    sorted_batch = sorted(batch, key=lambda x:x[0].shape[0], reverse=True)
    h_dt_sorted = [x[0] for x in sorted_batch]
    h_dt_padded = pad_sequence(h_dt_sorted, batch_first = True, padding_value=0)
    # the original length of the padded h_dt sequences
    h_dt_len = torch.Tensor([len(x) for x in h_dt_sorted])
    # unpadded m_dt sequences    
    m_dt = torch.stack([x[1] for x in sorted_batch])        
    return h_dt_padded, h_dt_len, m_dt

In [51]:
class DateDataset(Dataset):
    def __init__(self, human_fmt_dates, machine_fmt_dates, transform, target_transform):
        super().__init__()
        self.h_dts = human_fmt_dates
        self.m_dts = machine_fmt_dates
        self.transform = transform
        self.target_transform = target_transform        

    # Returns vectorized form of human format date and its corresponding machine format date
    # with elements of the vectorized date being the index of the characters in the corresponding date vocab
    def __getitem__(self, index):
        h_dt = self.h_dts[index]
        m_dt = self.m_dts[index]
        if self.transform:
            h_dt = self.transform(h_dt)
        if self.target_transform:
            m_dt = self.target_transform(m_dt)
        return h_dt, m_dt

    def __len__(self):                
        return len(self.h_dts)

In [52]:
# Get the train and validation data loaders for a specific fold. 
# X: numpy array of input features
# y: numpy array of target labels
# fold: fold index for which to create data loaders                                     
# kfolds: Array that marks each of the data items as belonging to a specific fold
def get_fold_dls(fold, df):
    fold += 1                         
    train_df = df[df.kfold != fold].reset_index(drop=True)
    val_df = df[df.kfold == fold].reset_index(drop=True)
    h_dt_max_len = train_df.h_dt.apply(lambda x: len(x)).max()
    h_vocab, m_vocab, inv_h_vocab, inv_m_vocab = get_source_target_vocab(train_df.h_dt, train_df.m_dt)    
    # transform to convert human_date and machine_date to one hot encoded forms
    transform = StoITensorTransform(h_vocab, h_dt_max_len)
    target_transform = StoITensorTransform(m_vocab, len(m_vocab), add_sos_token=True)
    ds_train = DateDataset(train_df.h_dt, train_df.m_dt, transform=transform, target_transform=target_transform)
    ds_val = DateDataset(val_df.h_dt, val_df.m_dt, transform=transform, target_transform=target_transform)
    dl_train = DataLoader(ds_train, batch_size=Config.BATCH_SIZE, shuffle=True, 
                        num_workers=Config.NUM_WORKERS, collate_fn=pad_collate)
    dl_val = DataLoader(ds_val, batch_size=Config.BATCH_SIZE, num_workers=Config.NUM_WORKERS, 
                        collate_fn=pad_collate)
    return dl_train, dl_val, ds_train, ds_val, h_vocab, m_vocab, inv_h_vocab, inv_m_vocab

In [53]:
dl_train, dl_val, ds_train, ds_val, h_vocab, m_vocab, inv_h_vocab, inv_m_vocab = get_fold_dls(0, df_dates)

In [54]:
# https://discuss.pytorch.org/t/what-does-the-scatter-function-do-in-layman-terms/28037/3
def one_hot_encode(input, vocab_size):        
    #print(f"input.shape = {input.shape}, vocab_size = {vocab_size}")
    batch_size = input.shape[0]
    seq_length = input.shape[1]
    input = input.reshape(batch_size, seq_length, 1).to(Config.DEVICE)    
    zeros_tensor = torch.zeros(batch_size, seq_length, vocab_size).to(Config.DEVICE)    
    return zeros_tensor.scatter_(2, input, 1)

In [55]:
# dl_train_iter = iter(dl_train)
# h_dt, h_dt_len, m_dt = next(dl_train_iter)
# test = one_hot_encode(m_dt, len(m_vocab))
# test.shape

In [56]:
class lstm_encoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, drop_out, is_bidirect=False):
        super().__init__()
        # input to lstm is a sequence (of words, of chars, of anything else). The dimensions being 
        # (batch_size, sequence_length, input_size) if batch_first = True with sequence_length = length of longest sequence in the batch
        # where input_size = number of features(cols) in input X. If you use embedding layer, then each word in the
        # the sequence is represented by an embedding vector, so input_size = size of the embedding vector. If one
        # hot encoding representation is used then input_size = vocab_size with each word represented by a one hot
        # vector with size = vocab_size        
        self.input_size = input_size
        # hidden_size = number of units in the hidden layer
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.is_bidirect = is_bidirect
        self.num_directions = 2 if is_bidirect else 1
        self.lstm_layer = nn.LSTM(
            input_size = input_size, 
            hidden_size = hidden_size,
            num_layers = num_layers,
            batch_first = True,
            dropout = drop_out,
            bidirectional = is_bidirect
            )                

    def forward(self, inputs, input_lengths):       
        # inputs = [batch_size, max_seq_length] 
        # we are going to use one hot encoding representation of the human dates data. The input data is
        # vectorized form of human format date with elements of the vectorized date being the index of the 
        # characters in the corresponding date vocab (input_size = vocab_size)
        # inputs_oh = F.one_hot(inputs.T.float(), self.input_size)
        #print(f"inputs.shape = {inputs.shape}")
        inputs_oh = one_hot_encode(inputs, self.input_size)
        #print(f"inputs_oh.shape = {inputs_oh.shape}")
        # inputs_oh = [batch_size, max_seq_length, vocab_size]
        # pack_padded_sequence before feeding into LSTM. This is required so pytorch knows
        # which elements of the sequence are padded ones and ignore them in computation.
        packed_padded_inputs = pack_padded_sequence(inputs_oh, input_lengths.to("cpu"), batch_first=True)
        lstm_out_pack, (h_final, c_final) = self.lstm_layer(packed_padded_inputs)
        # h_final and c_final = [num_direction * num_layers, batch_size, hidden_size]
        # unpack the output
        # lstm_out, lstm_out_len = pad_packed_sequence(lstm_out_pack, batch_first=True)
        # lstm_out = [batch_size , seq_length , num_directions * hidden_size]
        return h_final, c_final

    # `nn.LSTM` takes a tuple of hidden states (h0, c0). h0 = initial
    # hidden state for each element in the batch, c0 = initial cell state for each element in the batch
    def init_state(self, batch_size):
        return (
            torch.zeros((self.num_directions * self.num_layers, batch_size, self.hidden_size)),
            torch.zeros((self.num_directions * self.num_layers, batch_size, self.hidden_size))
        )

In [57]:
class lstm_decoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, drop_out):
        super().__init__()        
        self.input_size = input_size        
        self.hidden_size = hidden_size
        self.num_layers = num_layers                
        self.lstm_layer = nn.LSTM(
            input_size = input_size, 
            hidden_size = hidden_size,
            num_layers = num_layers,
            batch_first = True,
            dropout = drop_out,
            bidirectional = False
            )   
        self.linear = nn.Linear(hidden_size, input_size)                        

    def forward(self, input_oh, encoder_hidden, encoder_cell):
        # input_oh = [batch_size, target_vocab_size]
        # The input sequence length in decoder is always 1 as we feed in one character at a time
        input_oh = input_oh.unsqueeze(1).to(Config.DEVICE)
        # input_oh = [batch_size, 1, target_vocab_size]        
        # inputs_oh = F.one_hot(inputs.T.long(), self.input_size)        
        #print(f"decoder inputs_oh.shape = {inputs_oh.shape}")
        lstm_out, (h_final, c_final) = self.lstm_layer(input_oh, (encoder_hidden, encoder_cell))
        #print(f"decoder lstm_out.shape = {lstm_out.shape}")
        # lstm_out = [batch_size , seq_length , num_directions * hidden_size]
        # h_final and c_final = [num_direction * num_layers, batch_size, hidden_size]
        # seq_length and num_direction will always be 1 for decoder. Thus
        # lstm_out = [batch_size, 1, hidden_size]
        # h_final and c_final = [num_layers, batch_size, hidden_size]
        pred = self.linear(lstm_out.squeeze(1))
        #print(f"decoder pred.shape = {pred.shape}")
        # pred = [batch_size, output_dim] where output_dim = vocab_size of target sequences ( machine dates in our case)
        # For multi class classification the number of output nodes is equal to the number of classes to predict (vocab size)
        return pred
        

Input sequence is of max 30 characters (Tx = 30), a date string like "January 3, 1984". The length of each input sequence may vary.
Output sequence is of fixed length (Ty=11), a date string like "1984-01-03" (10 for digits and one for - character).
Vectorized input data is a matrix of dimension batch_size * sequence_length(Tx) * human_date_vocab_size.
Take the example of input_date = "January 3, 1984". Consider human_date_vocab has 44 characters with each character being mapped to an index. Thus one hot encoded form of an input date character would be vector of 44 zeros with a 1 at the index corresponding to that character.
Now our input date is going to be padded to a sequence length of Tx=30. And each of these 30 characters is a vector of length human_date_vocab_size. Similarly output data is matrix of dimension batch_size * sequence_length(Ty) * machine_date_vocab_size.
An output_date like "1984-01-03" in its one hot encoded for would be a matrix of dimension sequence_length(Ty=11) * machine_date_vocab_size.
This holds true for both predicted output dates as well as actual output dates. <br/>

Prediction is done for each character in the output date. Thus for a single data row, the model outputs 11 character sequence as output. Each character prediction is a multi class classification problem with loss function (for pytorch) being cross entropy. The sum of losses for each of the 11 character predictions can be considered the loss for a single data row.

In [58]:
class EncoderDecoderLitModel(pl.LightningModule):
    def __init__(self, hparams, source_vocab_size, target_vocab_size):
        super().__init__()
        self.save_hyperparameters()
        self.lr = hparams["lr"]
        self.weight_decay = hparams["weight_decay"]
        # target_vocab_size = vocab_size for target sequence data (machine_date) as we are using one hot encoding
        self.target_vocab_size = target_vocab_size
        self.encoder = lstm_encoder(
            input_size = source_vocab_size,
            hidden_size = hparams["hidden_size"],
            num_layers = hparams["num_layers"],
            drop_out = hparams["enc_drop_out"]
            )
        self.decoder = lstm_decoder(
            input_size = target_vocab_size,
            hidden_size = hparams["hidden_size"],
            num_layers = hparams["num_layers"],
            drop_out = hparams["dec_drop_out"]
        )            

    def forward(self, src_seq, src_seq_lengths, target_seq_oh, teacher_forcing_ratio=0.5):        
        # src_seq = [batch_size, max_source_seq_length]
        # target_seq = [batch_size, target_seq_length]
        batch_size = target_seq_oh.shape[0]        
        # target sequence length is 11 as it includes the <sos> token at the begining
        target_seq_length = target_seq_oh.shape[1]            
        # tensor to store decoder output
        dec_outputs = torch.zeros((batch_size, target_seq_length, self.target_vocab_size))        
        #print(f"dec_outputs.shape = {dec_outputs.shape}")
        # first input to the decoder is the <sos> token which is the first character in all target sequences
        input = target_seq_oh[:, 0, :].reshape(batch_size, -1)
        # input = [batch_size, target_vocab_size]
        #print(f"decoder input.shape = {input.shape}")
        # last hidden and cell state of the encoder is used as initial hidden and cell state of the decoder
        enc_hidden, enc_cell = self.encoder(src_seq, src_seq_lengths)
        for t in range(1, target_seq_length):            
            dec_output = self.decoder(input, enc_hidden, enc_cell)
            # dec_output = [batch_size, target_vocab_size]            
            dec_outputs[:, t, :] = dec_output
            # select the max value out of dec_input_size values for each row
            # pred_t = dec_output.argmax(1)
            #print(f"decoder pred_t.shape = {pred_t.shape}")
            # whether to use teacher forcing
            teacher_forcing = False if np.random.random() < teacher_forcing_ratio else True            
            # if teacher forcing use actual token at t as the input to t+1, otherwise use the prediction at t
            # as the input to t+1
            actual_t = target_seq_oh[:, t, :]
            #print(f"decoder actual_t.shape = {actual_t.shape}")
            input = actual_t if teacher_forcing else dec_outputs[:, t, :]
            input = input.reshape(batch_size, -1)
        return dec_outputs            

    def configure_optimizers(self):
        model_optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, self.parameters()), lr=self.lr, weight_decay=self.weight_decay)
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(model_optimizer, mode="min")
        return {
            "optimizer": model_optimizer,
            "lr_scheduler": {
                "scheduler": lr_scheduler,
                "monitor": "val_loss",
                "frequency": 1
            }
        }

    def training_step(self, batch, batch_idx):
        # data loader batch doesn't perform one hot encoding of either source or target sequences
        src_padded_seq, src_seq_lengths, target_seq = batch
        # target_seq = [batch_size, target_seq_length]
        # src_padded_seq = [batch_size, max_src_seq_length]
        target_seq_oh = one_hot_encode(target_seq, self.target_vocab_size)
        # target_seq_oh = [batch_size, target_seq_length, target_vocab_size]
        pred_target_seq = self(src_padded_seq, src_seq_lengths, target_seq_oh)
        # pred_target_seq = [batch_size, target_seq_length, target_vocab_size]
        # we will exclude the first character from both the predicted and actual target dates. The first character
        # in target_dates in <sos> token while the first value in pred_target_dates is 0.         
        #print(f"target_seq.shape = {target_seq.shape}")
        #print(f"pred_target_seq.shape = {pred_target_seq.shape}")
        target_seq = target_seq[:, 1:]        
        # flatten the target_seq to 1d 
        target_seq = target_seq.reshape(-1)
        pred_target_seq = pred_target_seq[:, 1:, :].to(Config.DEVICE)
        # flatten the predicted target seq to 2d
        pred_target_seq = pred_target_seq.view(-1, self.target_vocab_size)
        #print(f"target_seq.shape = {target_seq.shape}")
        #print(f"pred_target_seq.shape = {pred_target_seq.shape}")                
        train_loss = cross_entropy(pred_target_seq, target_seq)
        train_perplexity = torch.exp(train_loss)
        self.log("train_loss", train_loss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
        self.log("train_perplexity", train_perplexity, prog_bar=True, logger=True, on_step=True, on_epoch=True)
        return train_loss

    def validation_step(self, batch, batch_idx):        
        src_padded_seq, src_seq_lengths, target_seq = batch
        target_seq_oh = one_hot_encode(target_seq, self.target_vocab_size)
        # Remember to turn teacher forcing off for validation
        pred_target_seq = self(src_padded_seq, src_seq_lengths, target_seq_oh, teacher_forcing_ratio=0)
        target_seq = target_seq[:, 1:].reshape(-1)
        pred_target_seq = pred_target_seq[:, 1:, :].to(Config.DEVICE)
        # flatten the predicted target seq to 2d
        pred_target_seq = pred_target_seq.view(-1, self.target_vocab_size)
        val_loss = cross_entropy(pred_target_seq, target_seq)
        val_perplexity = torch.exp(val_loss)
        self.log("val_loss", val_loss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
        self.log("val_perplexity", val_perplexity, prog_bar=True, logger=True, on_step=True, on_epoch=True)
        return val_loss

In [59]:
# For results reproducibility 
# sets seeds for numpy, torch, python.random and PYTHONHASHSEED.
pl.seed_everything(42, workers=True)

# Best trial number = 10
# Best trial params:
# {'lr': 0.0009729811471218791, 'hidden_size': 218, 'drop_out': 0.20683771027057984, 'num_layers': 2, 'weight_decay': 1.0661805946346311e-06}

# model hyperparameters
model_params = {    
    "num_layers": 2,    
    "hidden_size": 218,
    "enc_drop_out": 0.206,
    "dec_drop_out": 0.206,
    "lr": 0.00097,
    "weight_decay": 1.066e-06
    }

Global seed set to 42


In [60]:
from pytorch_lightning.callbacks import Callback
from pytorch_lightning import LightningModule, Trainer
# Monitor multiple metric values that are calculated either in training or validation step and return the
# best metric values for each epoch
class MetricsAggCallback(Callback):
    def __init__(self, metrics_to_monitor):
        # dictionary with metric name as key and monitor mode (min, max) as the value
        # ( the same names used to log metric values in training and validation step)
        self.metrics_to_monitor = metrics_to_monitor
        # dictionary with metric_name as key and list of metric value for each epoch
        self.metrics = {metric: [] for metric in metrics_to_monitor.keys()}
        # dictionary with metric_name as key and the best metric value for all epochs
        self.best_metric = {metric: None for metric in metrics_to_monitor.keys()}
        # dictionary with metric_name as key and the epoch number with the best metric value
        self.best_metric_epoch = {metric: None for metric in metrics_to_monitor.keys()}     
        self.epoch_counter = 0   

    def on_validation_epoch_end(self, trainer: Trainer, pl_module: LightningModule):
        self.epoch_counter += 1
        print(f"For epoch {self.epoch_counter}")            
        for metric, mode in self.metrics_to_monitor.items():
            metric_value = round(trainer.callback_metrics[metric].cpu().detach().item(), 4)            
            print(f"{metric} = {metric_value}")
            self.metrics[metric].append(metric_value)
            if mode == "max":
                self.best_metric[metric] = max(self.metrics[metric])            
            elif mode == "min":            
                self.best_metric[metric] = min(self.metrics[metric])            
            self.best_metric_epoch[metric] = self.metrics[metric].index(self.best_metric[metric])    

In [61]:
def run_training(fold, fold_loss, fold_metrics, dl_train, dl_val, h_vocab, m_vocab, find_lr=True):
    fold_str = f"fold{fold}"
    print(f"Running training for {fold_str}")
    seq2seq_model = EncoderDecoderLitModel(
        hparams = model_params, 
        source_vocab_size = len(h_vocab),
        target_vocab_size = len(m_vocab)
        )
    tb_logger = pl.loggers.TensorBoardLogger(save_dir="logs")    
    chkpt_file_name = "best_model_{epoch}_{val_loss:.4f}"
    metrics_to_monitor = {
        "val_loss": "min",
        "val_perplexity": "min",
        }
    loss_chkpt_callback = ModelCheckpoint(dirpath="./model", verbose=True, monitor="val_loss", mode="min", filename=chkpt_file_name)    
    metric_chkpt_callback = MetricsAggCallback(metrics_to_monitor = metrics_to_monitor)
    trainer = pl.Trainer(
        gpus = 1,
        deterministic = True,
        auto_select_gpus = True,
        progress_bar_refresh_rate = 20,
        max_epochs = Config.NUM_EPOCHS,
        logger = tb_logger,
        auto_lr_find = True,    
        precision = Config.PRECISION,   
        fast_dev_run = Config.FAST_DEV_RUN, 
        gradient_clip_val = 1.0,
        callbacks = [loss_chkpt_callback, metric_chkpt_callback]
    )        
    if find_lr:
        trainer.tune(model=seq2seq_model, train_dataloaders=dl_train)
        print(seq2seq_model.lr)
    trainer.fit(seq2seq_model, train_dataloaders=dl_train, val_dataloaders=dl_val)
    fold_min_loss = loss_chkpt_callback.best_model_score.cpu().detach().item()
    fold_loss.append(fold_min_loss)
    fold_metrics = {metric: (metric_chkpt_callback.best_metric[metric], metric_chkpt_callback.best_metric_epoch[metric]) 
                    for metric in metrics_to_monitor.keys()}
    print(f"Best metric value for {fold_str}")
    print(f"val_loss  = {fold_loss[fold]}")
    print(fold_metrics)
    del trainer, seq2seq_model, loss_chkpt_callback, metric_chkpt_callback 

In [62]:
from optuna.integration import PyTorchLightningPruningCallback

def run_hparam_tuning(model_params, trial):
    dl_train, dl_val, ds_train, ds_val, h_vocab, m_vocab, inv_h_vocab, inv_m_vocab = get_fold_dls(0, df_dates)
    early_stopping = PyTorchLightningPruningCallback(trial, monitor="val_loss")    
    seq2seq_model = EncoderDecoderLitModel(
        hparams = model_params, 
        source_vocab_size = len(h_vocab),
        target_vocab_size = len(m_vocab)
        )    
    trainer = pl.Trainer(
        checkpoint_callback=False,        
        gpus=1,
        # For results reproducibility 
        deterministic=True,
        auto_select_gpus=True,
        progress_bar_refresh_rate=20,
        max_epochs=Config.NUM_EPOCHS,        
        precision=Config.PRECISION,   
        weights_summary=None,         
        gradient_clip_val = 1.0,            
        callbacks=[early_stopping]
    )      
    trainer.fit(seq2seq_model, train_dataloaders=dl_train, val_dataloaders=dl_val)     
    loss = trainer.callback_metrics["val_loss"].item()
    del trainer, seq2seq_model, early_stopping, dl_train, dl_val
    return loss 

In [63]:
# import optuna

# Config.NUM_EPOCHS = 20        
# def objective(trial):
#     params = {
#         "lr": trial.suggest_loguniform("lr", 1e-6, 1e-3),
#         "hidden_size": trial.suggest_int("hidden_size", 32, 512),
#         "enc_drop_out": trial.suggest_uniform("enc_drop_out", 0.2, 0.7),
#         "dec_drop_out": trial.suggest_uniform("dec_drop_out", 0.2, 0.7),
#         "num_layers": trial.suggest_int("num_layers", 1, 2),
#         "weight_decay": trial.suggest_loguniform("weight_decay", 1e-6, 1e-2),
#     }    
#     loss = run_hparam_tuning(params, trial)
#     return loss

# study = optuna.create_study(direction="minimize", study_name="Seq2SeqModelTuning")    
# study.optimize(objective, n_trials=20)
# print(f"Best trial number = {study.best_trial.number}")
# print("Best trial params:")
# print(study.best_params)

In [64]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [65]:
find_lr = True
fold_loss = []
fold_metrics = []

for fold in range(Config.NUM_FOLDS):
    dl_train, dl_val, ds_train, ds_val, h_vocab, m_vocab, inv_h_vocab, inv_m_vocab = get_fold_dls(fold, df_dates)
    run_training(fold, fold_loss, fold_metrics, dl_train, dl_val, h_vocab, m_vocab, find_lr=False)
    break

Running training for fold0


  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
Using native 16bit precision.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type         | Params
-----------------------------------------
0 | encoder | lstm_encoder | 613 K 
1 | decoder | lstm_decoder | 586 K 
-----------------------------------------
1.2 M     Trainable params
0         Non-trainable params
1.2 M     Total params
4.803     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 42


For epoch 1
val_loss = 2.4749
val_perplexity = 11.8808


  rank_zero_warn(


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 31: val_loss reached 2.11514 (best 2.11514), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=0_val_loss=2.1151.ckpt" as top 1


For epoch 2
val_loss = 2.1151
val_perplexity = 8.2908


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 63: val_loss reached 2.02866 (best 2.02866), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=1_val_loss=2.0287.ckpt" as top 1


For epoch 3
val_loss = 2.0287
val_perplexity = 7.6039


Validating: 0it [00:00, ?it/s]

Epoch 2, global step 95: val_loss reached 1.85643 (best 1.85643), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=2_val_loss=1.8564.ckpt" as top 1


For epoch 4
val_loss = 1.8564
val_perplexity = 6.4008


Validating: 0it [00:00, ?it/s]

Epoch 3, global step 127: val_loss reached 1.70948 (best 1.70948), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=3_val_loss=1.7095.ckpt" as top 1


For epoch 5
val_loss = 1.7095
val_perplexity = 5.5261


Validating: 0it [00:00, ?it/s]

Epoch 4, global step 159: val_loss reached 1.59148 (best 1.59148), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=4_val_loss=1.5915.ckpt" as top 1


For epoch 6
val_loss = 1.5915
val_perplexity = 4.911


Validating: 0it [00:00, ?it/s]

Epoch 5, global step 191: val_loss reached 1.52692 (best 1.52692), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=5_val_loss=1.5269.ckpt" as top 1


For epoch 7
val_loss = 1.5269
val_perplexity = 4.604


Validating: 0it [00:00, ?it/s]

Epoch 6, global step 223: val_loss reached 1.52341 (best 1.52341), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=6_val_loss=1.5234.ckpt" as top 1


For epoch 8
val_loss = 1.5234
val_perplexity = 4.5879


Validating: 0it [00:00, ?it/s]

Epoch 7, global step 255: val_loss reached 1.44901 (best 1.44901), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=7_val_loss=1.4490.ckpt" as top 1


For epoch 9
val_loss = 1.449
val_perplexity = 4.2589


Validating: 0it [00:00, ?it/s]

Epoch 8, global step 287: val_loss reached 1.37222 (best 1.37222), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=8_val_loss=1.3722.ckpt" as top 1


For epoch 10
val_loss = 1.3722
val_perplexity = 3.9441


Validating: 0it [00:00, ?it/s]

Epoch 9, global step 319: val_loss reached 1.37080 (best 1.37080), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=9_val_loss=1.3708.ckpt" as top 1


For epoch 11
val_loss = 1.3708
val_perplexity = 3.9385


Validating: 0it [00:00, ?it/s]

Epoch 10, global step 351: val_loss reached 1.36203 (best 1.36203), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=10_val_loss=1.3620.ckpt" as top 1


For epoch 12
val_loss = 1.362
val_perplexity = 3.9041


Validating: 0it [00:00, ?it/s]

Epoch 11, global step 383: val_loss reached 1.28084 (best 1.28084), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=11_val_loss=1.2808.ckpt" as top 1


For epoch 13
val_loss = 1.2808
val_perplexity = 3.5997


Validating: 0it [00:00, ?it/s]

Epoch 12, global step 415: val_loss was not in top 1


For epoch 14
val_loss = 1.3207
val_perplexity = 3.7462


Validating: 0it [00:00, ?it/s]

Epoch 13, global step 447: val_loss reached 1.23315 (best 1.23315), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=13_val_loss=1.2332.ckpt" as top 1


For epoch 15
val_loss = 1.2332
val_perplexity = 3.4321


Validating: 0it [00:00, ?it/s]

Epoch 14, global step 479: val_loss reached 1.20975 (best 1.20975), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=14_val_loss=1.2098.ckpt" as top 1


For epoch 16
val_loss = 1.2098
val_perplexity = 3.3527


Validating: 0it [00:00, ?it/s]

Epoch 15, global step 511: val_loss reached 1.18196 (best 1.18196), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=15_val_loss=1.1820.ckpt" as top 1


For epoch 17
val_loss = 1.182
val_perplexity = 3.2608


Validating: 0it [00:00, ?it/s]

Epoch 16, global step 543: val_loss reached 1.17382 (best 1.17382), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=16_val_loss=1.1738.ckpt" as top 1


For epoch 18
val_loss = 1.1738
val_perplexity = 3.2344


Validating: 0it [00:00, ?it/s]

Epoch 17, global step 575: val_loss reached 1.09703 (best 1.09703), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=17_val_loss=1.0970.ckpt" as top 1


For epoch 19
val_loss = 1.097
val_perplexity = 2.9953


Validating: 0it [00:00, ?it/s]

Epoch 18, global step 607: val_loss reached 1.07023 (best 1.07023), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=18_val_loss=1.0702.ckpt" as top 1


For epoch 20
val_loss = 1.0702
val_perplexity = 2.9161


Validating: 0it [00:00, ?it/s]

Epoch 19, global step 639: val_loss reached 1.06695 (best 1.06695), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=19_val_loss=1.0669.ckpt" as top 1


For epoch 21
val_loss = 1.0669
val_perplexity = 2.9065


Validating: 0it [00:00, ?it/s]

Epoch 20, global step 671: val_loss reached 1.01292 (best 1.01292), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=20_val_loss=1.0129.ckpt" as top 1


For epoch 22
val_loss = 1.0129
val_perplexity = 2.7537


Validating: 0it [00:00, ?it/s]

Epoch 21, global step 703: val_loss reached 0.99862 (best 0.99862), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=21_val_loss=0.9986.ckpt" as top 1


For epoch 23
val_loss = 0.9986
val_perplexity = 2.7146


Validating: 0it [00:00, ?it/s]

Epoch 22, global step 735: val_loss reached 0.95313 (best 0.95313), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=22_val_loss=0.9531.ckpt" as top 1


For epoch 24
val_loss = 0.9531
val_perplexity = 2.5939


Validating: 0it [00:00, ?it/s]

Epoch 23, global step 767: val_loss reached 0.92580 (best 0.92580), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=23_val_loss=0.9258.ckpt" as top 1


For epoch 25
val_loss = 0.9258
val_perplexity = 2.5239


Validating: 0it [00:00, ?it/s]

Epoch 24, global step 799: val_loss reached 0.89387 (best 0.89387), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=24_val_loss=0.8939.ckpt" as top 1


For epoch 26
val_loss = 0.8939
val_perplexity = 2.4446


Validating: 0it [00:00, ?it/s]

Epoch 25, global step 831: val_loss reached 0.89105 (best 0.89105), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=25_val_loss=0.8910.ckpt" as top 1


For epoch 27
val_loss = 0.891
val_perplexity = 2.4377


Validating: 0it [00:00, ?it/s]

Epoch 26, global step 863: val_loss reached 0.83950 (best 0.83950), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=26_val_loss=0.8395.ckpt" as top 1


For epoch 28
val_loss = 0.8395
val_perplexity = 2.3152


Validating: 0it [00:00, ?it/s]

Epoch 27, global step 895: val_loss reached 0.83076 (best 0.83076), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=27_val_loss=0.8308.ckpt" as top 1


For epoch 29
val_loss = 0.8308
val_perplexity = 2.2951


Validating: 0it [00:00, ?it/s]

Epoch 28, global step 927: val_loss reached 0.79027 (best 0.79027), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=28_val_loss=0.7903.ckpt" as top 1


For epoch 30
val_loss = 0.7903
val_perplexity = 2.204


Validating: 0it [00:00, ?it/s]

Epoch 29, global step 959: val_loss reached 0.78376 (best 0.78376), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=29_val_loss=0.7838.ckpt" as top 1


For epoch 31
val_loss = 0.7838
val_perplexity = 2.1897


Validating: 0it [00:00, ?it/s]

Epoch 30, global step 991: val_loss reached 0.74211 (best 0.74211), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=30_val_loss=0.7421.ckpt" as top 1


For epoch 32
val_loss = 0.7421
val_perplexity = 2.1004


Validating: 0it [00:00, ?it/s]

Epoch 31, global step 1023: val_loss reached 0.71966 (best 0.71966), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=31_val_loss=0.7197.ckpt" as top 1


For epoch 33
val_loss = 0.7197
val_perplexity = 2.0538


Validating: 0it [00:00, ?it/s]

Epoch 32, global step 1055: val_loss was not in top 1


For epoch 34
val_loss = 0.7276
val_perplexity = 2.0702


Validating: 0it [00:00, ?it/s]

Epoch 33, global step 1087: val_loss reached 0.70382 (best 0.70382), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=33_val_loss=0.7038.ckpt" as top 1


For epoch 35
val_loss = 0.7038
val_perplexity = 2.0215


Validating: 0it [00:00, ?it/s]

Epoch 34, global step 1119: val_loss reached 0.69293 (best 0.69293), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=34_val_loss=0.6929.ckpt" as top 1


For epoch 36
val_loss = 0.6929
val_perplexity = 1.9996


Validating: 0it [00:00, ?it/s]

Epoch 35, global step 1151: val_loss reached 0.66105 (best 0.66105), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=35_val_loss=0.6610.ckpt" as top 1


For epoch 37
val_loss = 0.661
val_perplexity = 1.9368


Validating: 0it [00:00, ?it/s]

Epoch 36, global step 1183: val_loss reached 0.65027 (best 0.65027), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=36_val_loss=0.6503.ckpt" as top 1


For epoch 38
val_loss = 0.6503
val_perplexity = 1.9161


Validating: 0it [00:00, ?it/s]

Epoch 37, global step 1215: val_loss was not in top 1


For epoch 39
val_loss = 0.6516
val_perplexity = 1.9186


Validating: 0it [00:00, ?it/s]

Epoch 38, global step 1247: val_loss reached 0.62010 (best 0.62010), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=38_val_loss=0.6201.ckpt" as top 1


For epoch 40
val_loss = 0.6201
val_perplexity = 1.8591


Validating: 0it [00:00, ?it/s]

Epoch 39, global step 1279: val_loss was not in top 1


For epoch 41
val_loss = 0.6214
val_perplexity = 1.8615


Validating: 0it [00:00, ?it/s]

Epoch 40, global step 1311: val_loss reached 0.59782 (best 0.59782), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=40_val_loss=0.5978.ckpt" as top 1


For epoch 42
val_loss = 0.5978
val_perplexity = 1.8182


Validating: 0it [00:00, ?it/s]

Epoch 41, global step 1343: val_loss was not in top 1


For epoch 43
val_loss = 0.5981
val_perplexity = 1.8188


Validating: 0it [00:00, ?it/s]

Epoch 42, global step 1375: val_loss reached 0.57940 (best 0.57940), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=42_val_loss=0.5794.ckpt" as top 1


For epoch 44
val_loss = 0.5794
val_perplexity = 1.785


Validating: 0it [00:00, ?it/s]

Epoch 43, global step 1407: val_loss was not in top 1


For epoch 45
val_loss = 0.6279
val_perplexity = 1.8736


Validating: 0it [00:00, ?it/s]

Epoch 44, global step 1439: val_loss was not in top 1


For epoch 46
val_loss = 0.6029
val_perplexity = 1.8274


Validating: 0it [00:00, ?it/s]

Epoch 45, global step 1471: val_loss reached 0.57022 (best 0.57022), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=45_val_loss=0.5702.ckpt" as top 1


For epoch 47
val_loss = 0.5702
val_perplexity = 1.7687


Validating: 0it [00:00, ?it/s]

Epoch 46, global step 1503: val_loss reached 0.55836 (best 0.55836), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=46_val_loss=0.5584.ckpt" as top 1


For epoch 48
val_loss = 0.5584
val_perplexity = 1.7478


Validating: 0it [00:00, ?it/s]

Epoch 47, global step 1535: val_loss was not in top 1


For epoch 49
val_loss = 0.5639
val_perplexity = 1.7575


Validating: 0it [00:00, ?it/s]

Epoch 48, global step 1567: val_loss reached 0.55486 (best 0.55486), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=48_val_loss=0.5549.ckpt" as top 1


For epoch 50
val_loss = 0.5549
val_perplexity = 1.7417


Validating: 0it [00:00, ?it/s]

Epoch 49, global step 1599: val_loss reached 0.53757 (best 0.53757), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=49_val_loss=0.5376.ckpt" as top 1


For epoch 51
val_loss = 0.5376
val_perplexity = 1.7119


Validating: 0it [00:00, ?it/s]

Epoch 50, global step 1631: val_loss was not in top 1


For epoch 52
val_loss = 0.5522
val_perplexity = 1.7371


Validating: 0it [00:00, ?it/s]

Epoch 51, global step 1663: val_loss reached 0.52035 (best 0.52035), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=51_val_loss=0.5203.ckpt" as top 1


For epoch 53
val_loss = 0.5203
val_perplexity = 1.6826


Validating: 0it [00:00, ?it/s]

Epoch 52, global step 1695: val_loss was not in top 1


For epoch 54
val_loss = 0.5459
val_perplexity = 1.7261


Validating: 0it [00:00, ?it/s]

Epoch 53, global step 1727: val_loss reached 0.51488 (best 0.51488), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=53_val_loss=0.5149.ckpt" as top 1


For epoch 55
val_loss = 0.5149
val_perplexity = 1.6735


Validating: 0it [00:00, ?it/s]

Epoch 54, global step 1759: val_loss reached 0.50591 (best 0.50591), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=54_val_loss=0.5059.ckpt" as top 1


For epoch 56
val_loss = 0.5059
val_perplexity = 1.6585


Validating: 0it [00:00, ?it/s]

Epoch 55, global step 1791: val_loss was not in top 1


For epoch 57
val_loss = 0.51
val_perplexity = 1.6654


Validating: 0it [00:00, ?it/s]

Epoch 56, global step 1823: val_loss was not in top 1


For epoch 58
val_loss = 0.5094
val_perplexity = 1.6642


Validating: 0it [00:00, ?it/s]

Epoch 57, global step 1855: val_loss was not in top 1


For epoch 59
val_loss = 0.5146
val_perplexity = 1.673


Validating: 0it [00:00, ?it/s]

Epoch 58, global step 1887: val_loss reached 0.49749 (best 0.49749), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=58_val_loss=0.4975.ckpt" as top 1


For epoch 60
val_loss = 0.4975
val_perplexity = 1.6446


Validating: 0it [00:00, ?it/s]

Epoch 59, global step 1919: val_loss was not in top 1


For epoch 61
val_loss = 0.5082
val_perplexity = 1.6623


Validating: 0it [00:00, ?it/s]

Epoch 60, global step 1951: val_loss reached 0.49621 (best 0.49621), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=60_val_loss=0.4962.ckpt" as top 1


For epoch 62
val_loss = 0.4962
val_perplexity = 1.6425


Validating: 0it [00:00, ?it/s]

Epoch 61, global step 1983: val_loss was not in top 1


For epoch 63
val_loss = 0.5129
val_perplexity = 1.6701


Validating: 0it [00:00, ?it/s]

Epoch 62, global step 2015: val_loss reached 0.49515 (best 0.49515), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=62_val_loss=0.4952.ckpt" as top 1


For epoch 64
val_loss = 0.4952
val_perplexity = 1.6408


Validating: 0it [00:00, ?it/s]

Epoch 63, global step 2047: val_loss reached 0.48980 (best 0.48980), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=63_val_loss=0.4898.ckpt" as top 1


For epoch 65
val_loss = 0.4898
val_perplexity = 1.632


Validating: 0it [00:00, ?it/s]

Epoch 64, global step 2079: val_loss was not in top 1


For epoch 66
val_loss = 0.4936
val_perplexity = 1.6382


Validating: 0it [00:00, ?it/s]

Epoch 65, global step 2111: val_loss reached 0.48925 (best 0.48925), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=65_val_loss=0.4892.ckpt" as top 1


For epoch 67
val_loss = 0.4892
val_perplexity = 1.6311


Validating: 0it [00:00, ?it/s]

Epoch 66, global step 2143: val_loss was not in top 1


For epoch 68
val_loss = 0.5094
val_perplexity = 1.6642


Validating: 0it [00:00, ?it/s]

Epoch 67, global step 2175: val_loss reached 0.48850 (best 0.48850), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=67_val_loss=0.4885.ckpt" as top 1


For epoch 69
val_loss = 0.4885
val_perplexity = 1.6299


Validating: 0it [00:00, ?it/s]

Epoch 68, global step 2207: val_loss reached 0.48267 (best 0.48267), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=68_val_loss=0.4827.ckpt" as top 1


For epoch 70
val_loss = 0.4827
val_perplexity = 1.6204


Validating: 0it [00:00, ?it/s]

Epoch 69, global step 2239: val_loss was not in top 1


For epoch 71
val_loss = 0.4844
val_perplexity = 1.6232


Validating: 0it [00:00, ?it/s]

Epoch 70, global step 2271: val_loss was not in top 1


For epoch 72
val_loss = 0.4849
val_perplexity = 1.624


Validating: 0it [00:00, ?it/s]

Epoch 71, global step 2303: val_loss reached 0.47719 (best 0.47719), saving model to "/home/bk_anupam/code/ML/DeepLearning/DeepLearning_AI/5-RNNs/week3/model/best_model_epoch=71_val_loss=0.4772.ckpt" as top 1


For epoch 73
val_loss = 0.4772
val_perplexity = 1.6115


Validating: 0it [00:00, ?it/s]

Epoch 72, global step 2335: val_loss was not in top 1


For epoch 74
val_loss = 0.4852
val_perplexity = 1.6245


Validating: 0it [00:00, ?it/s]

Epoch 73, global step 2367: val_loss was not in top 1


For epoch 75
val_loss = 0.4809
val_perplexity = 1.6176


Validating: 0it [00:00, ?it/s]

Epoch 74, global step 2399: val_loss was not in top 1


For epoch 76
val_loss = 0.4873
val_perplexity = 1.628
Best metric value for fold0
val_loss  = 0.47719234228134155
{'val_loss': (0.4772, 72), 'val_perplexity': (1.6115, 72)}


In [66]:
def get_key_from_value(dict, value_to_search):
    for key, value in dict.items():
        if value == value_to_search:
            return key

In [72]:
# inference on test set

def run_prediction():
    test_dataset = load_date_dataset(num_examples=10)
    Config.BATCH_SIZE = 1
    df_dates_test = pd.DataFrame({"h_dt": test_dataset[:, 0], "m_dt": test_dataset[:, 1]})
    h_dt_max_len = df_dates_test.h_dt.apply(lambda x: len(x)).max()
    transform = StoITensorTransform(h_vocab, h_dt_max_len)
    target_transform = StoITensorTransform(m_vocab, len(m_vocab), add_sos_token=True)
    ds_test = DateDataset(df_dates_test.h_dt, df_dates_test.m_dt, transform=transform, target_transform=target_transform)    
    dl_test = DataLoader(ds_test, batch_size=Config.BATCH_SIZE, num_workers=Config.NUM_WORKERS, collate_fn=pad_collate)
    seq2seq_model = EncoderDecoderLitModel.load_from_checkpoint(
        checkpoint_path = "model/best_model_epoch=49_val_loss=0.5511.ckpt",
        hparams = model_params, 
        source_vocab_size = len(h_vocab),
        target_vocab_size = len(m_vocab)
    ).to(Config.DEVICE)
    pred_table = []
    for src_seq, src_seq_len, target_seq in dl_test:        
        src_seq = src_seq.to(Config.DEVICE)
        src_seq_len = src_seq_len.to(Config.DEVICE)
        target_seq = target_seq.to(Config.DEVICE)
        target_seq_oh = one_hot_encode(target_seq, len(m_vocab))
        outputs = seq2seq_model(src_seq, src_seq_len, target_seq_oh)
        pred_target_seq = outputs.argmax(2)[:, 1:].reshape(-1).cpu().tolist()
        target_seq = target_seq[:, 1:].reshape(-1).cpu().tolist()                
        src_seq = src_seq.reshape(-1).cpu().tolist()
        source_str = ''.join([get_key_from_value(h_vocab, index) for index in src_seq])                
        target_str = ''.join([inv_m_vocab[index] for index in target_seq])        
        pred_target_str = ''.join([get_key_from_value(m_vocab, index) for index in pred_target_seq])        
        pred_table.append([source_str, target_str, pred_target_str])        
    return pred_table

In [73]:
from tabulate import tabulate

pred_table = run_prediction()    
header = ["Source sequence", "Target Sequence", "Predicted Target Sequence"]
print(tabulate(pred_table, headers=header))

Source sequence            Target Sequence    Predicted Target Sequence
-------------------------  -----------------  ---------------------------
15 Oct 1997                1997-10-15         19-7-15-10
Wednesday October 30 1985  1985-10-30         1085-10-30
1 January 2016             2017-01-01         201--01-00
01 Apr 1983                1983-04-01         1983-04-01
Wednesday January 16 1985  1985-01-16         1985-11685
1/18/14                    2014-01-18         2018-11-08
Sunday November 5 1995     1995-11-05         1995-19-05
June 3 2014                2014-06-03         201--03-03
11 May 2002                2002-05-11         2002000-15
Thursday May 20 1999       1999-05-20         1999-25-25
