In [1]:
# a piece of code needed to 
import os 
import sys
from pathlib import Path

current = os.getcwd()
while 'src' not in os.listdir(current):
    current = Path(current).parent

sys.path.append(str(current))
sys.path.append(os.path.join(current, 'src'))

import torch
import random
import numpy as np 

random.seed(69)
torch.manual_seed(69)
np.random.seed(69)

# Data Imports

In [2]:
# let's start with the data and see how it goes
import os
import pandas as pd
HOME = os.getcwd()
train_csv = os.path.join(HOME, 'data', 'train.csv')
test_csv = os.path.join(HOME, 'data', 'test.csv')

df_train = pd.read_csv(train_csv)
df_test = pd.read_csv(test_csv)
# set the columns names to lower case 

df_train.columns = [c.lower() for c in df_train.columns]
df_test.columns = [c.lower() for c in df_test.columns]

In [3]:
df_train.head()

Unnamed: 0,sentence_id,entity_id,entity,tag
0,0,0,It,PRON
1,0,1,is,VERB
2,0,2,true,ADJ
3,0,3,that,ADP
4,0,4,his,DET


# Data Preprocessing

In [4]:
# first replace any Nan objects by the word 'noun'
df_train.fillna(value='noun', inplace=True)
df_test.fillna(value='noun', inplace=True)

In [5]:
# initial implementation: iterate through the rows of 
def build_sentences(df: pd.DataFrame, train: bool = True):
    # the idea is simple
    last_id = df.iloc[0, :]['sentence_id']
    sentences = []
    labels = []
    s = []
    l = []

    for _, row in df.iterrows():
        sid = row['sentence_id']
        if sid != last_id:
            # this means the sentence is new:
            sentences.append(s)
            s = [row['entity'].lower().strip()]            

            if train:
                labels.append(l)
                l = [row['tag']]
        else:
            s.append(row['entity'].lower().strip())
            if train:
                l.append(row['tag'])

        last_id = sid

    # the last one should be added regardless
    sentences.append(s)
    s = [row['entity'].lower().strip()]            
    if train:
        labels.append(l)

    if train:
        return sentences, labels
    
    return sentences

In [6]:
# prepare the training data as sequences
TRAIN_SENTENCES, TRAIN_LABELS = build_sentences(df_train)
TEST_SENTENCES = build_sentences(df_test, train=False) 
# TRAIN_SENTENCES = [" ".join(tt) for tt in TRAIN_SENTENCES]
# TEST_SENTENCES = [" ".join(tt) for tt in TEST_SENTENCES]

In [7]:
# some assertions to make sure the data is extracted correctly
assert len(TRAIN_SENTENCES) == len(TRAIN_LABELS)
for ts, tl in zip(TRAIN_LABELS, TRAIN_LABELS):
    assert len(ts) == len(tl)

In [8]:
# let's map the non-numerical labels to numerical ones for later use
import itertools
LABELS = set(list(itertools.chain(*TRAIN_LABELS))) 
# make sure the labels are sorted and unique 
LABELS = sorted(list(LABELS))
lab2idx = dict([(l, i) for l, i in zip(LABELS, range(len(LABELS)))])

In [9]:
for index, t in enumerate(TRAIN_LABELS):
    TRAIN_LABELS[index] = [lab2idx[token] for token in t]

In [10]:
# split the data into training and test
from sklearn.model_selection import train_test_split
train_data, val_data, train_labels, val_labels = train_test_split(TRAIN_SENTENCES, TRAIN_LABELS, random_state=69, test_size=0.1)

# Vocabulary

In [11]:
from torchtext.vocab import build_vocab_from_iterator

def yield_tokens():
    for s in TRAIN_SENTENCES:
        yield s

# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

vocab = build_vocab_from_iterator(yield_tokens(), 
                                  min_freq=1,
                                  specials=special_symbols,
                                  special_first=True)
vocab.set_default_index(UNK_IDX)

# DataLoaders


In [12]:
import torch
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [13]:
from torch.utils.data import Dataset, DataLoader

class TrainDS(Dataset):
    def __init__(self, data, labels) -> None:
        super().__init__()
        self.data = data
        self.labels = labels
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index], self.labels[index]

class TestDS(Dataset):
    def __init__(self) -> None:
        super().__init__()
        self.data = TEST_SENTENCES
        

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]

train_ds = TrainDS(train_data, train_labels)
val_ds = TrainDS(val_data, val_labels)
test_ds = TestDS()

In [33]:
# create the collate function
def collate_train(batch):
    label_list, text_list= [], []
    maxi = 0
    for x, y in batch:
        maxi = max(maxi, len(x))
    for x, y in batch:
        text_indices = [BOS_IDX] + [vocab[token] for token in x] + [EOS_IDX] + [PAD_IDX] * (maxi - len(x))
        labels = [-1] + y + [-1] + [-1 for _ in range(maxi - len(x))]
        label_list.append(labels)
        text_list.append(text_indices)
    
    label_list = torch.tensor(label_list, dtype=torch.float32)
    text_list = torch.tensor(text_list, dtype=torch.int64)

    return text_list.to(DEVICE), label_list.to(DEVICE)

# train the dataset
train_dl = DataLoader(train_ds, batch_size=16, shuffle=True, drop_last=True, collate_fn=collate_train)
val_dl = DataLoader(val_ds, batch_size=16, shuffle=False, drop_last=False, collate_fn=collate_train)

# Model

In [34]:
# let's create the encoder module
from torch import nn

class EncoderRNN(nn.Module):
    def __init__(self, 
                embedding_dim: int,
                hidden_size: int,
                num_layers: int = 1, 
                dropout: float=0.3, 
                *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)

        self.embedding = nn.Embedding(num_embeddings=len(vocab), # the number of tokens in the vocabulary 
                                      embedding_dim=embedding_dim, # the dimension of the vector representation of each word 
                                      padding_idx=PAD_IDX # we know the padding index already
                                      )

        self.dropout = nn.Dropout(p=dropout)

        self.rnn = nn.LSTM(input_size=embedding_dim, 
                           hidden_size=hidden_size, 
                           dropout=dropout, 
                           num_layers=num_layers,
                           bidirectional=True, # bidiretional RNN are more powerful
                           batch_first=True # easier manipulation
                           )
        
        # 2: comes from the fact that the lstm is bidirectional, the rest is similar to the LSTM documention Pytorch
        self.hidden_state_dim = 2 * num_layers * hidden_size     
        self.lstm_output_dim = 2 * hidden_size

    def forward(self, x: torch.Tensor):
        # first pass it through the rnn
        rnn_output , (hidden_state, cell_state) = self.rnn(self.dropout(self.embedding(x)))
        # the shape according to LSTM documentation are: 
        # rnn_output: (batch, L, 2 * self.hidden_size)
        # hidden_state, cell_state (2 * num_layers, batch, self.hidden_size)
        return rnn_output, hidden_state, cell_state

In [35]:
import torch.nn.functional as F

class DecoderRNN(nn.Module):
    def __init__(self, 
                hidden_size, 
                output_size,
                num_layers: int = 1, 
                dropout: float=0.2): 
        self.hidden_size = hidden_size
        super(DecoderRNN, self).__init__()
        # the decoder is a sequence model as well 
        self.rnn = nn.LSTM(1, 
                           hidden_size, 
                           batch_first=True, 
                           num_layers=num_layers, 
                           dropout=dropout,
                           bidirectional=True,
                           )
        # given a variable number of tensors of size 'hidden_size' return the same number of tesnors but with dimension: 'output_size'
        self.classifier = nn.Linear(2 * hidden_size, output_size)

    def forward_step(self, decoder_input, decoder_hs, decoder_cs):
        # this function expects a decoder_input: of shape: (batch_size, 1, 1)
        # decoder_hs should be of the shape (2 * num_layers, )
        
        output, (hs, cs) = self.rnn(decoder_input, (decoder_hs, decoder_cs))
        # output at this stage will be (batch_size, 1, self.hidden_size)
        output = self.classifier(output.squeeze(dim=1))
        # output at this point is batch_Size, classes
        return output, hs, cs

    def forward(self, 
                encoder_hidden_state, 
                encoder_cell_state,
                max_seq_length: int,
                batch_size: int = None, 
                target: torch.Tensor = None):
        
        if target is None and batch_size is None:
            raise ValueError(f"either the 'batch_size' or the 'target' arguments must be explicitly passed. Both of them are {None}")

    
        batch_size = target.dim(0) if target is not None else batch_size
        # the first input is of the size (batch_size, L = 1, input_size = hidden_size)
        # according to the documentation of the nn.embedding layer, padding_idx are initialized to zero_values
        # we are using -1 as the label that represets 
        decoder_input = torch.empty(size=(batch_size, 1, 1), dtype=torch.float).fill_(value=-1).to(DEVICE)

        decoder_hidden_state = encoder_hidden_state
        decoder_cell_state = encoder_cell_state
        decoder_outputs = []

        for i in range(max_seq_length):
            decoder_output, decoder_hidden_state, decoder_cell_state  = self.forward_step(decoder_input, decoder_hidden_state, decoder_cell_state)
            # decoder_ouput will be of the shape (batch_size, num_classes)
            decoder_outputs.append(decoder_output.unsqueeze(dim=1))

            if target is not None:
                # using the target tensor is a technique known as Teacher Forcing
                # the target is expected to be of shape: (batch, L) (as each label is uni dimensional)
                decoder_input = target[:, i].unsqueeze(dim=-1)
            else:
                _, best_prediction =  decoder_output.topk(1)
                # detach (so that the error from the previous output is not propagated further to the rest of the sequence)
                # + set to float, as most optimizers work with float data (mainly as input)
                decoder_input = best_prediction.unsqueeze(dim=-1).detach().to(torch.float)  
        
        # the final output should be (batch_size, max_seq_length, classes)
        # each element inside the list is of shape: (batch_size, 1, classes)
        # they should be stacked according to dim = 1
        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        # reduce to classes predictions
        # decoder_outputs = F.log_softmax(decoder_outputs, dim=-1) 
        return decoder_outputs, decoder_hidden_state, decoder_cell_state
    

# training

In [36]:
# import optimizer
from torch.optim import Adam
encoder = EncoderRNN(embedding_dim=100, hidden_size=50)
# the output of the encoder
decoder = DecoderRNN(hidden_size=50, output_size=len(lab2idx) + 1)

e_opt = Adam(encoder.parameters(), lr=0.01)
d_opt = Adam(decoder.parameters(), lr=0.01)

criterion = nn.CrossEntropyLoss()



In [37]:
from typing import Dict, List, Tuple, Optional, Union

def train_per_epoch(encoder: EncoderRNN,
                    decoder: DecoderRNN, 
                    train_dataloader: DataLoader[torch.tensor],
                    loss_function: nn.Module,
                    e_opt: torch.optim, 
                    d_opt: torch.optim,
                    device: str = DEVICE,
                    ) -> Dict[str, float]:

    # set both components to 'train' mode
    encoder.train()
    decoder.train()

    # set both components to the right 'device'
    encoder.to(device)
    decoder.to(device)

    # set the train loss and metrics
    train_loss, train_acc = 0, 0

    for _, (x, y) in enumerate(train_dataloader):
        # first set both optimizers to zero gradients
        e_opt.zero_grad()
        d_opt.zero_grad()

        # extract the batch size, sequence length (with padding) 
        batch_size, seq_length = x.shape

        _, hidden_state, cell_state = encoder.forward(x)
        # pass the outputs of the encoder to the decoder
        decoder_outputs, _ , _ = decoder.forward(hidden_state, cell_state, max_seq_length=seq_length, batch_size=batch_size)

        # the decoder's outputs are expected to be of shape (batch_size, seq_length, num_classes)
        loss = torch.zeros(size=(batch_size,))
        for seq_index in range(seq_length):
            output_index = decoder_outputs[seq_index]
            y_index = y[seq_index].squeeze().to(torch.long)

            seq_loss = loss_function(output_index, y_index)
            loss = torch.add(loss, seq_loss)
        
        # average the loss accross the batch
        loss /= batch_size
        
        train_loss += loss.item()

        # take a backward step to calculate the gradients
        loss.backward()
        # optimize both for encoder and decoder
        e_opt.step()
        d_opt.step()

        y_pred = decoder_outputs.argmax(dim=-1)
        train_acc += (y_pred == y).type(torch.float32).mean().item()

    train_acc = train_acc / len(train_dataloader)
    train_loss = train_loss / len(train_dataloader)

    return train_loss, train_acc

In [38]:
def val_per_epoch(encoder: EncoderRNN,
                  decoder: DecoderRNN,
                  dataloader: DataLoader[torch.tensor],
                  loss_function: nn.Module,
                  ) -> Dict[str, float]:

    val_loss, val_acc = 0, 0
    # set both components to 'train' mode
    encoder.eval()
    decoder.eval()

    # set both components to the right 'device'
    encoder.to(DEVICE)
    decoder.to(DEVICE)

    # Turn on inference context manager
    with torch.inference_mode():
        # Loop through DataLoader batches
        for _, (x, y) in enumerate(dataloader):
            # extract the batch size, sequence length (with padding) 
            batch_size, seq_length = x.shape

            rnn_output, hidden_state, cell_state = encoder.forward(x)
            # pass the outputs of the encoder to the decoder
            decoder_outputs, _ , _ = decoder.forward(rnn_output, hidden_state, cell_state, max_seq_length=seq_length, batch_size=batch_size)

            # the decoder's outputs are expected to be of shape (batch_size, seq_length, num_classes)
            loss = torch.zeros(size=(batch_size,))
            for seq_index in range(seq_length):
                output_index = decoder_outputs[seq_index]
                y_index = y[index].squeeze().to(torch.long)
                # iterate through each sequence in the batch
                seq_loss = loss_function(output_index, y_index)
                loss = torch.add(loss, seq_loss)
                    
            # average the loss accross the batch
            loss /= batch_size
            
            val_loss += loss.item()

            y_pred = decoder_outputs.argmax(dim=-1)
            val_acc += (y_pred == y).type(torch.float32).mean().item()

    # average by epoch
    val_acc = val_acc / len(dataloader)
    val_loss = val_loss / len(dataloader)

    return val_loss, val_acc

In [39]:
from src.pytorch_modular.image_classification import utilities as ut, engine_classification as cls
from tqdm import tqdm

def train_model(
                encoder: EncoderRNN, 
                decoder: DecoderRNN,
                train_dataloader: DataLoader[torch.Tensor],
                test_dataloader: DataLoader[torch.Tensor],
                loss_function,
                e_opt, 
                d_opt,
                epochs: int = 5,
                log_dir: Optional[Union[Path, str]] = None,
                save_path: Optional[Union[Path, str]] = None,
                ):

    save_path = save_path if save_path is not None else log_dir

    performance_dict = {ut.TRAIN_LOSS: [],
                        ut.VAL_LOSS: []}

    # best_model, best_loss = None, None
    min_training_loss, no_improve_counter, best_model = float('inf'), 0, None

    # before proceeding with the training, let's set the summary writer
    writer = None if log_dir is None else cls.create_summary_writer(log_dir)

    for _ in tqdm(range(epochs)):
        epoch_train_loss, epoch_train_acc = train_per_epoch(encoder=encoder,
                                                            decoder=decoder,
                                                            e_opt=e_opt,
                                                            d_opt=d_opt,
                                                            train_dataloader=train_dataloader,
                                                            loss_function=criterion)

        epoch_val_loss, epoch_val_acc = val_per_epoch(encoder=encoder,
                                                      decoder=decoder,
                                                      dataloader=test_dataloader,
                                                      loss_function=loss_function)

        epoch_train_metrics = {'train_loss': epoch_train_loss, 'train_accuracy': epoch_train_acc}
        epoch_val_metrics = {'val_loss': epoch_val_loss, 'val_accuracy': epoch_val_acc}

        no_improve_counter = no_improve_counter + 1 if min_training_loss < epoch_train_loss else 0

        if min_training_loss > epoch_train_loss:
            # save the model with the lowest training error
            min_training_loss = epoch_train_loss

            cls._report_performance(epoch_train_loss,
                                epoch_val_loss,
                                epoch_train_metrics,
                                epoch_val_metrics)

        # save the model's performance for this epoch
        cls._track_performance(performance_dict=performance_dict,
                           train_loss=epoch_train_loss,
                           val_loss=epoch_val_loss,
                           train_metric=epoch_train_metrics,
                           val_metrics=epoch_val_metrics)

        cls._set_summary_writer(writer,
                            epoch_train_loss=epoch_train_loss,
                            epoch_val_loss=epoch_val_loss,
                            epoch_train_metrics=epoch_train_metrics,
                            epoch_val_metrics=epoch_val_metrics,
                            epoch=_
                            )

        # abort training if 2 conditions were met:
        # 1. NO_IMPROVE_STOP is larger than the minimum value
        # 2. the training loss did not decrease for consecutive NO_IMPROVE_STOP epochs

        # if ut.MIN_NO_IMPROVE_STOP <= train_configuration[ut.NO_IMPROVE_STOP] <= no_improve_counter:
        #     warnings.warn(f"The training loss did not improve for {no_improve_counter} consecutive epochs."
        #                   f"\naborting training!!", category=RuntimeWarning)
        #     break

    # if log_dir is not None:
    #     save_info(save_path=log_dir, details=details)

    # if save_path is not None:
    #     save_model(best_model, path=save_path)

    return performance_dict


In [41]:
CUDA_LAUNCH_BLOCKING="1"
# shit
# let's see how the model trains
train_model(encoder=encoder, 
            decoder=decoder,
            loss_function=criterion, 
            train_dataloader=train_dl, 
            test_dataloader=val_dl,
            e_opt=e_opt,
            d_opt=d_opt,
            )

  0%|          | 0/5 [00:00<?, ?it/s]


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
