# Week 39 - Sequence Supervised Classifier

## 1. Setup

### 1.1. Libraries

#### 1.1.1. New Libraries

In [6]:
!python --version
!pip3 install nltk                                      # new libraries
!pip install datasets==2.2.1 transformers==4.19.1
!pip3 install bnlp-toolkit                              # Bengali_Tokenization
!pip3 install transformers[torch]                       # hyperparameters
!pip3 install bpemb                                     # pretrain word embeddings
!pip install evaluate                                   # evaluate model huggingface
!pip install seqeval                                    # token classification metric

Python 3.10.12
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.1
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=6f349f7c87a6b3b9bda1ca50796d511e210196e82b7fb2f6c2d12ebb3eb801d4
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


#### 1.1.2. Load Libraries

In [19]:
from datasets import load_dataset                                                    # library to import data from huggingface
import warnings                                                                      # ignore warnings in printing
warnings.filterwarnings("ignore")
from torch import nn                                                                 # neural networks
import torch                                                                         # torch for managing special python objects
import numpy as np                                                                   # library for math operations and matrices
import random                                                                        # library for replicating results
from typing import List, Tuple                                                       # library format functionsin dataloader and torch objects
from torch.utils.data import Dataset, DataLoader                                     # library dataloader and dataset in training nn
import heapq                                                                         # beam searching for finding the most likely sequence
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix        # evaluation metrics
from torch.optim import Adam                                                         # optimizer
from tqdm.notebook import tqdm                                                       # print progress loop
from google.colab import drive                                                       # google colab
drive.mount('/content/drive')
from sklearn.metrics import f1_score, accuracy_score                                 # f1_score, accuracy
from transformers import AdamW, AutoTokenizer                                        # transformer: optimizer, tokenizer (pre-train model), model
from transformers import AutoModelForTokenClassification                             # Token Classification
from transformers import DataCollatorForTokenClassification                          # Data Collator for token classification
from transformers import TrainingArguments                                           # Hyperparametes
from transformers import Trainer                                                     # Trainer
import evaluate
from datasets import load_metric                                                     # Evaluation metric
from functools import partial

#import pandas as pd                                                                  # library to transform to dataframe. helps for statistics
#from bpemb import BPEmb                                                              # embeddings
#from sklearn.linear_model import LogisticRegression                                  # model
#from sklearn.metrics import classification_report                                    # classification report binary clasiffier
#from sklearn.metrics import recall_score, precision_score                            # recall_score, precision_score
#from torch.utils.data import Dataset, DataLoader                                     # torch for managing special data types
#from typing import List, Tuple                                                       # data structures in outputs
#from tqdm.notebook import tqdm                                                       # show progress of the loop
#
#from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification    # transformer: optimizer, tokenizer (pre-train model), model
#from transformers import BertForSequenceClassification                               # Load trained model
#from transformers import DataCollatorWithPadding                                     # for padding in batches
#from datasets import load_metric                                                     # Evaluation metric
#

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### 1.2. Data

#### 1.2.1. Read Data

In [8]:
# define languages for the project
languages = ['arabic', 'bengali','indonesian']

# load training dataset
datasets_train = load_dataset("copenlu/answerable_tydiqa", split='train')
# load validation dataset
datasets_val = load_dataset("copenlu/answerable_tydiqa", split='validation')
# set gpu if available
device = "cuda:0" if torch.cuda.is_available() else "cpu"

def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For atomic operations there is currently
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    random.seed(seed)
    np.random.seed(seed)
enforce_reproducibility()



In [None]:
# filter languages
train_data = [(language_i, datasets_train.filter(lambda dataset: dataset['language']==language_i)) for language_i in languages]
val_data = [(language_i, datasets_val.filter(lambda dataset: dataset['language']==language_i)) for language_i in languages]



  0%|          | 0/117 [00:00<?, ?ba/s]

  0%|          | 0/117 [00:00<?, ?ba/s]

  0%|          | 0/117 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

## 2. Sequence Labelers Supervised Models

### 2.1. Encoder-Decoder Model

#### 2.1.1. Embeddings

In [None]:
# embeddings
from bpemb import BPEmb
dim_ = 100                                         # embedding vector size
vocabulary_ = 100000                               # size vocabulary

bpe_models = {
    languages[0]: BPEmb(lang='ar', dim=dim_, vs = vocabulary_),
    languages[1]: BPEmb(lang='bn', dim=dim_, vs = vocabulary_),
    languages[2]: BPEmb(lang='id', dim=dim_, vs = vocabulary_)
}

PAD_id = vocabulary_
EOS_id = vocabulary_+1
SOS_id = vocabulary_+2


downloading https://nlp.h-its.org/bpemb/ar/ar.wiki.bpe.vs100000.model


100%|██████████| 2383518/2383518 [00:00<00:00, 2747586.07B/s]


downloading https://nlp.h-its.org/bpemb/ar/ar.wiki.bpe.vs100000.d100.w2v.bin.tar.gz


100%|██████████| 38037405/38037405 [00:02<00:00, 13110985.74B/s]


downloading https://nlp.h-its.org/bpemb/bn/bn.wiki.bpe.vs100000.model


100%|██████████| 2943332/2943332 [00:01<00:00, 2914699.96B/s]


downloading https://nlp.h-its.org/bpemb/bn/bn.wiki.bpe.vs100000.d100.w2v.bin.tar.gz


100%|██████████| 38121170/38121170 [00:03<00:00, 11533421.33B/s]


downloading https://nlp.h-its.org/bpemb/id/id.wiki.bpe.vs100000.model


100%|██████████| 1959924/1959924 [00:00<00:00, 2237802.79B/s]


downloading https://nlp.h-its.org/bpemb/id/id.wiki.bpe.vs100000.d100.w2v.bin.tar.gz


100%|██████████| 37930291/37930291 [00:04<00:00, 9099070.60B/s] 


In [None]:
def span_answer_document(df_list_annotations = [], df_list_documents = []):
  """Check whether the answer of a question is fully contained"""
  output = []
  for x,y in zip(df_list_annotations, df_list_documents):
    if x['answer_text'][0] == '':
      output.append(0)
    else:
      start_position = x['answer_start'][0]
      end_position = start_position + len(x['answer_text'][0])
      if end_position > len(y):
        output.append(0)
      else:
        output.append(1)
  return output

def oracle(df_list_annotations = []):
  """Check whether a question has an answer"""
  return [0 if x['answer_text'][0] == '' else 1 for x in df_list_annotations]


answerable_train = oracle(datasets_train['annotations'])
datasets_train = datasets_train.add_column("label", answerable_train)

answerable_val = oracle(datasets_val['annotations'])
datasets_val = datasets_val.add_column("label", answerable_val)

answerable_fully_document_train = span_answer_document(datasets_train['annotations'], datasets_train['document_plaintext'])
datasets_train = datasets_train.add_column("full_answer_document", answerable_fully_document_train)

answerable_fully_document_val = span_answer_document(datasets_val['annotations'], datasets_val['document_plaintext'])
datasets_val = datasets_val.add_column("full_answer_document", answerable_fully_document_val)

In [None]:
print(sum(datasets_train['label']));sum(datasets_train['full_answer_document'])

58059


58059

In [None]:
print(sum(datasets_val['label']));sum(datasets_val['full_answer_document'])

6665


6665

#### 2.1.2. Model

##### 2.1.2.1. Encoder

In [None]:
class EncoderRNN(nn.Module):
    """
    RNN Encoder model.
    """
    def __init__(self,
            pretrained_embeddings: torch.tensor,
            lstm_dim: int,
            dropout_prob: float = 0.1):
        """
        Initializer for EncoderRNN network
        :param pretrained_embeddings: A tensor containing the pretrained embeddings
        :param lstm_dim: The dimensionality of the LSTM network
        :param dropout_prob: Dropout probability
        """
        # First thing is to call the superclass initializer
        super(EncoderRNN, self).__init__()

        # We'll define the network in a ModuleDict, which makes organizing the model a bit nicer
        # The components are an embedding layer, and an LSTM layer.
        self.model = nn.ModuleDict({
            'embeddings': nn.Embedding.from_pretrained(pretrained_embeddings, padding_idx=PAD_id),
            'lstm': nn.LSTM(pretrained_embeddings.shape[1],
                            lstm_dim,
                            2,
                            batch_first=True,
                            dropout=dropout_prob,
                            bidirectional=True,
                            dtype=torch.float64),
                            })
        # Initialize the weights of the model
        self._init_weights()

    def _init_weights(self):
        all_params = list(self.model['lstm'].named_parameters())
        for n, p in all_params:
            if 'weight' in n:
                nn.init.xavier_normal_(p)
            elif 'bias' in n:
                nn.init.zeros_(p)

    def forward(self, inputs, input_lens):
        """
        Defines how tensors flow through the model
        :param inputs: (b x sl) The IDs into the vocabulary of the input samples
        :param input_lens: (b) The length of each input sequence
        :return: (lstm output state, lstm hidden state)
        """
        embeds = self.model['embeddings'](inputs)
        lstm_in = nn.utils.rnn.pack_padded_sequence(
                    embeds,
                    input_lens.cpu(),
                    batch_first=True,
                    enforce_sorted=False
                )
        #lstm_in_2 = lstm_in.data.to(torch.float32)  # Convert data to torch.float32

        #lstm_out, hidden_states = self.model['lstm'](lstm_in_2)
        lstm_out, hidden_states = self.model['lstm'](lstm_in)
        lstm_out, _ = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)
        return lstm_out, hidden_states
        #return lstm_in

##### 2.1.2.2. Decoder

In [None]:
class DecoderRNN(nn.Module):
    """
    RNN Decoder model.
    """
    def __init__(self, pretrained_embeddings: torch.tensor,
            lstm_dim: int,
            dropout_prob: float = 0.1,
            n_classes: int = 2):
        """
        Initializer for DecoderRNN network
        :param pretrained_embeddings: A tensor containing the pretrained embeddings
        :param lstm_dim: The dimensionality of the LSTM network
        :param dropout_prob: Dropout probability
        :param n_classes: Number of prediction classes
        """
        # First thing is to call the superclass initializer
        super(DecoderRNN, self).__init__()
        # We'll define the network in a ModuleDict, which makes organizing the model a bit nicer
        # The components are an embedding layer, a LSTM layer, and a feed-forward output layer
        self.model = nn.ModuleDict({
            'embeddings': nn.Embedding.from_pretrained(pretrained_embeddings, padding_idx=PAD_id),
            'lstm': nn.LSTM(pretrained_embeddings.shape[1],
                            lstm_dim,
                            2,
                            bidirectional=True,
                            dropout=dropout_prob,
                            batch_first=True,
                            dtype=torch.float64),
            'nn': nn.Linear(lstm_dim*2, n_classes, dtype=torch.float64),
        })
        # Initialize the weights of the model
        self._init_weights()
        self.dropout = nn.Dropout(p=dropout_prob)


    def _init_weights(self):
        all_params = list(self.model['lstm'].named_parameters()) + list(self.model['nn'].named_parameters())
        for n, p in all_params:
            if 'weight' in n:
                nn.init.xavier_normal_(p)
            elif 'bias' in n:
                nn.init.zeros_(p)


    def forward(self, inputs, hidden, input_lens):
        """
        Defines how tensors flow through the model
        :param inputs: (b x sl) The IDs into the vocabulary of the input samples
        :param hidden: (b) The hidden state of the previous step
        :param input_lens: (b) The length of each input sequence
        :return: (output predictions, lstm hidden states) the hidden states will be used as input at the next step
        """
        embeds = self.model['embeddings'](inputs)

        lstm_in = nn.utils.rnn.pack_padded_sequence(
                    embeds,
                    input_lens.cpu(),
                    batch_first=True,
                    enforce_sorted=False
                )
        #lstm_in_2 = lstm_in.data.to(torch.float32)  # Convert data to torch.float32
        lstm_out, hidden_states = self.model['lstm'](lstm_in, hidden)
        lstm_out, _ = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)
        # Max pool along the last dimension
        features_lstm = self.dropout(torch.max(lstm_out, 0)[0])
        output = self.model['nn'](features_lstm)
        return output, hidden_states
        #return features_lstm
        #return lstm_in

##### 2.1.2.3. Sequence to Sequence Model

In [None]:
# Define the model
class Seq2Seq(nn.Module):
    """
    Basic Seq2Seq network
    """
    def __init__(
            self,
            pretrained_embeddings: torch.tensor,
            lstm_dim: int,
            dropout_prob: float = 0.1,
            n_classes: int = 2
    ):
        """
        Initializer for basic Seq2Seq network
        :param pretrained_embeddings: A tensor containing the pretrained embeddings
        :param lstm_dim: The dimensionality of the LSTM network
        :param dropout_prob: Dropout probability
        :param n_classes: The number of output classes
        """

        # First thing is to call the superclass initializer
        super(Seq2Seq, self).__init__()

        # We'll define the network in a ModuleDict, which consists of an encoder and a decoder
        self.model = nn.ModuleDict({
            'encoder': EncoderRNN(pretrained_embeddings, lstm_dim, dropout_prob),
            'decoder': DecoderRNN(pretrained_embeddings, lstm_dim, dropout_prob, n_classes),
        })
        self.loss = nn.CrossEntropyLoss()


    def forward(self, inputs, input_lens, labels=None):
        """
        Defines how tensors flow through the model.
        For the Seq2Seq model this includes 1) encoding the whole input text,
        and running *target_length* decoding steps to predict the tag of each token.

        :param inputs: (b x sl) The IDs into the vocabulary of the input samples
        :param input_lens: (b) The length of each input sequence
        :param labels: (b) The label of each sample
        :return: (loss, logits) if `labels` is not None, otherwise just (logits,)
        """

        # Get embeddings (b x sl x embedding dim)
        encoder_output, encoder_hidden = self.model['encoder'](inputs, input_lens)
        decoder_hidden = encoder_hidden  # All Context Encoded
        #decoder_input = torch.tensor([SOS_id]*inputs.shape[0], device=device)
        #decoder_input = labels[:, 0].unsqueeze(-1) # Label associated with first position
        decoder_input = inputs[:, 0].unsqueeze(-1)

        mask = (decoder_input != PAD_id)  # Create a mask to identify non-padding elements
        input_lens_step = mask.sum(1)  # Compute sequence lengths for each time step

        target_length = labels.size(1)

        loss = None

        for di in range(1,target_length):
            decoder_output, decoder_hidden = self.model['decoder'](
                decoder_input, decoder_hidden, input_lens_step)

            if loss == None:
                loss = self.loss(decoder_output.squeeze(1), labels[:, di])
            else:
                loss += self.loss(decoder_output.squeeze(1), labels[:, di])
            # Teacher forcing: Feed the target as the next input
            decoder_input = labels[:, di].unsqueeze(-1)
            mask = (decoder_input != PAD_id)  # Create a mask to identify non-padding elements
            input_lens_step = mask.sum(1)  # Compute sequence lengths for each time step

        return loss / target_length

#### 2.1.3. Model Set Up

##### 2.1.3.1. Functions Preprocessing Data

In [None]:
def offset_mapping_manual(tokens):
  """Get the token boundaries (start, end)"""
  token_boundaries = []
  start = 0
  for token in tokens:
      end = start + len(token)
      token_boundaries.append((start, end-1))
      start = end
  return token_boundaries

def end_answer(list_annotations=[]):
  """ Check where the answer ends"""
  end_list = []
  for x in list_annotations:
    start = x['answer_start'][0]
    lenght_ = len(x['answer_text'][0])

    if start>0:
      end = [start + lenght_]
    else:
      end = [start]
    end_list.append(end)
  return end_list

def list_dummies_start_function(list_annotations = [], list_offset = []):
  """Look for the token position (touple) where the answer starts"""
  list_dummies_start = []
  for x,y in zip(list_annotations, list_offset):
    start = x['answer_start'][0]
    list_verification = []
    for element_first, element_second in y:
      if element_first<=start and element_second>=start:
        list_verification.append(1)
      else:
        list_verification.append(0)
    list_dummies_start.append(list_verification)
  return list_dummies_start

def list_dummies_end_function(list_end_answer = [], list_offset = []):
  """Look for the token position (touple) where the answer ends"""
  list_dummies_end = []
  for x,y in zip(list_end_answer, list_offset):
    end = x[0]
    list_verification = []

    for element_first, element_second in y:
      if element_first<=end and element_second>=end:
        list_verification.append(1)
      else:
        list_verification.append(0)

    list_dummies_end.append(list_verification)
  return list_dummies_end

def sequence_dummies_documments(list_dummies_start=[], list_dummies_end=[]):
  """ Get dummies for all tokens between the start and the end tokens"""
  condition = 1
  list_output = []
  for x, y in zip(list_dummies_start, list_dummies_end):
    indices_start = [i for i, x in enumerate(x) if x == condition]
    indices_end = [i for i, x in enumerate(y) if x == condition]
    output = np.repeat(0,len(x))
    if len(indices_start)>0:
      for index, value in enumerate(output):
        try:
          if index>=indices_start[0] and index<=indices_end[0]:
            output[index] = 1
        except:
          output[index] = 0
    list_output.append(list(output))
  return list_output

##### 2.1.3.2. Pytorch input format and data loader

In [None]:
# Define input format for each row in the neural network

def text_to_batch_bilstm(text_question: List,
                         text_document: List,
                         tokenizer, max_len=512,
                         id_token_eos = EOS_id,
                         id_token_sos = SOS_id) -> Tuple[List, List]:
    """
    Creates a tokenized batch for input to a bilstm model
    :param text: A list of sentences to tokenize
    :param tokenizer: A tokenization function to use (i.e. fasttext)
    :return: Tokenized text as well as the length of the input sequence
    """
    # Some light preprocessing
    #input_ids = [id_token_eos]+[tokenizer.encode_ids(t) for t in text_question]+[id_token_eos]+[tokenizer.encode_ids(t) for t in text_document]
    input_ids = []
    for x, y in zip(text_question, text_document):
      input_ids.append([id_token_sos]+tokenizer.encode_ids(x)+[id_token_eos]+tokenizer.encode_ids(y))
    return input_ids, [len(ids) for ids in input_ids]


# This will load the dataset and process it
class ClassificationDatasetReader(Dataset):
  def __init__(self, df, tokenizer, column_text_questions, column_text_documents, column_label):
    self.df = df
    self.tokenizer = tokenizer
    self.column_text_questions = column_text_questions
    self.column_text_documents = column_text_documents
    self.column_label = column_label


  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):
    row = self.df[idx]
    # Calls the text_to_batch function
    input_ids_column_text_together, seq_lens_column_text_together = text_to_batch_bilstm(text_question = [row[self.column_text_questions]],
                                                                                         text_document = [row[self.column_text_documents]],
                                                                                         tokenizer = self.tokenizer)

    label = row[self.column_label]

    return input_ids_column_text_together, seq_lens_column_text_together, label


# Prepare data for pytorch object
# Asumes the output from text_to_batch_bilstm
def collate_batch_bilstm(input_data: Tuple,
                         id_pad = PAD_id):

    """
    Combines multiple data samples into a single batch
    :param input_data: The combined input_ids, seq_lens, and labels for the batch
    :return: A tuple of tensors (input_ids, seq_lens, labels)
    """
    input_ids_ = [i[0][0] for i in input_data]
    seq_lens_ = [i[1][0] for i in input_data]
    labels = [i[2] for i in input_data]

    # Pad all of the input samples to the max length question
    max_length = max([len(i) for i in input_ids_])
    input_ids_ = [(i + [id_pad] * (max_length - len(i))) for i in input_ids_]
    labels = [(i + [0] * (max_length - len(i))) for i in labels]                  # 0 ANSWER IN PADDINGS


    # Make sure each sample is max_length long
    assert (all(len(i) == max_length for i in input_ids_))
    assert (all(len(i) == max_length for i in labels))

    return torch.tensor(input_ids_, dtype=torch.int64), torch.tensor(seq_lens_, dtype=torch.int64), torch.tensor(labels)

##### 2.1.3.3. Functions Train Neural Network

In [None]:
PATH = "/content/drive/MyDrive/model_lstm_seq"
def train(
    model: nn.Module,
    train_dl: DataLoader,
    valid_dl: DataLoader,
    optimizer: torch.optim.Optimizer,
    n_epochs: int,
    device: torch.device,
    PATH = PATH
):
    """
    The main training loop which will optimize a given model on a given dataset
    :param model: The model being optimized
    :param train_dl: The training dataset
    :param valid_dl: A validation dataset
    :param optimizer: The optimizer used to update the model parameters
    :param n_epochs: Number of epochs to train for
    :param device: The device to train on
    :return: (model, losses) The best model and the losses per iteration
    """

    # Keep track of the loss and best accuracy
    losses = []
    best_f1 = 0.0

    # Iterate through epochs
    for ep in range(n_epochs):

        loss_epoch = []

        #Iterate through each batch in the dataloader
        for batch in tqdm(train_dl):
            # VERY IMPORTANT: Make sure the model is in training mode, which turns on
            # things like dropout and layer normalization
            model.train()

            # VERY IMPORTANT: zero out all of the gradients on each iteration -- PyTorch
            # keeps track of these dynamically in its computation graph so you need to explicitly
            # zero them out
            optimizer.zero_grad()

            # Place each tensor on the GPU
            batch = tuple(t.to(device) for t in batch)
            input_ids = batch[0]
            labels = batch[2]
            input_lens = batch[1]

            # Pass the inputs through the model, get the current loss and logits
            loss = model(input_ids, labels=labels, input_lens=input_lens)
            losses.append(loss.item())
            loss_epoch.append(loss.item())

            # Calculate all of the gradients and weight updates for the model
            loss.backward()

            # Optional: clip gradients
            #torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Finally, update the weights of the model
            optimizer.step()

        # Perform inline evaluation at the end of the epoch
        #f1 = evaluate(model, valid_dl)
        print(f'Train loss: {sum(loss_epoch) / len(loss_epoch)}')
        torch.save(model, PATH)

        # Keep track of the best model based on the accuracy
        #if f1 > best_f1:
        #    torch.save(model.state_dict(), 'best_model')
        #    best_f1 = f1

    return model, losses

In [None]:
softmax = nn.Softmax(dim=-1)

def decode(model, inputs, input_lens, labels=None, beam_size=2):
    """
    Decoding/predicting the labels for an input text by running beam search.

    :param inputs: (b x sl) The IDs into the vocabulary of the input samples
    :param input_lens: (b) The length of each input sequence
    :param labels: (b) The label of each sample
    :param beam_size: the size of the beam
    :return: predicted sequence of labels
    """

    assert inputs.shape[0] == 1
    # first, encode the input text
    encoder_output, encoder_hidden = model.model['encoder'](inputs, input_lens)
    decoder_hidden = encoder_hidden

    # the decoder starts generating after the Begining of Sentence (SOS_id) token
    decoder_input = torch.tensor([tokenizer.encode([SOS_id,]),], device=device)
    target_length = labels.shape[1]

    # we will use heapq to keep top best sequences so far sorted in heap_queue
    # these will be sorted by the first item in the tuple
    heap_queue = []
    heap_queue.append((torch.tensor(0), tokenizer.encode([SOS_id]), decoder_input, decoder_hidden))

    # Beam Decoding
    for _ in range(target_length):
        # print("next len")
        new_items = []
        # for each item on the beam
        for j in range(len(heap_queue)):
            # 1. remove from heap
            score, tokens, decoder_input, decoder_hidden = heapq.heappop(heap_queue)
            # 2. decode one more step
            decoder_output, decoder_hidden = model.model['decoder'](
                decoder_input, decoder_hidden, torch.tensor([1]))
            decoder_output = softmax(decoder_output)
            # 3. get top-k predictions
            best_idx = torch.argsort(decoder_output[0], descending=True)[0]
            # print(decoder_output)
            # print(best_idx)
            for i in range(beam_size):
                decoder_input = torch.tensor([[best_idx[i]]], device=device)

                new_items.append((score + decoder_output[0,0, best_idx[i]],
                                  tokens + [best_idx[i].item()],
                                  decoder_input,
                                  decoder_hidden))
        # add new sequences to the heap
        for item in new_items:
          # print(item)
            heapq.heappush(heap_queue, item)
        # remove sequences with lowest score (items are sorted in descending order)
        while len(heap_queue) > beam_size:
            heapq.heappop(heap_queue)

    final_sequence = heapq.nlargest(1, heap_queue)[0]
    assert labels.shape[1] == len(final_sequence[1][1:])
    return final_sequence

In [None]:
def evaluate(model: nn.Module, valid_dl: DataLoader, beam_size:int = 1):
    """
    Evaluates the model on the given dataset
    :param model: The model under evaluation
    :param valid_dl: A `DataLoader` reading validation data
    :return: The accuracy of the model on the dataset
    """
    # VERY IMPORTANT: Put your model in "eval" mode -- this disables things like
    # layer normalization and dropout
    model.eval()
    labels_all = []
    logits_all = []
    tags_all = []

    # ALSO IMPORTANT: Don't accumulate gradients during this process
    with torch.no_grad():
        for batch in tqdm(valid_dl, desc='Evaluation'):
            batch = tuple(t.to(device) for t in batch)
            input_ids = batch[0]
            input_lens = batch[1]
            labels = batch[2]

            best_seq = decode(model, input_ids, input_lens, labels=labels, beam_size=beam_size)
            mask = (input_ids != 0)
            labels_all.extend([l for seq,samp in zip(list(labels.detach().cpu().numpy()), input_ids) for l,i in zip(seq,samp) if i != 0])
            tags_all += best_seq[1][1:]
            # print(best_seq[1][1:], labels)
    P, R, F1, _ = precision_recall_fscore_support(labels_all, tags_all, average='macro')
    print(confusion_matrix(labels_all, tags_all))
    return F1

#### 2.1.4. Arabic

##### 2.1.4.1. Vocabulary

In [None]:
#parameters
language_ = languages[0]                          # filter language
#lstm_dim = 100                                    # dim neural lstm network

# 0. Choose language
datasets_train_filter = datasets_train.filter(lambda dataset: dataset["language"]==language_)
datasets_val_filter = datasets_val.filter(lambda dataset: dataset["language"]==language_)

print('language:', language_);

# 1. pretrain embeddings for each language
tokenizer = bpe_models[language_]

# 3. add index for padding [PAD], END-OF-SENTENCE [EOS], START-OF-SENTENCE [SOS]
new_tokens=['[PAD]', '[SOS]' ,'[EOS]']
pretrained_embeddings = np.concatenate([bpe_models[language_].emb.vectors,
                                        np.zeros(shape=(len(new_tokens),dim_))], axis=0)
# 4. Extract the vocab and add extra tokeNS
vocabulary = bpe_models[language_].emb.index_to_key + new_tokens



  0%|          | 0/117 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

language: arabic


##### 2.1.4.2. Answer's tokens

In [None]:
# tokens plaindocument to identify tokens of answer
tokens_plaintext_train = [tokenizer.encode(x) for x in datasets_train_filter['document_plaintext']]
datasets_train_filter = datasets_train_filter.add_column("document_plaintext_tokens", tokens_plaintext_train)

tokens_plaintext_val = [tokenizer.encode(x) for x in datasets_val_filter['document_plaintext']]
datasets_val_filter = datasets_val_filter.add_column("document_plaintext_tokens", tokens_plaintext_val)

In [None]:
offset_mapping_manual_train = [offset_mapping_manual(x) for x in datasets_train_filter['document_plaintext_tokens']]
datasets_train_filter = datasets_train_filter.add_column("offset_mapping", offset_mapping_manual_train)

offset_mapping_manual_val = [offset_mapping_manual(x) for x in datasets_val_filter['document_plaintext_tokens']]
datasets_val_filter = datasets_val_filter.add_column("offset_mapping", offset_mapping_manual_val)

In [None]:
end_position_answer_train = end_answer(datasets_train_filter['annotations'])
datasets_train_filter = datasets_train_filter.add_column("end_answer", end_position_answer_train)

end_position_answer_val = end_answer(datasets_val_filter['annotations'])
datasets_val_filter = datasets_val_filter.add_column("end_answer", end_position_answer_val)

In [None]:
list_train_start_dummie = list_dummies_start_function(datasets_train_filter['annotations'], datasets_train_filter['offset_mapping'])
list_train_end_dummie = list_dummies_end_function(datasets_train_filter['end_answer'], datasets_train_filter['offset_mapping'])
list_train_sequence_dummies_documments = sequence_dummies_documments(list_train_start_dummie, list_train_end_dummie)
datasets_train_filter = datasets_train_filter.add_column("sequence_dummies", list_train_sequence_dummies_documments)

In [None]:
list_val_start_dummie = list_dummies_start_function(datasets_val_filter['annotations'], datasets_val_filter['offset_mapping'])
list_val_end_dummie = list_dummies_end_function(datasets_val_filter['end_answer'], datasets_val_filter['offset_mapping'])
list_val_sequence_dummies_documments = sequence_dummies_documments(list_val_start_dummie, list_val_end_dummie)
datasets_val_filter = datasets_val_filter.add_column("sequence_dummies", list_val_sequence_dummies_documments)

In [None]:
print(list_train_start_dummie[10][:20])
print(list_train_end_dummie[10][:20])
print(list_train_sequence_dummies_documments[10][:20])


##### 2.1.4.3. Tokenize Questions Together With Documents and Output Variable

In [None]:
tokens_questiontext_train = [tokenizer.encode(x) for x in datasets_train_filter['question_text']]
datasets_train_filter = datasets_train_filter.add_column("question_text_tokens", tokens_questiontext_train)

tokens_questiontext_val = [tokenizer.encode(x) for x in datasets_val_filter['question_text']]
datasets_val_filter = datasets_val_filter.add_column("question_text_tokens", tokens_questiontext_val)


In [None]:
# Train
tokens_together = []
for x, y in zip(datasets_train_filter['question_text_tokens'], datasets_train_filter['document_plaintext_tokens']):
  tokens_together.append(['[SOS]']+x+['[EOS]']+y)

dummies_together = []
for x, y in zip(datasets_train_filter['question_text_tokens'], datasets_train_filter['sequence_dummies']):
  # we know that the answer is not in the question, so we assign 0 to special tokens and tokens from the question
  dummies_together.append([0]+list(np.repeat(0,len(x)))+[0]+y)

datasets_train_filter = datasets_train_filter.add_column("tokens_together", tokens_together)
datasets_train_filter = datasets_train_filter.add_column("dummies_together", dummies_together)


In [None]:
# Val
tokens_together = []
for x, y in zip(datasets_val_filter['question_text_tokens'], datasets_val_filter['document_plaintext_tokens']):
  tokens_together.append(['[SOS]']+x+['[EOS]']+y)

dummies_together = []
for x, y in zip(datasets_val_filter['question_text_tokens'], datasets_val_filter['sequence_dummies']):
  dummies_together.append([0]+list(np.repeat(0,len(x)))+[0]+y)

datasets_val_filter = datasets_val_filter.add_column("tokens_together", tokens_together)
datasets_val_filter = datasets_val_filter.add_column("dummies_together", dummies_together)


In [None]:
print(datasets_train_filter['dummies_together'][0][0:30])

##### 2.1.4.4. Pytorch input format

In [None]:
datasets_train_filter.shape[0]

In [None]:
index_row = range(0,datasets_train_filter.shape[0])
datasets_train_filter = datasets_train_filter.add_column("index", index_row)

In [None]:
# sample arabic, was too long and golab failed many times
datasets_train_filter = datasets_train_filter.filter(lambda dataset: dataset["index"]<11000)
datasets_train_filter


In [None]:
batch_size = 1

# Create the dataset readers
train_dataset = ClassificationDatasetReader(datasets_train_filter,
                                            tokenizer=tokenizer,
                                            column_text_questions='question_text',
                                            column_text_documents='document_plaintext',
                                            column_label='dummies_together')
# dataset loaded lazily with N workers in parallel
train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                      collate_fn=collate_batch_bilstm, num_workers=8)

# Create the dataset readers
val_dataset = ClassificationDatasetReader(datasets_val_filter,
                                            tokenizer=tokenizer,
                                            column_text_questions='question_text',
                                            column_text_documents='document_plaintext',
                                            column_label='dummies_together')
# dataset loaded lazily with N workers in parallel
valid_dl = DataLoader(val_dataset, batch_size=batch_size, shuffle=True,
                      collate_fn=collate_batch_bilstm, num_workers=8)

##### 2.1.4.5. Train

In [None]:
# Define some hyperparameters
lstm_dim = 100
dropout_prob = 0.1
batch_size = 1
lr = 1e-3
n_epochs = 1
n_workers = 8
# Define Model

model = Seq2Seq(
    pretrained_embeddings=torch.from_numpy(pretrained_embeddings),
    lstm_dim=lstm_dim,
    n_classes=2,
    dropout_prob=dropout_prob
).to(device)

train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                      collate_fn=collate_batch_bilstm, num_workers=8)

valid_dl = DataLoader(val_dataset, batch_size=batch_size, shuffle=True,
                      collate_fn=collate_batch_bilstm, num_workers=8)


# Create the optimizer
optimizer = Adam(model.parameters(), lr=lr)

# Train
model_train, losses = train(model, train_dl, valid_dl, optimizer, n_epochs, device)

In [None]:
model_train

In [None]:
PATH = "Week 39/(BILSM) ENCODER-DECODER MODEL/model_encoder_decoder_seq_1_ARABIC"
torch.save(model_train,PATH)

##### 2.1.4.6. Test

In [None]:
PATH = 'Week 39/(BILSM) ENCODER-DECODER MODEL/model_encoder_decoder_seq_1_ARABIC'
# Need to define the class again
model_train = torch.load(PATH)
softmax = nn.Softmax(dim=-1)

In [None]:
#predictions
predictions_all = []
#real labels
labels_all = []

beam_size=2
model = model_train

with torch.no_grad():

    for batch in tqdm(valid_dl):
        #batch = (t.to(device) for t in batch)
        input_ids = torch.tensor(batch[0]).to("cuda")
        input_lens = torch.tensor(batch[1]).to("cuda")
        labels = torch.tensor(batch[2]).to("cuda")

        encoder_output, encoder_hidden = model.model['encoder'](input_ids, input_lens)
        decoder_hidden = encoder_hidden

        # the decoder starts generating after the Begining of Sentence (SOS_id) token
        decoder_input = torch.tensor([SOS_id], device=device).unsqueeze(-1)
        target_length = labels.shape[1]

        # we will use heapq to keep top best sequences so far sorted in heap_queue
        # these will be sorted by the first item in the tuple
        heap_queue = []
        heap_queue.append((torch.tensor(0),[SOS_id], decoder_input, decoder_hidden))
        #heap_queue.append((torch.tensor(0), torch.tensor([SOS_id]), decoder_input, decoder_hidden))


        # Beam Decoding
        for _ in range(target_length-1):
            #print("next len")
            new_items = []
            # for each item on the beam
            for j in range(len(heap_queue)):
                # 1. remove from heap
                score, tokens, decoder_input, decoder_hidden = heapq.heappop(heap_queue)
                # 2. decode one more step
                decoder_output, decoder_hidden = model.model['decoder'](
                    decoder_input, decoder_hidden, torch.tensor([1]))
                decoder_output_soft = softmax(decoder_output)
                # 3. get top-k predictions
                best_idx = torch.argsort(decoder_output_soft[0], descending=True)
                # print(decoder_output)
                # print(best_idx)
                for i in range(beam_size):
                    decoder_input = torch.tensor([[best_idx[i]]], device=device)

                    new_items.append((score + decoder_output[0, best_idx[i]],
                                      tokens + [best_idx[i].item()],
                                      decoder_input,
                                      decoder_hidden))
            # add new sequences to the heap
            for item in new_items:
              # print(item)
                heapq.heappush(heap_queue, item)
            # remove sequences with lowest score (items are sorted in descending order) NO
            while len(heap_queue) > beam_size:
                heapq.heappop(heap_queue)

        final_sequence = heapq.nlargest(1, heap_queue)[0]
        predicted_sequence = [0]+final_sequence[1][1:]
        # Predictions
        predictions_all += predicted_sequence

        # Real
        labels_all += labels.tolist()[0]

In [None]:
confusion_matrix(labels_all, predictions_all)

array([[26389,   467],
       [  337,     8]])

In [None]:
1-(337+8)/((337+8)+(26389+467))

0.9873166427704864

In [None]:
accuracy_score(labels_all, predictions_all)

0.97044226315209

In [None]:
f1_score(labels_all, predictions_all)

0.01951219512195122

#### 2.1.5. Bengali

##### 2.1.5.1. Vocabulary

In [None]:
#parameters
language_ = languages[1]                          # filter language
#lstm_dim = 100                                    # dim neural lstm network

# 0. Choose language
datasets_train_filter = datasets_train.filter(lambda dataset: dataset["language"]==language_)
datasets_val_filter = datasets_val.filter(lambda dataset: dataset["language"]==language_)

print('language:', language_);

# 1. pretrain embeddings for each language
tokenizer = bpe_models[language_]

# 3. add index for padding [PAD], END-OF-SENTENCE [EOS], START-OF-SENTENCE [SOS]
new_tokens=['[PAD]', '[SOS]' ,'[EOS]']
pretrained_embeddings = np.concatenate([bpe_models[language_].emb.vectors,
                                        np.zeros(shape=(len(new_tokens),dim_))], axis=0)
# 4. Extract the vocab and add extra tokeNS
vocabulary = bpe_models[language_].emb.index_to_key + new_tokens

  0%|          | 0/117 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

language: bengali


##### 2.1.5.2. Answer's tokens

In [None]:
# tokens plaindocument to identify tokens of answer
tokens_plaintext_train = [tokenizer.encode(x) for x in datasets_train_filter['document_plaintext']]
datasets_train_filter = datasets_train_filter.add_column("document_plaintext_tokens", tokens_plaintext_train)

tokens_plaintext_val = [tokenizer.encode(x) for x in datasets_val_filter['document_plaintext']]
datasets_val_filter = datasets_val_filter.add_column("document_plaintext_tokens", tokens_plaintext_val)

Flattening the indices:   0%|          | 0/5 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
offset_mapping_manual_train = [offset_mapping_manual(x) for x in datasets_train_filter['document_plaintext_tokens']]
datasets_train_filter = datasets_train_filter.add_column("offset_mapping", offset_mapping_manual_train)

offset_mapping_manual_val = [offset_mapping_manual(x) for x in datasets_val_filter['document_plaintext_tokens']]
datasets_val_filter = datasets_val_filter.add_column("offset_mapping", offset_mapping_manual_val)

In [None]:
end_position_answer_train = end_answer(datasets_train_filter['annotations'])
datasets_train_filter = datasets_train_filter.add_column("end_answer", end_position_answer_train)

end_position_answer_val = end_answer(datasets_val_filter['annotations'])
datasets_val_filter = datasets_val_filter.add_column("end_answer", end_position_answer_val)

In [None]:
list_train_start_dummie = list_dummies_start_function(datasets_train_filter['annotations'], datasets_train_filter['offset_mapping'])
list_train_end_dummie = list_dummies_end_function(datasets_train_filter['end_answer'], datasets_train_filter['offset_mapping'])
list_train_sequence_dummies_documments = sequence_dummies_documments(list_train_start_dummie, list_train_end_dummie)
datasets_train_filter = datasets_train_filter.add_column("sequence_dummies", list_train_sequence_dummies_documments)

In [None]:
list_val_start_dummie = list_dummies_start_function(datasets_val_filter['annotations'], datasets_val_filter['offset_mapping'])
list_val_end_dummie = list_dummies_end_function(datasets_val_filter['end_answer'], datasets_val_filter['offset_mapping'])
list_val_sequence_dummies_documments = sequence_dummies_documments(list_val_start_dummie, list_val_end_dummie)
datasets_val_filter = datasets_val_filter.add_column("sequence_dummies", list_val_sequence_dummies_documments)

##### 2.1.5.3. Tokenize Questions Together With Documents and Output Variable

In [None]:
tokens_questiontext_train = [tokenizer.encode(x) for x in datasets_train_filter['question_text']]
datasets_train_filter = datasets_train_filter.add_column("question_text_tokens", tokens_questiontext_train)

tokens_questiontext_val = [tokenizer.encode(x) for x in datasets_val_filter['question_text']]
datasets_val_filter = datasets_val_filter.add_column("question_text_tokens", tokens_questiontext_val)


In [None]:
# Train
tokens_together = []
for x, y in zip(datasets_train_filter['question_text_tokens'], datasets_train_filter['document_plaintext_tokens']):
  tokens_together.append(['[SOS]']+x+['[EOS]']+y)

dummies_together = []
for x, y in zip(datasets_train_filter['question_text_tokens'], datasets_train_filter['sequence_dummies']):
  # we know that the answer is not in the question, so we assign 0 to special tokens and tokens from the question
  dummies_together.append([0]+list(np.repeat(0,len(x)))+[0]+y)

datasets_train_filter = datasets_train_filter.add_column("tokens_together", tokens_together)
datasets_train_filter = datasets_train_filter.add_column("dummies_together", dummies_together)


In [None]:
# Val
tokens_together = []
for x, y in zip(datasets_val_filter['question_text_tokens'], datasets_val_filter['document_plaintext_tokens']):
  tokens_together.append(['[SOS]']+x+['[EOS]']+y)

dummies_together = []
for x, y in zip(datasets_val_filter['question_text_tokens'], datasets_val_filter['sequence_dummies']):
  dummies_together.append([0]+list(np.repeat(0,len(x)))+[0]+y)

datasets_val_filter = datasets_val_filter.add_column("tokens_together", tokens_together)
datasets_val_filter = datasets_val_filter.add_column("dummies_together", dummies_together)


In [None]:
print(datasets_train_filter['dummies_together'][0][0:30])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


##### 2.1.5.4. Pytorch input format

In [None]:
batch_size = 1

# Create the dataset readers
train_dataset = ClassificationDatasetReader(datasets_train_filter,
                                            tokenizer=tokenizer,
                                            column_text_questions='question_text',
                                            column_text_documents='document_plaintext',
                                            column_label='dummies_together')
# dataset loaded lazily with N workers in parallel
train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                      collate_fn=collate_batch_bilstm, num_workers=8)

# Create the dataset readers
val_dataset = ClassificationDatasetReader(datasets_val_filter,
                                            tokenizer=tokenizer,
                                            column_text_questions='question_text',
                                            column_text_documents='document_plaintext',
                                            column_label='dummies_together')
# dataset loaded lazily with N workers in parallel
valid_dl = DataLoader(val_dataset, batch_size=batch_size, shuffle=True,
                      collate_fn=collate_batch_bilstm, num_workers=8)

##### 2.1.5.5. Train

In [None]:
# Define some hyperparameters
lstm_dim = 100
dropout_prob = 0.1
lr = 1e-3
n_epochs = 10
n_workers = 8
# Define Model

model = Seq2Seq(
    pretrained_embeddings=torch.from_numpy(pretrained_embeddings),
    lstm_dim=lstm_dim,
    n_classes=2,
    dropout_prob=dropout_prob
).to(device)

train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                      collate_fn=collate_batch_bilstm, num_workers=8)

valid_dl = DataLoader(val_dataset, batch_size=batch_size, shuffle=True,
                      collate_fn=collate_batch_bilstm, num_workers=8)


# Create the optimizer
optimizer = Adam(model.parameters(), lr=lr)

# Train
model_train, losses = train(model, train_dl, valid_dl, optimizer, n_epochs, device)

In [None]:
model_train

Seq2Seq(
  (model): ModuleDict(
    (encoder): EncoderRNN(
      (model): ModuleDict(
        (embeddings): Embedding(100003, 100, padding_idx=100000)
        (lstm): LSTM(100, 100, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)
      )
    )
    (decoder): DecoderRNN(
      (model): ModuleDict(
        (embeddings): Embedding(100003, 100, padding_idx=100000)
        (lstm): LSTM(100, 100, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)
        (nn): Linear(in_features=200, out_features=2, bias=True)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (loss): CrossEntropyLoss()
)

In [None]:
PATH = "Week 39/(BILSM) ENCODER-DECODER MODEL/model_encoder_decoder_seq_1_BENGALI"
torch.save(model_train,PATH)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##### 2.1.5.6. Test

In [None]:
PATH = 'Week 39/(BILSM) ENCODER-DECODER MODEL/model_encoder_decoder_seq_1_BENGALI'
# Need to define the class again
model_train = torch.load(PATH)
softmax = nn.Softmax(dim=-1)

In [None]:
#predictions
predictions_all = []
#real labels
labels_all = []

beam_size=2
model = model_train

with torch.no_grad():

    for batch in tqdm(valid_dl):
        #batch = (t.to(device) for t in batch)
        input_ids = torch.tensor(batch[0]).to("cuda")
        input_lens = torch.tensor(batch[1]).to("cuda")
        labels = torch.tensor(batch[2]).to("cuda")

        encoder_output, encoder_hidden = model.model['encoder'](input_ids, input_lens)
        decoder_hidden = encoder_hidden

        # the decoder starts generating after the Begining of Sentence (SOS_id) token
        decoder_input = torch.tensor([SOS_id], device=device).unsqueeze(-1)
        target_length = labels.shape[1]

        # we will use heapq to keep top best sequences so far sorted in heap_queue
        # these will be sorted by the first item in the tuple
        heap_queue = []
        heap_queue.append((torch.tensor(0),[SOS_id], decoder_input, decoder_hidden))
        #heap_queue.append((torch.tensor(0), torch.tensor([SOS_id]), decoder_input, decoder_hidden))


        # Beam Decoding
        for _ in range(target_length-1):
            #print("next len")
            new_items = []
            # for each item on the beam
            for j in range(len(heap_queue)):
                # 1. remove from heap
                score, tokens, decoder_input, decoder_hidden = heapq.heappop(heap_queue)
                # 2. decode one more step
                decoder_output, decoder_hidden = model.model['decoder'](
                    decoder_input, decoder_hidden, torch.tensor([1]))
                decoder_output_soft = softmax(decoder_output)
                # 3. get top-k predictions
                best_idx = torch.argsort(decoder_output_soft[0], descending=True)
                # print(decoder_output)
                # print(best_idx)
                for i in range(beam_size):
                    decoder_input = torch.tensor([[best_idx[i]]], device=device)

                    new_items.append((score + decoder_output[0, best_idx[i]],
                                      tokens + [best_idx[i].item()],
                                      decoder_input,
                                      decoder_hidden))
            # add new sequences to the heap
            for item in new_items:
              # print(item)
                heapq.heappush(heap_queue, item)
            # remove sequences with lowest score (items are sorted in descending order) NO
            while len(heap_queue) > beam_size:
                heapq.heappop(heap_queue)

        final_sequence = heapq.nlargest(1, heap_queue)[0]
        predicted_sequence = [0]+final_sequence[1][1:]
        # Predictions
        predictions_all += predicted_sequence

        # Real
        labels_all += labels.tolist()[0]

In [None]:
confusion_matrix(labels_all, predictions_all)

array([[26843,    13],
       [  345,     0]])

In [None]:
accuracy_score(labels_all, predictions_all)

0.9868387191647366

In [None]:
f1_score(labels_all, predictions_all)

0.0

#### 2.1.6. Indonesian

##### 2.1.6.1. Vocabulary

In [None]:
#parameters
language_ = languages[2]                          # filter language
#lstm_dim = 100                                    # dim neural lstm network

# 0. Choose language
datasets_train_filter = datasets_train.filter(lambda dataset: dataset["language"]==language_)
datasets_val_filter = datasets_val.filter(lambda dataset: dataset["language"]==language_)

print('language:', language_);

# 1. pretrain embeddings for each language
tokenizer = bpe_models[language_]

# 3. add index for padding [PAD], END-OF-SENTENCE [EOS], START-OF-SENTENCE [SOS]
new_tokens=['[PAD]', '[SOS]' ,'[EOS]']
pretrained_embeddings = np.concatenate([bpe_models[language_].emb.vectors,
                                        np.zeros(shape=(len(new_tokens),dim_))], axis=0)
# 4. Extract the vocab and add extra tokeNS
vocabulary = bpe_models[language_].emb.index_to_key + new_tokens

  0%|          | 0/117 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

language: indonesian


##### 2.1.6.2. Answer's tokens

In [None]:
# tokens plaindocument to identify tokens of answer
tokens_plaintext_train = [tokenizer.encode(x) for x in datasets_train_filter['document_plaintext']]
datasets_train_filter = datasets_train_filter.add_column("document_plaintext_tokens", tokens_plaintext_train)

tokens_plaintext_val = [tokenizer.encode(x) for x in datasets_val_filter['document_plaintext']]
datasets_val_filter = datasets_val_filter.add_column("document_plaintext_tokens", tokens_plaintext_val)

Flattening the indices:   0%|          | 0/12 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
offset_mapping_manual_train = [offset_mapping_manual(x) for x in datasets_train_filter['document_plaintext_tokens']]
datasets_train_filter = datasets_train_filter.add_column("offset_mapping", offset_mapping_manual_train)

offset_mapping_manual_val = [offset_mapping_manual(x) for x in datasets_val_filter['document_plaintext_tokens']]
datasets_val_filter = datasets_val_filter.add_column("offset_mapping", offset_mapping_manual_val)

In [None]:
end_position_answer_train = end_answer(datasets_train_filter['annotations'])
datasets_train_filter = datasets_train_filter.add_column("end_answer", end_position_answer_train)

end_position_answer_val = end_answer(datasets_val_filter['annotations'])
datasets_val_filter = datasets_val_filter.add_column("end_answer", end_position_answer_val)

In [None]:
list_train_start_dummie = list_dummies_start_function(datasets_train_filter['annotations'], datasets_train_filter['offset_mapping'])
list_train_end_dummie = list_dummies_end_function(datasets_train_filter['end_answer'], datasets_train_filter['offset_mapping'])
list_train_sequence_dummies_documments = sequence_dummies_documments(list_train_start_dummie, list_train_end_dummie)
datasets_train_filter = datasets_train_filter.add_column("sequence_dummies", list_train_sequence_dummies_documments)

In [None]:
list_val_start_dummie = list_dummies_start_function(datasets_val_filter['annotations'], datasets_val_filter['offset_mapping'])
list_val_end_dummie = list_dummies_end_function(datasets_val_filter['end_answer'], datasets_val_filter['offset_mapping'])
list_val_sequence_dummies_documments = sequence_dummies_documments(list_val_start_dummie, list_val_end_dummie)
datasets_val_filter = datasets_val_filter.add_column("sequence_dummies", list_val_sequence_dummies_documments)

##### 2.1.6.3. Tokenize Questions Together With Documents and Output Variable

In [None]:
tokens_questiontext_train = [tokenizer.encode(x) for x in datasets_train_filter['question_text']]
datasets_train_filter = datasets_train_filter.add_column("question_text_tokens", tokens_questiontext_train)

tokens_questiontext_val = [tokenizer.encode(x) for x in datasets_val_filter['question_text']]
datasets_val_filter = datasets_val_filter.add_column("question_text_tokens", tokens_questiontext_val)


In [None]:
# Train
tokens_together = []
for x, y in zip(datasets_train_filter['question_text_tokens'], datasets_train_filter['document_plaintext_tokens']):
  tokens_together.append(['[SOS]']+x+['[EOS]']+y)

dummies_together = []
for x, y in zip(datasets_train_filter['question_text_tokens'], datasets_train_filter['sequence_dummies']):
  # we know that the answer is not in the question, so we assign 0 to special tokens and tokens from the question
  dummies_together.append([0]+list(np.repeat(0,len(x)))+[0]+y)

datasets_train_filter = datasets_train_filter.add_column("tokens_together", tokens_together)
datasets_train_filter = datasets_train_filter.add_column("dummies_together", dummies_together)


In [None]:
# Val
tokens_together = []
for x, y in zip(datasets_val_filter['question_text_tokens'], datasets_val_filter['document_plaintext_tokens']):
  tokens_together.append(['[SOS]']+x+['[EOS]']+y)

dummies_together = []
for x, y in zip(datasets_val_filter['question_text_tokens'], datasets_val_filter['sequence_dummies']):
  dummies_together.append([0]+list(np.repeat(0,len(x)))+[0]+y)

datasets_val_filter = datasets_val_filter.add_column("tokens_together", tokens_together)
datasets_val_filter = datasets_val_filter.add_column("dummies_together", dummies_together)


In [None]:
print(datasets_train_filter['dummies_together'][0][0:30])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]


##### 2.1.6.4. Pytorch input format

In [None]:
batch_size = 1

# Create the dataset readers
train_dataset = ClassificationDatasetReader(datasets_train_filter,
                                            tokenizer=tokenizer,
                                            column_text_questions='question_text',
                                            column_text_documents='document_plaintext',
                                            column_label='dummies_together')
# dataset loaded lazily with N workers in parallel
train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                      collate_fn=collate_batch_bilstm, num_workers=8)

# Create the dataset readers
val_dataset = ClassificationDatasetReader(datasets_val_filter,
                                            tokenizer=tokenizer,
                                            column_text_questions='question_text',
                                            column_text_documents='document_plaintext',
                                            column_label='dummies_together')
# dataset loaded lazily with N workers in parallel
valid_dl = DataLoader(val_dataset, batch_size=batch_size, shuffle=True,
                      collate_fn=collate_batch_bilstm, num_workers=8)

##### 2.1.6.5. Train

In [None]:
# Define some hyperparameters
lstm_dim = 100
dropout_prob = 0.1
lr = 1e-3
n_epochs = 3
n_workers = 8
# Define Model

model = Seq2Seq(
    pretrained_embeddings=torch.from_numpy(pretrained_embeddings),
    lstm_dim=lstm_dim,
    n_classes=2,
    dropout_prob=dropout_prob
).to(device)

train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                      collate_fn=collate_batch_bilstm, num_workers=8)

valid_dl = DataLoader(val_dataset, batch_size=batch_size, shuffle=True,
                      collate_fn=collate_batch_bilstm, num_workers=8)


# Create the optimizer
optimizer = Adam(model.parameters(), lr=lr)

# Train
model_train, losses = train(model, train_dl, valid_dl, optimizer, n_epochs, device)


  0%|          | 0/11394 [00:00<?, ?it/s]

Train loss: 0.049261860013665545


  0%|          | 0/11394 [00:00<?, ?it/s]

Train loss: 0.04524049638616121


  0%|          | 0/11394 [00:00<?, ?it/s]

Train loss: 0.04348221039761484


In [None]:
model_train

Seq2Seq(
  (model): ModuleDict(
    (encoder): EncoderRNN(
      (model): ModuleDict(
        (embeddings): Embedding(100003, 100, padding_idx=100000)
        (lstm): LSTM(100, 100, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)
      )
    )
    (decoder): DecoderRNN(
      (model): ModuleDict(
        (embeddings): Embedding(100003, 100, padding_idx=100000)
        (lstm): LSTM(100, 100, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)
        (nn): Linear(in_features=200, out_features=2, bias=True)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (loss): CrossEntropyLoss()
)

In [None]:
PATH = "Week 39/(BILSM) ENCODER-DECODER MODEL/model_encoder_decoder_seq_1_INDONESIAN"
torch.save(model_train,PATH)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##### 2.1.5.6. Test

In [None]:
PATH = 'Week 39/(BILSM) ENCODER-DECODER MODEL/model_encoder_decoder_seq_1_INDONESIAN'
# Need to define the class again
model_train = torch.load(PATH)
softmax = nn.Softmax(dim=-1)

In [None]:
#predictions
predictions_all = []
#real labels
labels_all = []

beam_size=2
model = model_train

with torch.no_grad():

    for batch in tqdm(valid_dl):
        #batch = (t.to(device) for t in batch)
        input_ids = torch.tensor(batch[0]).to("cuda")
        input_lens = torch.tensor(batch[1]).to("cuda")
        labels = torch.tensor(batch[2]).to("cuda")

        encoder_output, encoder_hidden = model.model['encoder'](input_ids, input_lens)
        decoder_hidden = encoder_hidden

        # the decoder starts generating after the Begining of Sentence (SOS_id) token
        decoder_input = torch.tensor([SOS_id], device=device).unsqueeze(-1)
        target_length = labels.shape[1]

        # we will use heapq to keep top best sequences so far sorted in heap_queue
        # these will be sorted by the first item in the tuple
        heap_queue = []
        heap_queue.append((torch.tensor(0),[SOS_id], decoder_input, decoder_hidden))
        #heap_queue.append((torch.tensor(0), torch.tensor([SOS_id]), decoder_input, decoder_hidden))


        # Beam Decoding
        for _ in range(target_length-1):
            #print("next len")
            new_items = []
            # for each item on the beam
            for j in range(len(heap_queue)):
                # 1. remove from heap
                score, tokens, decoder_input, decoder_hidden = heapq.heappop(heap_queue)
                # 2. decode one more step
                decoder_output, decoder_hidden = model.model['decoder'](
                    decoder_input, decoder_hidden, torch.tensor([1]))
                decoder_output_soft = softmax(decoder_output)
                # 3. get top-k predictions
                best_idx = torch.argsort(decoder_output_soft[0], descending=True)
                # print(decoder_output)
                # print(best_idx)
                for i in range(beam_size):
                    decoder_input = torch.tensor([[best_idx[i]]], device=device)

                    new_items.append((score + decoder_output[0, best_idx[i]],
                                      tokens + [best_idx[i].item()],
                                      decoder_input,
                                      decoder_hidden))
            # add new sequences to the heap
            for item in new_items:
              # print(item)
                heapq.heappush(heap_queue, item)
            # remove sequences with lowest score (items are sorted in descending order) NO
            while len(heap_queue) > beam_size:
                heapq.heappop(heap_queue)

        final_sequence = heapq.nlargest(1, heap_queue)[0]
        predicted_sequence = [0]+final_sequence[1][1:]
        # Predictions
        predictions_all += predicted_sequence

        # Real
        labels_all += labels.tolist()[0]

  0%|          | 0/1191 [00:00<?, ?it/s]

In [None]:
confusion_matrix(labels_all, predictions_all)

array([[117337,     25],
       [  3422,      0]])

In [None]:
accuracy_score(labels_all, predictions_all)

0.9714614518479269

In [None]:
f1_score(labels_all, predictions_all)

0.0

### 2.2. Transformer Multilingual BERT

#### 2.2.1. Pre-processing (Functions)

In [30]:
checkpoint = "bert-base-multilingual-uncased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Define function to tokenize question and documents together and the output
def get_train_features(samples):
  '''
  Tokenizes the text in the given samples, splittling inputs that are too long
  for our model across multiple features. Finds the token offsets of the answers,
  which ____ the labels for our inputs.
  '''
  answers = samples["annotations"]
  start_positions = []
  end_positions = []
  y_sequence = []


  batch = tokenizer(
        samples['question_text'],
        samples['document_plaintext'],
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

  # Since one document might give several features if it is long
  # we need a mapping that shows what example each feature is associated with.
  sample_mapping = batch.pop('overflow_to_sample_mapping')

  # This gives a map from token to character position in the original context
  # helps us computer start and end positions.
  offset_mapping = batch.pop('offset_mapping')

  id_words_list_special_characters = batch.word_ids()

  for i, offset in enumerate(offset_mapping):
      sample_idx = sample_mapping[i]                                                # id for identifying the row
      answer = answers[sample_idx]                                                  # answer associated with that id
      start_char = answer["answer_start"][0]                                        # position character where answer starts
      end_char = answer["answer_start"][0] + len(answer["answer_text"][0])          # position character where answer finishes
      sequence_ids = batch.sequence_ids(i)                                         # identify question, answer, special characters (EOS, PADDING, etc)

      # Find the start and end of the context
      idx = 0
      while sequence_ids[idx] != 1:                                                 # identify question characters or special characters
          idx += 1
      context_start = idx                                                           # identify beggining of context
      while sequence_ids[idx] == 1:
          idx += 1
      context_end = idx - 1                                                         # identify end of context

      # If the answer is not fully inside the context, label is (0, 0)
      if offset[context_start][0] > start_char or offset[context_end][1] < end_char: # when truncating, if the first part of the context is after the answe or if the last part of the context is before the end of the answer
          start_positions.append(0)
          end_positions.append(0)
      else:
          # Otherwise it's the start and end token positions
          idx = context_start
          while idx <= context_end and offset[idx][0] <= start_char:                  # between the start of the answer
              idx += 1
          start_positions.append(idx - 1)

          idx = context_end
          while idx >= context_start and offset[idx][1] >= end_char:                  # between the end of the answer
              idx -= 1
          end_positions.append(idx + 1)

      y_sequence_loop = [0] * len(offset)

      for index, token in enumerate(offset):
        if (start_positions[i]<=index)&(end_positions[i]>=index):
          y_sequence_loop[index] = 1
        if token == (0,0):
          y_sequence_loop[index] = -100

      y_sequence.append(y_sequence_loop)

  batch['labels']  = y_sequence
  return batch

#### 2.2.2. Model

In [31]:
label_names = ['no answer', 'answer']

id2label = {'0':'no answer', '1': 'answer'}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForTokenClassification.from_pretrained(checkpoint,
                                                       id2label=id2label,
                                                       label2id=label2id,)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint a

In [32]:
model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

#### 2.2.3. Model Set Up

In [33]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
optimizer = AdamW(model.parameters(), lr=2e-5)

# hyperparameters
path=f"/content/drive/MyDrive/train"
args = TrainingArguments(
    path,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Evaluation metric
metric = evaluate.load("seqeval")
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

#### 2.2.4. Arabic

##### 2.2.4.1. Pre-process (Data)

In [None]:
#parameters
language_ = languages[0]                          # filter language

# 0. Choose language
datasets_train_filter = datasets_train.filter(lambda dataset: dataset["language"]==language_)
datasets_val_filter = datasets_val.filter(lambda dataset: dataset["language"]==language_)

print('language:', language_);



  0%|          | 0/117 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

language: arabic


In [None]:
train_dataset = datasets_train_filter.map(get_train_features, batched = True, remove_columns = datasets_train_filter.column_names)
val_dataset = datasets_val_filter.map(get_train_features, batched = True, remove_columns = datasets_train_filter.column_names)

  0%|          | 0/30 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

##### 2.2.4.2. Train

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

***** Running training *****
  Num examples = 31187
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 11697


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0519,0.045399,0.615621,0.672788,0.642937,0.983452
2,0.0396,0.045852,0.622051,0.688826,0.653738,0.98397
3,0.0272,0.050292,0.63697,0.691671,0.663194,0.984653


***** Running Evaluation *****
  Num examples = 1963
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/arabic_sequence/checkpoint-3899
Configuration saved in /content/drive/MyDrive/arabic_sequence/checkpoint-3899/config.json
Model weights saved in /content/drive/MyDrive/arabic_sequence/checkpoint-3899/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/arabic_sequence/checkpoint-3899/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/arabic_sequence/checkpoint-3899/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1963
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/arabic_sequence/checkpoint-7798
Configuration saved in /content/drive/MyDrive/arabic_sequence/checkpoint-7798/config.json
Model weights saved in /content/drive/MyDrive/arabic_sequence/checkpoint-7798/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/arabic_sequence/checkpoint-7798/tokenizer_config.json


TrainOutput(global_step=11697, training_loss=0.04288217003675488, metrics={'train_runtime': 9157.5224, 'train_samples_per_second': 10.217, 'train_steps_per_second': 1.277, 'total_flos': 2.4447185856976896e+16, 'train_loss': 0.04288217003675488, 'epoch': 3.0})

In [None]:
path = "/content/drive/MyDrive/BERT - ARABIC - SEQUENCE"
trainer.save_model(path)

Saving model checkpoint to /content/drive/MyDrive/BERT - ARABIC - SEQUENCE
Configuration saved in /content/drive/MyDrive/BERT - ARABIC - SEQUENCE/config.json
Model weights saved in /content/drive/MyDrive/BERT - ARABIC - SEQUENCE/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/BERT - ARABIC - SEQUENCE/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/BERT - ARABIC - SEQUENCE/special_tokens_map.json


#### 2.2.5. Bengali

##### 2.2.5.1. Pre-process (Data)

In [34]:
#parameters
language_ = languages[1]                          # filter language

# 0. Choose language
datasets_train_filter = datasets_train.filter(lambda dataset: dataset["language"]==language_)
datasets_val_filter = datasets_val.filter(lambda dataset: dataset["language"]==language_)

print('language:', language_);

  0%|          | 0/117 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

language: bengali


In [35]:
train_dataset = datasets_train_filter.map(get_train_features, batched = True, remove_columns = datasets_train_filter.column_names)
val_dataset = datasets_val_filter.map(get_train_features, batched = True, remove_columns = datasets_train_filter.column_names)

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [36]:
train_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 5197
})

##### 2.2.5.2. Train

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

***** Running training *****
  Num examples = 5197
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1950


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0449,0.031714,0.41046,0.554604,0.471767,0.988442
2,0.0238,0.024912,0.55489,0.595289,0.57438,0.991705
3,0.0192,0.02711,0.582834,0.625268,0.603306,0.992392


***** Running Evaluation *****
  Num examples = 241
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/bengali_sequence/checkpoint-650
Configuration saved in /content/drive/MyDrive/bengali_sequence/checkpoint-650/config.json
Model weights saved in /content/drive/MyDrive/bengali_sequence/checkpoint-650/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/bengali_sequence/checkpoint-650/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/bengali_sequence/checkpoint-650/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 241
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/bengali_sequence/checkpoint-1300
Configuration saved in /content/drive/MyDrive/bengali_sequence/checkpoint-1300/config.json
Model weights saved in /content/drive/MyDrive/bengali_sequence/checkpoint-1300/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/bengali_sequence/checkpoint-1300/tokenizer_config.jso

TrainOutput(global_step=1950, training_loss=0.02545681268740923, metrics={'train_runtime': 1631.0284, 'train_samples_per_second': 9.559, 'train_steps_per_second': 1.196, 'total_flos': 4073877734270976.0, 'train_loss': 0.02545681268740923, 'epoch': 3.0})

In [None]:
path = "/content/drive/MyDrive/BERT - BENGALI - SEQUENCE"
trainer.save_model(path)

Saving model checkpoint to /content/drive/MyDrive/BERT - BENGALI - SEQUENCE
Configuration saved in /content/drive/MyDrive/BERT - BENGALI - SEQUENCE/config.json
Model weights saved in /content/drive/MyDrive/BERT - BENGALI - SEQUENCE/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/BERT - BENGALI - SEQUENCE/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/BERT - BENGALI - SEQUENCE/special_tokens_map.json


#### 2.2.6. Indonesian

##### 2.2.6.1. Pre-process (Data)

In [None]:
#parameters
language_ = languages[2]                          # filter language
#lstm_dim = 100                                    # dim neural lstm network

# 0. Choose language
datasets_train_filter = datasets_train.filter(lambda dataset: dataset["language"]==language_)
datasets_val_filter = datasets_val.filter(lambda dataset: dataset["language"]==language_)

print('language:', language_);



  0%|          | 0/117 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

language: indonesian


In [None]:
train_dataset = datasets_train_filter.map(get_train_features, batched = True, remove_columns = datasets_train_filter.column_names)
val_dataset = datasets_val_filter.map(get_train_features, batched = True, remove_columns = datasets_train_filter.column_names)

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

##### 2.2.6.2. Train

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

***** Running training *****
  Num examples = 11594
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 4350


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0688,0.047461,0.593028,0.608822,0.600821,0.984175
2,0.0393,0.043088,0.638673,0.648772,0.643683,0.985111
3,0.024,0.047272,0.62156,0.686226,0.652294,0.984777


***** Running Evaluation *****
  Num examples = 1210
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/indonesian_sequence/checkpoint-1450
Configuration saved in /content/drive/MyDrive/indonesian_sequence/checkpoint-1450/config.json
Model weights saved in /content/drive/MyDrive/indonesian_sequence/checkpoint-1450/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/indonesian_sequence/checkpoint-1450/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/indonesian_sequence/checkpoint-1450/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1210
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/indonesian_sequence/checkpoint-2900
Configuration saved in /content/drive/MyDrive/indonesian_sequence/checkpoint-2900/config.json
Model weights saved in /content/drive/MyDrive/indonesian_sequence/checkpoint-2900/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/indonesian_sequence/ch

TrainOutput(global_step=4350, training_loss=0.04820252769294826, metrics={'train_runtime': 3555.1016, 'train_samples_per_second': 9.784, 'train_steps_per_second': 1.224, 'total_flos': 9088423792791552.0, 'train_loss': 0.04820252769294826, 'epoch': 3.0})

In [None]:
path = "/content/drive/MyDrive/BERT - INDONESIAN - SEQUENCE"
trainer.save_model(path)

Saving model checkpoint to /content/drive/MyDrive/BERT - INDONESIAN - SEQUENCE
Configuration saved in /content/drive/MyDrive/BERT - INDONESIAN - SEQUENCE/config.json
Model weights saved in /content/drive/MyDrive/BERT - INDONESIAN - SEQUENCE/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/BERT - INDONESIAN - SEQUENCE/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/BERT - INDONESIAN - SEQUENCE/special_tokens_map.json


### 2.3. Transformer RoBERTa

#### 2.3.1. Pre-procesing (Functions)

In [None]:
MODEL_NAME = 'xlm-roberta-base'

# Load the pre-trained auto tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# tokenize and create features
def get_train_features(samples):

  '''
  Tokenizes the text in the given samples, splittling inputs that are too long
  for our model across multiple features. Finds the token offsets of the answers,
  which helps us find the labels for our inputs.
  '''

  answers = samples["annotations"]
  start_positions = []
  end_positions = []
  y_sequence = []

  batch = tokenizer(
        samples['question_text'],
        samples['document_plaintext'],
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

  # Since one document might give several features if it is long
  # we need a mapping that shows what example each feature is associated with.
  sample_mapping = batch.pop('overflow_to_sample_mapping')

  # This gives a map from token to character position in the original context
  # helps us computer start and end positions.
  offset_mapping = batch.pop('offset_mapping')

  for i, offset in enumerate(offset_mapping):
      sample_idx = sample_mapping[i]                                                # id for identifying the row
      answer = answers[sample_idx]                                                  # answer associated with that id
      start_char = answer["answer_start"][0]                                        # position character where answer starts
      end_char = answer["answer_start"][0] + len(answer["answer_text"][0])          # position character where answer finishes
      sequence_ids = batch.sequence_ids(i)                                          # identify question, answer, special characters (EOS, PADDING, etc)

      # Find the start and end of the context
      idx = 0
      while sequence_ids[idx] != 1:                                                 # identify question characters or special characters
          idx += 1
      context_start = idx                                                           # identify beggining of context
      while sequence_ids[idx] == 1:
          idx += 1
      context_end = idx - 1                                                         # identify end of context

      # If the answer is not fully inside the context, label is (0, 0)
      if offset[context_start][0] > start_char or offset[context_end][1] < end_char: # when truncating, if the first part of the context is after the answe or if the last part of the context is before the end of the answer
          start_positions.append(0)
          end_positions.append(0)
      else:
          # Otherwise it's the start and end token positions
          idx = context_start
          while idx <= context_end and offset[idx][0] <= start_char:                  # between the start of the answer
              idx += 1
          start_positions.append(idx - 1)

          idx = context_end
          while idx >= context_start and offset[idx][1] >= end_char:                  # between the end of the answer
              idx -= 1
          end_positions.append(idx + 1)

      y_sequence_loop = [0] * len(offset)

      for index, token in enumerate(offset):
        if (start_positions[i]<=index)&(end_positions[i]>=index):
          y_sequence_loop[index] = 1
        if token == (0,0):
          y_sequence_loop[index] = -100

      y_sequence.append(y_sequence_loop)

  batch['labels']  = y_sequence
  return batch

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

#### 2.3.2. Model

In [None]:
label_names = ['no answer', 'answer']

# load optimizer
id2label = {'0':'no answer', '1': 'answer'}
label2id = {v: k for k, v in id2label.items()}

# load model
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME,
                                                       id2label=id2label,
                                                       label2id=label2id,)

Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForTokenClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-st

In [None]:
# send model to GPU
model.to(device)

XLMRobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
            

#### 2.3.3. Model Set Up

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
optimizer = AdamW(model.parameters(), lr=2e-5)

# hyperparameters
path=f"/content/drive/MyDrive/train"
args = TrainingArguments(
    path,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Evaluation metric
metric = evaluate.load("seqeval")
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

#### 2.3.4. Arabic

##### 2.3.4.1. Pre-process (Data)

In [None]:
# run the tokenizer
tokenized_train_arabic = train_data[0][1].map(partial(get_train_features),
                                              batched = True,
remove_columns = train_data[0][1].column_names)

tokenized_val_arabic = val_data[0][1].map(partial(get_train_features),
                                              batched = True,
remove_columns = val_data[0][1].column_names)

  0%|          | 0/30 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

##### 2.3.4.2. Train

In [None]:
# load trainer
trainer = Trainer(
    model=model,
    args = args,
    train_dataset=tokenized_train_arabic,
    eval_dataset=tokenized_val_arabic,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

***** Running training *****
  Num examples = 30714
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 11520


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.06,0.050017,0.625,0.603381,0.614,0.982863
2,0.0477,0.048749,0.63229,0.693628,0.66154,0.982845
3,0.0393,0.049721,0.663027,0.694928,0.678603,0.983658


***** Running Evaluation *****
  Num examples = 1947
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/arabic_sequence/checkpoint-3840
Configuration saved in /content/drive/MyDrive/arabic_sequence/checkpoint-3840/config.json
Model weights saved in /content/drive/MyDrive/arabic_sequence/checkpoint-3840/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/arabic_sequence/checkpoint-3840/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/arabic_sequence/checkpoint-3840/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1947
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/arabic_sequence/checkpoint-7680
Configuration saved in /content/drive/MyDrive/arabic_sequence/checkpoint-7680/config.json
Model weights saved in /content/drive/MyDrive/arabic_sequence/checkpoint-7680/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/arabic_sequence/checkpoint-7680/tokenizer_config.json


TrainOutput(global_step=11520, training_loss=0.05382153172459867, metrics={'train_runtime': 9968.2916, 'train_samples_per_second': 9.244, 'train_steps_per_second': 1.156, 'total_flos': 2.407640575916851e+16, 'train_loss': 0.05382153172459867, 'epoch': 3.0})

In [None]:
path = "/content/drive/MyDrive/RoBERTa - ARABIC - SEQUENCE"
trainer.save_model(path)

Saving model checkpoint to /content/drive/MyDrive/RoBERTa - ARABIC - SEQUENCE
Configuration saved in /content/drive/MyDrive/RoBERTa - ARABIC - SEQUENCE/config.json
Model weights saved in /content/drive/MyDrive/RoBERTa - ARABIC - SEQUENCE/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/RoBERTa - ARABIC - SEQUENCE/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/RoBERTa - ARABIC - SEQUENCE/special_tokens_map.json


#### 2.3.5. Bengali

##### 2.3.5.1. Pre-process (Data)

In [None]:
# run the tokenizer
tokenized_train_bengali = train_data[1][1].map(partial(get_train_features),
                                              batched = True,
remove_columns = train_data[1][1].column_names)

tokenized_val_bengali = val_data[1][1].map(partial(get_train_features),
                                              batched = True,
remove_columns = val_data[1][1].column_names)

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

##### 2.3.5.2. Train

In [None]:
# load trainer
trainer = Trainer(
    model=model,
    args = args,
    train_dataset=tokenized_train_bengali,
    eval_dataset=tokenized_val_bengali,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

***** Running training *****
  Num examples = 5029
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1887


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0662,0.041823,0.461942,0.38512,0.420048,0.986567
2,0.0408,0.030354,0.485523,0.477024,0.481236,0.987875
3,0.0299,0.029813,0.466539,0.533917,0.497959,0.98876


***** Running Evaluation *****
  Num examples = 233
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/bengali_sequence/checkpoint-629
Configuration saved in /content/drive/MyDrive/bengali_sequence/checkpoint-629/config.json
Model weights saved in /content/drive/MyDrive/bengali_sequence/checkpoint-629/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/bengali_sequence/checkpoint-629/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/bengali_sequence/checkpoint-629/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 233
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/bengali_sequence/checkpoint-1258
Configuration saved in /content/drive/MyDrive/bengali_sequence/checkpoint-1258/config.json
Model weights saved in /content/drive/MyDrive/bengali_sequence/checkpoint-1258/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/bengali_sequence/checkpoint-1258/tokenizer_config.jso

TrainOutput(global_step=1887, training_loss=0.041390248183921084, metrics={'train_runtime': 1590.5841, 'train_samples_per_second': 9.485, 'train_steps_per_second': 1.186, 'total_flos': 3942184168876032.0, 'train_loss': 0.041390248183921084, 'epoch': 3.0})

In [None]:
path = "/content/drive/MyDrive/RoBERTa - BENGALI - SEQUENCE"
trainer.save_model(path)

Saving model checkpoint to /content/drive/MyDrive/RoBERTa - BENGALI - SEQUENCE
Configuration saved in /content/drive/MyDrive/RoBERTa - BENGALI - SEQUENCE/config.json
Model weights saved in /content/drive/MyDrive/RoBERTa - BENGALI - SEQUENCE/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/RoBERTa - BENGALI - SEQUENCE/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/RoBERTa - BENGALI - SEQUENCE/special_tokens_map.json


#### 2.3.6. Indonesian

##### 2.3.6.1. Pre-process (Data)

In [None]:
# run the tokenizer
tokenized_train_indonesian = train_data[2][1].map(partial(get_train_features),
                                                  batched = True,
                                                  remove_columns = train_data[2][1].column_names)

tokenized_val_indonesian = val_data[2][1].map(partial(get_train_features),
                                              batched = True,
                                              remove_columns = val_data[2][1].column_names)

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

##### 2.3.6.2. Train

In [None]:
trainer = Trainer(
    model=model,
    args = args,
    train_dataset=tokenized_train_indonesian,
    eval_dataset=tokenized_val_indonesian,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

***** Running training *****
  Num examples = 11573
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 4341


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0821,0.055658,0.605953,0.56857,0.586667,0.982357
2,0.0527,0.045156,0.633125,0.674031,0.652938,0.98388


***** Running Evaluation *****
  Num examples = 1208
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/indonesian_sequence/checkpoint-1447
Configuration saved in /content/drive/MyDrive/indonesian_sequence/checkpoint-1447/config.json
Model weights saved in /content/drive/MyDrive/indonesian_sequence/checkpoint-1447/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/indonesian_sequence/checkpoint-1447/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/indonesian_sequence/checkpoint-1447/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1208
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/indonesian_sequence/checkpoint-2894
Configuration saved in /content/drive/MyDrive/indonesian_sequence/checkpoint-2894/config.json
Model weights saved in /content/drive/MyDrive/indonesian_sequence/checkpoint-2894/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/indonesian_sequence/ch

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0821,0.055658,0.605953,0.56857,0.586667,0.982357
2,0.0527,0.045156,0.633125,0.674031,0.652938,0.98388
3,0.0384,0.049386,0.633982,0.706128,0.668113,0.984025


***** Running Evaluation *****
  Num examples = 1208
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/indonesian_sequence/checkpoint-4341
Configuration saved in /content/drive/MyDrive/indonesian_sequence/checkpoint-4341/config.json
Model weights saved in /content/drive/MyDrive/indonesian_sequence/checkpoint-4341/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/indonesian_sequence/checkpoint-4341/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/indonesian_sequence/checkpoint-4341/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=4341, training_loss=0.05864648614658021, metrics={'train_runtime': 3711.3245, 'train_samples_per_second': 9.355, 'train_steps_per_second': 1.17, 'total_flos': 9071962097117184.0, 'train_loss': 0.05864648614658021, 'epoch': 3.0})

In [None]:
path = "/content/drive/MyDrive/RoBERTa - INDONESIAN - SEQUENCE"
trainer.save_model(path)

Saving model checkpoint to /content/drive/MyDrive/RoBERTa - INDONESIAN - SEQUENCE
Configuration saved in /content/drive/MyDrive/RoBERTa - INDONESIAN - SEQUENCE/config.json
Model weights saved in /content/drive/MyDrive/RoBERTa - INDONESIAN - SEQUENCE/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/RoBERTa - INDONESIAN - SEQUENCE/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/RoBERTa - INDONESIAN - SEQUENCE/special_tokens_map.json
