# Week 38 - Binary Supervised Classifier

## 1. Setup

### 1.1. Libraries

#### 1.1.1. New Libraries

In [None]:
# new libraries for Google Colab
!pip3 install datasets
!pip3 install bpemb
!pip install datasets==2.2.1 transformers==4.19.1
!pip3 install transformers[torch] # hyperparameters

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, huggingface-hub, datasets
Successfully installed datasets-2.14

#### 1.1.2. Load Libraries

In [None]:
from datasets import load_dataset                                                    # library to import data from huggingface
import pandas as pd                                                                  # library to transform to dataframe. helps for statistics
from bpemb import BPEmb                                                              # embeddings
from sklearn.linear_model import LogisticRegression                                  # model
import warnings                                                                      # ignore warnings in printing
warnings.filterwarnings("ignore")
import numpy as np                                                                   # library for math operations and matrices
from sklearn.metrics import classification_report                                    # classification report binary clasiffier
from sklearn.metrics import f1_score, accuracy_score                                 # f1_score, accuracy
from sklearn.metrics import recall_score, precision_score                            # recall_score, precision_score
import torch                                                                         # torch for managing special python objects
from torch import nn                                                                 # neural networks
from torch.utils.data import Dataset, DataLoader                                     # torch for managing special data types
from typing import List, Tuple                                                       # data structures in outputs
from tqdm.notebook import tqdm                                                       # show progress of the loop

from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification    # transformer: optimizer, tokenizer (pre-train model), model
from transformers import BertForSequenceClassification                               # Load trained model
from transformers import DataCollatorWithPadding                                     # for padding in batches
from transformers import TrainingArguments                                           # Hyperparametes
from transformers import Trainer                                                     # Trainer
from datasets import load_metric                                                     # Evaluation metric


### 1.2. Data

#### 1.2.1. Read Data

In [None]:
# load training dataset
datasets_train = load_dataset("copenlu/answerable_tydiqa", split='train')
# load validation dataset
datasets_val = load_dataset("copenlu/answerable_tydiqa", split='validation')
# set gpu if available
device = "cuda:0" if torch.cuda.is_available() else "cpu"

Downloading:   0%|          | 0.00/2.47k [00:00<?, ?B/s]



Downloading and preparing dataset None/None (download: 75.43 MiB, generated: 131.78 MiB, post-processed: Unknown size, total: 207.21 MiB) to /root/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-42333912ea665dd0/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/71.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.49M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-42333912ea665dd0/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901. Subsequent calls will reuse this data.




#### 1.2.2. Transform Data

In [None]:
def oracle(df_list_annotations = []):
  """Check whether a question has an answer"""
  return [0 if x['answer_text'][0] == '' else 1 for x in df_list_annotations]

# train
answerable_train = oracle(datasets_train['annotations'])
datasets_train = datasets_train.add_column("label", answerable_train)

# val
answerable_val = oracle(datasets_val['annotations'])
datasets_val = datasets_val.add_column("label", answerable_val)

# define languages for the project
languages = ['arabic', 'bengali','indonesian']

# transform to pandas dataframe
pandas_datasets_train = pd.DataFrame(datasets_train)
pandas_datasets_val = pd.DataFrame(datasets_val)

# filter languaje
df_train_filter = pandas_datasets_train[pandas_datasets_train['language'].isin(languages)]
df_val_filter = pandas_datasets_val[pandas_datasets_val['language'].isin(languages)]



# 2. Supervised Models

### 2.1. Logistic

#### 2.1.1. Pre-processing

In [None]:
# embeddings
bpe_models = {
    'arabic': BPEmb(lang='ar', dim=50),
    'bengali': BPEmb(lang='bn', dim=50),
    'indonesian': BPEmb(lang='id', dim=50)
}

def get_mean_embedding(sentence, model):
  """
  Function get mean embedding from a text
  """
  embeddings = model.embed(sentence)
  if len(embeddings) > 0:
      return np.mean(embeddings, axis=0)
  else:
      return np.zeros(model.dim)

downloading https://nlp.h-its.org/bpemb/ar/ar.wiki.bpe.vs10000.model


100%|██████████| 428120/428120 [00:00<00:00, 10434030.97B/s]


downloading https://nlp.h-its.org/bpemb/ar/ar.wiki.bpe.vs10000.d50.w2v.bin.tar.gz


100%|██████████| 1928527/1928527 [00:00<00:00, 26987770.37B/s]


downloading https://nlp.h-its.org/bpemb/bn/bn.wiki.bpe.vs10000.model


100%|██████████| 471203/471203 [00:00<00:00, 12286418.01B/s]


downloading https://nlp.h-its.org/bpemb/bn/bn.wiki.bpe.vs10000.d50.w2v.bin.tar.gz


100%|██████████| 1933584/1933584 [00:00<00:00, 27255963.20B/s]


downloading https://nlp.h-its.org/bpemb/id/id.wiki.bpe.vs10000.model


100%|██████████| 396303/396303 [00:00<00:00, 9845147.11B/s]

downloading https://nlp.h-its.org/bpemb/id/id.wiki.bpe.vs10000.d50.w2v.bin.tar.gz



100%|██████████| 1920574/1920574 [00:00<00:00, 27320666.55B/s]


#### 2.1.2. Model

In [None]:
# where the results are saved
results_fscore = {}
results_accuracy = {}

for language in languages:
    bpemb = bpe_models[language]
    df_language_train = df_train_filter[df_train_filter['language'] == language]
    df_language_val = df_val_filter[df_val_filter['language'] == language]

    X_train, y_train = [], df_language_train['label'].values

    for _, row in df_language_train.iterrows():
        doc_emb = get_mean_embedding(row['document_plaintext'], bpemb)
        ques_emb = get_mean_embedding(row['question_text'], bpemb)

        # Intersection embedding
        doc_tokens = set(bpemb.encode(row['document_plaintext']))
        ques_tokens = set(bpemb.encode(row['question_text']))
        intersection_tokens = doc_tokens & ques_tokens
        intersection_emb = get_mean_embedding(' '.join(intersection_tokens), bpemb)

        # Proportion of question tokens in document
        prop_ques_in_doc = len(intersection_tokens) / len(ques_tokens) if ques_tokens else 0

        # Combine all features
        features = np.hstack([doc_emb, ques_emb, intersection_emb, [prop_ques_in_doc]])
        X_train.append(features)

    # Split data for training and validation

    # Train
    clf = LogisticRegression(penalty='l2',max_iter=1000).fit(X_train, y_train)

    # Val test
    X_val, y_val = [], df_language_val['label'].values
    for _, row in df_language_val.iterrows():
        doc_emb = get_mean_embedding(row['document_plaintext'], bpemb)
        ques_emb = get_mean_embedding(row['question_text'], bpemb)

        # Intersection embedding
        doc_tokens = set(bpemb.encode(row['document_plaintext']))
        ques_tokens = set(bpemb.encode(row['question_text']))
        intersection_tokens = doc_tokens & ques_tokens
        intersection_emb = get_mean_embedding(' '.join(intersection_tokens), bpemb)

        # Proportion of question tokens in document
        prop_ques_in_doc = len(intersection_tokens) / len(ques_tokens) if ques_tokens else 0

        # Combine all features
        features = np.hstack([doc_emb, ques_emb, intersection_emb, [prop_ques_in_doc]])
        X_val.append(features)

    # Predict & Evaluate
    y_pred = clf.predict(X_val)
    print(f"\nResults for {language}:")
    print(classification_report(y_val, y_pred))
    results_fscore[language] = f1_score(y_val, y_pred)
    results_accuracy[language] = accuracy_score(y_val, y_pred)

print('f-score',results_fscore);print('accuracy',results_accuracy)


Results for arabic:
              precision    recall  f1-score   support

           0       0.82      0.80      0.81       951
           1       0.80      0.82      0.81       951

    accuracy                           0.81      1902
   macro avg       0.81      0.81      0.81      1902
weighted avg       0.81      0.81      0.81      1902


Results for bengali:
              precision    recall  f1-score   support

           0       0.79      0.66      0.72       112
           1       0.71      0.82      0.76       112

    accuracy                           0.74       224
   macro avg       0.75      0.74      0.74       224
weighted avg       0.75      0.74      0.74       224


Results for indonesian:
              precision    recall  f1-score   support

           0       0.81      0.74      0.77       594
           1       0.76      0.82      0.79       597

    accuracy                           0.78      1191
   macro avg       0.78      0.78      0.78      1191
weight

### 2.2. BI-LSTM

#### 2.2.1. Pre-processing

In [None]:
from bpemb import BPEmb
dim_ = 100                                         # embedding vector size
vocabulary_ = 100000                               # size vocabulary

bpe_models = {
    languages[0]: BPEmb(lang='ar', dim=dim_, vs = vocabulary_),
    languages[1]: BPEmb(lang='bn', dim=dim_, vs = vocabulary_),
    languages[2]: BPEmb(lang='id', dim=dim_, vs = vocabulary_)
}

downloading https://nlp.h-its.org/bpemb/ar/ar.wiki.bpe.vs100000.model


100%|██████████| 2383518/2383518 [00:00<00:00, 34731912.91B/s]


downloading https://nlp.h-its.org/bpemb/ar/ar.wiki.bpe.vs100000.d100.w2v.bin.tar.gz


100%|██████████| 38037405/38037405 [00:00<00:00, 81330131.90B/s]


downloading https://nlp.h-its.org/bpemb/bn/bn.wiki.bpe.vs100000.model


100%|██████████| 2943332/2943332 [00:00<00:00, 42940525.02B/s]


downloading https://nlp.h-its.org/bpemb/bn/bn.wiki.bpe.vs100000.d100.w2v.bin.tar.gz


100%|██████████| 38121170/38121170 [00:00<00:00, 78147403.56B/s]


downloading https://nlp.h-its.org/bpemb/id/id.wiki.bpe.vs100000.model


100%|██████████| 1959924/1959924 [00:00<00:00, 27402271.62B/s]


downloading https://nlp.h-its.org/bpemb/id/id.wiki.bpe.vs100000.d100.w2v.bin.tar.gz


100%|██████████| 37930291/37930291 [00:00<00:00, 54205474.77B/s]


#### 2.2.2. Model

##### 2.2.2.1. Arabic

###### 2.2.2.1.1. Prepare Data for Model

In [None]:
#parameters
language_ = languages[0]                          # filter language
lstm_dim = 100                                    # dim neural lstm network

# 0. Choose language
datasets_train_filter = datasets_train.filter(lambda dataset: dataset["language"]==language_)
datasets_val_filter = datasets_val.filter(lambda dataset: dataset["language"]==language_)

# 1. pretrain embeddings for each language
tokenizer = bpe_models[language_]

# 2. add index for padding [PAD], END-OF-SENTENCE [EOS]
pretrained_embeddings = np.concatenate([bpe_models[language_].emb.vectors,
                                        np.zeros(shape=(2,dim_))], axis=0)
# Extract the vocab and add extra tokeNS
vocabulary = bpe_models[language_].emb.index_to_key + ['[PAD]', '[EOS]']
PAD_id = vocabulary_
EOS_id = vocabulary_+1

Filter:   0%|          | 0/116067 [00:00<?, ? examples/s]

Filter:   0%|          | 0/13325 [00:00<?, ? examples/s]

In [None]:
print('language:', language_); print('size of hidden layer:',lstm_dim)

language: arabic
size of hidden layer: 100


In [None]:
# 3. Define input format for each row in the neural network
def text_to_batch_bilstm(text: List, tokenizer, max_len=512,
                         id_token_eos = EOS_id) -> Tuple[List, List]:
    """
    Creates a tokenized batch for input to a bilstm model
    :param text: A list of sentences to tokenize
    :param tokenizer: A tokenization function to use (i.e. fasttext)
    :return: Tokenized text as well as the length of the input sequence
    """
    # Some light preprocessing
    input_ids = [tokenizer.encode_ids(t) for t in text]

    return input_ids, [len(ids) for ids in input_ids]


# This will load the dataset and process it
class ClassificationDatasetReader(Dataset):
  def __init__(self, df, tokenizer, column_text_questions, column_text_documents, column_label):
    self.df = df
    self.tokenizer = tokenizer
    self.column_text_questions = column_text_questions
    self.column_text_documents = column_text_documents
    self.column_label = column_label


  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):
    row = self.df[idx]
    # Calls the text_to_batch function
    input_ids_column_text_questions, seq_lens_column_text_questions = text_to_batch_bilstm(text = [row[self.column_text_questions]],
                                              tokenizer = self.tokenizer)

    input_ids_column_text_documents, seq_lens_column_text_documents = text_to_batch_bilstm(text = [row[self.column_text_documents]],
                                              tokenizer = self.tokenizer)
    label = row[self.column_label]

    return input_ids_column_text_questions, seq_lens_column_text_questions, input_ids_column_text_documents , seq_lens_column_text_documents, label


# 4. Prepare data for pytorch object
# Asumes the output from text_to_batch_bilstm
# 4.1.
def collate_batch_bilstm(input_data: Tuple,
                         id_pad = PAD_id):

    """
    Combines multiple data samples into a single batch
    :param input_data: The combined input_ids, seq_lens, and labels for the batch
    :return: A tuple of tensors (input_ids, seq_lens, labels)
    """
    input_ids_question = [i[0][0] for i in input_data]
    seq_lens_question = [i[1][0] for i in input_data]
    input_ids_document = [i[2][0] for i in input_data]
    seq_lens_document = [i[3][0] for i in input_data]
    labels = [i[4] for i in input_data]

    # Pad all of the input samples to the max length question
    max_length_question = max([len(i) for i in input_ids_question])
    input_ids_question = [(i + [id_pad] * (max_length_question - len(i))) for i in input_ids_question]
    # Pad all of the input samples to the max length document
    max_length_document = max([len(i) for i in input_ids_document])
    input_ids_document = [(i + [id_pad] * (max_length_document - len(i))) for i in input_ids_document]


    # Make sure each sample is max_length long
    assert (all(len(i) == max_length_question for i in input_ids_question))
    assert (all(len(i) == max_length_document for i in input_ids_document))

    return torch.tensor(input_ids_question, dtype=torch.int64), torch.tensor(seq_lens_question, dtype=torch.int64), torch.tensor(input_ids_document, dtype=torch.int64), torch.tensor(seq_lens_document, dtype=torch.int64), torch.tensor(labels)

In [None]:
batch_size = 1
# 5. Prepare how inputs and output for the model

# Create the dataset readers
train_dataset = ClassificationDatasetReader(datasets_train_filter,
                                            tokenizer=tokenizer,
                                            column_text_questions='question_text',
                                            column_text_documents='document_plaintext',
                                            column_label='label')
# dataset loaded lazily with N workers in parallel
train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch_bilstm, num_workers=8)

# Create the dataset readers
val_dataset = ClassificationDatasetReader(datasets_val_filter,
                                            tokenizer=tokenizer,
                                            column_text_questions='question_text',
                                            column_text_documents='document_plaintext',
                                            column_label='label')
# dataset loaded lazily with N workers in parallel
valid_dl = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch_bilstm, num_workers=8)

###### 2.2.2.1.2. Define Network

In [None]:
# 6. Define the model
class RNN_Question_Document_Binary(nn.Module):
    """
    RNN Question+Document to predict answerable.
    """
    def __init__(self,
            pretrained_embeddings: torch.tensor,
            lstm_dim: int,
            dropout_prob: float = 0.1,
            n_classes: int = 2):
        """
        Initializer for RNN_Question_Document_Binary network
        :param pretrained_embeddings: A tensor containing the pretrained embeddings
        :param lstm_dim: The dimensionality of the LSTM network: Assume same dimension for questions and documents
        :param dropout_prob: Dropout probability
        """
        # First thing is to call the superclass initializer
        super(RNN_Question_Document_Binary, self).__init__()

        # We'll define the network in a ModuleDict, which makes organizing the model a bit nicer
        # The components are an embedding layer, and an LSTM layer.
        self.model = nn.ModuleDict({
            'embeddings': nn.Embedding.from_pretrained(pretrained_embeddings, padding_idx=PAD_id),
             'bilstm_questions': nn.LSTM(
                pretrained_embeddings.shape[1],           # Dim vector input
                lstm_dim,                                 # Dim Hidden Layer
                1,                                        # Not sure
                batch_first=True,                         # Not sure
                dropout=dropout_prob,                     # Dropout hyperparam
                bidirectional=True),                       # Bidirectional
              'bilstm_documents': nn.LSTM(
                pretrained_embeddings.shape[1],           # Dim vector input
                lstm_dim,                                 # Dim Hidden Layer
                1,                                        # Not sure
                batch_first=True,                         # Not sure
                dropout=dropout_prob,                     # Dropout hyperparam
                bidirectional=True),                       # Bidirectional
              'binary': nn.Linear((2*2)*lstm_dim, n_classes)
        })

        self.n_classes = n_classes
        self.dropout = nn.Dropout(p=dropout_prob)

        # Initialize the weights of the model
        self._init_weights()

    def _init_weights(self):

        all_params_questions = list(self.model['bilstm_questions'].named_parameters())
        for n, p in all_params_questions:
            if 'weight' in n:
                nn.init.xavier_normal_(p)
            elif 'bias' in n:
                nn.init.zeros_(p)

        all_params_documents = list(self.model['bilstm_documents'].named_parameters())
        for n, p in all_params_documents:
            if 'weight' in n:
                nn.init.xavier_normal_(p)
            elif 'bias' in n:
                nn.init.zeros_(p)


    def forward(self, inputs_questions, input_lens_questions, inputs_documents, input_lens_documents, labels=None):
        """
        Defines how tensors flow through the model
        :param inputs: (b x sl) The IDs into the vocabulary of the input samples
        :param input_lens: (b) The length of each input sequence
        :return: (lstm output state, lstm hidden state)
        """
        embeds_questions = self.model['embeddings'](inputs_questions)
        lstm_in_questions = nn.utils.rnn.pack_padded_sequence(
                    embeds_questions,
                    input_lens_questions.cpu(),
                    batch_first=True,
                    enforce_sorted=False)

        lstm_in_questions_2 = lstm_in_questions.data.to(torch.float32)  # Convert data to torch.float32
        lstm_out_questions, (h_questions, c_questions) = self.model['bilstm_questions'](lstm_in_questions_2)


        embeds_documents = self.model['embeddings'](inputs_documents)
        lstm_in_documents = nn.utils.rnn.pack_padded_sequence(
                    embeds_documents,
                    input_lens_documents.cpu(),
                    batch_first=True,
                    enforce_sorted=False)

        lstm_in_documents_2 = lstm_in_documents.data.to(torch.float32)  # Convert data to torch.float32
        lstm_out_documents, (h_documents, c_documents) = self.model['bilstm_documents'](lstm_in_documents_2)


        # Max pool along the last dimension
        features_questions = self.dropout(torch.max(lstm_out_questions, 0)[0])
        features_documents = self.dropout(torch.max(lstm_out_documents, 0)[0])

        ff_in = torch.cat((features_questions, features_documents))
        # Get logits (b x n_classes)
        logits = self.model['binary'](ff_in).view(-1, self.n_classes)
        outputs = (logits,)

        if labels is not None:
            # Xentropy loss
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
            outputs = (loss,) + outputs

        return outputs


question_document = RNN_Question_Document_Binary(
    pretrained_embeddings=torch.from_numpy(pretrained_embeddings),
    lstm_dim=100,
    n_classes=2
).to(device)

###### 2.2.2.1.3. Train

In [None]:
def train(
    model: nn.Module,
    train_dl: DataLoader,
    valid_dl: DataLoader,
    optimizer: torch.optim.Optimizer,
    n_epochs: int,
    device: torch.device,
    patience: int = 10
):
  """
  The main training loop which will optimize a given model on a given dataset
  :param model: The model being optimized
  :param train_dl: The training dataset
  :param valid_dl: A validation dataset
  :param optimizer: The optimizer used to update the model parameters
  :param n_epochs: Number of epochs to train for
  :param device: The device to train on
  :return: (model, losses) The best model and the losses per iteration
  """

  # Keep track of the loss and best accuracy
  losses = []
  best_acc = 0.0
  pcounter = 0

  # Iterate through epochs
  for ep in range(n_epochs):

    loss_epoch = []

    #Iterate through each batch in the dataloader
    for batch in tqdm(train_dl):
      # VERY IMPORTANT: Make sure the model is in training mode, which turns on
      # things like dropout and layer normalization
      model.train()

      # VERY IMPORTANT: zero out all of the gradients on each iteration -- PyTorch
      # keeps track of these dynamically in its computation graph so you need to explicitly
      # zero them out
      optimizer.zero_grad()

      # Place each tensor on the GPU
      batch = tuple(t.to(device) for t in batch)
      inputs_questions = batch[0]
      input_lens_questions = batch[1]
      inputs_documents = batch[2]
      input_lens_documents = batch[3]
      labels = batch[4]

      # Pass the inputs through the model, get the current loss and logits
      loss, logits = model(inputs_questions, input_lens_questions, inputs_documents,
                           input_lens_documents, labels=labels)
      losses.append(loss.item())
      loss_epoch.append(loss.item())
      # Calculate all of the gradients and weight updates for the model
      loss.backward()

      # Optional: clip gradients
      #torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      # Finally, update the weights of the model
      optimizer.step()

    print(f'Train loss: {sum(loss_epoch) / len(loss_epoch)}')

      #gc.collect()

  return model, losses

In [None]:
from torch.optim import Adam
# Define some hyperparameters
lr = 3e-4
n_epochs = 10

# Create the optimizer
optimizer = Adam(question_document.parameters(), lr=lr)

# Train
model_train, losses = train(question_document, train_dl, valid_dl, optimizer, n_epochs, device)

100%|██████████| 29598/29598 [04:49<00:00, 102.29it/s]


Train loss: 0.40269424654862085


100%|██████████| 29598/29598 [04:50<00:00, 102.04it/s]


Train loss: 0.34386933894684457


100%|██████████| 29598/29598 [04:43<00:00, 104.45it/s]


Train loss: 0.3179626580098798


100%|██████████| 29598/29598 [04:32<00:00, 108.53it/s]


Train loss: 0.28961997509049553


100%|██████████| 29598/29598 [04:33<00:00, 108.28it/s]


Train loss: 0.2576534286151785


100%|██████████| 29598/29598 [04:33<00:00, 108.38it/s]


Train loss: 0.22742061726965804


100%|██████████| 29598/29598 [04:35<00:00, 107.51it/s]


Train loss: 0.1974950110497442


100%|██████████| 29598/29598 [04:33<00:00, 108.16it/s]


Train loss: 0.1707309804479296


100%|██████████| 29598/29598 [04:40<00:00, 105.53it/s]


Train loss: 0.14584261887973837


100%|██████████| 29598/29598 [04:41<00:00, 105.16it/s]

Train loss: 0.1261566958168652





In [None]:
from google.colab import drive
drive.mount('/content/drive')
PATH = "Week 38/LSTMS TRAINED MODELS/model_lstm_binary_arabic"
torch.save(model_train,PATH)

Mounted at /content/drive


###### 2.2.2.1.4. Load Train Model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
PATH = "Week 38/LSTMS TRAINED MODELS/model_lstm_binary_arabic"

# Check that the network class is defined
model_train = torch.load(PATH)


###### 2.2.2.1.5. Evaluate

In [None]:
predictions = []
labels = []

with torch.no_grad(): # for evaluation we don't backpropagate and update weights anymore
    for batch_index, (inputs1,inputs2,inputs3,inputs4, targets) in enumerate(valid_dl):
        outputs = model_train.forward(inputs1.to(device),
                                      inputs2.to(device),
                                      inputs3.to(device),
                                      inputs4.to(device)
                                      ) # apply softmax to get probabilities/logits
        # getting the indices of the logit with the highest value, which corresponds to the predicted class (as labels 0, 1)
        indices = torch.max(outputs[0],1).indices.item()
        # accumulating the predictions
        predictions.append(indices)
        # accumulating the true labels
        labels += targets.tolist()

In [None]:
print('f1-score',f1_score(predictions, labels))
print('accuracy-score',accuracy_score(predictions, labels))

f1-score 0.862137862137862
accuracy-score 0.8548895899053628


##### 2.2.2.2. Bengali

###### 2.2.2.2.1. Prepare Data for Model

In [None]:
#parameters
language_ = languages[1]                          # filter language
lstm_dim = 100                                    # dim neural lstm network

# 0. Choose language
datasets_train_filter = datasets_train.filter(lambda dataset: dataset["language"]==language_)
datasets_val_filter = datasets_val.filter(lambda dataset: dataset["language"]==language_)

# 1. pretrain embeddings for each language
tokenizer = bpe_models[language_]

# 2. add index for padding [PAD], END-OF-SENTENCE [EOS]
pretrained_embeddings = np.concatenate([bpe_models[language_].emb.vectors,
                                        np.zeros(shape=(2,dim_))], axis=0)
# Extract the vocab and add extra tokeNS
vocabulary = bpe_models[language_].emb.index_to_key + ['[PAD]', '[EOS]']
PAD_id = vocabulary_
EOS_id = vocabulary_+1

In [None]:
print('language:', language_); print('size of hidden layer:',lstm_dim)

language: bengali
size of hidden layer: 100


In [None]:
# 3. Define input format for each row in the neural network
def text_to_batch_bilstm(text: List, tokenizer, max_len=512,
                         id_token_eos = EOS_id) -> Tuple[List, List]:
    """
    Creates a tokenized batch for input to a bilstm model
    :param text: A list of sentences to tokenize
    :param tokenizer: A tokenization function to use (i.e. fasttext)
    :return: Tokenized text as well as the length of the input sequence
    """
    # Some light preprocessing
    input_ids = [tokenizer.encode_ids(t) for t in text]

    return input_ids, [len(ids) for ids in input_ids]


# This will load the dataset and process it
class ClassificationDatasetReader(Dataset):
  def __init__(self, df, tokenizer, column_text_questions, column_text_documents, column_label):
    self.df = df
    self.tokenizer = tokenizer
    self.column_text_questions = column_text_questions
    self.column_text_documents = column_text_documents
    self.column_label = column_label


  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):
    row = self.df[idx]
    # Calls the text_to_batch function
    input_ids_column_text_questions, seq_lens_column_text_questions = text_to_batch_bilstm(text = [row[self.column_text_questions]],
                                              tokenizer = self.tokenizer)

    input_ids_column_text_documents, seq_lens_column_text_documents = text_to_batch_bilstm(text = [row[self.column_text_documents]],
                                              tokenizer = self.tokenizer)
    label = row[self.column_label]

    return input_ids_column_text_questions, seq_lens_column_text_questions, input_ids_column_text_documents , seq_lens_column_text_documents, label


# 4. Prepare data for pytorch object
# Asumes the output from text_to_batch_bilstm
# 4.1.
def collate_batch_bilstm(input_data: Tuple,
                         id_pad = PAD_id):

    """
    Combines multiple data samples into a single batch
    :param input_data: The combined input_ids, seq_lens, and labels for the batch
    :return: A tuple of tensors (input_ids, seq_lens, labels)
    """
    input_ids_question = [i[0][0] for i in input_data]
    seq_lens_question = [i[1][0] for i in input_data]
    input_ids_document = [i[2][0] for i in input_data]
    seq_lens_document = [i[3][0] for i in input_data]
    labels = [i[4] for i in input_data]

    # Pad all of the input samples to the max length question
    max_length_question = max([len(i) for i in input_ids_question])
    input_ids_question = [(i + [id_pad] * (max_length_question - len(i))) for i in input_ids_question]
    # Pad all of the input samples to the max length document
    max_length_document = max([len(i) for i in input_ids_document])
    input_ids_document = [(i + [id_pad] * (max_length_document - len(i))) for i in input_ids_document]


    # Make sure each sample is max_length long
    assert (all(len(i) == max_length_question for i in input_ids_question))
    assert (all(len(i) == max_length_document for i in input_ids_document))

    return torch.tensor(input_ids_question, dtype=torch.int64), torch.tensor(seq_lens_question, dtype=torch.int64), torch.tensor(input_ids_document, dtype=torch.int64), torch.tensor(seq_lens_document, dtype=torch.int64), torch.tensor(labels)

In [None]:
batch_size = 1
# 5. Prepare how inputs and output for the model

# Create the dataset readers
train_dataset = ClassificationDatasetReader(datasets_train_filter,
                                            tokenizer=tokenizer,
                                            column_text_questions='question_text',
                                            column_text_documents='document_plaintext',
                                            column_label='label')
# dataset loaded lazily with N workers in parallel
train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch_bilstm, num_workers=8)

# Create the dataset readers
val_dataset = ClassificationDatasetReader(datasets_val_filter,
                                            tokenizer=tokenizer,
                                            column_text_questions='question_text',
                                            column_text_documents='document_plaintext',
                                            column_label='label')
# dataset loaded lazily with N workers in parallel
valid_dl = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch_bilstm, num_workers=8)

###### 2.2.2.2.2. Define Network

In [None]:
# 6. Define the model
class RNN_Question_Document_Binary(nn.Module):
    """
    RNN Question+Document to predict answerable.
    """
    def __init__(self,
            pretrained_embeddings: torch.tensor,
            lstm_dim: int,
            dropout_prob: float = 0.1,
            n_classes: int = 2):
        """
        Initializer for RNN_Question_Document_Binary network
        :param pretrained_embeddings: A tensor containing the pretrained embeddings
        :param lstm_dim: The dimensionality of the LSTM network: Assume same dimension for questions and documents
        :param dropout_prob: Dropout probability
        """
        # First thing is to call the superclass initializer
        super(RNN_Question_Document_Binary, self).__init__()

        # We'll define the network in a ModuleDict, which makes organizing the model a bit nicer
        # The components are an embedding layer, and an LSTM layer.
        self.model = nn.ModuleDict({
            'embeddings': nn.Embedding.from_pretrained(pretrained_embeddings, padding_idx=PAD_id),
             'bilstm_questions': nn.LSTM(
                pretrained_embeddings.shape[1],           # Dim vector input
                lstm_dim,                                 # Dim Hidden Layer
                1,                                        # Not sure
                batch_first=True,                         # Not sure
                dropout=dropout_prob,                     # Dropout hyperparam
                bidirectional=True),                       # Bidirectional
              'bilstm_documents': nn.LSTM(
                pretrained_embeddings.shape[1],           # Dim vector input
                lstm_dim,                                 # Dim Hidden Layer
                1,                                        # Not sure
                batch_first=True,                         # Not sure
                dropout=dropout_prob,                     # Dropout hyperparam
                bidirectional=True),                       # Bidirectional
              'binary': nn.Linear((2*2)*lstm_dim, n_classes)
        })

        self.n_classes = n_classes
        self.dropout = nn.Dropout(p=dropout_prob)

        # Initialize the weights of the model
        self._init_weights()

    def _init_weights(self):

        all_params_questions = list(self.model['bilstm_questions'].named_parameters())
        for n, p in all_params_questions:
            if 'weight' in n:
                nn.init.xavier_normal_(p)
            elif 'bias' in n:
                nn.init.zeros_(p)

        all_params_documents = list(self.model['bilstm_documents'].named_parameters())
        for n, p in all_params_documents:
            if 'weight' in n:
                nn.init.xavier_normal_(p)
            elif 'bias' in n:
                nn.init.zeros_(p)


    def forward(self, inputs_questions, input_lens_questions, inputs_documents, input_lens_documents, labels=None):
        """
        Defines how tensors flow through the model
        :param inputs: (b x sl) The IDs into the vocabulary of the input samples
        :param input_lens: (b) The length of each input sequence
        :return: (lstm output state, lstm hidden state)
        """
        embeds_questions = self.model['embeddings'](inputs_questions)
        lstm_in_questions = nn.utils.rnn.pack_padded_sequence(
                    embeds_questions,
                    input_lens_questions.cpu(),
                    batch_first=True,
                    enforce_sorted=False)

        lstm_in_questions_2 = lstm_in_questions.data.to(torch.float32)  # Convert data to torch.float32
        lstm_out_questions, (h_questions, c_questions) = self.model['bilstm_questions'](lstm_in_questions_2)


        embeds_documents = self.model['embeddings'](inputs_documents)
        lstm_in_documents = nn.utils.rnn.pack_padded_sequence(
                    embeds_documents,
                    input_lens_documents.cpu(),
                    batch_first=True,
                    enforce_sorted=False)

        lstm_in_documents_2 = lstm_in_documents.data.to(torch.float32)  # Convert data to torch.float32
        lstm_out_documents, (h_documents, c_documents) = self.model['bilstm_documents'](lstm_in_documents_2)


        # Max pool along the last dimension
        features_questions = self.dropout(torch.max(lstm_out_questions, 0)[0])
        features_documents = self.dropout(torch.max(lstm_out_documents, 0)[0])

        ff_in = torch.cat((features_questions, features_documents))
        # Get logits (b x n_classes)
        logits = self.model['binary'](ff_in).view(-1, self.n_classes)
        outputs = (logits,)

        if labels is not None:
            # Xentropy loss
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
            outputs = (loss,) + outputs

        return outputs


question_document = RNN_Question_Document_Binary(
    pretrained_embeddings=torch.from_numpy(pretrained_embeddings),
    lstm_dim=100,
    n_classes=2
).to(device)

###### 2.2.2.2.3. Train

In [None]:
def train(
    model: nn.Module,
    train_dl: DataLoader,
    valid_dl: DataLoader,
    optimizer: torch.optim.Optimizer,
    n_epochs: int,
    device: torch.device,
    patience: int = 10
):
  """
  The main training loop which will optimize a given model on a given dataset
  :param model: The model being optimized
  :param train_dl: The training dataset
  :param valid_dl: A validation dataset
  :param optimizer: The optimizer used to update the model parameters
  :param n_epochs: Number of epochs to train for
  :param device: The device to train on
  :return: (model, losses) The best model and the losses per iteration
  """

  # Keep track of the loss and best accuracy
  losses = []
  best_acc = 0.0
  pcounter = 0

  # Iterate through epochs
  for ep in range(n_epochs):

    loss_epoch = []

    #Iterate through each batch in the dataloader
    for batch in tqdm(train_dl):
      # VERY IMPORTANT: Make sure the model is in training mode, which turns on
      # things like dropout and layer normalization
      model.train()

      # VERY IMPORTANT: zero out all of the gradients on each iteration -- PyTorch
      # keeps track of these dynamically in its computation graph so you need to explicitly
      # zero them out
      optimizer.zero_grad()

      # Place each tensor on the GPU
      batch = tuple(t.to(device) for t in batch)
      inputs_questions = batch[0]
      input_lens_questions = batch[1]
      inputs_documents = batch[2]
      input_lens_documents = batch[3]
      labels = batch[4]

      # Pass the inputs through the model, get the current loss and logits
      loss, logits = model(inputs_questions, input_lens_questions, inputs_documents,
                           input_lens_documents, labels=labels)
      losses.append(loss.item())
      loss_epoch.append(loss.item())
      # Calculate all of the gradients and weight updates for the model
      loss.backward()

      # Optional: clip gradients
      #torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      # Finally, update the weights of the model
      optimizer.step()

    print(f'Train loss: {sum(loss_epoch) / len(loss_epoch)}')

      #gc.collect()

  return model, losses

In [None]:
from torch.optim import Adam
# Define some hyperparameters
lr = 3e-4
n_epochs = 10

# Create the optimizer
optimizer = Adam(question_document.parameters(), lr=lr)

# Train
model_train, losses = train(question_document, train_dl, valid_dl, optimizer, n_epochs, device)

100%|██████████| 4779/4779 [00:48<00:00, 98.53it/s]


Train loss: 0.5869037140533796


100%|██████████| 4779/4779 [00:48<00:00, 97.86it/s] 


Train loss: 0.5063614803760504


100%|██████████| 4779/4779 [00:45<00:00, 104.22it/s]


Train loss: 0.4678498652982927


100%|██████████| 4779/4779 [00:46<00:00, 101.81it/s]


Train loss: 0.4298292798539535


100%|██████████| 4779/4779 [00:46<00:00, 103.24it/s]


Train loss: 0.38646440335138277


100%|██████████| 4779/4779 [00:45<00:00, 106.12it/s]


Train loss: 0.3409546113178895


100%|██████████| 4779/4779 [00:48<00:00, 98.35it/s] 


Train loss: 0.2974717969959672


100%|██████████| 4779/4779 [00:45<00:00, 105.06it/s]


Train loss: 0.2572987467051835


100%|██████████| 4779/4779 [00:45<00:00, 104.95it/s]


Train loss: 0.21899424647916949


100%|██████████| 4779/4779 [00:46<00:00, 102.04it/s]

Train loss: 0.19556437589016545





In [None]:
from google.colab import drive
drive.mount('/content/drive')
PATH = "Week 38/LSTMS TRAINED MODELS/model_lstm_binary_bengali"
torch.save(model_train,PATH)

Mounted at /content/drive


###### 2.2.2.2.4. Load Train Model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
PATH = "Week 38/LSTMS TRAINED MODELS/model_lstm_binary_bengali"

# Check that the network class is defined
model_train = torch.load(PATH)


###### 2.2.2.2.5. Evaluate

In [None]:
predictions = []
labels = []

with torch.no_grad(): # for evaluation we don't backpropagate and update weights anymore
    for batch_index, (inputs1,inputs2,inputs3,inputs4, targets) in enumerate(valid_dl):
        outputs = model_train.forward(inputs1.to(device),
                                      inputs2.to(device),
                                      inputs3.to(device),
                                      inputs4.to(device)
                                      ) # apply softmax to get probabilities/logits
        # getting the indices of the logit with the highest value, which corresponds to the predicted class (as labels 0, 1)
        indices = torch.max(outputs[0],1).indices.item()
        # accumulating the predictions
        predictions.append(indices)
        # accumulating the true labels
        labels += targets.tolist()

In [None]:
print('f1-score',f1_score(predictions, labels))
print('accuracy-score',accuracy_score(predictions, labels))

f1-score 0.7203791469194313
accuracy-score 0.7366071428571429


##### 2.2.2.3. Indonesian

###### 2.2.2.3.1. Prepare Data for Model

In [None]:
#parameters
language_ = languages[2]                          # filter language
lstm_dim = 100                                    # dim neural lstm network

# 0. Choose language
datasets_train_filter = datasets_train.filter(lambda dataset: dataset["language"]==language_)
datasets_val_filter = datasets_val.filter(lambda dataset: dataset["language"]==language_)

# 1. pretrain embeddings for each language
tokenizer = bpe_models[language_]

# 2. add index for padding [PAD], END-OF-SENTENCE [EOS]
pretrained_embeddings = np.concatenate([bpe_models[language_].emb.vectors,
                                        np.zeros(shape=(2,dim_))], axis=0)
# Extract the vocab and add extra tokeNS
vocabulary = bpe_models[language_].emb.index_to_key + ['[PAD]', '[EOS]']
PAD_id = vocabulary_
EOS_id = vocabulary_+1

Filter:   0%|          | 0/116067 [00:00<?, ? examples/s]

Filter:   0%|          | 0/13325 [00:00<?, ? examples/s]

In [None]:
print('language:', language_); print('size of hidden layer:',lstm_dim)

language: indonesian
size of hidden layer: 100


In [None]:
# 3. Define input format for each row in the neural network
def text_to_batch_bilstm(text: List, tokenizer, max_len=512,
                         id_token_eos = EOS_id) -> Tuple[List, List]:
    """
    Creates a tokenized batch for input to a bilstm model
    :param text: A list of sentences to tokenize
    :param tokenizer: A tokenization function to use (i.e. fasttext)
    :return: Tokenized text as well as the length of the input sequence
    """
    # Some light preprocessing
    input_ids = [tokenizer.encode_ids(t) for t in text]

    return input_ids, [len(ids) for ids in input_ids]


# This will load the dataset and process it
class ClassificationDatasetReader(Dataset):
  def __init__(self, df, tokenizer, column_text_questions, column_text_documents, column_label):
    self.df = df
    self.tokenizer = tokenizer
    self.column_text_questions = column_text_questions
    self.column_text_documents = column_text_documents
    self.column_label = column_label


  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):
    row = self.df[idx]
    # Calls the text_to_batch function
    input_ids_column_text_questions, seq_lens_column_text_questions = text_to_batch_bilstm(text = [row[self.column_text_questions]],
                                              tokenizer = self.tokenizer)

    input_ids_column_text_documents, seq_lens_column_text_documents = text_to_batch_bilstm(text = [row[self.column_text_documents]],
                                              tokenizer = self.tokenizer)
    label = row[self.column_label]

    return input_ids_column_text_questions, seq_lens_column_text_questions, input_ids_column_text_documents , seq_lens_column_text_documents, label


# 4. Prepare data for pytorch object
# Asumes the output from text_to_batch_bilstm
# 4.1.
def collate_batch_bilstm(input_data: Tuple,
                         id_pad = PAD_id):

    """
    Combines multiple data samples into a single batch
    :param input_data: The combined input_ids, seq_lens, and labels for the batch
    :return: A tuple of tensors (input_ids, seq_lens, labels)
    """
    input_ids_question = [i[0][0] for i in input_data]
    seq_lens_question = [i[1][0] for i in input_data]
    input_ids_document = [i[2][0] for i in input_data]
    seq_lens_document = [i[3][0] for i in input_data]
    labels = [i[4] for i in input_data]

    # Pad all of the input samples to the max length question
    max_length_question = max([len(i) for i in input_ids_question])
    input_ids_question = [(i + [id_pad] * (max_length_question - len(i))) for i in input_ids_question]
    # Pad all of the input samples to the max length document
    max_length_document = max([len(i) for i in input_ids_document])
    input_ids_document = [(i + [id_pad] * (max_length_document - len(i))) for i in input_ids_document]


    # Make sure each sample is max_length long
    assert (all(len(i) == max_length_question for i in input_ids_question))
    assert (all(len(i) == max_length_document for i in input_ids_document))

    return torch.tensor(input_ids_question, dtype=torch.int64), torch.tensor(seq_lens_question, dtype=torch.int64), torch.tensor(input_ids_document, dtype=torch.int64), torch.tensor(seq_lens_document, dtype=torch.int64), torch.tensor(labels)

In [None]:
batch_size = 1
# 5. Prepare how inputs and output for the model

# Create the dataset readers
train_dataset = ClassificationDatasetReader(datasets_train_filter,
                                            tokenizer=tokenizer,
                                            column_text_questions='question_text',
                                            column_text_documents='document_plaintext',
                                            column_label='label')
# dataset loaded lazily with N workers in parallel
train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch_bilstm, num_workers=8)

# Create the dataset readers
val_dataset = ClassificationDatasetReader(datasets_val_filter,
                                            tokenizer=tokenizer,
                                            column_text_questions='question_text',
                                            column_text_documents='document_plaintext',
                                            column_label='label')
# dataset loaded lazily with N workers in parallel
valid_dl = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch_bilstm, num_workers=8)

###### 2.2.2.3.2. Define Network

In [None]:
# 6. Define the model
class RNN_Question_Document_Binary(nn.Module):
    """
    RNN Question+Document to predict answerable.
    """
    def __init__(self,
            pretrained_embeddings: torch.tensor,
            lstm_dim: int,
            dropout_prob: float = 0.1,
            n_classes: int = 2):
        """
        Initializer for RNN_Question_Document_Binary network
        :param pretrained_embeddings: A tensor containing the pretrained embeddings
        :param lstm_dim: The dimensionality of the LSTM network: Assume same dimension for questions and documents
        :param dropout_prob: Dropout probability
        """
        # First thing is to call the superclass initializer
        super(RNN_Question_Document_Binary, self).__init__()

        # We'll define the network in a ModuleDict, which makes organizing the model a bit nicer
        # The components are an embedding layer, and an LSTM layer.
        self.model = nn.ModuleDict({
            'embeddings': nn.Embedding.from_pretrained(pretrained_embeddings, padding_idx=PAD_id),
             'bilstm_questions': nn.LSTM(
                pretrained_embeddings.shape[1],           # Dim vector input
                lstm_dim,                                 # Dim Hidden Layer
                1,                                        # Not sure
                batch_first=True,                         # Not sure
                dropout=dropout_prob,                     # Dropout hyperparam
                bidirectional=True),                       # Bidirectional
              'bilstm_documents': nn.LSTM(
                pretrained_embeddings.shape[1],           # Dim vector input
                lstm_dim,                                 # Dim Hidden Layer
                1,                                        # Not sure
                batch_first=True,                         # Not sure
                dropout=dropout_prob,                     # Dropout hyperparam
                bidirectional=True),                       # Bidirectional
              'binary': nn.Linear((2*2)*lstm_dim, n_classes)
        })

        self.n_classes = n_classes
        self.dropout = nn.Dropout(p=dropout_prob)

        # Initialize the weights of the model
        self._init_weights()

    def _init_weights(self):

        all_params_questions = list(self.model['bilstm_questions'].named_parameters())
        for n, p in all_params_questions:
            if 'weight' in n:
                nn.init.xavier_normal_(p)
            elif 'bias' in n:
                nn.init.zeros_(p)

        all_params_documents = list(self.model['bilstm_documents'].named_parameters())
        for n, p in all_params_documents:
            if 'weight' in n:
                nn.init.xavier_normal_(p)
            elif 'bias' in n:
                nn.init.zeros_(p)


    def forward(self, inputs_questions, input_lens_questions, inputs_documents, input_lens_documents, labels=None):
        """
        Defines how tensors flow through the model
        :param inputs: (b x sl) The IDs into the vocabulary of the input samples
        :param input_lens: (b) The length of each input sequence
        :return: (lstm output state, lstm hidden state)
        """
        embeds_questions = self.model['embeddings'](inputs_questions)
        lstm_in_questions = nn.utils.rnn.pack_padded_sequence(
                    embeds_questions,
                    input_lens_questions.cpu(),
                    batch_first=True,
                    enforce_sorted=False)

        lstm_in_questions_2 = lstm_in_questions.data.to(torch.float32)  # Convert data to torch.float32
        lstm_out_questions, (h_questions, c_questions) = self.model['bilstm_questions'](lstm_in_questions_2)


        embeds_documents = self.model['embeddings'](inputs_documents)
        lstm_in_documents = nn.utils.rnn.pack_padded_sequence(
                    embeds_documents,
                    input_lens_documents.cpu(),
                    batch_first=True,
                    enforce_sorted=False)

        lstm_in_documents_2 = lstm_in_documents.data.to(torch.float32)  # Convert data to torch.float32
        lstm_out_documents, (h_documents, c_documents) = self.model['bilstm_documents'](lstm_in_documents_2)


        # Max pool along the last dimension
        features_questions = self.dropout(torch.max(lstm_out_questions, 0)[0])
        features_documents = self.dropout(torch.max(lstm_out_documents, 0)[0])

        ff_in = torch.cat((features_questions, features_documents))
        # Get logits (b x n_classes)
        logits = self.model['binary'](ff_in).view(-1, self.n_classes)
        outputs = (logits,)

        if labels is not None:
            # Xentropy loss
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
            outputs = (loss,) + outputs

        return outputs


question_document = RNN_Question_Document_Binary(
    pretrained_embeddings=torch.from_numpy(pretrained_embeddings),
    lstm_dim=100,
    n_classes=2
).to(device)

###### 2.2.2.3.3. Train

In [None]:
def train(
    model: nn.Module,
    train_dl: DataLoader,
    valid_dl: DataLoader,
    optimizer: torch.optim.Optimizer,
    n_epochs: int,
    device: torch.device,
    patience: int = 10
):
  """
  The main training loop which will optimize a given model on a given dataset
  :param model: The model being optimized
  :param train_dl: The training dataset
  :param valid_dl: A validation dataset
  :param optimizer: The optimizer used to update the model parameters
  :param n_epochs: Number of epochs to train for
  :param device: The device to train on
  :return: (model, losses) The best model and the losses per iteration
  """

  # Keep track of the loss and best accuracy
  losses = []
  best_acc = 0.0
  pcounter = 0

  # Iterate through epochs
  for ep in range(n_epochs):

    loss_epoch = []

    #Iterate through each batch in the dataloader
    for batch in tqdm(train_dl):
      # VERY IMPORTANT: Make sure the model is in training mode, which turns on
      # things like dropout and layer normalization
      model.train()

      # VERY IMPORTANT: zero out all of the gradients on each iteration -- PyTorch
      # keeps track of these dynamically in its computation graph so you need to explicitly
      # zero them out
      optimizer.zero_grad()

      # Place each tensor on the GPU
      batch = tuple(t.to(device) for t in batch)
      inputs_questions = batch[0]
      input_lens_questions = batch[1]
      inputs_documents = batch[2]
      input_lens_documents = batch[3]
      labels = batch[4]

      # Pass the inputs through the model, get the current loss and logits
      loss, logits = model(inputs_questions, input_lens_questions, inputs_documents,
                           input_lens_documents, labels=labels)
      losses.append(loss.item())
      loss_epoch.append(loss.item())
      # Calculate all of the gradients and weight updates for the model
      loss.backward()

      # Optional: clip gradients
      #torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      # Finally, update the weights of the model
      optimizer.step()

    print(f'Train loss: {sum(loss_epoch) / len(loss_epoch)}')

      #gc.collect()

  return model, losses

In [None]:
from torch.optim import Adam
# Define some hyperparameters
lr = 3e-4
n_epochs = 10

# Create the optimizer
optimizer = Adam(question_document.parameters(), lr=lr)

# Train
model_train, losses = train(question_document, train_dl, valid_dl, optimizer, n_epochs, device)

100%|██████████| 4779/4779 [00:48<00:00, 98.53it/s]


Train loss: 0.5869037140533796


100%|██████████| 4779/4779 [00:48<00:00, 97.86it/s] 


Train loss: 0.5063614803760504


100%|██████████| 4779/4779 [00:45<00:00, 104.22it/s]


Train loss: 0.4678498652982927


100%|██████████| 4779/4779 [00:46<00:00, 101.81it/s]


Train loss: 0.4298292798539535


100%|██████████| 4779/4779 [00:46<00:00, 103.24it/s]


Train loss: 0.38646440335138277


100%|██████████| 4779/4779 [00:45<00:00, 106.12it/s]


Train loss: 0.3409546113178895


100%|██████████| 4779/4779 [00:48<00:00, 98.35it/s] 


Train loss: 0.2974717969959672


100%|██████████| 4779/4779 [00:45<00:00, 105.06it/s]


Train loss: 0.2572987467051835


100%|██████████| 4779/4779 [00:45<00:00, 104.95it/s]


Train loss: 0.21899424647916949


100%|██████████| 4779/4779 [00:46<00:00, 102.04it/s]

Train loss: 0.19556437589016545





In [None]:
from google.colab import drive
drive.mount('/content/drive')
PATH = "Week 38/LSTMS TRAINED MODELS/model_lstm_binary_indonesian"
torch.save(model_train,PATH)

Mounted at /content/drive


###### 2.2.2.3.4. Load Train Model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
PATH = "Week 38/LSTMS TRAINED MODELS/model_lstm_binary_indonesian"

# Check that the network class is defined
model_train = torch.load(PATH)


###### 2.2.2.3.5. Evaluate

In [None]:
predictions = []
labels = []

with torch.no_grad(): # for evaluation we don't backpropagate and update weights anymore
    for batch_index, (inputs1,inputs2,inputs3,inputs4, targets) in enumerate(valid_dl):
        outputs = model_train.forward(inputs1.to(device),
                                      inputs2.to(device),
                                      inputs3.to(device),
                                      inputs4.to(device)
                                      ) # apply softmax to get probabilities/logits
        # getting the indices of the logit with the highest value, which corresponds to the predicted class (as labels 0, 1)
        indices = torch.max(outputs[0],1).indices.item()
        # accumulating the predictions
        predictions.append(indices)
        # accumulating the true labels
        labels += targets.tolist()

In [None]:
print('f1-score',f1_score(predictions, labels))
print('accuracy-score',accuracy_score(predictions, labels))

f1-score 0.7416974169741697
accuracy-score 0.7649034424853065


### 2.3. Transformer (BERT)

#### 2.3.1. Pre-processing

In [None]:
# call bert model tokenizer (it was train with arabic, bengali, and indonesian)
checkpoint = "bert-base-multilingual-uncased"                                                   # Name of the model
tokenizer = AutoTokenizer.from_pretrained(checkpoint,                                           # Set up tokenizer associated with the model
                                          num_labels=len(set(datasets_train['label'])))         # Number of binary options

# Define function to tokenize question and documents together
def tokenize_function(dataset_, variable1= 'question_text', variable2= 'document_plaintext'):
    """
    Use together question and document to create the tokenizer object
    that will be input of the model
    - We don't pad here but later in the batches.
    - We truncate as the length of text how the model learnt
    """
    return tokenizer(dataset_["question_text"], dataset_["document_plaintext"], truncation=True, padding="max_length")

# Apply the tokenizer to the questions+document in train and val datasets
datasets_train_tokenize = datasets_train.map(tokenize_function, batched=True)
datasets_val_tokenize = datasets_val.map(tokenize_function, batched=True)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/851k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.64M [00:00<?, ?B/s]



  0%|          | 0/117 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

#### 2.3.2. Model

##### 2.3.2.1. Model Setup

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)                                    # Dinamically Padding
model_bert_pretrain = AutoModelForSequenceClassification.from_pretrained(checkpoint)            # Pre-train model to finetune
model_bert_pretrain.to(device)                                                                  # move to GPU

# Evaluation metric
metric = load_metric("f1")
def compute_metrics(eval_pred):
    outputs, labels = eval_pred
    predictions = np.argmax(outputs, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading:   0%|          | 0.00/641M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

##### 2.3.2.2. Arabic

###### 2.3.2.2.1. Filter Language

In [None]:
datasets_train_tokenize_filter = datasets_train_tokenize.filter(lambda dataset: dataset["language"]==languages[0])
datasets_val_tokenize_filter = datasets_val_tokenize.filter(lambda dataset: dataset["language"]==languages[0])

  0%|          | 0/117 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

###### 2.3.2.2.2. Train

In [None]:
# path to save the arguments
path=f"/content/drive/MyDrive/{languages[0]}"
training_args = TrainingArguments(output_dir=path,
                                  evaluation_strategy="steps",
                                  num_train_epochs=3.0,
                                  per_device_train_batch_size=16,
                                  eval_steps=500)

In [None]:
# Define the model
trainer_language_1 = Trainer(
    model = model_bert_pretrain,
    args = training_args,
    train_dataset = datasets_train_tokenize_filter,
    eval_dataset = datasets_train_tokenize_filter, # this is a mistake, should be validation set
    data_collator = data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer_language_1.train(path+"/checkpoint-4000")

Loading model from /content/drive/MyDrive/my_trainer/checkpoint-4000).
The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document_title, document_plaintext, document_url, question_text, language, annotations. If document_title, document_plaintext, document_url, question_text, language, annotations are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 29598
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5550
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 2
  Continuing training from global step 4000
  Will skip the first 2 epochs then the first 300 batches in the first epoch. If this takes a lot of time, y

  0%|          | 0/300 [00:00<?, ?it/s]

Step,Training Loss,Validation Loss,F1
4500,0.1187,0.079874,0.979327
5000,0.1168,0.064046,0.984498
5500,0.1102,0.053517,0.986943


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document_title, document_plaintext, document_url, question_text, language, annotations. If document_title, document_plaintext, document_url, question_text, language, annotations are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 29598
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/my_trainer/checkpoint-4500
Configuration saved in /content/drive/MyDrive/my_trainer/checkpoint-4500/config.json
Model weights saved in /content/drive/MyDrive/my_trainer/checkpoint-4500/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/my_trainer/checkpoint-4500/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/my_trainer/checkpoint-4500/special_tokens_map.json
The following columns in the evaluation set don'

TrainOutput(global_step=5550, training_loss=0.03197795111853797, metrics={'train_runtime': 4445.4309, 'train_samples_per_second': 19.974, 'train_steps_per_second': 1.248, 'total_flos': 2.006756960895492e+16, 'train_loss': 0.03197795111853797, 'epoch': 3.0})

In [None]:
path = "Week 38/TRANSFORMER TRAINED MODELS/Week 38 - BERT - ARABIC"
trainer_language_1.save_model(path)

###### 2.3.2.2.3. Test

In [None]:
# Load Train Model
checkpoint = "Week 38/TRANSFORMER TRAINED MODELS/Week 38 - BERT - ARABIC"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Send to GPU
model.to(device)

# Define function to tokenize question and documents together
def tokenize_function(dataset_, variable1= 'question_text', variable2= 'document_plaintext'):
    """
    Use together question and document to create the tokenizer object
    that will be input of the model
    - We don't pad here but later in the batches.
    - We truncate as the length of text how the model learnt
    """
    return tokenizer(dataset_["question_text"], dataset_["document_plaintext"], truncation=True, padding="max_length")

In [None]:
datasets_val_filter = datasets_val.filter(lambda dataset: dataset["language"]==languages[0])
datasets_val_tokenize_filter = datasets_val_filter.map(tokenize_function, batched=True,remove_columns = ['question_text',
                                                                                                         'document_title',
                                                                                                         'language',
                                                                                                         'annotations',
                                                                                                         'document_plaintext',
                                                                                                         'document_url'])

  0%|          | 0/14 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
# Data Loader
val_dataloader = DataLoader(
    datasets_val_tokenize_filter, collate_fn=data_collator
    , batch_size=8
)

In [None]:
metric = load_metric("f1")
metric2 = load_metric("accuracy")

for batch in val_dataloader:
    batch = {key: value.to(device) for key, value in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    predictions = outputs.logits.argmax(dim=-1)
    labels = batch["labels"]

    metric.add_batch(predictions=predictions, references=labels)
    metric2.add_batch(predictions=predictions, references=labels)

results = metric.compute()
print(results)

results2 = metric2.compute()
print(results2)

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

{'f1': 0.9357326478149101}
{'accuracy': 0.9342797055730809}


##### 2.3.2.3. Bengali

###### 2.3.2.3.1. Filter Language

In [None]:
datasets_train_tokenize_filter = datasets_train_tokenize.filter(lambda dataset: dataset["language"]==languages[1])
datasets_val_tokenize_filter = datasets_val_tokenize.filter(lambda dataset: dataset["language"]==languages[1])

  0%|          | 0/117 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

###### 2.3.2.3.2. Train

In [None]:
# path to save the arguments
path=f"/content/drive/MyDrive/{languages[1]}"
training_args = TrainingArguments(output_dir=path,
                                  evaluation_strategy="steps",
                                  num_train_epochs=3.0,
                                  per_device_train_batch_size=16,
                                  eval_steps=500)

In [None]:
# Define the model
trainer_language_1 = Trainer(
    model = model_bert_pretrain,
    args = training_args,
    train_dataset = datasets_train_tokenize_filter,
    eval_dataset = datasets_val_tokenize_filter,
    data_collator = data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer_language_1.train()


The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document_url, annotations, document_title, document_plaintext, question_text, language. If document_url, annotations, document_title, document_plaintext, question_text, language are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4779
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 897


Step,Training Loss,Validation Loss,F1
500,0.3494,0.407713,0.890756


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document_url, annotations, document_title, document_plaintext, question_text, language. If document_url, annotations, document_title, document_plaintext, question_text, language are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 224
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/bengali/checkpoint-500
Configuration saved in /content/drive/MyDrive/bengali/checkpoint-500/config.json
Model weights saved in /content/drive/MyDrive/bengali/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/bengali/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/bengali/checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface

TrainOutput(global_step=897, training_loss=0.29314036459694204, metrics={'train_runtime': 1353.5194, 'train_samples_per_second': 10.592, 'train_steps_per_second': 0.663, 'total_flos': 3772223200696320.0, 'train_loss': 0.29314036459694204, 'epoch': 3.0})

In [None]:
path = "/content/drive/MyDrive/2Education/University of Copenhagen/3 Semester/Natural Language Processing/NLP Group Work/Week 38/TRANSFORMER TRAINED MODELS/Week 38 - BERT - BENGALI"
trainer_language_1.save_model(path)

Saving model checkpoint to /content/drive/MyDrive/2Education/University of Copenhagen/3 Semester/Natural Language Processing/NLP Group Work/Week 38/TRANSFORMER TRAINED MODELS/Week 38 - BERT - BENGALI
Configuration saved in /content/drive/MyDrive/2Education/University of Copenhagen/3 Semester/Natural Language Processing/NLP Group Work/Week 38/TRANSFORMER TRAINED MODELS/Week 38 - BERT - BENGALI/config.json
Model weights saved in /content/drive/MyDrive/2Education/University of Copenhagen/3 Semester/Natural Language Processing/NLP Group Work/Week 38/TRANSFORMER TRAINED MODELS/Week 38 - BERT - BENGALI/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/2Education/University of Copenhagen/3 Semester/Natural Language Processing/NLP Group Work/Week 38/TRANSFORMER TRAINED MODELS/Week 38 - BERT - BENGALI/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/2Education/University of Copenhagen/3 Semester/Natural Language Processing/NLP Group Work/Week 38/TR

###### 2.3.2.3.3. Test

In [None]:
# Load Train Model
checkpoint = "/content/drive/MyDrive/2Education/University of Copenhagen/3 Semester/Natural Language Processing/NLP Group Work/Week 38/TRANSFORMER TRAINED MODELS/Week 38 - BERT - BENGALI"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Send to GPU
model.to(device)

# Define function to tokenize question and documents together
def tokenize_function(dataset_, variable1= 'question_text', variable2= 'document_plaintext'):
    """
    Use together question and document to create the tokenizer object
    that will be input of the model
    - We don't pad here but later in the batches.
    - We truncate as the length of text how the model learnt
    """
    return tokenizer(dataset_["question_text"], dataset_["document_plaintext"], truncation=True, padding="max_length")

loading configuration file /content/drive/MyDrive/2Education/University of Copenhagen/3 Semester/Natural Language Processing/NLP Group Work/Week 38/TRANSFORMER TRAINED MODELS/Week 38 - BERT - BENGALI/config.json
Model config BertConfig {
  "_name_or_path": "/content/drive/MyDrive/2Education/University of Copenhagen/3 Semester/Natural Language Processing/NLP Group Work/Week 38/TRANSFORMER TRAINED MODELS/Week 38 - BERT - BENGALI",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_hea

In [None]:
datasets_val_filter = datasets_val.filter(lambda dataset: dataset["language"]==languages[1])
datasets_val_tokenize_filter = datasets_val_filter.map(tokenize_function, batched=True,remove_columns = ['question_text',
                                                                                                         'document_title',
                                                                                                         'language',
                                                                                                         'annotations',
                                                                                                         'document_plaintext',
                                                                                                         'document_url'])

  0%|          | 0/14 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
# Data Loader
val_dataloader = DataLoader(
    datasets_val_tokenize_filter, collate_fn=data_collator
    , batch_size=8
)

In [None]:
metric = load_metric("f1")
metric2 = load_metric("accuracy")

for batch in val_dataloader:
    batch = {key: value.to(device) for key, value in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    predictions = outputs.logits.argmax(dim=-1)
    labels = batch["labels"]

    metric.add_batch(predictions=predictions, references=labels)
    metric2.add_batch(predictions=predictions, references=labels)

results = metric.compute()
print(results)

results2 = metric2.compute()
print(results2)

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

{'f1': 0.8860759493670887}
{'accuracy': 0.8794642857142857}


##### 2.3.2.4. Indonesian

###### 2.3.2.3.1. Filter Language

In [None]:
datasets_train_tokenize_filter = datasets_train_tokenize.filter(lambda dataset: dataset["language"]==languages[2])
datasets_val_tokenize_filter = datasets_val_tokenize.filter(lambda dataset: dataset["language"]==languages[2])

  0%|          | 0/117 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

###### 2.3.2.3.2. Train

In [None]:
# path to save the arguments
path=f"/content/drive/MyDrive/{languages[2]}"
training_args = TrainingArguments(output_dir=path,
                                  evaluation_strategy="steps",
                                  num_train_epochs=3.0,
                                  per_device_train_batch_size=16,
                                  eval_steps=500)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
# Define the model
trainer_language_1 = Trainer(
    model = model_bert_pretrain,
    args = training_args,
    train_dataset = datasets_train_tokenize_filter,
    eval_dataset = datasets_train_tokenize_filter, # error should be val
    data_collator = data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer_language_1.train()


The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document_url, annotations, document_title, document_plaintext, question_text, language. If document_url, annotations, document_title, document_plaintext, question_text, language are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 11394
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2139


Step,Training Loss,Validation Loss,F1
500,0.3449,0.2362,0.914759
1000,0.2779,0.178111,0.936926
1500,0.2067,0.112943,0.968943
2000,0.1356,0.07837,0.981371


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: document_url, annotations, document_title, document_plaintext, question_text, language. If document_url, annotations, document_title, document_plaintext, question_text, language are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 11394
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/indonesian/checkpoint-500
Configuration saved in /content/drive/MyDrive/indonesian/checkpoint-500/config.json
Model weights saved in /content/drive/MyDrive/indonesian/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/indonesian/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/indonesian/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't hav

TrainOutput(global_step=2139, training_loss=0.23586812672519195, metrics={'train_runtime': 4669.6592, 'train_samples_per_second': 7.32, 'train_steps_per_second': 0.458, 'total_flos': 8993662094315520.0, 'train_loss': 0.23586812672519195, 'epoch': 3.0})

In [None]:
path = "/content/drive/MyDrive/2Education/University of Copenhagen/3 Semester/Natural Language Processing/NLP Group Work/Week 38/TRANSFORMER TRAINED MODELS/Week 38 - BERT - INDONESIAN"
trainer_language_1.save_model(path)

Saving model checkpoint to /content/drive/MyDrive/2Education/University of Copenhagen/3 Semester/Natural Language Processing/NLP Group Work/Week 38/TRANSFORMER TRAINED MODELS/Week 38 - BERT - INDONESIAN
Configuration saved in /content/drive/MyDrive/2Education/University of Copenhagen/3 Semester/Natural Language Processing/NLP Group Work/Week 38/TRANSFORMER TRAINED MODELS/Week 38 - BERT - INDONESIAN/config.json
Model weights saved in /content/drive/MyDrive/2Education/University of Copenhagen/3 Semester/Natural Language Processing/NLP Group Work/Week 38/TRANSFORMER TRAINED MODELS/Week 38 - BERT - INDONESIAN/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/2Education/University of Copenhagen/3 Semester/Natural Language Processing/NLP Group Work/Week 38/TRANSFORMER TRAINED MODELS/Week 38 - BERT - INDONESIAN/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/2Education/University of Copenhagen/3 Semester/Natural Language Processing/NLP Group Wor

###### 2.3.2.3.3. Test

In [None]:
# Load Train Model
checkpoint = "/content/drive/MyDrive/2Education/University of Copenhagen/3 Semester/Natural Language Processing/NLP Group Work/Week 38/TRANSFORMER TRAINED MODELS/Week 38 - BERT - INDONESIAN"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Send to GPU
model.to(device)

# Define function to tokenize question and documents together
def tokenize_function(dataset_, variable1= 'question_text', variable2= 'document_plaintext'):
    """
    Use together question and document to create the tokenizer object
    that will be input of the model
    - We don't pad here but later in the batches.
    - We truncate as the length of text how the model learnt
    """
    return tokenizer(dataset_["question_text"], dataset_["document_plaintext"], truncation=True, padding="max_length")

loading configuration file /content/drive/MyDrive/2Education/University of Copenhagen/3 Semester/Natural Language Processing/NLP Group Work/Week 38/TRANSFORMER TRAINED MODELS/Week 38 - BERT - INDONESIAN/config.json
Model config BertConfig {
  "_name_or_path": "/content/drive/MyDrive/2Education/University of Copenhagen/3 Semester/Natural Language Processing/NLP Group Work/Week 38/TRANSFORMER TRAINED MODELS/Week 38 - BERT - INDONESIAN",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_p

In [None]:
datasets_val_filter = datasets_val.filter(lambda dataset: dataset["language"]==languages[2])
datasets_val_tokenize_filter = datasets_val_filter.map(tokenize_function, batched=True,remove_columns = ['question_text',
                                                                                                         'document_title',
                                                                                                         'language',
                                                                                                         'annotations',
                                                                                                         'document_plaintext',
                                                                                                         'document_url'])



In [None]:
# Data Loader
val_dataloader = DataLoader(
    datasets_val_tokenize_filter, collate_fn=data_collator
    , batch_size=8
)

In [None]:
metric = load_metric("f1")
metric2 = load_metric("accuracy")

for batch in val_dataloader:
    batch = {key: value.to(device) for key, value in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    predictions = outputs.logits.argmax(dim=-1)
    labels = batch["labels"]

    metric.add_batch(predictions=predictions, references=labels)
    metric2.add_batch(predictions=predictions, references=labels)

results = metric.compute()
print(results)

results2 = metric2.compute()
print(results2)

{'f1': 0.8702928870292886}
{'accuracy': 0.8616071428571429}
