# Week 41 - Zero-shot Cross-lingual Evaluation

## 1. Setup

### 1.1. Libraries

#### 1.1.1. New Libraries

In [None]:
!python --version      # you can also write shell commands in code blocks
!pip3 install nltk     # new libraries
!pip install datasets==2.2.1 transformers==4.19.1
!pip3 install bnlp-toolkit # Bengali_Tokenization
!pip3 install transformers[torch] # hyperparameters
!pip3 install bpemb # pretrain word embeddings
!pip install evaluate # evaluation
!pip install seqeval # special for our sequence model
!pip install accelerate # posprocessing


Python 3.10.12
Collecting datasets==2.2.1
  Downloading datasets-2.2.1-py3-none-any.whl (342 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m342.2/342.2 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.19.1
  Downloading transformers-4.19.1-py3-none-any.whl (4.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m65.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from datasets==2.2.1)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets==2.2.1)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.1.0 (from datasets==2.2.1)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl 

#### 1.1.2. Load Libraries

In [None]:
from datasets import load_dataset                       # library to import data from huggingface
from tqdm.notebook import tqdm                          # Check progress loop
import torch                                            # Torch objects
from torch.utils.data import DataLoader                 # Dataloader to iterate
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from torch.optim import AdamW
from transformers import AutoModelForTokenClassification
from transformers import AutoTokenizer, BertForSequenceClassification
from datasets import load_metric                                                     # Evaluation metric
# for padding in batches
from transformers import DataCollatorWithPadding
import numpy as np
from sklearn.metrics import f1_score



from google.colab import drive
drive.mount('/content/drive')

device = "cuda:0" if torch.cuda.is_available() else "cpu"

Mounted at /content/drive


### 1.2. Data

#### 1.2.1. Read Data

In [None]:
languages = ['arabic', 'bengali', 'indonesian']
# load training dataset
datasets_train = load_dataset("copenlu/answerable_tydiqa", split='train')
# load validation dataset
datasets_val = load_dataset("copenlu/answerable_tydiqa", split='validation')

Downloading:   0%|          | 0.00/2.47k [00:00<?, ?B/s]



Downloading and preparing dataset None/None (download: 75.43 MiB, generated: 131.78 MiB, post-processed: Unknown size, total: 207.21 MiB) to /root/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-42333912ea665dd0/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/71.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.49M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/copenlu___parquet/copenlu--nlp_course_tydiqa-42333912ea665dd0/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901. Subsequent calls will reuse this data.




#### 1.2.2. Transform Data

In [None]:
def oracle(df_list_annotations = []):
  """Check whether a question has an answer"""
  return [0 if x['answer_text'][0] == '' else 1 for x in df_list_annotations]

# train
answerable_train = oracle(datasets_train['annotations'])
datasets_train = datasets_train.add_column("label", answerable_train)

# val
answerable_val = oracle(datasets_val['annotations'])
datasets_val = datasets_val.add_column("label", answerable_val)

## 2. Zero-shot Cross-lingual Evaluator

### 2.1. Sequence Labeler

#### 2.1.1. From Bengali To Arabic

##### 2.1.1.1 Filter Test Language

In [None]:
#parameters
language_ = languages[0]                          # filter language
#lstm_dim = 100                                    # dim neural lstm network

# 0. Choose language
datasets_val_filter = datasets_val.filter(lambda dataset: dataset["language"]==language_)

print('language:', language_);



  0%|          | 0/14 [00:00<?, ?ba/s]

language: arabic


##### 2.1.1.2. Tolkenize

In [None]:
# call model (it was train with arabic, bengali, and indonesian)
# Name of the model
checkpoint = "Week 39/ROBERTA SEQUENCE LABELER/MODELS/RoBERTa - BENGALI - SEQUENCE"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)                         # Number of binary options

In [None]:
def get_train_features(samples):
  '''
  Tokenizes the text in the given samples, splittling inputs that are too long
  for our model across multiple features. Finds the token offsets of the answers,
  which ____ the labels for our inputs.
  '''
  answers = samples["annotations"]
  start_positions = []
  end_positions = []
  y_sequence = []


  batch = tokenizer(
        samples['question_text'],
        samples['document_plaintext'],
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

  # Since one document might give several features if it is long
  # we need a mapping that shows what example each feature is associated with.
  sample_mapping = batch.pop('overflow_to_sample_mapping')

  # This gives a map from token to character position in the original context
  # helps us computer start and end positions.
  offset_mapping = batch.pop('offset_mapping')

  id_words_list_special_characters = batch.word_ids()

  for i, offset in enumerate(offset_mapping):
      sample_idx = sample_mapping[i]                                                # id for identifying the row
      answer = answers[sample_idx]                                                  # answer associated with that id
      start_char = answer["answer_start"][0]                                        # position character where answer starts
      end_char = answer["answer_start"][0] + len(answer["answer_text"][0])          # position character where answer finishes
      sequence_ids = batch.sequence_ids(i)                                         # identify question, answer, special characters (EOS, PADDING, etc)

      # Find the start and end of the context
      idx = 0
      while sequence_ids[idx] != 1:                                                 # identify question characters or special characters
          idx += 1
      context_start = idx                                                           # identify beggining of context
      while sequence_ids[idx] == 1:
          idx += 1
      context_end = idx - 1                                                         # identify end of context

      # If the answer is not fully inside the context, label is (0, 0)
      if offset[context_start][0] > start_char or offset[context_end][1] < end_char: # when truncating, if the first part of the context is after the answe or if the last part of the context is before the end of the answer
          start_positions.append(0)
          end_positions.append(0)
      else:
          # Otherwise it's the start and end token positions
          idx = context_start
          while idx <= context_end and offset[idx][0] <= start_char:                  # between the start of the answer
              idx += 1
          start_positions.append(idx - 1)

          idx = context_end
          while idx >= context_start and offset[idx][1] >= end_char:                  # between the end of the answer
              idx -= 1
          end_positions.append(idx + 1)

      y_sequence_loop = [0] * len(offset)

      for index, token in enumerate(offset):
        if (start_positions[i]<=index)&(end_positions[i]>=index):
          y_sequence_loop[index] = 1
        if token == (0,0):
          y_sequence_loop[index] = -100

      y_sequence.append(y_sequence_loop)

  batch['labels']  = y_sequence
  return batch

In [None]:
val_dataset = datasets_val_filter.map(get_train_features, batched = True, remove_columns = datasets_val_filter.column_names)

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
val_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1947
})

##### 2.1.1.3. Load Train Model

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

label_names = ['no answer', 'answer']
id2label = {'0':'no answer', '1': 'answer'}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForTokenClassification.from_pretrained(checkpoint,
                                                       id2label=id2label,
                                                       label2id=label2id,).to(device)

In [None]:
model

XLMRobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
            

##### 2.1.1.4. Evaluate the Train Model tn the Test Language

In [None]:
import evaluate

metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [None]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [None]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [None]:
val_dataloader = DataLoader(
    val_dataset, collate_fn=data_collator
    , batch_size=8
)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()

In [None]:
#progress_bar = tqdm(range(num_training_steps))

for batch in val_dataloader:
    batch = {key: value.to(device) for key, value in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    predictions = outputs.logits.argmax(dim=-1)
    labels = batch["labels"]

    predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
    labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

    predictions_gathered = accelerator.gather(predictions)
    labels_gathered = accelerator.gather(labels)

    true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
    metric.add_batch(predictions=true_predictions, references=true_labels)

results = metric.compute()
print(results)
#   progress_bar.update(1)



{'nswer': {'precision': 0.13760504201680673, 'recall': 0.27291666666666664, 'f1': 0.1829608938547486, 'number': 480}, 'o answer': {'precision': 0.45661942620117524, 'recall': 0.5447422680412372, 'f1': 0.4968033095148552, 'number': 2425}, 'overall_precision': 0.3776332899869961, 'overall_recall': 0.4998278829604131, 'overall_f1': 0.43022222222222223, 'overall_accuracy': 0.967001994465086}


#### 2.1.2. From Indonesian To Arabic

##### 2.1.2.1. Filter Test Language

In [None]:
#parameters
language_ = languages[0]                          # filter language
#lstm_dim = 100                                    # dim neural lstm network

# 0. Choose language
datasets_val_filter = datasets_val.filter(lambda dataset: dataset["language"]==language_)

print('language:', language_);

  0%|          | 0/14 [00:00<?, ?ba/s]

language: arabic


##### 2.1.2.2. Tolkenize

In [None]:
from transformers import AutoTokenizer

# call model (it was train with arabic, bengali, and indonesian)
# Name of the model
checkpoint = "Week 39/ROBERTA SEQUENCE LABELER/MODELS/RoBERTa - INDONESIAN - SEQUENCE"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)                         # Number of binary options

In [None]:
def get_train_features(samples):
  '''
  Tokenizes the text in the given samples, splittling inputs that are too long
  for our model across multiple features. Finds the token offsets of the answers,
  which ____ the labels for our inputs.
  '''
  answers = samples["annotations"]
  start_positions = []
  end_positions = []
  y_sequence = []


  batch = tokenizer(
        samples['question_text'],
        samples['document_plaintext'],
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

  # Since one document might give several features if it is long
  # we need a mapping that shows what example each feature is associated with.
  sample_mapping = batch.pop('overflow_to_sample_mapping')

  # This gives a map from token to character position in the original context
  # helps us computer start and end positions.
  offset_mapping = batch.pop('offset_mapping')

  id_words_list_special_characters = batch.word_ids()

  for i, offset in enumerate(offset_mapping):
      sample_idx = sample_mapping[i]                                                # id for identifying the row
      answer = answers[sample_idx]                                                  # answer associated with that id
      start_char = answer["answer_start"][0]                                        # position character where answer starts
      end_char = answer["answer_start"][0] + len(answer["answer_text"][0])          # position character where answer finishes
      sequence_ids = batch.sequence_ids(i)                                         # identify question, answer, special characters (EOS, PADDING, etc)

      # Find the start and end of the context
      idx = 0
      while sequence_ids[idx] != 1:                                                 # identify question characters or special characters
          idx += 1
      context_start = idx                                                           # identify beggining of context
      while sequence_ids[idx] == 1:
          idx += 1
      context_end = idx - 1                                                         # identify end of context

      # If the answer is not fully inside the context, label is (0, 0)
      if offset[context_start][0] > start_char or offset[context_end][1] < end_char: # when truncating, if the first part of the context is after the answe or if the last part of the context is before the end of the answer
          start_positions.append(0)
          end_positions.append(0)
      else:
          # Otherwise it's the start and end token positions
          idx = context_start
          while idx <= context_end and offset[idx][0] <= start_char:                  # between the start of the answer
              idx += 1
          start_positions.append(idx - 1)

          idx = context_end
          while idx >= context_start and offset[idx][1] >= end_char:                  # between the end of the answer
              idx -= 1
          end_positions.append(idx + 1)

      y_sequence_loop = [0] * len(offset)

      for index, token in enumerate(offset):
        if (start_positions[i]<=index)&(end_positions[i]>=index):
          y_sequence_loop[index] = 1
        if token == (0,0):
          y_sequence_loop[index] = -100

      y_sequence.append(y_sequence_loop)

  batch['labels']  = y_sequence
  return batch

In [None]:
val_dataset = datasets_val_filter.map(get_train_features, batched = True, remove_columns = datasets_val_filter.column_names)

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
val_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1947
})

##### 2.1.2.3. Load Train Model

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

from torch.optim import AdamW
from transformers import AutoModelForTokenClassification

label_names = ['no answer', 'answer']
id2label = {'0':'no answer', '1': 'answer'}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForTokenClassification.from_pretrained(checkpoint,
                                                       id2label=id2label,
                                                       label2id=label2id,).to(device)

In [None]:
model

XLMRobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
            

##### 2.1.2.4. Evaluate the Train Model tn the Test Language

In [None]:
import evaluate

metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [None]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [None]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [None]:
val_dataloader = DataLoader(
    val_dataset, collate_fn=data_collator
    , batch_size=8
)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()

In [None]:
#progress_bar = tqdm(range(num_training_steps))

for batch in val_dataloader:
    batch = {key: value.to(device) for key, value in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    predictions = outputs.logits.argmax(dim=-1)
    labels = batch["labels"]

    predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
    labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

    predictions_gathered = accelerator.gather(predictions)
    labels_gathered = accelerator.gather(labels)

    true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
    metric.add_batch(predictions=true_predictions, references=true_labels)

results = metric.compute()
print(results)
#   progress_bar.update(1)

{'nswer': {'precision': 0.3560924369747899, 'recall': 0.2848739495798319, 'f1': 0.31652661064425774, 'number': 1190}, 'o answer': {'precision': 0.6069823712409264, 'recall': 0.5613810741687979, 'f1': 0.5832918119913635, 'number': 3128}, 'overall_precision': 0.5448634590377113, 'overall_recall': 0.48517832329782307, 'overall_f1': 0.5132916819796643, 'overall_accuracy': 0.9762086203187498}


#### 2.1.3. From Indonesian To Bengali

##### 2.1.3.1. Filter Test Language

In [None]:
#parameters
language_ = languages[1]                          # filter language
#lstm_dim = 100                                    # dim neural lstm network

# 0. Choose language
datasets_val_filter = datasets_val.filter(lambda dataset: dataset["language"]==language_)

print('language:', language_);



  0%|          | 0/117 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

language: bengali


##### 2.1.3.2. Tolkenize

In [None]:
from transformers import AutoTokenizer

# call model (it was train with arabic, bengali, and indonesian)
# Name of the model
checkpoint = "Week 39/ROBERTA SEQUENCE LABELER/MODELS/RoBERTa - INDONESIAN - SEQUENCE"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)                         # Number of binary options

In [None]:
def get_train_features(samples):
  '''
  Tokenizes the text in the given samples, splittling inputs that are too long
  for our model across multiple features. Finds the token offsets of the answers,
  which ____ the labels for our inputs.
  '''
  answers = samples["annotations"]
  start_positions = []
  end_positions = []
  y_sequence = []


  batch = tokenizer(
        samples['question_text'],
        samples['document_plaintext'],
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

  # Since one document might give several features if it is long
  # we need a mapping that shows what example each feature is associated with.
  sample_mapping = batch.pop('overflow_to_sample_mapping')

  # This gives a map from token to character position in the original context
  # helps us computer start and end positions.
  offset_mapping = batch.pop('offset_mapping')

  id_words_list_special_characters = batch.word_ids()

  for i, offset in enumerate(offset_mapping):
      sample_idx = sample_mapping[i]                                                # id for identifying the row
      answer = answers[sample_idx]                                                  # answer associated with that id
      start_char = answer["answer_start"][0]                                        # position character where answer starts
      end_char = answer["answer_start"][0] + len(answer["answer_text"][0])          # position character where answer finishes
      sequence_ids = batch.sequence_ids(i)                                         # identify question, answer, special characters (EOS, PADDING, etc)

      # Find the start and end of the context
      idx = 0
      while sequence_ids[idx] != 1:                                                 # identify question characters or special characters
          idx += 1
      context_start = idx                                                           # identify beggining of context
      while sequence_ids[idx] == 1:
          idx += 1
      context_end = idx - 1                                                         # identify end of context

      # If the answer is not fully inside the context, label is (0, 0)
      if offset[context_start][0] > start_char or offset[context_end][1] < end_char: # when truncating, if the first part of the context is after the answe or if the last part of the context is before the end of the answer
          start_positions.append(0)
          end_positions.append(0)
      else:
          # Otherwise it's the start and end token positions
          idx = context_start
          while idx <= context_end and offset[idx][0] <= start_char:                  # between the start of the answer
              idx += 1
          start_positions.append(idx - 1)

          idx = context_end
          while idx >= context_start and offset[idx][1] >= end_char:                  # between the end of the answer
              idx -= 1
          end_positions.append(idx + 1)

      y_sequence_loop = [0] * len(offset)

      for index, token in enumerate(offset):
        if (start_positions[i]<=index)&(end_positions[i]>=index):
          y_sequence_loop[index] = 1
        if token == (0,0):
          y_sequence_loop[index] = -100

      y_sequence.append(y_sequence_loop)

  batch['labels']  = y_sequence
  return batch

In [None]:
val_dataset = datasets_val_filter.map(get_train_features, batched = True, remove_columns = datasets_val_filter.column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
val_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 233
})

##### 2.1.3.3. Load Train Model

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

from torch.optim import AdamW
from transformers import AutoModelForTokenClassification

label_names = ['no answer', 'answer']
id2label = {'0':'no answer', '1': 'answer'}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForTokenClassification.from_pretrained(checkpoint,
                                                       id2label=id2label,
                                                       label2id=label2id,)

In [None]:
model

XLMRobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
            

##### 2.1.3.4. Evaluate the Train Model tn the Test Language

In [None]:
import evaluate

metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [None]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [None]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [None]:
val_dataloader = DataLoader(
    val_dataset, collate_fn=data_collator, batch_size=8
)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()

In [None]:
#progress_bar = tqdm(range(num_training_steps))

for batch in val_dataloader:
    with torch.no_grad():
        outputs = model(**batch)

    predictions = outputs.logits.argmax(dim=-1)
    labels = batch["labels"]

    predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
    labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

    predictions_gathered = accelerator.gather(predictions)
    labels_gathered = accelerator.gather(labels)

    true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
    metric.add_batch(predictions=true_predictions, references=true_labels)

results = metric.compute()
print(results)
#   progress_bar.update(1)



{'nswer': {'precision': 0.125, 'recall': 0.2222222222222222, 'f1': 0.16, 'number': 63}, 'o answer': {'precision': 0.4492753623188406, 'recall': 0.5236486486486487, 'f1': 0.483619344773791, 'number': 296}, 'overall_precision': 0.36980306345733044, 'overall_recall': 0.47075208913649025, 'overall_f1': 0.41421568627450983, 'overall_accuracy': 0.9863143688583358}


#### 2.1.4. From Arabic to Bengali

##### 2.1.4.1. Filter Test Language

In [None]:
#parameters
language_ = languages[1]                          # filter language
#lstm_dim = 100                                    # dim neural lstm network

# 0. Choose language
datasets_val_filter = datasets_val.filter(lambda dataset: dataset["language"]==language_)

print('language:', language_);

  0%|          | 0/14 [00:00<?, ?ba/s]

language: bengali


##### 2.1.4.2. Tolkenize

In [None]:
from transformers import AutoTokenizer

# call model (it was train with arabic, bengali, and indonesian)
# Name of the model
checkpoint = "Week 39/ROBERTA SEQUENCE LABELER/MODELS/RoBERTa - ARABIC - SEQUENCE"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)                         # Number of binary options

In [None]:
def get_train_features(samples):
  '''
  Tokenizes the text in the given samples, splittling inputs that are too long
  for our model across multiple features. Finds the token offsets of the answers,
  which ____ the labels for our inputs.
  '''
  answers = samples["annotations"]
  start_positions = []
  end_positions = []
  y_sequence = []


  batch = tokenizer(
        samples['question_text'],
        samples['document_plaintext'],
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

  # Since one document might give several features if it is long
  # we need a mapping that shows what example each feature is associated with.
  sample_mapping = batch.pop('overflow_to_sample_mapping')

  # This gives a map from token to character position in the original context
  # helps us computer start and end positions.
  offset_mapping = batch.pop('offset_mapping')

  id_words_list_special_characters = batch.word_ids()

  for i, offset in enumerate(offset_mapping):
      sample_idx = sample_mapping[i]                                                # id for identifying the row
      answer = answers[sample_idx]                                                  # answer associated with that id
      start_char = answer["answer_start"][0]                                        # position character where answer starts
      end_char = answer["answer_start"][0] + len(answer["answer_text"][0])          # position character where answer finishes
      sequence_ids = batch.sequence_ids(i)                                         # identify question, answer, special characters (EOS, PADDING, etc)

      # Find the start and end of the context
      idx = 0
      while sequence_ids[idx] != 1:                                                 # identify question characters or special characters
          idx += 1
      context_start = idx                                                           # identify beggining of context
      while sequence_ids[idx] == 1:
          idx += 1
      context_end = idx - 1                                                         # identify end of context

      # If the answer is not fully inside the context, label is (0, 0)
      if offset[context_start][0] > start_char or offset[context_end][1] < end_char: # when truncating, if the first part of the context is after the answe or if the last part of the context is before the end of the answer
          start_positions.append(0)
          end_positions.append(0)
      else:
          # Otherwise it's the start and end token positions
          idx = context_start
          while idx <= context_end and offset[idx][0] <= start_char:                  # between the start of the answer
              idx += 1
          start_positions.append(idx - 1)

          idx = context_end
          while idx >= context_start and offset[idx][1] >= end_char:                  # between the end of the answer
              idx -= 1
          end_positions.append(idx + 1)

      y_sequence_loop = [0] * len(offset)

      for index, token in enumerate(offset):
        if (start_positions[i]<=index)&(end_positions[i]>=index):
          y_sequence_loop[index] = 1
        if token == (0,0):
          y_sequence_loop[index] = -100

      y_sequence.append(y_sequence_loop)

  batch['labels']  = y_sequence
  return batch

In [None]:
val_dataset = datasets_val_filter.map(get_train_features, batched = True, remove_columns = datasets_val_filter.column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
val_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 233
})

##### 2.1.4.3. Load Train Model

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

from torch.optim import AdamW
from transformers import AutoModelForTokenClassification

label_names = ['no answer', 'answer']
id2label = {'0':'no answer', '1': 'answer'}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForTokenClassification.from_pretrained(checkpoint,
                                                       id2label=id2label,
                                                       label2id=label2id,)

In [None]:
model

XLMRobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
            

##### 2.1.4.4. Evaluate the Train Model tn the Test Language

In [None]:
import evaluate

metric = evaluate.load("seqeval")

In [None]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [None]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [None]:
val_dataloader = DataLoader(
    val_dataset, collate_fn=data_collator, batch_size=8
)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()

In [None]:
#progress_bar = tqdm(range(num_training_steps))

for batch in val_dataloader:
    with torch.no_grad():
        outputs = model(**batch)

    predictions = outputs.logits.argmax(dim=-1)
    labels = batch["labels"]

    predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
    labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

    predictions_gathered = accelerator.gather(predictions)
    labels_gathered = accelerator.gather(labels)

    true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
    metric.add_batch(predictions=true_predictions, references=true_labels)

results = metric.compute()
print(results)
#   progress_bar.update(1)

{'nswer': {'precision': 0.20535714285714285, 'recall': 0.3026315789473684, 'f1': 0.24468085106382978, 'number': 76}, 'o answer': {'precision': 0.46956521739130436, 'recall': 0.5242718446601942, 'f1': 0.4954128440366973, 'number': 309}, 'overall_precision': 0.4048140043763676, 'overall_recall': 0.4805194805194805, 'overall_f1': 0.43942992874109266, 'overall_accuracy': 0.9845641263548564}


#### 2.1.5. From Bengali To Indonesian

##### 2.1.5.1. Filter Test Language

In [None]:
#parameters
language_ = languages[2]                          # filter language
#lstm_dim = 100                                    # dim neural lstm network

# 0. Choose language
datasets_val_filter = datasets_val.filter(lambda dataset: dataset["language"]==language_)

print('language:', language_);



  0%|          | 0/14 [00:00<?, ?ba/s]

language: indonesian


##### 2.1.5.2. Tolkenize

In [None]:
from transformers import AutoTokenizer

# call model (it was train with arabic, bengali, and indonesian)
# Name of the model
checkpoint = "Week 39/ROBERTA SEQUENCE LABELER/MODELS/RoBERTa - BENGALI - SEQUENCE"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)                         # Number of binary options

In [None]:
def get_train_features(samples):
  '''
  Tokenizes the text in the given samples, splittling inputs that are too long
  for our model across multiple features. Finds the token offsets of the answers,
  which ____ the labels for our inputs.
  '''
  answers = samples["annotations"]
  start_positions = []
  end_positions = []
  y_sequence = []


  batch = tokenizer(
        samples['question_text'],
        samples['document_plaintext'],
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

  # Since one document might give several features if it is long
  # we need a mapping that shows what example each feature is associated with.
  sample_mapping = batch.pop('overflow_to_sample_mapping')

  # This gives a map from token to character position in the original context
  # helps us computer start and end positions.
  offset_mapping = batch.pop('offset_mapping')

  id_words_list_special_characters = batch.word_ids()

  for i, offset in enumerate(offset_mapping):
      sample_idx = sample_mapping[i]                                                # id for identifying the row
      answer = answers[sample_idx]                                                  # answer associated with that id
      start_char = answer["answer_start"][0]                                        # position character where answer starts
      end_char = answer["answer_start"][0] + len(answer["answer_text"][0])          # position character where answer finishes
      sequence_ids = batch.sequence_ids(i)                                         # identify question, answer, special characters (EOS, PADDING, etc)

      # Find the start and end of the context
      idx = 0
      while sequence_ids[idx] != 1:                                                 # identify question characters or special characters
          idx += 1
      context_start = idx                                                           # identify beggining of context
      while sequence_ids[idx] == 1:
          idx += 1
      context_end = idx - 1                                                         # identify end of context

      # If the answer is not fully inside the context, label is (0, 0)
      if offset[context_start][0] > start_char or offset[context_end][1] < end_char: # when truncating, if the first part of the context is after the answe or if the last part of the context is before the end of the answer
          start_positions.append(0)
          end_positions.append(0)
      else:
          # Otherwise it's the start and end token positions
          idx = context_start
          while idx <= context_end and offset[idx][0] <= start_char:                  # between the start of the answer
              idx += 1
          start_positions.append(idx - 1)

          idx = context_end
          while idx >= context_start and offset[idx][1] >= end_char:                  # between the end of the answer
              idx -= 1
          end_positions.append(idx + 1)

      y_sequence_loop = [0] * len(offset)

      for index, token in enumerate(offset):
        if (start_positions[i]<=index)&(end_positions[i]>=index):
          y_sequence_loop[index] = 1
        if token == (0,0):
          y_sequence_loop[index] = -100

      y_sequence.append(y_sequence_loop)

  batch['labels']  = y_sequence
  return batch

In [None]:
val_dataset = datasets_val_filter.map(get_train_features, batched = True, remove_columns = datasets_val_filter.column_names)

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
val_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1208
})

##### 2.1.5.3. Load Train Model

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

from torch.optim import AdamW
from transformers import AutoModelForTokenClassification

label_names = ['no answer', 'answer']
id2label = {'0':'no answer', '1': 'answer'}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForTokenClassification.from_pretrained(checkpoint,
                                                       id2label=id2label,
                                                       label2id=label2id,).to(device)

In [None]:
model

XLMRobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
            

##### 2.1.5.4. Evaluate the Train Model tn the Test Language

In [None]:
import evaluate

metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [None]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [None]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [None]:
val_dataloader = DataLoader(
    val_dataset, collate_fn=data_collator
    , batch_size=8
)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()

In [None]:
#progress_bar = tqdm(range(num_training_steps))

for batch in val_dataloader:
    batch = {key: value.to(device) for key, value in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    predictions = outputs.logits.argmax(dim=-1)
    labels = batch["labels"]

    predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
    labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

    predictions_gathered = accelerator.gather(predictions)
    labels_gathered = accelerator.gather(labels)

    true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
    metric.add_batch(predictions=true_predictions, references=true_labels)

results = metric.compute()
print(results)
#   progress_bar.update(1)



{'nswer': {'precision': 0.15862068965517243, 'recall': 0.35384615384615387, 'f1': 0.21904761904761905, 'number': 325}, 'o answer': {'precision': 0.4375303840544482, 'recall': 0.5418422636965683, 'f1': 0.48413125336202256, 'number': 1661}, 'overall_precision': 0.3648454349388929, 'overall_recall': 0.5110775427995972, 'overall_f1': 0.42575503355704697, 'overall_accuracy': 0.9692545293282142}


#### 2.1.6. From Arabic To Indonesian

##### 2.1.6.1. Filter Test Language

In [None]:
#parameters
language_ = languages[2]                          # filter language
#lstm_dim = 100                                    # dim neural lstm network

# 0. Choose language
datasets_val_filter = datasets_val.filter(lambda dataset: dataset["language"]==language_)

print('language:', language_);

  0%|          | 0/14 [00:00<?, ?ba/s]

language: indonesian


##### 2.1.6.2. Tolkenize

In [None]:
from transformers import AutoTokenizer

# call model (it was train with arabic, bengali, and indonesian)
# Name of the model
checkpoint = "Week 39/ROBERTA SEQUENCE LABELER/MODELS/RoBERTa - ARABIC - SEQUENCE"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)                         # Number of binary options

In [None]:
def get_train_features(samples):
  '''
  Tokenizes the text in the given samples, splittling inputs that are too long
  for our model across multiple features. Finds the token offsets of the answers,
  which ____ the labels for our inputs.
  '''
  answers = samples["annotations"]
  start_positions = []
  end_positions = []
  y_sequence = []


  batch = tokenizer(
        samples['question_text'],
        samples['document_plaintext'],
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

  # Since one document might give several features if it is long
  # we need a mapping that shows what example each feature is associated with.
  sample_mapping = batch.pop('overflow_to_sample_mapping')

  # This gives a map from token to character position in the original context
  # helps us computer start and end positions.
  offset_mapping = batch.pop('offset_mapping')

  id_words_list_special_characters = batch.word_ids()

  for i, offset in enumerate(offset_mapping):
      sample_idx = sample_mapping[i]                                                # id for identifying the row
      answer = answers[sample_idx]                                                  # answer associated with that id
      start_char = answer["answer_start"][0]                                        # position character where answer starts
      end_char = answer["answer_start"][0] + len(answer["answer_text"][0])          # position character where answer finishes
      sequence_ids = batch.sequence_ids(i)                                         # identify question, answer, special characters (EOS, PADDING, etc)

      # Find the start and end of the context
      idx = 0
      while sequence_ids[idx] != 1:                                                 # identify question characters or special characters
          idx += 1
      context_start = idx                                                           # identify beggining of context
      while sequence_ids[idx] == 1:
          idx += 1
      context_end = idx - 1                                                         # identify end of context

      # If the answer is not fully inside the context, label is (0, 0)
      if offset[context_start][0] > start_char or offset[context_end][1] < end_char: # when truncating, if the first part of the context is after the answe or if the last part of the context is before the end of the answer
          start_positions.append(0)
          end_positions.append(0)
      else:
          # Otherwise it's the start and end token positions
          idx = context_start
          while idx <= context_end and offset[idx][0] <= start_char:                  # between the start of the answer
              idx += 1
          start_positions.append(idx - 1)

          idx = context_end
          while idx >= context_start and offset[idx][1] >= end_char:                  # between the end of the answer
              idx -= 1
          end_positions.append(idx + 1)

      y_sequence_loop = [0] * len(offset)

      for index, token in enumerate(offset):
        if (start_positions[i]<=index)&(end_positions[i]>=index):
          y_sequence_loop[index] = 1
        if token == (0,0):
          y_sequence_loop[index] = -100

      y_sequence.append(y_sequence_loop)

  batch['labels']  = y_sequence
  return batch

In [None]:
val_dataset = datasets_val_filter.map(get_train_features, batched = True, remove_columns = datasets_val_filter.column_names)

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
val_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1208
})

##### 2.1.6.3. Load Train Model

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

from torch.optim import AdamW
from transformers import AutoModelForTokenClassification

label_names = ['no answer', 'answer']
id2label = {'0':'no answer', '1': 'answer'}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForTokenClassification.from_pretrained(checkpoint,
                                                       id2label=id2label,
                                                       label2id=label2id,).to(device)

In [None]:
model

XLMRobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
            

##### 2.1.6.4. Evaluate the Train Model tn the Test Language

In [None]:
import evaluate

metric = evaluate.load("seqeval")

In [None]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [None]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [None]:
val_dataloader = DataLoader(
    val_dataset, collate_fn=data_collator, batch_size=8
)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()

In [None]:
#progress_bar = tqdm(range(num_training_steps))

for batch in val_dataloader:
    batch = {key: value.to(device) for key, value in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    predictions = outputs.logits.argmax(dim=-1)
    labels = batch["labels"]

    predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
    labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

    predictions_gathered = accelerator.gather(predictions)
    labels_gathered = accelerator.gather(labels)

    true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
    metric.add_batch(predictions=true_predictions, references=true_labels)

results = metric.compute()
print(results)
#   progress_bar.update(1)



{'nswer': {'precision': 0.37018425460636517, 'recall': 0.38704028021015763, 'f1': 0.37842465753424664, 'number': 571}, 'o answer': {'precision': 0.6082130965593785, 'recall': 0.6164229471316085, 'f1': 0.6122905027932961, 'number': 1778}, 'overall_precision': 0.5489787411421425, 'overall_recall': 0.5606641123882503, 'overall_f1': 0.554759898904802, 'overall_accuracy': 0.9802599703761127}


### 2.2. Binary Classifier

#### 2.2.1. Setup

In [None]:
# Define function to tokenize question and documents together
def tokenize_function(dataset_, variable1= 'question_text', variable2= 'document_plaintext'):
    """
    Use together question and document to create the tokenizer object
    that will be input of the model
    - We don't pad here but later in the batches.
    - We truncate as the length of text how the model learnt
    """
    return tokenizer(dataset_["question_text"], dataset_["document_plaintext"], truncation=True, padding="max_length")

# Evaluation metric
metric1 = load_metric("f1")


#### 2.2.2. Trained in Bengali, tested in Arabic and Indonesian

##### 2.2.2.1. Tolkenize

In [None]:
# call model (it was train with arabic, bengali, and indonesian)
# Name of the model
checkpoint = "Week 38/TRANSFORMER TRAINED MODELS/Week 38 - BERT - BENGALI"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)                         # Number of binary options
# Tokenizing the validation dataset
datasets_val_tokenize = datasets_val.map(tokenize_function, batched=True)

# Dinamically Padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



  0%|          | 0/14 [00:00<?, ?ba/s]

##### 2.2.2.2. Load Train Model

In [None]:
model = BertForSequenceClassification.from_pretrained(checkpoint).to(device)

##### 2.2.2.3. Evaluate Train Model in the Test Language

In [None]:
# Evaluate for each language
for language in ['arabic', 'indonesian']:
    datasets_val_tokenize_filter = datasets_val_tokenize.filter(lambda example: example["language"] == language)

    # Manual evaluation
    model.eval()  # Put the model in evaluation mode
    logits_list = []
    with torch.no_grad():  # Deactivate autograd engine to reduce memory usage and speed up computations
        for i in range(len(datasets_val_tokenize_filter)):
            instance = datasets_val_tokenize_filter[i]
            inputs = {key: torch.tensor(val).unsqueeze(0) for key, val in instance.items() if key in ['input_ids', 'attention_mask']}
            outputs = model(**inputs)
            logits_list.append(outputs.logits)

    # Convert logits list to numpy array and get the predicted labels
    predicted_labels = torch.softmax(torch.cat(logits_list), dim=-1).argmax(dim=-1).numpy()

    # Calculate F1 score
    true_labels = [example['label'] for example in datasets_val_tokenize_filter]
    print(f'F1 score for {language}:', f1_score(true_labels, predicted_labels, average='micro'))  # Specify the appropriate averaging method



  0%|          | 0/14 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

F1 score for arabic: 0.6640378548895899


  0%|          | 0/14 [00:00<?, ?ba/s]

F1 score for indonesian: 0.6658270361041142


#### 2.2.3. Trained in Indonesian, tested in Arabic and Bengali

##### 2.2.3.1. Tolkenize

In [None]:
# call model (it was train with arabic, bengali, and indonesian)
# Name of the model
checkpoint = "Week 38/TRANSFORMER TRAINED MODELS/Week 38 - BERT - INDONESIAN"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)                         # Number of binary options
# Tokenizing the validation dataset
datasets_val_tokenize = datasets_val.map(tokenize_function, batched=True)

# Dinamically Padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/14 [00:00<?, ?ba/s]

##### 2.2.3.2. Load Train Model

In [None]:
model = BertForSequenceClassification.from_pretrained(checkpoint).to(device)

##### 2.2.3.3. Evaluate Train Model in the Test Language

In [None]:
# Evaluate for each language
for language in ['arabic', 'bengali']:
    datasets_val_tokenize_filter = datasets_val_tokenize.filter(lambda example: example["language"] == language)

    # Manual evaluation
    model.eval()  # Put the model in evaluation mode
    logits_list = []
    with torch.no_grad():  # Deactivate autograd engine to reduce memory usage and speed up computations
        for i in range(len(datasets_val_tokenize_filter)):
            instance = datasets_val_tokenize_filter[i]
            inputs = {key: torch.tensor(val).unsqueeze(0) for key, val in instance.items() if key in ['input_ids', 'attention_mask']}
            outputs = model(**inputs)
            logits_list.append(outputs.logits)

    # Convert logits list to numpy array and get the predicted labels
    predicted_labels = torch.softmax(torch.cat(logits_list), dim=-1).argmax(dim=-1).numpy()

    # Calculate F1 score
    true_labels = [example['label'] for example in datasets_val_tokenize_filter]
    print(f'F1 score for {language}:', f1_score(true_labels, predicted_labels, average='micro'))  # Specify the appropriate averaging method

  0%|          | 0/14 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

F1 score for arabic: 0.7066246056782335


  0%|          | 0/14 [00:00<?, ?ba/s]

F1 score for bengali: 0.6339285714285714


#### 2.2.4. Trained in Arabic, tested in Bengali and Indonesian

##### 2.2.3.1. Tolkenize

In [None]:
# call model (it was train with arabic, bengali, and indonesian)
# Name of the model
checkpoint = "Week 38/TRANSFORMER TRAINED MODELS/Week 38 - BERT - ARABIC"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)                         # Number of binary options
# Tokenizing the validation dataset
datasets_val_tokenize = datasets_val.map(tokenize_function, batched=True)

# Dinamically Padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/14 [00:00<?, ?ba/s]

##### 2.2.3.2. Load Train Model

In [None]:
model = BertForSequenceClassification.from_pretrained(checkpoint).to(device)

##### 2.2.3.3. Evaluate Train Model in the Test Language

In [None]:
# Define the languages list
languages = ['bengali', 'indonesian']

# Load model and tokenizer
model_directory = "/content/drive/MyDrive/NLP/Binary_Arabic"
model = BertForSequenceClassification.from_pretrained(model_directory)
tokenizer = AutoTokenizer.from_pretrained(model_directory)

# Define function to tokenize question and documents together
def tokenize_function(examples):
    return tokenizer(examples["question_text"], examples["document_plaintext"], padding="max_length", truncation=True, max_length=512)

# Tokenizing the validation dataset
datasets_val_tokenize = datasets_val.map(tokenize_function, batched=True)

# Evaluate for each language
for language in languages:
    datasets_val_tokenize_filter = datasets_val_tokenize.filter(lambda example: example["language"] == language)

    # Manual evaluation
    model.eval()  # Put the model in evaluation mode
    logits_list = []
    with torch.no_grad():  # Deactivate autograd engine to reduce memory usage and speed up computations
        for i in range(len(datasets_val_tokenize_filter)):
            instance = datasets_val_tokenize_filter[i]
            inputs = {key: torch.tensor(val).unsqueeze(0) for key, val in instance.items() if key in ['input_ids', 'attention_mask']}
            outputs = model(**inputs)
            logits_list.append(outputs.logits)

    # Convert logits list to numpy array and get the predicted labels
    predicted_labels = torch.softmax(torch.cat(logits_list), dim=-1).argmax(dim=-1).numpy()

    # Calculate F1 score
    true_labels = [example['label'] for example in datasets_val_tokenize_filter]
    print(f'F1 score for {language}:', f1_score(true_labels, predicted_labels, average='micro'))  # Specify the appropriate averaging method



  0%|          | 0/14 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

F1 score for bengali: 0.7321428571428571


  0%|          | 0/14 [00:00<?, ?ba/s]

F1 score for indonesian: 0.8295549958018472
