## Intallations and imports


In [26]:
# NOTE: Run this colab on CPU to avoid issues

%%capture
!pip install -U transformers
!pip install -U accelerate
!pip install datasets
!pip install bertviz

In [None]:
from bertviz import head_view
import pandas as pd
from transformers import AutoModelForQuestionAnswering, AutoConfig, AutoTokenizer, AutoModel, utils, TrainingArguments, Trainer, pipeline
from datasets import load_dataset

In [27]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
    IN_COLAB = True
except:
    IN_COLAB = False

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Downloading question-answering dataset

In [28]:
dataset = load_dataset("copenlu/answerable_tydiqa")
train = load_dataset("copenlu/answerable_tydiqa", split='train')
val = load_dataset("copenlu/answerable_tydiqa", split='validation')


Downloading readme:   0%|          | 0.00/4.94k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/71.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.49M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/116067 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13325 [00:00<?, ? examples/s]

## Filtering dataset by language

In [30]:
def language_filter(dataset, lang):
    return dataset['language'] == lang

eng = dataset.filter(lambda row: language_filter(row, lang='english'))
ben = dataset.filter(lambda row: language_filter(row, lang='bengali'))
ar = dataset.filter(lambda row: language_filter(row, lang='arabic'))
ind = dataset.filter(lambda row: language_filter(row, lang='indonesian'))
eng_train = eng['train']
eng_val = eng['validation']
ben_train = ben['train']
ben_val = ben['validation']
ar_train = ar['train']
ar_val = ar['validation']
ind_train = ind['train']
ind_val = ind['validation']

Filter:   0%|          | 0/116067 [00:00<?, ? examples/s]

Filter:   0%|          | 0/13325 [00:00<?, ? examples/s]

Filter:   0%|          | 0/116067 [00:00<?, ? examples/s]

Filter:   0%|          | 0/13325 [00:00<?, ? examples/s]

Filter:   0%|          | 0/116067 [00:00<?, ? examples/s]

Filter:   0%|          | 0/13325 [00:00<?, ? examples/s]

Filter:   0%|          | 0/116067 [00:00<?, ? examples/s]

Filter:   0%|          | 0/13325 [00:00<?, ? examples/s]

In [31]:
ar

DatasetDict({
    train: Dataset({
        features: ['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url'],
        num_rows: 29598
    })
    validation: Dataset({
        features: ['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url'],
        num_rows: 1902
    })
})

## Analyzing dataset

In [33]:
# Analize the data shape/format
print('Shape---->',ar_train.shape)

print('Columns----->',ar_train.column_names)

print(ar_train.description)

Shape----> (29598, 6)
Columns-----> ['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url']
TyDi QA is a question answering dataset covering 11 typologically diverse languages with 204K question-answer pairs.
The languages of TyDi QA are diverse with regard to their typology -- the set of linguistic features that each language
expresses -- such that we expect models performing well on this set to generalize across a large number of the languages
in the world. It contains language phenomena that would not be found in English-only corpora. To provide a realistic
information-seeking task and avoid priming effects, questions are written by people who want to know the answer, but
don’t know the answer yet, (unlike SQuAD and its descendents) and the data is collected directly in each language without
the use of translation (unlike MLQA and XQuAD).


## Importing 'distilbert' pretrained tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

## Preprocessing function

In [34]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question_text"]]
    print(questions[0])
    inputs = tokenizer(
        questions,
        examples["document_plaintext"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["annotations"]
    print(answers[0])
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["answer_text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

## Subsampling and preprocessing training and validation datasets for computational ease

In [35]:
num_train_samples = 600
num_test_samples = 200

ar_train = ar_train.shuffle(seed=42).select(range(num_train_samples))
ar_val = ar_val.shuffle(seed=42).select(range(num_test_samples))

In [36]:
tokenized_ar_train = ar_train.map(preprocess_function, batched=True)
tokenized_ar_val = ar_val.map(preprocess_function, batched=True)

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

ما هي الشارنية؟
{'answer_start': [35], 'answer_text': ['شكل من أشكال الحياة الإدياكارية تشبه السعف']}


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

من هو أوزبك خان؟
{'answer_start': [-1], 'answer_text': ['']}


## Importing data collator and pretrained 'distilbert' model

In [37]:
from transformers import DefaultDataCollator
data_collator = DefaultDataCollator()

In [38]:
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

## Fine-tuning

In [39]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/MASTERS KU/AUTUMN 2023/NLP/Week 39/qa_model_arabic",
    evaluation_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.001,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ar_train,
    eval_dataset=tokenized_ar_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

{'eval_loss': 2.211880683898926, 'eval_runtime': 35.4636, 'eval_samples_per_second': 5.64, 'eval_steps_per_second': 0.367, 'epoch': 1.0}
{'eval_loss': 2.114198684692383, 'eval_runtime': 35.3617, 'eval_samples_per_second': 5.656, 'eval_steps_per_second': 0.368, 'epoch': 2.0}
{'eval_loss': 2.2065062522888184, 'eval_runtime': 34.5482, 'eval_samples_per_second': 5.789, 'eval_steps_per_second': 0.376, 'epoch': 3.0}
{'eval_loss': 2.27561092376709, 'eval_runtime': 36.2712, 'eval_samples_per_second': 5.514, 'eval_steps_per_second': 0.358, 'epoch': 4.0}
{'eval_loss': 2.438361644744873, 'eval_runtime': 34.12, 'eval_samples_per_second': 5.862, 'eval_steps_per_second': 0.381, 'epoch': 5.0}
{'train_runtime': 1911.5216, 'train_samples_per_second': 1.569, 'train_steps_per_second': 0.099, 'train_loss': 1.9973197535464637, 'epoch': 5.0}


TrainOutput(global_step=190, training_loss=1.9973197535464637, metrics={'train_runtime': 1911.5216, 'train_samples_per_second': 1.569, 'train_steps_per_second': 0.099, 'train_loss': 1.9973197535464637, 'epoch': 5.0})

## Evaluation

In [40]:
results = trainer.evaluate()
print(results)

{'eval_loss': 2.438361644744873, 'eval_runtime': 35.8639, 'eval_samples_per_second': 5.577, 'eval_steps_per_second': 0.362, 'epoch': 5.0}
{'eval_loss': 2.438361644744873, 'eval_runtime': 35.8639, 'eval_samples_per_second': 5.577, 'eval_steps_per_second': 0.362, 'epoch': 5.0}


## Getting the outputs and analyzing its format

In [144]:
outputs = trainer.predict(tokenized_ar_val)

In [146]:
type(outputs)

transformers.trainer_utils.PredictionOutput

In [147]:
len(outputs)

3

## Predictions are in a tuple format, let's see each component

In [149]:
print(outputs[1][0].shape)

print(outputs[1][1].shape)

(200,)
(200,)


## Using a pipeline to get the answer

In [44]:
from transformers import pipeline
question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer)

In [152]:
for i in range(0,10):
    print('Answers, annotation  and response')
    question = tokenized_ar_val['question_text'][i]
    context = tokenized_ar_val['document_plaintext'][i]
    answer = question_answerer(question=question, context=context)
    print(question,answer)

Answers, annotation  and response
من هو أوزبك خان؟ {'score': 0.002334434539079666, 'start': 47, 'end': 60, 'answer': 'سيد البخاريين'}
Answers, annotation  and response
ماهى عاصمة أيرلندا الشمالية ؟ {'score': 0.026353929191827774, 'start': 85, 'end': 93, 'answer': 'الشمالية'}
Answers, annotation  and response
متى نشبت معركة دوروستولون؟ {'score': 0.04798007011413574, 'start': 41, 'end': 43, 'answer': '10'}
Answers, annotation  and response
أين تقع مديرية رغوان؟ {'score': 0.5703692436218262, 'start': 82, 'end': 86, 'answer': '2004'}
Answers, annotation  and response
كم تبلغ مساحة رومانيا؟ {'score': 0.015751240774989128, 'start': 371, 'end': 375, 'answer': '1969'}
Answers, annotation  and response
من هو أورخان غازي؟ {'score': 0.7683903574943542, 'start': 409, 'end': 419, 'answer': 'بلاد الروم'}
Answers, annotation  and response
متى توفي توماس كوهين ؟ {'score': 0.01593385450541973, 'start': 76, 'end': 80, 'answer': '1977'}
Answers, annotation  and response
ما هي السفسطة أو السفسطائية؟ {'sco

In [154]:
# Extract predicted start and end positions
start_logits, end_logits = outputs[1]
predicted_start = start_logits.argmax()
predicted_end = end_logits.argmax()

# Convert token positions to text spans
predicted_answer = tokenizer.decode(tokenized_ar_val["input_ids"][0][predicted_start:predicted_end + 1])

## Test the model on a random question and context

In [155]:
question = 'كم كان عمر إيمي واينهاوس عندما ماتت؟'
context = 'كانت إيمي واينهاوس مغنية بريطانية توفيت عن عمر يناهز 28 عاما بسبب تعاطي المخدرات'
question_answerer = pipeline("question-answering", model=model,tokenizer=tokenizer)
question_answerer(question=question, context=context,tokenizer=tokenizer)

{'score': 0.8692898750305176, 'start': 53, 'end': 55, 'answer': '28'}

## Save the fine tuned model:

In [49]:
model.save_pretrained("./qa_model_arabic")

# Load the fine tuned model

In [125]:
model_path = "/content/drive/MyDrive/MASTERS KU/AUTUMN 2023/NLP/Week 39/qa_model_arabic"
# Create a model configuration
config = AutoConfig.from_pretrained(model_path)
# Load fine-tuned model
fine_tuned_model = AutoModelForQuestionAnswering.from_pretrained(model_path, output_attentions=True) # output attentions for posterior visualizatoin
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
#tokenizer = AutoTokenizer.from_pretrained(model_path)

In [128]:
from transformers import pipeline
question = 'متى تأسست الجامعة؟'
context = 'تأسست الجامعة في عام 196 من قبل سوزانا'
question_answerer = pipeline("question-answering", model=fine_tuned_model,tokenizer=tokenizer)
question_answerer(question=question, context=context,tokenizer=tokenizer)

{'score': 0.3487803041934967,
 'start': 21,
 'end': 38,
 'answer': '196 من قبل سوزانا'}

In [None]:
# TO DO
#from datasets import load_metric
#compute_squad = load_metric("squad_v2")
#compute_squad(references=gold, predictions=formatted_predictions)

## Week 40

In [129]:
utils.logging.set_verbosity_error()  # Suppress standard warnings

input_text = "The cat sat on the mat"
inputs = tokenizer.encode_plus(input_text, return_tensors='pt')  # Tokenize input text
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']  # If 'attention_mask' is returned by 'encode_plus'
print(inputs)
outputs = fine_tuned_model(input_ids, attention_mask=attention_mask)  # Run the model with input tensors
attention = outputs[-1]  # Retrieve attention from model outputs
print(len(attention))
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])  # Convert input ids to token strings
head_view(attention, tokens)  # Display model view


{'input_ids': tensor([[  101,  1996,  4937,  2938,  2006,  1996, 13523,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
6


<IPython.core.display.Javascript object>

In [122]:
model_view(attention, tokens)

<IPython.core.display.Javascript object>