## Intallations and imports


In [None]:
%load_ext autoreload
%autoreload 2

In [1]:
# NOTE: Run this colab on CPU to avoid issues

%%capture
!pip install -U transformers
!pip install -U accelerate
!pip install datasets
!pip install bertviz

In [38]:
from bertviz import head_view, model_view
import pandas as pd
from transformers import AutoModelForQuestionAnswering, AutoConfig, AutoTokenizer, AutoModel, utils, TrainingArguments, Trainer, pipeline, DefaultDataCollator
from datasets import load_dataset

In [28]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
    IN_COLAB = True
except:
    IN_COLAB = False

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Downloading question-answering dataset

In [4]:
dataset = load_dataset("copenlu/answerable_tydiqa")
train = load_dataset("copenlu/answerable_tydiqa", split='train')
val = load_dataset("copenlu/answerable_tydiqa", split='validation')


Downloading readme:   0%|          | 0.00/4.94k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/71.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.49M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/116067 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13325 [00:00<?, ? examples/s]

## Filtering dataset by language

In [5]:
def language_filter(dataset, lang):
    return dataset['language'] == lang

ben = dataset.filter(lambda row: language_filter(row, lang='bengali'))
ben_train = ben['train']
ben_val = ben['validation']

Filter:   0%|          | 0/116067 [00:00<?, ? examples/s]

Filter:   0%|          | 0/13325 [00:00<?, ? examples/s]

In [6]:
ben

DatasetDict({
    train: Dataset({
        features: ['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url'],
        num_rows: 4779
    })
    validation: Dataset({
        features: ['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url'],
        num_rows: 224
    })
})

## Analyzing dataset

In [7]:
# Analize the data shape/format
print('Shape---->',ben_train.shape)

print('Columns----->',ben_train.column_names)

print(ben_train.description)

Shape----> (4779, 6)
Columns-----> ['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url']
TyDi QA is a question answering dataset covering 11 typologically diverse languages with 204K question-answer pairs.
The languages of TyDi QA are diverse with regard to their typology -- the set of linguistic features that each language
expresses -- such that we expect models performing well on this set to generalize across a large number of the languages
in the world. It contains language phenomena that would not be found in English-only corpora. To provide a realistic
information-seeking task and avoid priming effects, questions are written by people who want to know the answer, but
don’t know the answer yet, (unlike SQuAD and its descendents) and the data is collected directly in each language without
the use of translation (unlike MLQA and XQuAD).


## Importing 'distilbert' pretrained tokenizer

In [8]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

## Preprocessing function

In [9]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question_text"]]
    print(questions[0])
    inputs = tokenizer(
        questions,
        examples["document_plaintext"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["annotations"]
    print(answers[0])
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["answer_text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

## Subsampling and preprocessing training and validation datasets for computational ease

In [10]:
num_train_samples = 600
num_test_samples = 200

ben_train = ben_train.shuffle(seed=42).select(range(num_train_samples))
ben_val = ben_val.shuffle(seed=42).select(range(num_test_samples))

In [11]:
tokenized_ben_train = ben_train.map(preprocess_function, batched=True)
tokenized_ben_val = ben_val.map(preprocess_function, batched=True)

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

আবহাওয়া দপ্তরের অফিসটি কলকাতার কোথায় অবস্থিত ?
{'answer_start': [-1], 'answer_text': ['']}


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

পশ্চিমবঙ্গের হুগলী জেলার সদর শহর কোথায় ?
{'answer_start': [138], 'answer_text': ['চুঁচুড়া']}


## Importing data collator and pretrained 'distilbert' model

In [12]:
data_collator = DefaultDataCollator()

In [13]:
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Fine-tuning

In [15]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/MASTERS KU/AUTUMN 2023/NLP/Week 39/qa_model_bengali",
    evaluation_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.001,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ben_train,
    eval_dataset=tokenized_ben_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,2.739694
2,No log,2.75966
3,No log,2.679967
4,No log,2.713308
5,No log,2.942724


TrainOutput(global_step=190, training_loss=2.1533289859169407, metrics={'train_runtime': 2043.1335, 'train_samples_per_second': 1.468, 'train_steps_per_second': 0.093, 'total_flos': 293969475072000.0, 'train_loss': 2.1533289859169407, 'epoch': 5.0})

## Evaluation

In [16]:
results = trainer.evaluate()
print(results)

{'eval_loss': 2.9427242279052734, 'eval_runtime': 36.419, 'eval_samples_per_second': 5.492, 'eval_steps_per_second': 0.357, 'epoch': 5.0}


## Getting the outputs and analyzing its format

In [17]:
outputs = trainer.predict(tokenized_ben_val)

In [18]:
type(outputs)

transformers.trainer_utils.PredictionOutput

In [19]:
len(outputs)

3

## Predictions are in a tuple format, let's see each component

In [20]:
print(outputs[1][0].shape)

print(outputs[1][1].shape)

(200,)
(200,)


## Using a pipeline to get the answer

In [21]:
from transformers import pipeline
question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer)

In [22]:
for i in range(0,10):
    print('Answers, annotation and response')
    question = tokenized_ben_val['question_text'][i]
    context = tokenized_ben_val['document_plaintext'][i]
    answer = question_answerer(question=question, context=context)
    print(question,answer)

Answers, annotation  and response
পশ্চিমবঙ্গের হুগলী জেলার সদর শহর কোথায় ? {'score': 0.004799097776412964, 'start': 82, 'end': 98, 'answer': 'জেলা। হুগলী নদীর'}
Answers, annotation  and response
নর্স পুরাণ কবে রচিত হয় ? {'score': 0.0018744951812550426, 'start': 0, 'end': 11, 'answer': '১৭শ শতাব্দী'}
Answers, annotation  and response
চীনের রাজধানী কোথায় ? {'score': 0.0019455363508313894, 'start': 0, 'end': 18, 'answer': 'চীনারা তাদের দেশকে'}
Answers, annotation  and response
বিখ্যাত জ্যোতির্বিজ্ঞানী নিকোলাউস কোপের্নিকুসের জন্ম কবে হয় ? {'score': 0.009640130214393139, 'start': 81, 'end': 106, 'answer': '১৪৭৩ সালের ১৮ ফেব্রুয়ারী'}
Answers, annotation  and response
পশ্চিম ভারতের মহারাষ্ট্র রাজ্যের মুম্বাই শহরে নির্মিত গেটওয়ে অব ইন্ডিয়ার নির্মাণ কাজ কত সাল থেকে শুরু হয়েছিল ? {'score': 0.026003047823905945, 'start': 0, 'end': 6, 'answer': 'দিল্লী'}
Answers, annotation  and response
বিখ্যাত জ্যোতির্বিজ্ঞানী নিকোলাউস কোপের্নিকুসের জন্ম কোথায় হয় ? {'score': 0.02911493554711342, 'start': 61, 

In [23]:
# Extract predicted start and end positions
start_logits, end_logits = outputs[1]
predicted_start = start_logits.argmax()
predicted_end = end_logits.argmax()

# Convert token positions to text spans
predicted_answer = tokenizer.decode(tokenized_ben_val["input_ids"][0][predicted_start:predicted_end + 1])

## Test the model on a random question and context

In [24]:
question = 'স্পেন কবে বিশ্বকাপ জিতেছে?'
context = 'দক্ষিণ আফ্রিকার ২০১০ বিশ্বকাপ জিতেছিল স্পেন, আর ২০১৪ বিশ্বকাপ ব্রাজিলে অনুষ্ঠিত হয়েছিল এবং জার্মানি জিতেছিল।'
question_answerer = pipeline("question-answering", model=model,tokenizer=tokenizer)
question_answerer(question=question, context=context,tokenizer=tokenizer)

{'score': 0.000857316073961556,
 'start': 30,
 'end': 43,
 'answer': 'জিতেছিল স্পেন'}

## Save the fine tuned model:

In [33]:
model.save_pretrained("/content/drive/MyDrive/MASTERS KU/AUTUMN 2023/NLP/Week 39/qa_model_bengali")

# Load the fine tuned model

In [34]:
model_path = '/content/drive/MyDrive/MASTERS KU/AUTUMN 2023/NLP/Week 39/qa_model_bengali'
# Create a model configuration
config = AutoConfig.from_pretrained(model_path)
# Load fine-tuned model
fine_tuned_model = AutoModelForQuestionAnswering.from_pretrained(model_path, output_attentions=True) # output attentions for posterior visualization
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
#tokenizer = AutoTokenizer.from_pretrained(model_path)

In [35]:
# in this code snippet, we can see that the model also makes lots of mistakes when the context is a bit complex
question = 'স্প্যানিশ মহিলা ফুটবল দলও কি বিশ্বকাপ জিতেছে?'
context = 'যদিও এটি সাধারণত বলা হয় যে স্প্যানিশ ফুটবল জাতীয় দল ২০১০ সালে বিশ্বকাপ জিতেছিল, তবে সত্যটি হল ২০২৩ সালে মহিলা জাতীয় দলও বিশ্বকাপ জিতেছিল।'
question_answerer = pipeline("question-answering", model=fine_tuned_model,tokenizer=tokenizer)
question_answerer(question=question, context=context,tokenizer=tokenizer)

{'score': 0.08900346606969833, 'start': 54, 'end': 58, 'answer': '২০১০'}

In [None]:
# TO DO
#from datasets import load_metric
#compute_squad = load_metric("squad_v2")
#compute_squad(references=gold, predictions=formatted_predictions)

## Week 40

In [36]:
utils.logging.set_verbosity_error()  # Suppress standard warnings

input_text = "The cat sat on the mat"
inputs = tokenizer.encode_plus(input_text, return_tensors='pt')  # Tokenize input text
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']  # If 'attention_mask' is returned by 'encode_plus'
print(inputs)
outputs = fine_tuned_model(input_ids, attention_mask=attention_mask)  # Run the model with input tensors
attention = outputs[-1]  # Retrieve attention from model outputs
print(len(attention))
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])  # Convert input ids to token strings
head_view(attention, tokens)  # Display model view


{'input_ids': tensor([[  101,  1996,  4937,  2938,  2006,  1996, 13523,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
6


<IPython.core.display.Javascript object>

In [39]:
model_view(attention, tokens)

<IPython.core.display.Javascript object>