## Intallations and imports


In [9]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
# NOTE: Run this colab on CPU to avoid issues

%%capture
!pip install -U transformers
!pip install -U accelerate
!pip install datasets
!pip install bertviz

In [11]:
from bertviz import head_view
import pandas as pd
from transformers import AutoModelForQuestionAnswering, AutoConfig, AutoTokenizer, AutoModel, utils, TrainingArguments, Trainer, pipeline, DefaultDataCollator
from datasets import load_dataset

In [12]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
    IN_COLAB = True
except:
    IN_COLAB = False

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Downloading question-answering dataset

In [13]:
dataset = load_dataset("copenlu/answerable_tydiqa")
train = load_dataset("copenlu/answerable_tydiqa", split='train')
val = load_dataset("copenlu/answerable_tydiqa", split='validation')


## Filtering dataset by language

In [14]:
def language_filter(dataset, lang):
    return dataset['language'] == lang

ind = dataset.filter(lambda row: language_filter(row, lang='indonesian'))
ind_train = ind['train']
ind_val = ind['validation']

In [15]:
ind

DatasetDict({
    train: Dataset({
        features: ['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url'],
        num_rows: 11394
    })
    validation: Dataset({
        features: ['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url'],
        num_rows: 1191
    })
})

## Analyzing dataset

In [16]:
# Analize the data shape/format
print('Shape---->',ind_train.shape)

print('Columns----->',ind_train.column_names)

print(ind_train.description)

Shape----> (11394, 6)
Columns-----> ['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url']
TyDi QA is a question answering dataset covering 11 typologically diverse languages with 204K question-answer pairs.
The languages of TyDi QA are diverse with regard to their typology -- the set of linguistic features that each language
expresses -- such that we expect models performing well on this set to generalize across a large number of the languages
in the world. It contains language phenomena that would not be found in English-only corpora. To provide a realistic
information-seeking task and avoid priming effects, questions are written by people who want to know the answer, but
don’t know the answer yet, (unlike SQuAD and its descendents) and the data is collected directly in each language without
the use of translation (unlike MLQA and XQuAD).


## Importing 'distilbert' pretrained tokenizer

In [17]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

## Preprocessing function

In [18]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question_text"]]
    print(questions[0])
    inputs = tokenizer(
        questions,
        examples["document_plaintext"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["annotations"]
    print(answers[0])
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["answer_text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

## Subsampling and preprocessing training and validation datasets for computational ease

In [19]:
num_train_samples = 600
num_test_samples = 200

ind_train = ind_train.shuffle(seed=42).select(range(num_train_samples))
ind_val = ind_val.shuffle(seed=42).select(range(num_test_samples))

In [20]:
tokenized_ind_train = ind_train.map(preprocess_function, batched=True)
tokenized_ind_val = ind_val.map(preprocess_function, batched=True)

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

apakah yang dimaksud dengan nirlaba?
{'answer_start': [16], 'answer_text': ['istilah yang biasa digunakan sebagai sesuatu yang bertujuan sosial, kemasyarakatan atau lingkungan yang tidak semata-mata untuk mencari keuntungan materi']}


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Siapa pendiri Institut Teknologi Sepuluh Nopember Surabaya?
{'answer_start': [-1], 'answer_text': ['']}


## Importing data collator and pretrained 'distilbert' model

In [21]:
data_collator = DefaultDataCollator()

In [22]:
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Fine-tuning

In [23]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/MASTERS KU/AUTUMN 2023/NLP/Week 39/qa_model_indonesian",
    evaluation_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.001,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ind_train,
    eval_dataset=tokenized_ind_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,2.247285
2,No log,2.283199
3,No log,2.445383
4,No log,2.915856
5,No log,3.181023


TrainOutput(global_step=190, training_loss=1.4421041388260691, metrics={'train_runtime': 1891.8988, 'train_samples_per_second': 1.586, 'train_steps_per_second': 0.1, 'total_flos': 293969475072000.0, 'train_loss': 1.4421041388260691, 'epoch': 5.0})

## Evaluation

In [24]:
results = trainer.evaluate()
print(results)

{'eval_loss': 3.181022882461548, 'eval_runtime': 34.6769, 'eval_samples_per_second': 5.768, 'eval_steps_per_second': 0.375, 'epoch': 5.0}


## Getting the outputs and analyzing its format

In [25]:
outputs = trainer.predict(tokenized_ind_val)

In [26]:
type(outputs)

transformers.trainer_utils.PredictionOutput

In [27]:
len(outputs)

3

## Predictions are in a tuple format, let's see each component

In [28]:
print(outputs[1][0].shape)

print(outputs[1][1].shape)

(200,)
(200,)


## Using a pipeline to get the answer

In [29]:
from transformers import pipeline
question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer)

In [30]:
for i in range(0,10):
    print('Answers, annotation  and response')
    question = tokenized_ind_val['question_text'][i]
    context = tokenized_ind_val['document_plaintext'][i]
    answer = question_answerer(question=question, context=context)
    print(question,answer)

Answers, annotation  and response
Siapa pendiri Institut Teknologi Sepuluh Nopember Surabaya? {'score': 3.404992094147019e-05, 'start': 48, 'end': 91, 'answer': 'termuda di ITS yang berdiri pada tahun 2001'}
Answers, annotation  and response
Kapan Su Dingfang lahir? {'score': 0.009049175307154655, 'start': 743, 'end': 746, 'answer': 'dua'}
Answers, annotation  and response
apakah yang di maksud dengan masyarakat? {'score': 0.0005556134856306016, 'start': 28, 'end': 33, 'answer': 'utama'}
Answers, annotation  and response
Dimana letak Museum Anak Kolong Tangga? {'score': 0.017521733418107033, 'start': 334, 'end': 337, 'answer': 'dua'}
Answers, annotation  and response
Apa definisi dari wilayah ? {'score': 0.0005112206563353539, 'start': 63, 'end': 70, 'answer': 'daratan'}
Answers, annotation  and response
Apakah warna bendera Myanmar ? {'score': 0.811949610710144, 'start': 46, 'end': 60, 'answer': '3 Januari 1974'}
Answers, annotation  and response
Berapa luas SMK Negeri 1 Cikampek? {'s

In [31]:
# Extract predicted start and end positions
start_logits, end_logits = outputs[1]
predicted_start = start_logits.argmax()
predicted_end = end_logits.argmax()

# Convert token positions to text spans
predicted_answer = tokenizer.decode(tokenized_ind_val["input_ids"][0][predicted_start:predicted_end + 1])

## Test the model on a random question and context

In [44]:
question = 'Apa kota terbesar di Bali'
context = 'Denpasar adalah kota terbesar dan ibukota Kabupaten Badung di Bali, Indonesia, yang terkenal dengan atraksi budaya dan Lapangan Puputan'
question_answerer = pipeline("question-answering", model=model,tokenizer=tokenizer)
question_answerer(question=question, context=context,tokenizer=tokenizer)

{'score': 0.0006051050149835646, 'start': 16, 'end': 20, 'answer': 'kota'}

## Save the fine tuned model:

In [40]:
model.save_pretrained("/content/drive/MyDrive/MASTERS KU/AUTUMN 2023/NLP/Week 39/qa_model_indonesian")

# Load the fine tuned model

In [41]:
model_path = "/content/drive/MyDrive/MASTERS KU/AUTUMN 2023/NLP/Week 39/qa_model_indonesian"
# Create a model configuration
config = AutoConfig.from_pretrained(model_path)
# Load fine-tuned model
fine_tuned_model = AutoModelForQuestionAnswering.from_pretrained(model_path, output_attentions=True) # output attentions for posterior visualizatoin
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
#tokenizer = AutoTokenizer.from_pretrained(model_path)

In [43]:
from transformers import pipeline
question = 'Apa kota terbesar di Bali'
context = 'Denpasar adalah kota terbesar dan ibukota Kabupaten Badung di Bali, Indonesia, yang terkenal dengan atraksi budaya dan Lapangan Puputan'
question_answerer = pipeline("question-answering", model=fine_tuned_model,tokenizer=tokenizer)
question_answerer(question=question, context=context,tokenizer=tokenizer)

{'score': 0.0006051050149835646, 'start': 16, 'end': 20, 'answer': 'kota'}

In [63]:
gold_answers = []
predicted_answers = []
for i in range(5):
    question = tokenized_ind_val['question_text'][i]
    context = tokenized_ind_val['document_plaintext'][i]
    annotations = tokenized_ind_val['annotations'][i]
    val_answers = annotations['answer_text']
    gold_answers.append(val_answers)
    answers = (question_answerer(question=question, context=context)['answer'])
    predicted_answers.append(answers)

In [67]:
gold_answers

[[''], [''], [''], ['alan Sriwedani No. 1, Yogyakarta'], ['']]

In [68]:
predicted_answers

['termuda di ITS yang berdiri pada tahun 2001',
 'dua',
 'utama',
 'dua',
 'daratan']

In [66]:
# https://huggingface.co/spaces/evaluate-metric/squad_v2
from datasets import load_metric
compute_squad = load_metric("squad_v2")
compute_squad.compute(references=gold_answers, predictions=predicted_answers)

TypeError: ignored

## Week 40

In [37]:
utils.logging.set_verbosity_error()  # Suppress standard warnings

input_text = "The cat sat on the mat"
inputs = tokenizer.encode_plus(input_text, return_tensors='pt')  # Tokenize input text
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']  # If 'attention_mask' is returned by 'encode_plus'
print(inputs)
outputs = fine_tuned_model(input_ids, attention_mask=attention_mask)  # Run the model with input tensors
attention = outputs[-1]  # Retrieve attention from model outputs
print(len(attention))
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])  # Convert input ids to token strings
head_view(attention, tokens)  # Display model view


{'input_ids': tensor([[  101,  1996,  4937,  2938,  2006,  1996, 13523,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
6


<IPython.core.display.Javascript object>

In [38]:
model_view(attention, tokens)

NameError: ignored