# Depedencies

## Libraries

In [1]:
!pip3 install transformers
!pip3 install torch



In [2]:
import torch
from torch.utils.data import Dataset, DataLoader

from transformers import BertForQuestionAnswering, BertTokenizer, BertConfig

import seaborn as sns 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

## Dataset

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
TRAIN_PATH = '/content/drive/MyDrive/Data Science/Project/GEMASTIK 2021/Final/train_preprocess.csv'
VAL_PATH = '/content/drive/MyDrive/Data Science/Project/GEMASTIK 2021/Final/valid_preprocess.csv'

In [5]:
train = pd.read_csv(TRAIN_PATH)
val = pd.read_csv(VAL_PATH)

# Pre-Process

## Question and Passage

In [6]:
train.sample(1)

Unnamed: 0,question,passage,seq_label
1501,"['Berapa', 'orangkah', 'anggota', 'Dewan', 'IA...","['Resolusi', 'IAEA', 'itu', 'pada', 'dasarnya'...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [7]:
train['question'] = train['question'].apply(lambda x: " ".join(eval(x)))
train['passage'] = train['passage'].apply(lambda x: " ".join(eval(x)))

In [8]:
train.sample(1)

Unnamed: 0,question,passage,seq_label
1361,apa nama latin biawak,"Ketika tim tiba siang hari , para orangutan it...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


## Sequence Label

In [9]:
from nltk.tokenize import word_tokenize
import nltk

In [10]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
def BI(seq_label):
    B_idx, I_idx = 0,0
    curr_idx = 0
    for i in seq_label.replace("', '", "").replace("['", "").replace("']", ""):
        if i == 'B':
            B_idx = curr_idx 
        if i == 'I':
            I_idx = curr_idx 
        curr_idx += 1

    return B_idx, I_idx

In [12]:
train['tokenized'] = train['passage'].apply(word_tokenize)

train['answer'] = np.nanargmax
for idx, rows in train.iterrows():
    B, I = BI(rows['seq_label'])
    if I == 0:
        rows['answer'] = rows["tokenized"][B]
    else:
        rows['answer'] = " ".join(rows["tokenized"][B:I+1])

In [13]:
train.drop(['seq_label', 'tokenized'], axis=1, inplace=True)

In [14]:
train.sample(1)

Unnamed: 0,question,passage,answer
2192,Siapakah Mantan Deputi Perdana Menteri Malaysi...,"Kuala lumpur , kamis - Mantan Deputi Perdana M...",Anwar Ibrahim


# Input Data

## Model

In [15]:
MODEL_NAME = "indobenchmark/indobert-base-p2"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForQuestionAnswering.from_pretrained(MODEL_NAME)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
from transformers import BertConfig

# Initializing a BERT bert-base-uncased style configuration
configuration = BertConfig()

# Initializing a model from the bert-base-uncased style configuration
model = BertForQuestionAnswering(configuration)

# Accessing the model configuration
configuration = model.config

In [17]:
configuration

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

## Instance Trial

In [18]:
question = train['question'][0]
passage = train['passage'][0]

print("Question: {}".format(question))
print("Passage: {}".format(passage))

Question: Kelompok apakah yang menyatakan bertanggung jawab atas ledakan di Srinagar ?
Passage: Lewat telepon ke kantor berita lokal Current News Service , Hezb-ul Mujahedeen , kelompok militan Kashmir yang terbesar , menyatakan bertanggung jawab atas ledakan di Srinagar .


In [19]:
# Apply tokenizer (Token Ids)
input_ids = tokenizer.encode(question, passage)
for ids in input_ids:
  print(ids, end=" ")

2 1311 937 34 2195 3987 1024 441 10884 26 4750 23620 12 30477 3 2145 3178 43 1571 2140 2752 24072 4425 4266 30468 1991 30387 30368 30469 601 18642 15188 9 30468 1311 26762 575 13461 72 34 2805 30468 2195 3987 1024 441 10884 26 4750 23620 12 30470 3 

In [20]:
# String representations of Token Ids
tokens = tokenizer.convert_ids_to_tokens(input_ids)

for token, id in zip(tokens, input_ids):
    if id == tokenizer.sep_token_id:
        print('')

    print('{:<12} {:>6,}'.format(token, id))
    if id == tokenizer.sep_token_id:
        print('')

[CLS]             2
kelompok      1,311
apakah          937
yang             34
menyatakan    2,195
bertanggung   3,987
jawab         1,024
atas            441
ledakan      10,884
di               26
sri           4,750
##nag        23,620
##ar             12
?            30,477

[SEP]             3

lewat         2,145
telepon       3,178
ke               43
kantor        1,571
berita        2,140
lokal         2,752
current      24,072
news          4,425
service       4,266
,            30,468
he            1,991
##z          30,387
##b          30,368
-            30,469
ul              601
mujah        18,642
##ede        15,188
##en              9
,            30,468
kelompok      1,311
militan      26,762
kas             575
##hm         13,461
##ir             72
yang             34
terbesar      2,805
,            30,468
menyatakan    2,195
bertanggung   3,987
jawab         1,024
atas            441
ledakan      10,884
di               26
sri           4,750
##nag        23,62

In [21]:
# Question and passage segmentation
sep_index = input_ids.index(tokenizer.sep_token_id)

num_seg_a = sep_index + 1
num_seg_b = len(input_ids) - num_seg_a

segment_ids = [0]*num_seg_a + [1]*num_seg_b
assert len(segment_ids) == len(input_ids)

for ids in segment_ids:
  print(ids, end=" ")

0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

In [33]:
scores = model(input_ids=torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))

In [34]:
scores

QuestionAnsweringModelOutput([('start_logits',
                               tensor([[ 0.3262,  0.4315,  0.5480, -0.3025,  0.1049,  0.6590,  0.8419,  0.0899,
                                         0.2076,  0.2457,  0.0792, -0.2819,  0.2909,  0.1270,  0.4943,  0.4531,
                                         1.2032,  0.8738,  0.6414,  0.8590,  0.5857,  0.8825,  0.5730,  0.2870,
                                         1.0768, -0.1091, -0.2752, -0.5461,  0.0358,  0.3461,  0.1188,  0.1227,
                                         0.3420,  0.9194,  0.8786,  1.2745,  0.2697,  0.5761, -0.0050, -0.5668,
                                         0.9843,  0.3420, -0.4176,  0.5979,  0.5708,  0.7775,  0.7004,  0.5933,
                                         0.9412, -0.2885, -0.1101,  0.1245,  0.4167]],
                                      grad_fn=<CopyBackwards>)),
                              ('end_logits',
                               tensor([[-0.4654,  0.3069,  0.7186,  0.5722, -0.1729,

In [35]:
answer_start = torch.argmax(scores['start_logits'])
answer_end = torch.argmax(scores['end_logits'])

answer = ' '.join(tokens[answer_start:answer_end+1])
print('Answer: "' + answer + '"')

Answer: "militan kas ##hm ##ir yang terbesar , menyatakan bertanggung jawab atas ledakan di"


In [36]:
answer = tokens[answer_start]

for i in range(answer_start + 1, answer_end + 1):
    
    if tokens[i][0:2] == '##':
        answer += tokens[i][2:]
    else:
        answer += ' ' + tokens[i]

print('Answer: "' + answer + '"')

Answer: "militan kashmir yang terbesar , menyatakan bertanggung jawab atas ledakan di"


## Input Loader

In [129]:
class QADataset(Dataset):
  def __init__(self, dataframe, tokenizer):
    self.len = len(dataframe)
    self.data = dataframe
    self.tokenizer = tokenizer
    self.maxlen = 128

  def __get__item(self, index):
    question = self.data.question[index]
    passage = self.data.question[index]

    inputs = tokenizer.encode_plus(question, passage,
                                   max_length=128,
                                   add_special_tokens=True,
                                   pad_to_max_length=True,
                                   return_attetion_mask=True,
                                   truncation=True)

    ids = inputs['input_ids']
    segment = inputs['token_type_ids']
    mask = inputs['attention_mask']

    return {
        'ids': torch.tensor([ids], dtype=torch.long),
        'segments': torch.tensor([segment], dtype=torch.long),
        'mask': torch.tensor([mask], dtype=torch.long)
    }
  
  def __len__(self):
    return self.len

In [130]:
trainset = QADataset(train, tokenizer)
trainload = DataLoader(trainset)

loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-5)

trainload

<torch.utils.data.dataloader.DataLoader at 0x7fb4aaba01d0>

In [123]:
def evaluate(data, output):
    answer = ''
    max_prob = float('-inf')
    num_of_windows = data[0].shape[1]
    
    for k in range(num_of_windows):
        # Obtain answer by choosing the most probable start position / end position
        start_prob, start_index = torch.max(output.start_logits[k], dim=0)
        end_prob, end_index = torch.max(output.end_logits[k], dim=0)
        
        # Probability of answer is calculated as sum of start_prob and end_prob
        prob = start_prob + end_prob
        
        # Replace answer if calculated probability is larger than previous windows
        if prob > max_prob:
            max_prob = prob
            answer = tokenizer.decode(data[0][0][k][start_index : end_index + 1])
    
    return answer.replace(' ','')