In [None]:
import os
import sys
import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import json

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

In [None]:
import json

with open("indonesian_datasets/question-answering/squad/data/tar/train-v2.0_small.json") as f:
    content = json.load(f)
num = 0
for m in content['data']:
  for n in m['paragraphs']:
    num +=len(n['qas'])
  print(num)

#**Question Answering SQUAD dataset**

In [None]:
#berhasil preproses dari hasil.json
with open("/kaggle/input/questionanswer/hf_train-v2.0.json") as f:
    content = json.load(f)

df = pd.DataFrame(content['data'])
list_answer = [answer['text'][0] for answer in df['answers']]
df['answer'] = list_answer
df = df[['question','answer']]

In [None]:
!pip install --quiet SentencePiece
!pip install --quiet transformers[torch]==4.3

In [None]:
tokenizer = BertTokenizer.from_pretrained("indolem/indobert-base-uncased")

In [None]:
!pip install -U --quiet spacy[cuda92]
!python -m spacy download en_core_web_sm
import spacy
import en_core_web_sm
spacy.prefer_gpu()
spacy_nlp = en_core_web_sm.load()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df[['question']], df[['answer']], test_size = 0.2, shuffle=True)
X_train['answer']=y_train
X_test['answer']=y_test

In [None]:
PRETRAINED_MODEL = 'indolem/indobert-base-uncased'
BATCH_SIZE = 8
SEQ_LENGTH = 512

class QAEvalDataset(Dataset):
    def __init__(self, csv):
        self.df = csv
        self.transforms = [self.shuffle, self.corrupt]

    def __len__(self):
         return len(self.df)

    def __getitem__(self, idx): 
        question, answer = self.df.iloc[idx]
        label = random.choice([0, 1])

        if label == 0:
            question, answer = random.choice(self.transforms)(question, answer)

        encoded_data = tokenizer(
            text=question,
            text_pair=answer,
            pad_to_max_length=True, 
            max_length=SEQ_LENGTH,
            truncation=True,
            return_tensors="pt"
        )

        encoded_data['input_ids'] = torch.squeeze(encoded_data['input_ids'])
        encoded_data['token_type_ids'] = torch.squeeze(encoded_data['token_type_ids'])
        encoded_data['attention_mask'] = torch.squeeze(encoded_data['attention_mask'])
        return (encoded_data.to(device), torch.tensor(label).to(device))
    
    def shuffle(self, question, answer):
        shuffled_answer = answer
        while shuffled_answer == answer:
            shuffled_answer = self.df.sample(1)['answer'].item()
        return question, shuffled_answer
    
    def corrupt(self, question, answer):
      
        doc = spacy_nlp(question)
        if len(doc.ents) > 1:
            # Replace all entities in the sentence with the same thing
            copy_ent = str(random.choice(doc.ents))
            for ent in doc.ents:
                question = question.replace(str(ent), copy_ent)
        elif len(doc.ents) == 1:
            # Replace the answer with an entity from the question
            answer = str(doc.ents[0])
        else:
            question, answer = self.shuffle(question, answer)
        return question, answer


train_set = QAEvalDataset(X_train) 
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)
valid_set = QAEvalDataset(X_test) 
valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
LR = 0.001
EPOCHS = 10
LOG_INTERVAL = 6515

model = BertForSequenceClassification.from_pretrained('indolem/indobert-base-uncased')
model = model.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=LR)

In [None]:
SAVED_MODEL_PATH = "qa_eval_model_trained"

def train():
    model.train()
    total_loss = 0.
    for batch_index, batch in enumerate(train_loader):
        data, labels = batch
        optimizer.zero_grad()
        output = model(**data, labels=labels)
        loss = output[0]
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        total_loss += loss.item()
        
        if batch_index % LOG_INTERVAL == 0 and batch_index > 0:
            cur_loss = total_loss / LOG_INTERVAL
            print('| epoch {:3d} | ' 
                  '{:5d}/{:5d} batches | '
                  'loss {:5.2f}'.format(
                    epoch, 
                    batch_index, len(train_loader), 
                    cur_loss))
            total_loss = 0
#         print(batch_index)

def evaluate(eval_model, data_loader):
    eval_model.eval()
    total_score = 0.
    with torch.no_grad():
        for batch_index, batch in enumerate(data_loader):
            data, labels = batch
            output = eval_model(**data, labels=labels)
            preds = np.argmax(output[1].cpu(), axis=1)
            total_score += (preds == labels.cpu()).sum()
    return total_score / (len(data_loader) * BATCH_SIZE)

def save(path, epoch, model_state_dict, optimizer_state_dict, loss):
    torch.save({
            'epoch': epoch,
            'model_state_dict': model_state_dict,
            'optimizer_state_dict': optimizer_state_dict,
            'best_loss': loss,
            }, path)

    print("| Model saved.")
    print_line()

def load():
    return torch.load(SAVED_MODEL_PATH)

def print_line():
    LINE_WIDTH = 60
    print('-' * LINE_WIDTH)

In [None]:
import time

In [None]:
start = time.time()
highest_accuracy = 0

accuracy = evaluate(model, valid_loader)
print_line()
print('| Before training | accuracy on valid set: {:5.2f}%'.format(accuracy))
print_line()

print('===================RUNNING TIME===========================')
print(time.time()-start)

In [None]:
EPOCHS = 16

In [None]:
for epoch in range(1, EPOCHS + 1):
    
    start = time.time()
    train()
    print('===================RUNNING TIME 1===========================')
    print(time.time()-start)
    
    start = time.time()
    accuracy = evaluate(model, valid_loader)
    print('===================RUNNING TIME 2===========================')
    print(time.time()-start)
    
    print_line()
    print('| end of epoch {:3d} | accuracy on valid set: {:5.2f}%'.format(
        epoch,
        accuracy)
    )
    print_line()

    if accuracy > highest_accuracy:
        highest_accuracy = accuracy
        save(
            SAVED_MODEL_PATH+'_'+str(epoch)+'.pth',
             epoch, 
             model.state_dict(), 
             optimizer.state_dict(), 
             highest_accuracy
        )

In [None]:
model.push_to_hub()
tokenizer.push_to_hub()