In [None]:
import os
import sys
import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import spacy
import json

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!pip install --quiet SentencePiece
!pip install --quiet transformers[torch]==4.3

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

In [None]:


with open("indonesian_datasets/question-answering/squad/data/tar/dev-v2.0_small.json") as f:
    content = json.load(f)
num = 0
for m in content['data']:
  for n in m['paragraphs']:
    num +=len(n['qas'])
print(num)

##**Preprocess dataset**

In [None]:
!pip install --quiet datasets'
from datasets import load_dataset
dataset = load_dataset("indonlu", "facqa", split='train')
val_data = load_dataset("indonlu", "facqa", split='validation')
test_data = load_dataset("indonlu", "facqa", split='test')

arr = []
for i in dataset:
  answer_idx = (np.array(i['seq_label'])!=0).nonzero()[0]
  ans = i['passage'][answer_idx[0]:answer_idx[-1]+1]
  arr.append(['<answer> '+' '.join(ans) +' <context> ' + ' '.join(i['passage']), ' '.join(i['question'])])
train_df = pd.DataFrame(arr, columns=['context','question'])

arr_val = []
for i in val_data:
    answer_idx = (np.array(i['seq_label'])!=0).nonzero()[0]
    ans = i['passage'][answer_idx[0]:answer_idx[-1]+1]
    arr_val.append(['<answer> '+' '.join(ans) +' <context> ' + ' '.join(i['passage']), ' '.join(i['question'])])
val_df = pd.DataFrame(arr_val, columns=['context','question'])

##**Load dataset**

In [None]:
with open("indonesian_datasets/question-answering/squad/data/tar/train-v2.0.json") as f:
    content = json.load(f)

hf_data = []
for data in content["data"]:
    title = data["title"]
    for paragraph in data["paragraphs"]:
        context = paragraph["context"]
        for qa in paragraph["qas"]:
            fill = {
                "id":  qa["id"],
                "title": title,
                "context": context,
                "question": qa["question"],
                "answers": {"answer_start": [], "text": []}
            }
            if qa["is_impossible"]:
                answers = qa["plausible_answers"]
            else:
                answers = qa["answers"]
            for answer in answers:
                fill["answers"]["answer_start"].append(answer["answer_start"])
                fill["answers"]["text"].append(answer["text"])

            hf_data.append(fill)

with open("hf_train-v2.0.json", "w") as f:
    json.dump({"data": hf_data}, f)

In [None]:
#berhasil preproses dari hasil.json
with open("/kaggle/input/squadindo/hf_train-v2.0.json") as f:
    content_2 = json.load(f)

df = pd.DataFrame(content_2['data'])
list_answer = [answer['text'][0] for answer in df['answers']]
df['answer'] = list_answer

In [None]:
df1 = df.copy()
df1['context'] = '<answer> '+df['answer']+' <context> '+df['context']
df1 = df1[['context','question']]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df1[['context']][:100000], df1[['question']][:100000], test_size = 0.1, shuffle=True)
X_train['question']=y_train
X_test['question']=y_test

In [None]:
PRETRAINED_MODEL = 'Wikidepia/IndoT5-small'
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, T5ForConditionalGeneration, T5Config
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
tokenizer.add_special_tokens({'additional_special_tokens': ['<answer>', '<context>']})

In [None]:
SEQ_LENGTH = 512

class FacQADataset(Dataset):
  def __init__(self, data):
    self.data = data

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    if torch.is_tensor(idx):
      idx = idx.tolist()
    row = self.data.iloc[idx]
    encoded_text = tokenizer(row['context'],
            pad_to_max_length=True, 
            max_length=SEQ_LENGTH,
            truncation=True,
            return_tensors="pt")
    encoded_text['input_ids'] = torch.squeeze(encoded_text['input_ids'])
    encoded_text['attention_mask'] = torch.squeeze(encoded_text['attention_mask'])

    encoded_label= tokenizer(
        row['question'],
        pad_to_max_length=True, 
        max_length=SEQ_LENGTH,
        truncation=True,
        return_tensors="pt"
    )
    encoded_label['input_ids'] = torch.squeeze(encoded_label['input_ids'])
    
    return encoded_text.to(device), encoded_label.to(device)

In [None]:
train_data = FacQADataset(X_train)
train_set = DataLoader(train_data, batch_size = 4)
val_data = FacQADataset(X_test)
val_set = DataLoader(val_data, batch_size = 2)

##**Retrain (Fine-tuning)**

In [None]:
LR = 0.005
EPOCHS = 7
LOG_INTERVAL = 13031

config = T5Config(decoder_start_token_id = tokenizer.pad_token_id)
model = T5ForConditionalGeneration(config).from_pretrained(PRETRAINED_MODEL)
model.resize_token_embeddings(len(tokenizer))
# model.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr = LR)

In [None]:
SAVED_MODEL_PATH = "qg_pretrained_t5_small_model_trained"

def train(epoch, best_val_loss):
    model.train()
    total_loss = 0.
    for batch_index, batch in enumerate(train_set):
        data, target = batch
        optimizer.zero_grad()
        masked_labels = mask_label_padding(target['input_ids'])
        output = model(
            input_ids=data['input_ids'],
            attention_mask=data['attention_mask'],
            labels=masked_labels
        )
        loss = output[0]
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        if batch_index % LOG_INTERVAL == 0 and batch_index > 0:
            cur_loss = total_loss / LOG_INTERVAL
            print('| epoch {:3d} | ' 
                  '{:5d}/{:5d} batches | '
                  'loss {:5.2f}'.format(
                    epoch, 
                    batch_index, len(train_set), 
                    cur_loss))
            total_loss = 0
#        print(batch_index)

def evaluate(eval_model, data_loader):
    eval_model.eval()
    total_loss = 0.

    with torch.no_grad():
        for batch_index, batch in enumerate(data_loader):
            data, target = batch
            masked_labels = mask_label_padding(target['input_ids'])
            output = eval_model(
                input_ids=data['input_ids'],
                attention_mask=data['attention_mask'],
                labels=masked_labels
            )
            total_loss += output[0].item()
#             print(batch_index)
    return total_loss / len(data_loader)

def mask_label_padding(labels):
    MASK_ID = -100
    labels[labels==tokenizer.pad_token_id] = MASK_ID
    return labels

def save(path, epoch, model_state_dict, optimizer_state_dict, loss):
    torch.save({
            'epoch': epoch,
            'model_state_dict': model_state_dict,
            'optimizer_state_dict': optimizer_state_dict,
            'best_loss': loss,
            }, path)

def load(path):
    return torch.load(path)

def print_line():
    LINE_WIDTH = 60
    print('-' * LINE_WIDTH)

In [None]:
import time

In [None]:
#BEWARE TO RUN
torch.cuda.empty_cache()

In [None]:
model.to(device)
model.train()

In [None]:
start =time.time()

best_val_loss = float("inf")
best_model = None

val_loss = evaluate(model, val_set)
print_line()
print('| Before training | valid loss {:5.2f}'.format(
    val_loss)
)

print(time.time() - start)

In [None]:
len(train_set)

In [None]:
len(val_set)

In [None]:
import time
EPOCHS = 32

In [None]:
print_line()

for epoch in range(1, EPOCHS + 1):
    start = time.time()
    train(epoch=epoch, best_val_loss=val_loss)
    val_loss = evaluate(model, val_set)
    print_line()
    print('| end of epoch {:3d} | valid loss {:5.2f}'.format(
        epoch,
        val_loss)
    )
    print_line()

    if val_loss < best_val_loss or epoch == EPOCHS:
        best_val_loss = val_loss
        best_model = model
        save(
             SAVED_MODEL_PATH+'_'+str(epoch)+'.pth',
             epoch, 
             model.state_dict(), 
             optimizer.state_dict(), 
             best_val_loss
        )
        print("| Model saved.")
        print_line()
    print('=====time=======')
    print(time.time()-start)

In [None]:
model.push_to_hub()
tokenizer.push_to_hub()