In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import sys
sys.path.append(os.path.join(os.getcwd(), '..'))

import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel, AutoModel
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import torch
from torch.optim import AdamW
# 
from src.data.dataset import EssayDataset
from src.data.longDataset import LongEssayDataset
from src.models.bert_regression import BertRegressionModel
from src.models.hierarchicalBert import HierarchicalBert
from datasets import load_dataset
# 
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

In [4]:
df = pd.read_csv("../data/aes_dataset.csv")
df.head()

Unnamed: 0,question,reference_answer,answer,score,dataset,max_length1,normalized_score,normalized_score2
0,Jelaskan kegunaan karbohidrat untuk tubuh kita.,Fungsi karbohidrat adalah sebagai pemasok ener...,"sumber tenaga, pemanis alami, menjaga sistem i...",27.0,analisis_essay,65,0.27,27
1,Jelaskan kegunaan karbohidrat untuk tubuh kita.,Fungsi karbohidrat adalah sebagai pemasok ener...,"sebagai sumber energi, pemanis alami, menjaga ...",21.0,analisis_essay,66,0.21,21
2,Jelaskan kegunaan karbohidrat untuk tubuh kita.,Fungsi karbohidrat adalah sebagai pemasok ener...,1. Sebagai energi. 2. Sebagai memperlancaar pe...,42.0,analisis_essay,76,0.42,42
3,Jelaskan kegunaan karbohidrat untuk tubuh kita.,Fungsi karbohidrat adalah sebagai pemasok ener...,"untuk membuat kenyang, agar tidak lapar, agar ...",18.0,analisis_essay,67,0.18,18
4,Jelaskan kegunaan karbohidrat untuk tubuh kita.,Fungsi karbohidrat adalah sebagai pemasok ener...,Karbohidrat mempunyai peran penting untuk pros...,82.0,analisis_essay,105,0.82,82


In [3]:
subset_dataset = df['dataset'].unique()
splits = {}

for subset in subset_dataset:
    subset_df = df[df['dataset'] == subset]

    # split dataset (70:20:10)
    train, temp = train_test_split(subset_df, test_size=0.3, random_state=42)
    valid, test = train_test_split(temp, test_size=0.3, random_state=42)

    splits[subset] = {
        'train': train,
        'valid': valid,
        'test': test,
    }

In [4]:
train_dataset = pd.concat([splits[subset]['train'] for subset in subset_dataset])
valid_dataset = pd.concat([splits[subset]['valid'] for subset in subset_dataset])
test_dataset = pd.concat([splits[subset]['test'] for subset in subset_dataset])

# Training

In [5]:
# # filter dataset by category
# selected_category = "analisis_essay"
# train_dataset = train_dataset[train_dataset['dataset'] == selected_category]
# valid_dataset = valid_dataset[valid_dataset['dataset'] == selected_category]
# test_dataset = test_dataset[test_dataset['dataset'] == selected_category]

In [5]:
# load indobert tokenizer
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-lite-base-p2")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [6]:
# apply dataset with overlapping 128 and max length 512
train_data = LongEssayDataset(train_dataset, tokenizer, 512, 128)
valid_data = LongEssayDataset(valid_dataset, tokenizer, 512, 128)
test_data = LongEssayDataset(test_dataset, tokenizer, 512, 128)

In [7]:
# # load dataset to dataloader for creating batch size 4
train_dataloader = DataLoader(train_data, batch_size=4, collate_fn=lambda x: list(zip(*x)))
valid_dataloader = DataLoader(valid_data, batch_size=4, collate_fn=lambda x: list(zip(*x)))
test_dataloader = DataLoader(test_data, batch_size=4, collate_fn=lambda x: list(zip(*x)))

In [8]:
# load model, optimizer, and loss function (MSE)
model = HierarchicalBert("indobenchmark/indobert-lite-base-p2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
epochs = 1

optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.MSELoss()
results = []

In [None]:
def calculate_qwk(all_targets, all_predictions, min_score=0, max_score=100):
    rounded_predictions = [max(min(round(pred), max_score), min_score) for pred in all_predictions]
    rounded_targets = [max(min(round(target), max_score), min_score) for target in all_targets]
    return cohen_kappa_score(rounded_targets, rounded_predictions, weights="quadratic")

In [None]:
# training process
for epoch in range(epochs):
    model.train()
    train_mse_loss = 0
    all_predictions_train = []
    all_targets_train = []
    for batch, targets in train_dataloader:
        optimizer.zero_grad()
        targets = torch.stack(targets).to(device)
        # forward pass
        predictions = model(batch).squeeze(1)
        # compute loss
        loss = criterion(predictions, targets)
        train_mse_loss += loss.item()
        # backward pass
        loss.backward()
        optimizer.step()

        # QWK Calculation
        all_predictions_train.extend(predictions.cpu().numpy())
        all_targets_train.extend(targets.cpu().numpy())

    avg_train_loss = train_mse_loss / len(train_dataloader)
    qwk_train = calculate_qwk(all_targets_train, all_predictions_train)
    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Train QWK: {qwk_train:.4f}")

    # Validation step
    model.eval()
    valid_mse_loss = 0
    all_predictions_valid = []
    all_targets_valid = []
    with torch.no_grad():
        for batch, targets in valid_dataloader:
            targets = torch.stack(targets).to(device)
            
            predictions = model(batch).squeeze(1)
            loss = criterion(predictions, targets)
            valid_mse_loss += loss.item()

            # QWK Calculation
            all_predictions_valid.extend(predictions.cpu().numpy())
            all_targets_valid.extend(targets.cpu().numpy())

    avg_valid_loss = valid_mse_loss / len(valid_dataloader)
    qwk_valid = calculate_qwk(all_targets_valid, all_predictions_valid)
    print(f"Epoch {epoch + 1}/{epochs}, Validation Loss: {avg_valid_loss:.4f}, Validation QWK: {qwk_valid:.4f}")

    results.append({
        "epoch": epoch + 1,
        "avg_train_loss": avg_train_loss,
        "train_qwk": qwk_train,
        "avg_valid_loss": avg_valid_loss,
        "valid_qwk": qwk_valid
    })

## Testing

In [None]:
# Testing phase
model.eval()
test_mse_loss = 0
all_predictions_test = []
all_targets_test = []

with torch.no_grad():
    for batch, targets in test_dataloader:
        targets = torch.stack(targets).to(device)
        
        # Forward pass
        predictions = model(batch).squeeze(1)
        loss = criterion(predictions, targets)
        test_mse_loss += loss.item()
        
        # Store predictions and targets for QWK calculation
        all_predictions_test.extend(predictions.cpu().numpy())
        all_targets_test.extend(targets.cpu().numpy())

# Calculate average test loss
avg_test_loss = test_mse_loss / len(test_dataloader)

# Calculate QWK for test data
qwk_test = calculate_qwk(all_targets_test, all_predictions_test)

print(f"Test Loss: {avg_test_loss:.4f}, Test QWK: {qwk_test:.4f}")

test_results = {
    "test_loss": [avg_test_loss],
    "test_qwk": [qwk_test]
}