In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel, AutoModel
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import torch
from torch.optim import AdamW
# 
from dataset import EssayDataset
from longDataset import LongEssayDataset
from bert_regression import BertRegressionModel
from hierarchicalBert import HierarchicalBert
from datasets import load_dataset
# 
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("dataset/aes_dataset.csv")
df.head()

Unnamed: 0,question,reference_answer,answer,score,dataset,max_length1,normalized_score,normalized_score2
0,Jelaskan kegunaan karbohidrat untuk tubuh kita.,Fungsi karbohidrat adalah sebagai pemasok ener...,"sumber tenaga, pemanis alami, menjaga sistem i...",27.0,analisis_essay,65,0.27,27
1,Jelaskan kegunaan karbohidrat untuk tubuh kita.,Fungsi karbohidrat adalah sebagai pemasok ener...,"sebagai sumber energi, pemanis alami, menjaga ...",21.0,analisis_essay,66,0.21,21
2,Jelaskan kegunaan karbohidrat untuk tubuh kita.,Fungsi karbohidrat adalah sebagai pemasok ener...,1. Sebagai energi. 2. Sebagai memperlancaar pe...,42.0,analisis_essay,76,0.42,42
3,Jelaskan kegunaan karbohidrat untuk tubuh kita.,Fungsi karbohidrat adalah sebagai pemasok ener...,"untuk membuat kenyang, agar tidak lapar, agar ...",18.0,analisis_essay,67,0.18,18
4,Jelaskan kegunaan karbohidrat untuk tubuh kita.,Fungsi karbohidrat adalah sebagai pemasok ener...,Karbohidrat mempunyai peran penting untuk pros...,82.0,analisis_essay,105,0.82,82


In [3]:
subset_dataset = df['dataset'].unique()
splits = {}

for subset in subset_dataset:
    subset_df = df[df['dataset'] == subset]

    # split dataset (70:20:10)
    train, temp = train_test_split(subset_df, test_size=0.3, random_state=42)
    valid, test = train_test_split(temp, test_size=0.3, random_state=42)

    splits[subset] = {
        'train': train,
        'valid': valid,
        'test': test,
    }

In [4]:
train_dataset = pd.concat([splits[subset]['train'] for subset in subset_dataset])
valid_dataset = pd.concat([splits[subset]['valid'] for subset in subset_dataset])
test_dataset = pd.concat([splits[subset]['test'] for subset in subset_dataset])

# Training

In [5]:
# # filter dataset by category
# selected_category = "analisis_essay"
# train_dataset = train_dataset[train_dataset['dataset'] == selected_category]
# valid_dataset = valid_dataset[valid_dataset['dataset'] == selected_category]
# test_dataset = test_dataset[test_dataset['dataset'] == selected_category]

In [5]:
# load indobert tokenizer
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-lite-base-p2")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [6]:
# apply dataset with overlapping 128 and max length 512
train_data = LongEssayDataset(train_dataset, tokenizer, 512, 128)
valid_data = LongEssayDataset(valid_dataset, tokenizer, 512, 128)
test_data = LongEssayDataset(test_dataset, tokenizer, 512, 128)

In [7]:
# # load dataset to dataloader for creating batch size 4
train_dataloader = DataLoader(train_data, batch_size=4, collate_fn=lambda x: list(zip(*x)))
valid_dataloader = DataLoader(valid_data, batch_size=4, collate_fn=lambda x: list(zip(*x)))
test_dataloader = DataLoader(test_data, batch_size=4, collate_fn=lambda x: list(zip(*x)))

In [8]:
# load model, optimizer, and loss function (MSE)
model = HierarchicalBert("indobenchmark/indobert-lite-base-p2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
epochs = 1

optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.MSELoss()

In [9]:
# training process
for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    for batch, targets in train_dataloader:
        optimizer.zero_grad()
        # 
        targets = torch.stack(targets).to(device)
        # forward pass
        predictions = model(batch).squeeze(1)
        # compute loss
        loss = criterion(predictions, targets)
        total_train_loss += loss.item()
        # backward pass
        loss.backward()
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {avg_train_loss:.4f}")

    # Validation step
    model.eval()
    total_valid_loss = 0
    with torch.no_grad():
        for batch, targets in valid_dataloader:
            targets = torch.stack(targets).to(device)
            
            predictions = model(batch).squeeze(1)
            loss = criterion(predictions, targets)
            total_valid_loss += loss.item()

    avg_valid_loss = total_valid_loss / len(valid_dataloader)
    print(f"Epoch {epoch + 1}/{epochs}, Validation Loss: {avg_valid_loss:.4f}")

  attention_output = torch.nn.functional.scaled_dot_product_attention(


Epoch 1/1, Train Loss: 1269.8804
Epoch 1/1, Validation Loss: 1197.6855


## Testing

In [None]:
# Testing phase
model.eval()
total_test_loss = 0

with torch.no_grad():
    for batch, targets in test_dataloader:
        targets = torch.stack(targets).to(device)
        
        predictions = model(batch).squeeze(1)
        loss = criterion(predictions, targets)
        total_test_loss += loss.item()

avg_test_loss = total_test_loss / len(test_dataloader)
print(f"Test Loss: {avg_test_loss:.4f}")