In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel, AutoModel
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import torch
from torch.optim import AdamW
# 
from dataset import EssayDataset
# from bert_regression import BertRegressionModel

  from .autonotebook import tqdm as notebook_tqdm


### Merge dataset

In [2]:
# df1 = pd.read_csv("dataset/analisis_essay.csv")
# df1['dataset'] = 'analisis_essay'
# df2 = pd.read_csv("dataset/asap.csv")
# df2['dataset'] = 'asap'
# df3 = pd.read_csv("dataset/cunlp.csv")
# df3['dataset'] = 'cunlp'
# df4 = pd.read_csv("dataset/sag.csv")
# df4['dataset'] = 'sag'
# df5 = pd.read_csv("dataset/sci.csv")
# df5['dataset'] = 'sci'
# df6 = pd.read_csv("dataset/stita.csv")
# df6['dataset'] = 'stita'

# # add null value to all dataframe that doesn't have question column
# for df in [df2, df3, df6]:
#     df['question'] = pd.NA

# df = pd.concat([df1, df2, df3, df4, df5, df6], ignore_index=True)
# df.shape

# df.to_csv("dataset/aes_dataset.csv", index=False)

### Preprocessing

In [3]:
df = pd.read_csv("dataset/aes_dataset.csv")
df.head()

Unnamed: 0,question,reference_answer,answer,score,dataset
0,Jelaskan kegunaan karbohidrat untuk tubuh kita.,Fungsi karbohidrat adalah sebagai pemasok ener...,"sumber tenaga, pemanis alami, menjaga sistem i...",27.0,analisis_essay
1,Jelaskan kegunaan karbohidrat untuk tubuh kita.,Fungsi karbohidrat adalah sebagai pemasok ener...,"sebagai sumber energi, pemanis alami, menjaga ...",21.0,analisis_essay
2,Jelaskan kegunaan karbohidrat untuk tubuh kita.,Fungsi karbohidrat adalah sebagai pemasok ener...,1. Sebagai energi. 2. Sebagai memperlancaar pe...,42.0,analisis_essay
3,Jelaskan kegunaan karbohidrat untuk tubuh kita.,Fungsi karbohidrat adalah sebagai pemasok ener...,"untuk membuat kenyang, agar tidak lapar, agar ...",18.0,analisis_essay
4,Jelaskan kegunaan karbohidrat untuk tubuh kita.,Fungsi karbohidrat adalah sebagai pemasok ener...,Karbohidrat mempunyai peran penting untuk pros...,82.0,analisis_essay


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22406 entries, 0 to 22405
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   question          4859 non-null   object 
 1   reference_answer  22406 non-null  object 
 2   answer            22406 non-null  object 
 3   score             22406 non-null  float64
 4   dataset           22406 non-null  object 
dtypes: float64(1), object(4)
memory usage: 875.4+ KB


In [5]:
df['dataset'].value_counts()

dataset
asap              17043
sag                2558
analisis_essay     2162
stita               333
cunlp               171
sci                 139
Name: count, dtype: int64

In [6]:
# convert NaN to None
df['question'] = df['question'].replace({np.nan: None})

### Analysis Max Length of the Sub Token

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22406 entries, 0 to 22405
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   question          4859 non-null   object 
 1   reference_answer  22406 non-null  object 
 2   answer            22406 non-null  object 
 3   score             22406 non-null  float64
 4   dataset           22406 non-null  object 
dtypes: float64(1), object(4)
memory usage: 875.4+ KB


In [8]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# dataset = EssayDataset(df, tokenizer, 512)

# df['max_length'] = df.index.map(dataset.get_max_length)
# len(df[(df['max_length'] > 510)])

In [9]:
tokenizer1 = BertTokenizer.from_pretrained("indobenchmark/indobert-lite-base-p2")
dataset = EssayDataset(df, tokenizer1, 512)

df['max_length1'] = df.index.map(dataset.get_max_length)
len(df[(df['dataset'] != 'analisis_essay')&(df['max_length1'] > 510)])

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


308

In [10]:
df_sag = df[df['dataset'] == 'sag']
df_sag.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2558 entries, 19376 to 21933
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   question          2558 non-null   object 
 1   reference_answer  2558 non-null   object 
 2   answer            2558 non-null   object 
 3   score             2558 non-null   float64
 4   dataset           2558 non-null   object 
 5   max_length1       2558 non-null   int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 139.9+ KB


In [11]:
df_test = df_sag[:12]
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12 entries, 19376 to 19387
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   question          12 non-null     object 
 1   reference_answer  12 non-null     object 
 2   answer            12 non-null     object 
 3   score             12 non-null     float64
 4   dataset           12 non-null     object 
 5   max_length1       12 non-null     int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 672.0+ bytes


In [12]:
test_dataset = EssayDataset(df_test, tokenizer1, 512)

In [13]:
test_dataset[0]

{'input_ids': tensor([    2, 29791, 29950, 12444,   374,  1002, 25788,  1116,   253, 18048,
          5707,  3265,   986,    48,  7853,  1358, 16497, 29955,     3, 10948,
          4012, 24045,     7, 29950,  1226,  3284, 14657, 29835,  1002, 17568,
          5122,    58,  1116,  4051,  2754, 29840,  1116,  1002, 29339, 12832,
          3389, 10521, 29948,     3, 16608, 24045,     7, 29950,  7460, 15536,
          7853, 29840,  5811, 11977,    48,  1002, 18048,  5707,  3265,   986,
          1226,  6587,   574, 29835,  8666,  1002,   986,   374,  6517, 21848,
           591, 29948,   253, 18048,  5707,  3265,  2481, 23039,  1432, 25508,
          1226,  7586,   253,  9105,  8666,  1002,  3389,  4442,  1432,   598,
           106,  2910, 29849,   986, 11633, 29948,     3,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [14]:
dataloader = DataLoader(test_dataset, batch_size=4, shuffle=True)

In [49]:
import torch
import torch.nn as nn
from transformers import BertModel, AutoModel

class BertRegressionModel(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased'):
        super().__init__()
        # load pretrained model
        self.bert = AutoModel.from_pretrained(bert_model_name)
        # add regression layer
        self.regression_layer = nn.Linear(self.bert.config.hidden_size, 1) # 768 x 1 -> output layer regression

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # print("output:", outputs)
        # print("last hidden state:", outputs.last_hidden_state)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        # print("cls embedding", cls_embedding)
        score = self.regression_layer(cls_embedding)
        return score

In [50]:
model = BertRegressionModel("indobenchmark/indobert-lite-base-p2").to("cpu")
model = model.to("cpu")

In [51]:
optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.MSELoss()
model.train()

BertRegressionModel(
  (bert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertSdpaAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True

In [53]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

for epoch in range(1):
    for batch in dataloader:
        optimizer.zero_grad()

        # Move data to CPU
        input_ids = batch['input_ids'].to("cpu")
        attention_mask = batch['attention_mask'].to("cpu")
        scores = batch['scores'].to("cpu")

        # Debugging checks
        print(f"Input IDs shape: {input_ids.shape}, Scores shape: {scores.shape}")
        assert not torch.isnan(scores).any(), "Scores contain NaN!"
        assert not torch.isinf(scores).any(), "Scores contain Inf!"

        # Forward pass
        predictions = model(input_ids=input_ids, attention_mask=attention_mask).squeeze()
        print(f"Predictions shape: {predictions.shape}, dtype: {predictions.dtype}")

        # Ensure shapes and types match
        assert predictions.size() == scores.size(), f"Shape mismatch: {predictions.size()} vs {scores.size()}"
        assert predictions.dtype == scores.dtype, f"Dtype mismatch: {predictions.dtype} vs {scores.dtype}"

        # Compute loss
        loss = criterion(predictions, scores)
        print(f"Loss before backward: {loss.item()}")

        # Backward pass
        loss.backward()
        optimizer.step()

        print(f"Loss after step: {loss.item()}")


Input IDs shape: torch.Size([4, 512]), Scores shape: torch.Size([4])
Predictions shape: torch.Size([4]), dtype: torch.float32
Loss before backward: 16.550809860229492


RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
