In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel, AutoModel
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import torch
from torch.optim import AdamW
# 
from dataset import EssayDataset
from bert_regression import BertRegressionModel

  from .autonotebook import tqdm as notebook_tqdm


### Merge dataset

In [2]:
# df1 = pd.read_csv("dataset/analisis_essay.csv")
# df1['dataset'] = 'analisis_essay'
# df2 = pd.read_csv("dataset/asap.csv")
# df2['dataset'] = 'asap'
# df3 = pd.read_csv("dataset/cunlp.csv")
# df3['dataset'] = 'cunlp'
# df4 = pd.read_csv("dataset/sag.csv")
# df4['dataset'] = 'sag'
# df5 = pd.read_csv("dataset/sci.csv")
# df5['dataset'] = 'sci'
# df6 = pd.read_csv("dataset/stita.csv")
# df6['dataset'] = 'stita'

# # add null value to all dataframe that doesn't have question column
# for df in [df2, df3, df6]:
#     df['question'] = pd.NA

# df = pd.concat([df1, df2, df3, df4, df5, df6], ignore_index=True)
# df.shape

# df.to_csv("dataset/aes_dataset.csv", index=False)

### Preprocessing

In [3]:
df = pd.read_csv("dataset/aes_dataset.csv")
df.head()

Unnamed: 0,question,reference_answer,answer,score,dataset
0,Jelaskan kegunaan karbohidrat untuk tubuh kita.,Fungsi karbohidrat adalah sebagai pemasok ener...,"sumber tenaga, pemanis alami, menjaga sistem i...",27.0,analisis_essay
1,Jelaskan kegunaan karbohidrat untuk tubuh kita.,Fungsi karbohidrat adalah sebagai pemasok ener...,"sebagai sumber energi, pemanis alami, menjaga ...",21.0,analisis_essay
2,Jelaskan kegunaan karbohidrat untuk tubuh kita.,Fungsi karbohidrat adalah sebagai pemasok ener...,1. Sebagai energi. 2. Sebagai memperlancaar pe...,42.0,analisis_essay
3,Jelaskan kegunaan karbohidrat untuk tubuh kita.,Fungsi karbohidrat adalah sebagai pemasok ener...,"untuk membuat kenyang, agar tidak lapar, agar ...",18.0,analisis_essay
4,Jelaskan kegunaan karbohidrat untuk tubuh kita.,Fungsi karbohidrat adalah sebagai pemasok ener...,Karbohidrat mempunyai peran penting untuk pros...,82.0,analisis_essay


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22406 entries, 0 to 22405
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   question          4859 non-null   object 
 1   reference_answer  22406 non-null  object 
 2   answer            22406 non-null  object 
 3   score             22406 non-null  float64
 4   dataset           22406 non-null  object 
dtypes: float64(1), object(4)
memory usage: 875.4+ KB


In [5]:
df['dataset'].value_counts()

dataset
asap              17043
sag                2558
analisis_essay     2162
stita               333
cunlp               171
sci                 139
Name: count, dtype: int64

In [6]:
# convert NaN to None
df['question'] = df['question'].replace({np.nan: None})

### Analysis Max Length of the Sub Token

In [7]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# dataset = EssayDataset(df, tokenizer, 512)

# df['max_length'] = df.index.map(dataset.get_max_length)
# len(df[(df['max_length'] > 510)])

In [8]:
tokenizer1 = BertTokenizer.from_pretrained("indobenchmark/indobert-lite-base-p2")
dataset = EssayDataset(df, tokenizer1, 512)

df['max_length1'] = df.index.map(dataset.get_max_length)
len(df[(df['dataset'] != 'analisis_essay')&(df['max_length1'] > 510)])

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


308

## Normalize Score

In [9]:
def normalize_score(score, min_score, max_score):
    return (score - min_score) / (max_score - min_score)

df['normalized_score'] = df.apply(lambda x: normalize_score(x['score'], df['score'].min(), df['score'].max()), axis=1)

In [10]:
df['normalized_score'].describe()

count    22406.000000
mean         0.064189
std          0.169824
min          0.000000
25%          0.000000
50%          0.010000
75%          0.020000
max          1.000000
Name: normalized_score, dtype: float64

## Training

In [11]:
df_sag = df[df['dataset'] == 'sag']
df_sag.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2558 entries, 19376 to 21933
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   question          2558 non-null   object 
 1   reference_answer  2558 non-null   object 
 2   answer            2558 non-null   object 
 3   score             2558 non-null   float64
 4   dataset           2558 non-null   object 
 5   max_length1       2558 non-null   int64  
 6   normalized_score  2558 non-null   float64
dtypes: float64(2), int64(1), object(4)
memory usage: 159.9+ KB


In [12]:
df_test = df_sag[:12]
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12 entries, 19376 to 19387
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   question          12 non-null     object 
 1   reference_answer  12 non-null     object 
 2   answer            12 non-null     object 
 3   score             12 non-null     float64
 4   dataset           12 non-null     object 
 5   max_length1       12 non-null     int64  
 6   normalized_score  12 non-null     float64
dtypes: float64(2), int64(1), object(4)
memory usage: 768.0+ bytes


In [13]:
test_dataset = EssayDataset(df_test, tokenizer1, 512)

In [14]:
test_dataset[0]

{'input_ids': tensor([    2, 29791, 29950, 12444,   374,  1002, 25788,  1116,   253, 18048,
          5707,  3265,   986,    48,  7853,  1358, 16497, 29955, 10948,  4012,
         24045,     7, 29950,  1226,  3284, 14657, 29835,  1002, 17568,  5122,
            58,  1116,  4051,  2754, 29840,  1116,  1002, 29339, 12832,  3389,
         10521, 29948,     3, 16608, 24045,     7, 29950,  7460, 15536,  7853,
         29840,  5811, 11977,    48,  1002, 18048,  5707,  3265,   986,  1226,
          6587,   574, 29835,  8666,  1002,   986,   374,  6517, 21848,   591,
         29948,   253, 18048,  5707,  3265,  2481, 23039,  1432, 25508,  1226,
          7586,   253,  9105,  8666,  1002,  3389,  4442,  1432,   598,   106,
          2910, 29849,   986, 11633, 29948,     3,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [15]:
dataloader = DataLoader(test_dataset, batch_size=4, shuffle=True)

In [16]:
model = BertRegressionModel("indobenchmark/indobert-lite-base-p2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [17]:
optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.MSELoss()
model.train()

BertRegressionModel(
  (bert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertSdpaAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True

In [18]:
for epoch in range(1):
    for batch in dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        scores = batch['scores'].to(device)

        # Forward pass
        predictions = model(input_ids=input_ids, attention_mask=attention_mask).squeeze()

        # Compute loss
        loss = criterion(predictions, scores)
        print("loss", loss.item())
        # Backward pass
        loss.backward()
        optimizer.step()


  attention_output = torch.nn.functional.scaled_dot_product_attention(


loss 0.06480002403259277
loss 0.13452069461345673
loss 0.06163423880934715
