In [1]:
import pandas as pd
from src.training.bert_pipeline import TrainingBertPipeline
import logging
import torch
import os

In [2]:
df = pd.read_csv("data/aes_dataset_5k.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5363 entries, 0 to 5362
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   question           4859 non-null   object 
 1   reference_answer   5363 non-null   object 
 2   answer             5363 non-null   object 
 3   score              5363 non-null   float64
 4   dataset            5363 non-null   object 
 5   normalized_score   5363 non-null   float64
 6   normalized_score2  5363 non-null   int64  
 7   bert_length        5363 non-null   int64  
 8   indobert_length    5363 non-null   int64  
 9   albert_length      5363 non-null   int64  
 10  longformer_length  5363 non-null   int64  
 11  multibert_length   5363 non-null   int64  
 12  indoalbert_length  5363 non-null   int64  
dtypes: float64(2), int64(7), object(4)
memory usage: 544.8+ KB


In [3]:
# Check if the first file exists
df_result = None
if os.path.exists("experiments/results/results.csv"):
    df_result = pd.read_csv("experiments/results/results.csv")
    print(df_result['config_id'].iloc[-1])
else:
    print("File 'results.csv' does not exist.")

df_result1 = None
# Check if the second file exists
if os.path.exists("experiments/results/results_epoch.csv"):
    df_result1 = pd.read_csv("experiments/results/results_epoch.csv")
    print(min(df_result1['valid_qwk']))
else:
    print("File 'results_epoch.csv' does not exist.")

3
7.659122644421146e-05


In [4]:
results = []
results_epoch = []

In [5]:
batch_sizes = [4]
overlappings = [64]
epochs_list = [1]
learning_rates = [1e-5]
idx = (df_result['config_id'].iloc[-1] + 1) if df_result is not None and not df_result.empty else 0  # index untuk setiap kombinasi
best_valid_qwk = min(df_result1['valid_qwk']) if df_result1 is not None and not df_result1.empty else float("-inf")
ROOT_DIR = os.getcwd()

In [6]:
# model = [
#     ("bert_length", "bert-base-uncased"),
#     ("indobert_length", "indobenchmark/indobert-base-p1"),
#     ("albert_length", "albert-base-v1"),
#     ("indoalbert_length", "indobenchmark/indobert-lite-base-p2"),
#     ("longformer_length", "allenai/longformer-base-4096"),
#     ("multibert_length", "google-bert/bert-base-multilingual-cased")
# ]

In [7]:
for batch_size in batch_sizes:
    for overlapping in overlappings:
        for num_epochs in epochs_list:
            for lr in learning_rates:
                config = {
                    "df": df,
                    "model_name": "bert-base-uncased",
                    "overlapping": overlapping,
                    "batch_size": batch_size,
                    "learning_rate": lr,
                    "epochs": num_epochs,
                    "config_id": idx,
                    "max_seq_len": 128,
                    "col_length": "bert_length",
                    "best_valid_qwk": best_valid_qwk
                }

                logging.info(
                    f"Running configuration: config_id={idx}, model_name={config['model_name']}, batch_size={batch_size}, "
                    f"max_seq_length={config['max_seq_len']}, overlapping={overlapping}, epochs={num_epochs}, learning_rate={lr}"
                )
                
                print(
                    f"\nRunning configuration: config_id={idx}, model_name={config['model_name']}, batch_size={batch_size}, "
                    f"max_seq_length={config['max_seq_len']}, overlapping={overlapping}, epochs={num_epochs}, learning_rate={lr}"
                )
                
                try:
                    pipeline = TrainingBertPipeline(config, results, results_epoch)
                    pipeline.run_training()

                    # Save results
                    # Dapatkan root project
                    results_path = os.path.join(ROOT_DIR, "experiments/results/results.csv")
                    results_epoch_path = os.path.join(ROOT_DIR, "experiments/results/results_epoch.csv")
                    TrainingBertPipeline.save_csv(results, results_path)
                    TrainingBertPipeline.save_csv(results_epoch, results_epoch_path)
                except Exception as e:
                    logging.error(f"Error in config_id={idx}: {str(e)}")
                    print(f"Error in config_id={idx}: {str(e)}")
                    torch.cuda.empty_cache()
                finally:
                    # Clear GPU memory after every configuration
                    del pipeline.model
                    del pipeline.tokenizer
                    del pipeline.optimizer
                    torch.cuda.empty_cache()

                idx += 1


Running configuration: config_id=4, model_name=bert-base-uncased, batch_size=4, max_seq_length=128, overlapping=64, epochs=1, learning_rate=1e-05
split dataset run...
create dataset run...
max len 128
max len 128
max len 128
create dataloader run...


  attn_output = torch.nn.functional.scaled_dot_product_attention(
Token indices sequence length is longer than the specified maximum sequence length for this model (756 > 512). Running this sequence through the model will result in indexing errors


Train Loss: 0.0542, Train QWK: 0.6149, Train Pearson: 0.7148
tensor([0.4930, 0.6546, 0.4237, 0.7839], device='cuda:0') tensor([49.3019, 65.4601, 42.3672, 78.3873], device='cuda:0')
tensor([0.7700, 0.4500, 0.2700, 0.7100], device='cuda:0') tensor([77.0000, 45.0000, 27.0000, 71.0000], device='cuda:0')
tensor([0.6095, 0.2917, 0.4809, 0.3991], device='cuda:0') tensor([60.9533, 29.1708, 48.0870, 39.9127], device='cuda:0')
tensor([0.8200, 0.3300, 0.4200, 0.2300], device='cuda:0') tensor([82., 33., 42., 23.], device='cuda:0')
tensor([0.3885, 0.5056, 0.4375, 0.5647], device='cuda:0') tensor([38.8501, 50.5600, 43.7519, 56.4709], device='cuda:0')
tensor([0.2300, 0.4500, 0.5400, 0.5200], device='cuda:0') tensor([23.0000, 45.0000, 54.0000, 52.0000], device='cuda:0')
tensor([0.1620, 0.2611, 0.4409, 1.0545], device='cuda:0') tensor([ 16.1975,  26.1072,  44.0879, 105.4475], device='cuda:0')
tensor([0.2000, 0.1400, 0.3000, 0.9100], device='cuda:0') tensor([20.0000, 14.0000, 30.0000, 91.0000], device='