In [1]:
import pandas as pd
# Load your custom dataset from a CSV file
def load_indo_dataset(filename):
    # Read the CSV file
    df = pd.read_csv(filename)
    print(df.head())
    # Extract columns 'answer', 'response', and 'label'
    # Normalize the label to [0, 1]
    data = [
        (row['answer'], row['response'], row['label'] / 5.0)
        for _, row in df.iterrows()
    ]

    return data

In [2]:
import pandas as pd

# Load the CSV file
file_path = '/kaggle/input/indo-datanew/indodataset.csv'
df = pd.read_csv(file_path)

# Check the first few rows
print(df.head())

                                              answer  \
0  ilmu pengetahuan yang mempelajari gejala alam ...   
1  ilmu pengetahuan yang mempelajari gejala alam ...   
2  ilmu pengetahuan yang mempelajari gejala alam ...   
3  ilmu pengetahuan yang mempelajari gejala alam ...   
4  ilmu pengetahuan yang mempelajari gejala alam ...   

                                            response  label  
0  Ilmu yang mempelajari tentang fenomena alam da...    2.5  
1  ilmu pengetahuan yang mempelajari gejala alam ...    4.0  
2  pelajaran yang mempelajari tentang suatu perhi...    1.0  
3  ilmu yang mempelajari tentang bumi, lingkungan...    2.0  
4  ilmu yang mengajarkan tentang fenomena yang ad...    1.0  


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1845 entries, 0 to 1844
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   answer    1845 non-null   object 
 1   response  1845 non-null   object 
 2   label     1845 non-null   float64
dtypes: float64(1), object(2)
memory usage: 43.4+ KB


In [4]:
import pandas as pd

# Load your dataset
file_path = '/kaggle/input/indo-datanew/indodataset.csv'
df = pd.read_csv(file_path)

# Find rows with missing values
missing_values = df[df.isnull().any(axis=1)]

# Display the rows and their indices
print("Rows with missing values:")
print(missing_values)

# Display the row numbers
print("\nIndices of rows with missing values:")
print(missing_values.index.tolist())


Rows with missing values:
Empty DataFrame
Columns: [answer, response, label]
Index: []

Indices of rows with missing values:
[]


In [5]:
# Check for missing or infinite values
print(df.isnull().sum())  # Ensure no missing values
print((df == float('inf')).sum())  # Check for infinite values
print((df == float('-inf')).sum())  # Check for negative infinite values

answer      0
response    0
label       0
dtype: int64
answer      0
response    0
label       0
dtype: int64
answer      0
response    0
label       0
dtype: int64


In [6]:
!pip install transformers sentence-transformers datasets



In [7]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import datetime
import time
import random
from transformers import BertTokenizer
from sentence_transformers import SentenceTransformer, models
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr

In [8]:
# Split dataset into train, validation, and test
def split_dataset(data, valid_percentage, test_percentage):
    length = len(data)
    np.random.shuffle(data)
    train = data[:int(length * (1 - valid_percentage - test_percentage))]
    valid = data[int(length * (1 - valid_percentage - test_percentage)):int(length * (1 - test_percentage))]
    test = data[int(length * (1 - test_percentage)):]
    return train, valid, test

In [9]:
data = load_indo_dataset(file_path)

# Split the dataset into train, validation, and test sets
train_data, val_data, test_data = split_dataset(data, valid_percentage=0.1, test_percentage=0.1)

# Check a sample from the loaded data
print(f"Number of samples: {len(data)}")
print(f"Sample from the dataset: {data[0]}")

                                              answer  \
0  ilmu pengetahuan yang mempelajari gejala alam ...   
1  ilmu pengetahuan yang mempelajari gejala alam ...   
2  ilmu pengetahuan yang mempelajari gejala alam ...   
3  ilmu pengetahuan yang mempelajari gejala alam ...   
4  ilmu pengetahuan yang mempelajari gejala alam ...   

                                            response  label  
0  Ilmu yang mempelajari tentang fenomena alam da...    2.5  
1  ilmu pengetahuan yang mempelajari gejala alam ...    4.0  
2  pelajaran yang mempelajari tentang suatu perhi...    1.0  
3  ilmu yang mempelajari tentang bumi, lingkungan...    2.0  
4  ilmu yang mengajarkan tentang fenomena yang ad...    1.0  
Number of samples: 1845
Sample from the dataset: ('mengamati (observasi), mengklasifikasi, mengukur, mengajukan pertanyaan, merumuskan hipotesis, merencanakan penelitian, menafsirkan data, Mengkomunikasikan', 'Fakta - Hukum - Model\nKonsep - Rumus\nPrinsip - Teori', 0.0)


In [10]:
import pandas as pd
import os

# Save datasets to CSV files
def save_splits_to_csv(train_data, val_data, test_data, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    train_df = pd.DataFrame(train_data, columns=["response", "answer", "label"])
    val_df = pd.DataFrame(val_data, columns=["response", "answer", "label"])
    test_df = pd.DataFrame(test_data, columns=["response", "answer", "label"])

    train_df.to_csv(os.path.join(output_dir, "train_data.csv"), index=False)
    val_df.to_csv(os.path.join(output_dir, "val_data.csv"), index=False)
    test_df.to_csv(os.path.join(output_dir, "test_data.csv"), index=False)

    print(f"Data saved to {output_dir} successfully.")

# Load datasets from CSV files
def load_splits_from_csv(output_dir):
    train_df = pd.read_csv(os.path.join(output_dir, "train_data.csv"))
    val_df = pd.read_csv(os.path.join(output_dir, "val_data.csv"))
    test_df = pd.read_csv(os.path.join(output_dir, "test_data.csv"))

    # Convert dataframes back to lists of tuples
    train_data = list(train_df.itertuples(index=False, name=None))
    val_data = list(val_df.itertuples(index=False, name=None))
    test_data = list(test_df.itertuples(index=False, name=None))

    print(f"Data loaded from {output_dir} successfully.")
    return train_data, val_data, test_data

# Save splits to CSV
output_directory = "output_splits"
save_splits_to_csv(train_data, val_data, test_data, output_directory)

Data saved to output_splits successfully.


In [11]:
# Load splits from CSV
train_data, val_data, test_data = load_splits_from_csv(output_directory)

# Check the first few rows to ensure data integrity
print("Train Data (First 5 Rows):", train_data[:5])
print("Validation Data (First 5 Rows):", val_data[:5])
print("Test Data (First 5 Rows):", test_data[:5])

Data loaded from output_splits successfully.
Train Data (First 5 Rows): [('mengamati (observasi), mengklasifikasi, mengukur, mengajukan pertanyaan, merumuskan hipotesis, merencanakan penelitian, menafsirkan data, Mengkomunikasikan', 'Fakta - Hukum - Model\nKonsep - Rumus\nPrinsip - Teori', 0.0), ('penanda waktu, kesempatan, durasi, tempo, dan timing.', 'penanda waktu,kesempatan,durasi,tempo dan timing', 1.0), ('Riasan yang digunakan untuk mempertajam atau mempertegas garis wajah tanpa menghilangkan bentuk wajah asli penari.', 'riasan yang digunakan untuk mempertajam atau mempertegas garis wajah tanpa menghilangkan bentuk wajah asli penari', 1.0), ('Nilai moral, yaitu nilai yang berkaitan dengan baik buruknya tokoh.\nNilai sosial, yaitu nilai yang berkaitan dengan sikap tokoh dalam hubungannya dengan tokoh lain\nNilai religi, yaitu nilai yang berhubungan dengan hubungan manusia dengan Tuhan.\nNilai budaya, yaitu nilai yang berkaitan dengan kebiasaan masyarakat saat itu.\nNilai pendidika

In [12]:
# Highlighted: Use the IndoBERT tokenizer
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

In [13]:
def format_time(elapsed):
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [14]:
# Correct the CustomDataset __getitem__ method
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.first_sentences = [pair[0] for pair in data]
        self.second_sentences = [pair[1] for pair in data]
        self.labels = [pair[2] for pair in data]

    def __len__(self):
        return len(self.first_sentences)

    def __getitem__(self, idx):
        texts = tokenizer(
            self.first_sentences[idx],
            self.second_sentences[idx],
            padding="max_length",
            max_length=128,
            truncation=True,
            return_tensors="pt"
        )
        label = torch.tensor(self.labels[idx], dtype=torch.float32)
        return {
            'input_ids': texts['input_ids'].squeeze(0),
            'attention_mask': texts['attention_mask'].squeeze(0),
        }, label

In [15]:
# Create DataLoader
batch_size = 8
train_ds = CustomDataset(train_data)
val_ds = CustomDataset(val_data)
test_ds = CustomDataset(test_data)

In [16]:
train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
validation_dataloader = DataLoader(val_ds, batch_size=batch_size)
test_dataloader = DataLoader(test_ds, batch_size=batch_size)

In [17]:
# Enhanced Model with IndoBERT
class EnhancedBertModel(nn.Module):
    def __init__(self):
        super(EnhancedBertModel, self).__init__()
        # Highlighted: Use IndoBERT as the transformer
        self.bert = models.Transformer('indobenchmark/indobert-base-p1', max_seq_length=128)
        self.pooling_layer = models.Pooling(self.bert.get_word_embedding_dimension())

        # Freeze BERT layers
        for param in self.bert.parameters():
            param.requires_grad = False

        self.bi_lstm = nn.LSTM(
            input_size=self.bert.get_word_embedding_dimension(),
            hidden_size=64,
            num_layers=1,
            bidirectional=True,
            batch_first=True
        )

        self.fc_dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(256, 1)

    def forward(self, input_data):
        bert_output = self.bert(input_data)
        sequence_output = bert_output['token_embeddings']

        lstm_output, _ = self.bi_lstm(sequence_output)

        avg_pool = torch.mean(lstm_output, dim=1)
        max_pool, _ = torch.max(lstm_output, dim=1)

        pooled_output = torch.cat((avg_pool, max_pool), dim=1)

        output = self.fc_dropout(pooled_output)
        output = self.fc(output)

        return output.squeeze(-1)

In [18]:
# Check for GPU availability
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 2 GPU(s) available.
We will use the GPU: Tesla T4


In [19]:
# Instantiate and move the model to device
model = EnhancedBertModel()
model.to(device)

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

EnhancedBertModel(
  (bert): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (pooling_layer): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (bi_lstm): LSTM(768, 64, batch_first=True, bidirectional=True)
  (fc_dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)

In [20]:
# Loss function, optimizer, and scheduler
criterion = nn.MSELoss()
epochs = 8
optimizer = optim.Adam(model.parameters(), lr=1e-5)

In [21]:
# Training Loop
def train_model():
    training_stats = []
    total_t0 = time.time()

    for epoch_i in range(epochs):
        print(f"\n======== Epoch {epoch_i + 1} / {epochs} ========")
        print("Training...")

        t0 = time.time()
        total_train_loss = 0
        model.train()

        for batch in tqdm(train_dataloader):
            train_data, train_labels = batch
            train_data['input_ids'] = train_data['input_ids'].to(device)
            train_data['attention_mask'] = train_data['attention_mask'].to(device)
            train_labels = train_labels.to(device)

            optimizer.zero_grad()
            outputs = model({
                'input_ids': train_data['input_ids'],
                'attention_mask': train_data['attention_mask']
            })
            loss = criterion(outputs, train_labels)
            total_train_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

        avg_train_loss = total_train_loss / len(train_dataloader)
        training_time = format_time(time.time() - t0)

        print(f"  Average training loss: {avg_train_loss:.5f}")
        print(f"  Training epoch took: {training_time}")

        # Validation
        print("Running Validation...")
        t0 = time.time()

        model.eval()
        total_val_loss = 0

        for batch in tqdm(validation_dataloader):
            val_data, val_labels = batch
            val_data['input_ids'] = val_data['input_ids'].to(device)
            val_data['attention_mask'] = val_data['attention_mask'].to(device)
            val_labels = val_labels.to(device)

            with torch.no_grad():
                outputs = model({
                    'input_ids': val_data['input_ids'],
                    'attention_mask': val_data['attention_mask']
                })
                loss = criterion(outputs, val_labels)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(validation_dataloader)
        validation_time = format_time(time.time() - t0)

        print(f"  Validation Loss: {avg_val_loss:.5f}")
        print(f"  Validation took: {validation_time}")

        training_stats.append({
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Validation Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        })

    print("Training complete!")
    print(f"Total training took {format_time(time.time() - total_t0)}")
    return model, training_stats


In [22]:
# Train the model
model, training_stats = train_model()


Training...


  6%|▌         | 11/185 [00:01<00:15, 11.10it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 16%|█▌        | 29/185 [00:02<00:09, 15.64it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 37%|███▋      | 69/185 [00:05<00:07, 15.47it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 43%|████▎     | 79/185 [00:05<00:06, 15.39it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' trunca

  Average training loss: 0.35319
  Training epoch took: 0:00:12
Running Validation...


  9%|▊         | 2/23 [00:00<00:01, 17.17it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 78%|███████▊  | 18/23 [00:01<00:00, 17.27it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|██████████| 23/23 [00:01<00:00, 17.05it/s]


  Validation Loss: 0.20221
  Validation took: 0:00:01

Training...


  2%|▏         | 4/185 [00:00<00:11, 16.24it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  8%|▊         | 14/185 [00:00<00:10, 16.28it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 15%|█▌        | 28/185 [00:01<00:09, 16.22it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 28%|██▊       | 52/185 [00:03<00:08, 16.45it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncat

  Average training loss: 0.20063
  Training epoch took: 0:00:11
Running Validation...


  9%|▊         | 2/23 [00:00<00:01, 17.29it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 78%|███████▊  | 18/23 [00:01<00:00, 16.54it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|██████████| 23/23 [00:01<00:00, 16.53it/s]


  Validation Loss: 0.15672
  Validation took: 0:00:01

Training...


  1%|          | 2/185 [00:00<00:11, 15.71it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  4%|▍         | 8/185 [00:00<00:11, 15.97it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  9%|▊         | 16/185 [00:01<00:10, 15.43it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 14%|█▍        | 26/185 [00:01<00:09, 15.99it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncati

  Average training loss: 0.17957
  Training epoch took: 0:00:12
Running Validation...


  9%|▊         | 2/23 [00:00<00:01, 17.22it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 78%|███████▊  | 18/23 [00:01<00:00, 16.56it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|██████████| 23/23 [00:01<00:00, 16.56it/s]


  Validation Loss: 0.13044
  Validation took: 0:00:01

Training...


  8%|▊         | 14/185 [00:00<00:10, 15.84it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 15%|█▌        | 28/185 [00:01<00:09, 15.83it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 18%|█▊        | 34/185 [00:02<00:09, 15.55it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always 

  Average training loss: 0.15439
  Training epoch took: 0:00:12
Running Validation...


  9%|▊         | 2/23 [00:00<00:01, 17.34it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 78%|███████▊  | 18/23 [00:01<00:00, 16.49it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|██████████| 23/23 [00:01<00:00, 16.59it/s]


  Validation Loss: 0.11334
  Validation took: 0:00:01

Training...


  4%|▍         | 8/185 [00:00<00:11, 15.68it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 15%|█▌        | 28/185 [00:01<00:10, 15.39it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 37%|███▋      | 68/185 [00:04<00:07, 15.60it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 42%|████▏     | 78/185 [00:05<00:06, 15.52it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncat

  Average training loss: 0.14860
  Training epoch took: 0:00:12
Running Validation...


  9%|▊         | 2/23 [00:00<00:01, 16.30it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 78%|███████▊  | 18/23 [00:01<00:00, 16.31it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|██████████| 23/23 [00:01<00:00, 16.21it/s]


  Validation Loss: 0.10024
  Validation took: 0:00:01

Training...


  1%|          | 2/185 [00:00<00:12, 15.19it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  2%|▏         | 4/185 [00:00<00:12, 14.95it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 21%|██        | 38/185 [00:02<00:09, 15.25it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 23%|██▎       | 42/185 [00:02<00:09, 15.15it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncati

  Average training loss: 0.13348
  Training epoch took: 0:00:12
Running Validation...


  9%|▊         | 2/23 [00:00<00:01, 16.03it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 78%|███████▊  | 18/23 [00:01<00:00, 15.69it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|██████████| 23/23 [00:01<00:00, 15.79it/s]


  Validation Loss: 0.09057
  Validation took: 0:00:01

Training...


 10%|▉         | 18/185 [00:01<00:11, 14.98it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 13%|█▎        | 24/185 [00:01<00:10, 14.89it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 16%|█▌        | 30/185 [00:02<00:10, 15.04it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 44%|████▍     | 82/185 [00:05<00:06, 14.83it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' trunca

  Average training loss: 0.13154
  Training epoch took: 0:00:12
Running Validation...


  9%|▊         | 2/23 [00:00<00:01, 15.77it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 78%|███████▊  | 18/23 [00:01<00:00, 15.49it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|██████████| 23/23 [00:01<00:00, 15.50it/s]


  Validation Loss: 0.08289
  Validation took: 0:00:01

Training...


  2%|▏         | 4/185 [00:00<00:12, 14.84it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  5%|▌         | 10/185 [00:00<00:11, 14.63it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  8%|▊         | 14/185 [00:00<00:11, 14.57it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  9%|▊         | 16/185 [00:01<00:11, 14.60it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncat

  Average training loss: 0.12165
  Training epoch took: 0:00:13
Running Validation...


  9%|▊         | 2/23 [00:00<00:01, 15.73it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 78%|███████▊  | 18/23 [00:01<00:00, 15.25it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|██████████| 23/23 [00:01<00:00, 15.15it/s]

  Validation Loss: 0.07630
  Validation took: 0:00:02
Training complete!
Total training took 0:01:48





In [23]:
# THIS PART OF CODE IS JUST ADDED
# Unfreeze BERT layers for fine-tuning
print("\nUnfreezing BERT layers for fine-tuning...")
for param in model.bert.parameters():
    param.requires_grad = True

# Reinitialize the optimizer for fine-tuning
optimizer = optim.Adam(model.parameters(), lr=1e-5)

# Retrain the entire model with unfrozen BERT
print("\nFine-tuning the entire model...")
model, training_stats = train_model()


Unfreezing BERT layers for fine-tuning...

Fine-tuning the entire model...

Training...


  1%|          | 1/185 [00:00<00:37,  4.86it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  4%|▍         | 8/185 [00:01<00:38,  4.61it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  7%|▋         | 13/185 [00:02<00:38,  4.52it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  9%|▉         | 17/185 [00:03<00:37,  4.50it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncati

  Average training loss: 0.09396
  Training epoch took: 0:00:41
Running Validation...


  4%|▍         | 1/23 [00:00<00:02,  7.99it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 74%|███████▍  | 17/23 [00:01<00:00, 14.80it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|██████████| 23/23 [00:01<00:00, 14.43it/s]


  Validation Loss: 0.03431
  Validation took: 0:00:02

Training...


 39%|███▉      | 73/185 [00:15<00:24,  4.62it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 42%|████▏     | 78/185 [00:16<00:23,  4.62it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 44%|████▍     | 81/185 [00:17<00:22,  4.60it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 59%|█████▉    | 109/185 [00:23<00:16,  4.59it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' trunc

  Average training loss: 0.06140
  Training epoch took: 0:00:40
Running Validation...


  4%|▍         | 1/23 [00:00<00:02,  7.64it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 74%|███████▍  | 17/23 [00:01<00:00, 14.75it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|██████████| 23/23 [00:01<00:00, 14.37it/s]


  Validation Loss: 0.02784
  Validation took: 0:00:02

Training...


  5%|▌         | 10/185 [00:02<00:38,  4.54it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  9%|▉         | 17/185 [00:03<00:37,  4.49it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 33%|███▎      | 61/185 [00:13<00:27,  4.50it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 65%|██████▌   | 121/185 [00:26<00:14,  4.51it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' trunc

  Average training loss: 0.04844
  Training epoch took: 0:00:41
Running Validation...


  4%|▍         | 1/23 [00:00<00:02,  7.51it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 74%|███████▍  | 17/23 [00:01<00:00, 14.85it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|██████████| 23/23 [00:01<00:00, 14.30it/s]


  Validation Loss: 0.02342
  Validation took: 0:00:02

Training...


 13%|█▎        | 24/185 [00:05<00:35,  4.51it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 36%|███▌      | 67/185 [00:14<00:26,  4.49it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 47%|████▋     | 87/185 [00:19<00:21,  4.50it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 54%|█████▍    | 100/185 [00:22<00:18,  4.49it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' trunc

  Average training loss: 0.03610
  Training epoch took: 0:00:41
Running Validation...


  4%|▍         | 1/23 [00:00<00:02,  7.63it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 74%|███████▍  | 17/23 [00:01<00:00, 14.79it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|██████████| 23/23 [00:01<00:00, 14.32it/s]


  Validation Loss: 0.02534
  Validation took: 0:00:02

Training...


  0%|          | 0/185 [00:00<?, ?it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  3%|▎         | 5/185 [00:00<00:36,  4.89it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 16%|█▌        | 30/185 [00:06<00:34,  4.50it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 25%|██▍       | 46/185 [00:10<00:30,  4.50it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strat

  Average training loss: 0.03059
  Training epoch took: 0:00:41
Running Validation...


  4%|▍         | 1/23 [00:00<00:02,  7.61it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 74%|███████▍  | 17/23 [00:01<00:00, 14.79it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|██████████| 23/23 [00:01<00:00, 14.33it/s]


  Validation Loss: 0.02216
  Validation took: 0:00:02

Training...


 10%|▉         | 18/185 [00:03<00:37,  4.49it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 17%|█▋        | 31/185 [00:06<00:34,  4.50it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 34%|███▎      | 62/185 [00:13<00:27,  4.52it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 34%|███▍      | 63/185 [00:13<00:27,  4.49it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' trunca

  Average training loss: 0.02852
  Training epoch took: 0:00:41
Running Validation...


  4%|▍         | 1/23 [00:00<00:02,  7.52it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 74%|███████▍  | 17/23 [00:01<00:00, 14.76it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|██████████| 23/23 [00:01<00:00, 14.36it/s]


  Validation Loss: 0.01968
  Validation took: 0:00:02

Training...


 15%|█▍        | 27/185 [00:05<00:35,  4.50it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 16%|█▌        | 29/185 [00:06<00:34,  4.49it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 38%|███▊      | 71/185 [00:15<00:25,  4.51it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 44%|████▍     | 82/185 [00:18<00:22,  4.50it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' trunca

  Average training loss: 0.02455
  Training epoch took: 0:00:41
Running Validation...


  4%|▍         | 1/23 [00:00<00:02,  7.57it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 74%|███████▍  | 17/23 [00:01<00:00, 14.97it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|██████████| 23/23 [00:01<00:00, 14.43it/s]


  Validation Loss: 0.02173
  Validation took: 0:00:02

Training...


 12%|█▏        | 23/185 [00:04<00:35,  4.50it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 13%|█▎        | 24/185 [00:05<00:35,  4.50it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 23%|██▎       | 43/185 [00:09<00:31,  4.49it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 24%|██▍       | 44/185 [00:09<00:31,  4.46it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' trunca

  Average training loss: 0.02093
  Training epoch took: 0:00:41
Running Validation...


  4%|▍         | 1/23 [00:00<00:02,  7.56it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 74%|███████▍  | 17/23 [00:01<00:00, 14.80it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|██████████| 23/23 [00:01<00:00, 14.31it/s]

  Validation Loss: 0.02104
  Validation took: 0:00:02
Training complete!
Total training took 0:05:40





In [24]:
# Define custom evaluation functions
def mean_squared_error(y_true, y_pred):
    squared_errors = [(true - pred) ** 2 for true, pred in zip(y_true, y_pred)]
    return sum(squared_errors) / len(squared_errors)

def mean_absolute_error(y_true, y_pred):
    absolute_errors = [abs(true - pred) for true, pred in zip(y_true, y_pred)]
    return sum(absolute_errors) / len(absolute_errors)

def root_mean_squared_error(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    return mse ** 0.5

def pearsonr(x, y):
    mean_x = sum(x) / len(x)
    mean_y = sum(y) / len(y)
    numerator = sum((xi - mean_x) * (yi - mean_y) for xi, yi in zip(x, y))
    denominator = ((sum((xi - mean_x) ** 2 for xi in x) * sum((yi - mean_y) ** 2 for yi in y)) ** 0.5)
    return numerator / denominator if denominator != 0 else 0.0

In [25]:
# Evaluate Model with custom functions
def evaluate_model(model, dataloader):
    model.eval()
    true_labels = []
    predicted_scores = []

    with torch.no_grad():
        for batch in dataloader:
            data, labels = batch
            data['input_ids'] = data['input_ids'].to(device)
            data['attention_mask'] = data['attention_mask'].to(device)
            predictions = model({
                'input_ids': data['input_ids'],
                'attention_mask': data['attention_mask']
            })

            true_labels.extend(labels.cpu().numpy())
            predicted_scores.extend(predictions.cpu().numpy())

    mse = mean_squared_error(true_labels, predicted_scores)
    mae = mean_absolute_error(true_labels, predicted_scores)
    rmse = root_mean_squared_error(true_labels, predicted_scores)
    pearson_corr = pearsonr(true_labels, predicted_scores)

    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Pearson Correlation: {pearson_corr:.4f}")

    return mse, mae, rmse, pearson_corr

# Evaluate on validation set
evaluate_model(model, validation_dataloader)

# Optional: Evaluate on the test set if available
test_ds = CustomDataset(test_data)
test_dataloader = DataLoader(test_ds, batch_size=batch_size)
print("this is the evaluation on the test set:")
evaluate_model(model, test_dataloader)

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Mean Squared Error (MSE): 0.0210
Mean Absolute Error (MAE): 0.1058
Root Mean Squared Error (RMSE): 0.1450
Pearson Correlation: 0.9230
this is the evaluation on the test set:


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Mean Squared Error (MSE): 0.0278
Mean Absolute Error (MAE): 0.1119
Root Mean Squared Error (RMSE): 0.1668
Pearson Correlation: 0.9012


(0.02781942536563016,
 0.1118861418318104,
 0.16679156263321643,
 0.9011529553446912)

In [26]:
# Save the trained model to a .pt file
model_save_path = "/kaggle/working/indobert_similarity_model.pt"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")


Model saved to /kaggle/working/indobert_similarity_model.pt


In [27]:
# Reinitialize the model architecture
model = EnhancedBertModel()
model.to(device)

# Load the model state dictionary
model.load_state_dict(torch.load(model_save_path))
print("Model loaded successfully!")

# Set the model to evaluation mode if testing
model.eval()


  model.load_state_dict(torch.load(model_save_path))


Model loaded successfully!


EnhancedBertModel(
  (bert): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (pooling_layer): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (bi_lstm): LSTM(768, 64, batch_first=True, bidirectional=True)
  (fc_dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)

In [28]:
def evaluate_and_save_results(model, dataloader, csv_filename, original_data):
    model.eval()
    results = []  # To store all prediction results
    index = 0  # Track the original dataset index

    with torch.no_grad():
        for batch in tqdm(dataloader):
            data, labels = batch
            data['input_ids'] = data['input_ids'].to(device)
            data['attention_mask'] = data['attention_mask'].to(device)
            predictions = model({
                'input_ids': data['input_ids'],
                'attention_mask': data['attention_mask']
            }).cpu().numpy()

            batch_size = len(labels)
            for i in range(batch_size):
                # Get original sentences (response, answer)
                response, answer, true_label = original_data[index]
                index += 1  # Move to the next pair

                results.append({
                    'Response': response,
                    'Answer': answer,
                    'True Label': true_label,
                    'Predicted Score': predictions[i]
                })

    # Save results to CSV
    df = pd.DataFrame(results)
    df.to_csv(csv_filename, index=False)
    print(f"Results saved to {csv_filename}")
    
    return df

# Save validation results
validation_results_csv = "/kaggle/working/validation_results.csv"
df_validation = evaluate_and_save_results(model, validation_dataloader, validation_results_csv, val_data)

# Load test data for testing
test_ds = CustomDataset(test_data)
test_dataloader = DataLoader(test_ds, batch_size=batch_size)

# Save test results
test_results_csv = "/kaggle/working/test_results.csv"
df_test = evaluate_and_save_results(model, test_dataloader, test_results_csv, test_data)


  9%|▊         | 2/23 [00:00<00:01, 10.88it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 78%|███████▊  | 18/23 [00:01<00:00, 14.97it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|██████████| 23/23 [00:01<00:00, 14.48it/s]


Results saved to /kaggle/working/validation_results.csv


 42%|████▏     | 10/24 [00:00<00:00, 15.14it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|██████████| 24/24 [00:01<00:00, 15.59it/s]

Results saved to /kaggle/working/test_results.csv





In [29]:
def test_single_data_point(model, tokenizer, sentence1, sentence2):
    # Prepare input using the tokenizer
    encoded_input = tokenizer(
        sentence1,
        sentence2,
        padding="max_length",
        max_length=128,
        truncation=True,
        return_tensors="pt"
    ).to(device)

    # Set the model to evaluation mode
    model.eval()

    with torch.no_grad():
        # Perform inference
        predicted_score = model({
            'input_ids': encoded_input['input_ids'],
            'attention_mask': encoded_input['attention_mask']
        })

    # Since the model output is normalized to [0, 1], rescale it to [0, 5]
    predicted_score_rescaled = predicted_score.item() * 5.0

    print(f"Sentence 1: {sentence1}")
    print(f"Sentence 2: {sentence2}")
    print(f"Predicted Similarity Score: {predicted_score_rescaled:.4f}")

    return predicted_score_rescaled

In [30]:
# Example test data point from the test set
test_sentence1 = test_data[0][0]  # Replace with the first sentence from your test data
test_sentence2 = test_data[0][1]  # Replace with the second sentence from your test data
true_score = test_data[0][2] * 5.0  # Rescale the true label to the [0, 5] range for comparison

print(f"True Similarity Score: {true_score:.4f}")
predicted_score = test_single_data_point(model, tokenizer, test_sentence1, test_sentence2)

True Similarity Score: 2.5000
Sentence 1: Nilai moral, yaitu nilai yang berkaitan dengan baik buruknya tokoh.
Nilai sosial, yaitu nilai yang berkaitan dengan sikap tokoh dalam hubungannya dengan tokoh lain
Nilai religi, yaitu nilai yang berhubungan dengan hubungan manusia dengan Tuhan.
Nilai budaya, yaitu nilai yang berkaitan dengan kebiasaan masyarakat saat itu.
Nilai pendidikan, yaitu nilai yang berkaitan dengan upaya seseorang mencari ilmu pengetahuan.
Sentence 2: Nilai moral
Nilai Pendidikan
Nilai Sosial
Nilai Budaya
Nilai Religi
Predicted Similarity Score: 2.3664


In [31]:
# Example test data point from the test set
test_sentence1 = test_data[5][0]  # Replace with the first sentence from your test data
test_sentence2 = test_data[5][1]  # Replace with the second sentence from your test data
true_score = test_data[5][2] * 5.0  # Rescale the true label to the [0, 5] range for comparison

print(f"True Similarity Score: {true_score:.4f}")
predicted_score = test_single_data_point(model, tokenizer, test_sentence1, test_sentence2)

True Similarity Score: 5.0000
Sentence 1: tema tokoh penokohan alur latar sudut pandang amanat
Sentence 2: Tema
Tokoh
Penokohan
Alur
Latar
Sudut pandang
Amanat
Predicted Similarity Score: 4.7203


In [32]:
import csv

# Evaluate model on the unseen dataset and save results to CSV
def evaluate_and_save_results(model, dataloader, output_csv_path):
    model.eval()
    true_labels = []
    predicted_scores = []
    responses = []
    answers = []

    with torch.no_grad():
        for batch in dataloader:
            data, labels = batch
            data['input_ids'] = data['input_ids'].to(device)
            data['attention_mask'] = data['attention_mask'].to(device)
            predictions = model({
                'input_ids': data['input_ids'],
                'attention_mask': data['attention_mask']
            })

            true_labels.extend(labels.cpu().numpy())
            predicted_scores.extend(predictions.cpu().numpy())
            responses.extend(data['input_ids'].cpu().numpy())  # Add the actual `response`
            answers.extend(data['attention_mask'].cpu().numpy())  # Add the `answer`

    # Calculate metrics
    mse = mean_squared_error(true_labels, predicted_scores)
    mae = mean_absolute_error(true_labels, predicted_scores)
    rmse = root_mean_squared_error(true_labels, predicted_scores)
    pearson_corr = pearsonr(true_labels, predicted_scores)

    print(f"\nEvaluation on Unseen Dataset:")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Pearson Correlation: {pearson_corr:.4f}")

    # Save results to CSV
    results = {
        'answer': [tokenizer.decode(answers[i]) for i in range(len(answers))],
        'response': [tokenizer.decode(responses[i]) for i in range(len(responses))],
        'label': true_labels,
        'predicted_label': predicted_scores,
    }

    df_results = pd.DataFrame(results)
    df_results.to_csv(output_csv_path, index=False)
    print(f"Results saved to {output_csv_path}")

# Load unseen dataset and create DataLoader
unseen_data = load_indo_dataset("/kaggle/input/testi-data/test-BuIng.csv")
unseen_dataset = CustomDataset(unseen_data)
unseen_dataloader = DataLoader(unseen_dataset, batch_size=batch_size)

# Evaluate and save to CSV
output_csv_path = "/kaggle/working/unseen_dataset_results.csv"
evaluate_and_save_results(model, unseen_dataloader, output_csv_path)


                                              answer  \
0  animasi adalah sebuah proses merekam dan memai...   
1  animasi adalah menghidupkan, yaitu usaha untuk...   
2  animasi adalah sebuah proses merekam dan memai...   
3  animasi adalah sebuah proses merekam dan memai...   
4  animasi adalah menghidupkan, yaitu usaha untuk...   

                                            response  label  
0  animasi komputer adalah pembuatan atau pemrose...    2.5  
1  animasi komputer merupakan sebuah bentuk seni ...    4.5  
2  animasi yang dibuat pada saat sekarang dan dib...    2.5  
3  sebuah animasi dimana animasi ini sebuah perge...    4.0  
4    proses menciptakan gerakan menggunakan komputer    5.0  


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Evaluation on Unseen Dataset:
Mean Squared Error (MSE): 0.2420
Mean Absolute Error (MAE): 0.4348
Root Mean Squared Error (RMSE): 0.4920
Pearson Correlation: 0.0140
Results saved to /kaggle/working/unseen_dataset_results.csv
