In [1]:
import pandas as pd
# Load your custom dataset from a CSV file
def load_indo_dataset(filename):
    # Read the CSV file
    df = pd.read_csv(filename)
    print(df.head())
    # Extract columns 'answer', 'response', and 'label'
    # Normalize the label to [0, 1]
    data = [
        (row['answer'], row['response'], row['label'] / 5.0)
        for _, row in df.iterrows()
    ]

    return data

In [None]:
import pandas as pd

# Load the CSV file
file_path = '/kaggle/input/indo-datanew/indodataset.csv'
df = pd.read_csv(file_path)

# Check the first few rows
print(df.head())

In [None]:
df.info()

In [None]:
import pandas as pd

# Load your dataset
file_path = '/kaggle/input/indo-datanew/indodataset.csv'
df = pd.read_csv(file_path)

# Find rows with missing values
missing_values = df[df.isnull().any(axis=1)]

# Display the rows and their indices
print("Rows with missing values:")
print(missing_values)

# Display the row numbers
print("\nIndices of rows with missing values:")
print(missing_values.index.tolist())


In [None]:
# Check for missing or infinite values
print(df.isnull().sum())  # Ensure no missing values
print((df == float('inf')).sum())  # Check for infinite values
print((df == float('-inf')).sum())  # Check for negative infinite values

In [2]:
!pip install transformers sentence-transformers datasets



In [3]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import datetime
import time
import random
from transformers import BertTokenizer
from sentence_transformers import SentenceTransformer, models
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr

In [4]:
# Split dataset into train, validation, and test
def split_dataset(data, valid_percentage, test_percentage):
    length = len(data)
    np.random.shuffle(data)
    train = data[:int(length * (1 - valid_percentage - test_percentage))]
    valid = data[int(length * (1 - valid_percentage - test_percentage)):int(length * (1 - test_percentage))]
    test = data[int(length * (1 - test_percentage)):]
    return train, valid, test

In [14]:
# Split the dataset into train, validation, and test sets
# train_data, val_data, test_data = split_dataset(data, valid_percentage=0.1, test_percentage=0.1)
# ====== Load Dataset ======
# Load train, validation, and test datasets from CSV files
train_file = "/kaggle/input/data-with-unseen/70-train_data (1).csv"
valid_file = "/kaggle/input/data-with-unseen/15-val_data (1).csv"
test_file = "/kaggle/input/data-with-unseen/15-test_data (1).csv"

# Read datasets
train_data = pd.read_csv(train_file).values
val_data = pd.read_csv(valid_file).values
test_data = pd.read_csv(test_file).values

In [10]:
# Highlighted: Use the IndoBERT tokenizer
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

In [11]:
def format_time(elapsed):
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [12]:
# Correct the CustomDataset __getitem__ method
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.first_sentences = [pair[0] for pair in data]
        self.second_sentences = [pair[1] for pair in data]
        self.labels = [pair[2] for pair in data]

    def __len__(self):
        return len(self.first_sentences)

    def __getitem__(self, idx):
        texts = tokenizer(
            self.first_sentences[idx],
            self.second_sentences[idx],
            padding="max_length",
            max_length=128,
            truncation=True,
            return_tensors="pt"
        )
        label = torch.tensor(self.labels[idx], dtype=torch.float32)
        return {
            'input_ids': texts['input_ids'].squeeze(0),
            'attention_mask': texts['attention_mask'].squeeze(0),
        }, label

In [15]:
# Create DataLoader
batch_size = 8
train_ds = CustomDataset(train_data)
val_ds = CustomDataset(val_data)
test_ds = CustomDataset(test_data)

In [16]:
train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
validation_dataloader = DataLoader(val_ds, batch_size=batch_size)
test_dataloader = DataLoader(test_ds, batch_size=batch_size)

In [17]:
# Enhanced Model with IndoBERT
class EnhancedBertModel(nn.Module):
    def __init__(self):
        super(EnhancedBertModel, self).__init__()
        # Highlighted: Use IndoBERT as the transformer
        self.bert = models.Transformer('indobenchmark/indobert-base-p1', max_seq_length=128)
        self.pooling_layer = models.Pooling(self.bert.get_word_embedding_dimension())

        # Freeze BERT layers
        for param in self.bert.parameters():
            param.requires_grad = False

        self.bi_lstm = nn.LSTM(
            input_size=self.bert.get_word_embedding_dimension(),
            hidden_size=64,
            num_layers=1,
            bidirectional=True,
            batch_first=True
        )

        self.fc_dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(256, 1)

    def forward(self, input_data):
        bert_output = self.bert(input_data)
        sequence_output = bert_output['token_embeddings']

        lstm_output, _ = self.bi_lstm(sequence_output)

        avg_pool = torch.mean(lstm_output, dim=1)
        max_pool, _ = torch.max(lstm_output, dim=1)

        pooled_output = torch.cat((avg_pool, max_pool), dim=1)

        output = self.fc_dropout(pooled_output)
        output = self.fc(output)

        return output.squeeze(-1)

In [18]:
# Check for GPU availability
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 2 GPU(s) available.
We will use the GPU: Tesla T4


In [19]:
# Instantiate and move the model to device
model = EnhancedBertModel()
model.to(device)

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

EnhancedBertModel(
  (bert): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (pooling_layer): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (bi_lstm): LSTM(768, 64, batch_first=True, bidirectional=True)
  (fc_dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)

In [20]:
# Loss function, optimizer, and scheduler
criterion = nn.MSELoss()
epochs = 8
optimizer = optim.Adam(model.parameters(), lr=1e-5)

In [21]:
# Training Loop
def train_model():
    training_stats = []
    total_t0 = time.time()

    for epoch_i in range(epochs):
        print(f"\n======== Epoch {epoch_i + 1} / {epochs} ========")
        print("Training...")

        t0 = time.time()
        total_train_loss = 0
        model.train()

        for batch in tqdm(train_dataloader):
            train_data, train_labels = batch
            train_data['input_ids'] = train_data['input_ids'].to(device)
            train_data['attention_mask'] = train_data['attention_mask'].to(device)
            train_labels = train_labels.to(device)

            optimizer.zero_grad()
            outputs = model({
                'input_ids': train_data['input_ids'],
                'attention_mask': train_data['attention_mask']
            })
            loss = criterion(outputs, train_labels)
            total_train_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

        avg_train_loss = total_train_loss / len(train_dataloader)
        training_time = format_time(time.time() - t0)

        print(f"  Average training loss: {avg_train_loss:.5f}")
        print(f"  Training epoch took: {training_time}")

        # Validation
        print("Running Validation...")
        t0 = time.time()

        model.eval()
        total_val_loss = 0

        for batch in tqdm(validation_dataloader):
            val_data, val_labels = batch
            val_data['input_ids'] = val_data['input_ids'].to(device)
            val_data['attention_mask'] = val_data['attention_mask'].to(device)
            val_labels = val_labels.to(device)

            with torch.no_grad():
                outputs = model({
                    'input_ids': val_data['input_ids'],
                    'attention_mask': val_data['attention_mask']
                })
                loss = criterion(outputs, val_labels)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(validation_dataloader)
        validation_time = format_time(time.time() - t0)

        print(f"  Validation Loss: {avg_val_loss:.5f}")
        print(f"  Validation took: {validation_time}")

        training_stats.append({
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Validation Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        })

    print("Training complete!")
    print(f"Total training took {format_time(time.time() - total_t0)}")
    return model, training_stats


In [22]:
# Train the model
model, training_stats = train_model()


Training...


  3%|▎         | 5/164 [00:02<00:49,  3.22it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  5%|▌         | 9/164 [00:02<00:23,  6.69it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 12%|█▏        | 19/164 [00:03<00:10, 14.25it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 23%|██▎       | 37/164 [00:04<00:07, 17.21it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncati

  Average training loss: 0.19173
  Training epoch took: 0:00:11
Running Validation...


 28%|██▊       | 10/36 [00:00<00:01, 18.43it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 94%|█████████▍| 34/36 [00:01<00:00, 18.01it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|██████████| 36/36 [00:01<00:00, 18.46it/s]


  Validation Loss: 0.13719
  Validation took: 0:00:02

Training...


  0%|          | 0/164 [00:00<?, ?it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 28%|██▊       | 46/164 [00:02<00:06, 17.02it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 34%|███▍      | 56/164 [00:03<00:06, 16.97it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 37%|███▋      | 60/164 [00:03<00:06, 16.88it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation stra

  Average training loss: 0.17498
  Training epoch took: 0:00:10
Running Validation...


 28%|██▊       | 10/36 [00:00<00:01, 17.91it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 94%|█████████▍| 34/36 [00:01<00:00, 17.57it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|██████████| 36/36 [00:01<00:00, 18.06it/s]


  Validation Loss: 0.12124
  Validation took: 0:00:02

Training...


 15%|█▍        | 24/164 [00:01<00:08, 16.49it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 29%|██▉       | 48/164 [00:02<00:07, 16.46it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 30%|███       | 50/164 [00:03<00:06, 16.37it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 49%|████▉     | 80/164 [00:04<00:05, 16.48it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' trunca

  Average training loss: 0.15686
  Training epoch took: 0:00:10
Running Validation...


 28%|██▊       | 10/36 [00:00<00:01, 17.41it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 94%|█████████▍| 34/36 [00:01<00:00, 17.04it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|██████████| 36/36 [00:02<00:00, 17.54it/s]


  Validation Loss: 0.11042
  Validation took: 0:00:02

Training...


  5%|▍         | 8/164 [00:00<00:09, 16.65it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  9%|▊         | 14/164 [00:00<00:08, 16.76it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 10%|▉         | 16/164 [00:00<00:08, 16.62it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 26%|██▌       | 42/164 [00:02<00:07, 16.49it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncat

  Average training loss: 0.15094
  Training epoch took: 0:00:10
Running Validation...


 28%|██▊       | 10/36 [00:00<00:01, 16.82it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 94%|█████████▍| 34/36 [00:02<00:00, 16.57it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|██████████| 36/36 [00:02<00:00, 17.22it/s]


  Validation Loss: 0.10229
  Validation took: 0:00:02

Training...


 20%|█▉        | 32/164 [00:01<00:08, 16.39it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 22%|██▏       | 36/164 [00:02<00:07, 16.35it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 28%|██▊       | 46/164 [00:02<00:07, 16.09it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 39%|███▉      | 64/164 [00:03<00:06, 16.31it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' trunca

  Average training loss: 0.14149
  Training epoch took: 0:00:10
Running Validation...


 28%|██▊       | 10/36 [00:00<00:01, 16.57it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 94%|█████████▍| 34/36 [00:02<00:00, 16.37it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|██████████| 36/36 [00:02<00:00, 16.85it/s]


  Validation Loss: 0.09550
  Validation took: 0:00:02

Training...


  5%|▍         | 8/164 [00:00<00:09, 15.68it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  6%|▌         | 10/164 [00:00<00:09, 15.84it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 21%|██        | 34/164 [00:02<00:08, 16.00it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always b

  Average training loss: 0.13697
  Training epoch took: 0:00:10
Running Validation...


 28%|██▊       | 10/36 [00:00<00:01, 16.62it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 94%|█████████▍| 34/36 [00:02<00:00, 16.43it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|██████████| 36/36 [00:02<00:00, 16.86it/s]


  Validation Loss: 0.08877
  Validation took: 0:00:02

Training...


 15%|█▍        | 24/164 [00:01<00:09, 15.47it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 21%|██        | 34/164 [00:02<00:08, 15.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 29%|██▉       | 48/164 [00:03<00:07, 15.25it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always 

  Average training loss: 0.12961
  Training epoch took: 0:00:11
Running Validation...


 28%|██▊       | 10/36 [00:00<00:01, 16.22it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 94%|█████████▍| 34/36 [00:02<00:00, 15.78it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|██████████| 36/36 [00:02<00:00, 16.35it/s]


  Validation Loss: 0.08350
  Validation took: 0:00:02

Training...


 21%|██        | 34/164 [00:02<00:08, 15.05it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 39%|███▉      | 64/164 [00:04<00:06, 14.94it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 41%|████▏     | 68/164 [00:04<00:06, 14.91it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 44%|████▍     | 72/164 [00:04<00:06, 14.96it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' trunca

  Average training loss: 0.13152
  Training epoch took: 0:00:11
Running Validation...


 28%|██▊       | 10/36 [00:00<00:01, 15.56it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 94%|█████████▍| 34/36 [00:02<00:00, 15.16it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|██████████| 36/36 [00:02<00:00, 15.68it/s]

  Validation Loss: 0.08010
  Validation took: 0:00:02
Training complete!
Total training took 0:01:40





In [23]:
# Define custom evaluation functions
def mean_squared_error(y_true, y_pred):
    squared_errors = [(true - pred) ** 2 for true, pred in zip(y_true, y_pred)]
    return sum(squared_errors) / len(squared_errors)

def mean_absolute_error(y_true, y_pred):
    absolute_errors = [abs(true - pred) for true, pred in zip(y_true, y_pred)]
    return sum(absolute_errors) / len(absolute_errors)

def root_mean_squared_error(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    return mse ** 0.5

def pearsonr(x, y):
    mean_x = sum(x) / len(x)
    mean_y = sum(y) / len(y)
    numerator = sum((xi - mean_x) * (yi - mean_y) for xi, yi in zip(x, y))
    denominator = ((sum((xi - mean_x) ** 2 for xi in x) * sum((yi - mean_y) ** 2 for yi in y)) ** 0.5)
    return numerator / denominator if denominator != 0 else 0.0

In [24]:
# Evaluate Model with custom functions
def evaluate_model(model, dataloader):
    model.eval()
    true_labels = []
    predicted_scores = []

    with torch.no_grad():
        for batch in dataloader:
            data, labels = batch
            data['input_ids'] = data['input_ids'].to(device)
            data['attention_mask'] = data['attention_mask'].to(device)
            predictions = model({
                'input_ids': data['input_ids'],
                'attention_mask': data['attention_mask']
            })

            true_labels.extend(labels.cpu().numpy())
            predicted_scores.extend(predictions.cpu().numpy())

    mse = mean_squared_error(true_labels, predicted_scores)
    mae = mean_absolute_error(true_labels, predicted_scores)
    rmse = root_mean_squared_error(true_labels, predicted_scores)
    pearson_corr = pearsonr(true_labels, predicted_scores)

    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Pearson Correlation: {pearson_corr:.4f}")

    return mse, mae, rmse, pearson_corr

# Evaluate on validation set
evaluate_model(model, validation_dataloader)

# Optional: Evaluate on the test set if available
test_ds = CustomDataset(test_data)
test_dataloader = DataLoader(test_ds, batch_size=batch_size)
print("this is the evaluation on the test set:")
evaluate_model(model, test_dataloader)

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Mean Squared Error (MSE): 0.0810
Mean Absolute Error (MAE): 0.2371
Root Mean Squared Error (RMSE): 0.2846
Pearson Correlation: 0.6651
this is the evaluation on the test set:


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Mean Squared Error (MSE): 0.0791
Mean Absolute Error (MAE): 0.2390
Root Mean Squared Error (RMSE): 0.2813
Pearson Correlation: 0.7001


(0.07910466631199645,
 0.23903706051119647,
 0.2812555178338666,
 0.7001037946271274)

In [25]:
# Save the trained model to a .pt file
model_save_path = "/kaggle/working/indobert_similarity_model.pt"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")


Model saved to /kaggle/working/indobert_similarity_model.pt


In [26]:
# Reinitialize the model architecture
model = EnhancedBertModel()
model.to(device)

# Load the model state dictionary
model.load_state_dict(torch.load(model_save_path))
print("Model loaded successfully!")

# Set the model to evaluation mode if testing
model.eval()


  model.load_state_dict(torch.load(model_save_path))


Model loaded successfully!


EnhancedBertModel(
  (bert): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (pooling_layer): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (bi_lstm): LSTM(768, 64, batch_first=True, bidirectional=True)
  (fc_dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)

In [27]:
def evaluate_and_save_results(model, dataloader, csv_filename, original_data):
    model.eval()
    results = []  # To store all prediction results
    index = 0  # Track the original dataset index

    with torch.no_grad():
        for batch in tqdm(dataloader):
            data, labels = batch
            data['input_ids'] = data['input_ids'].to(device)
            data['attention_mask'] = data['attention_mask'].to(device)
            predictions = model({
                'input_ids': data['input_ids'],
                'attention_mask': data['attention_mask']
            }).cpu().numpy()

            batch_size = len(labels)
            for i in range(batch_size):
                # Get original sentences (response, answer)
                response, answer, true_label = original_data[index]
                index += 1  # Move to the next pair

                results.append({
                    'Response': response,
                    'Answer': answer,
                    'True Label': true_label,
                    'Predicted Score': predictions[i]
                })

    # Save results to CSV
    df = pd.DataFrame(results)
    df.to_csv(csv_filename, index=False)
    print(f"Results saved to {csv_filename}")
    
    return df

# Save validation results
validation_results_csv = "/kaggle/working/validation_results.csv"
df_validation = evaluate_and_save_results(model, validation_dataloader, validation_results_csv, val_data)

# Load test data for testing
test_ds = CustomDataset(test_data)
test_dataloader = DataLoader(test_ds, batch_size=batch_size)

# Save test results
test_results_csv = "/kaggle/working/test_results.csv"
df_test = evaluate_and_save_results(model, test_dataloader, test_results_csv, test_data)


 28%|██▊       | 10/36 [00:00<00:01, 15.07it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 94%|█████████▍| 34/36 [00:02<00:00, 15.10it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|██████████| 36/36 [00:02<00:00, 15.34it/s]


Results saved to /kaggle/working/validation_results.csv


 61%|██████    | 22/36 [00:01<00:00, 14.60it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|██████████| 36/36 [00:02<00:00, 14.85it/s]

Results saved to /kaggle/working/test_results.csv





In [None]:
def test_single_data_point(model, tokenizer, sentence1, sentence2):
    # Prepare input using the tokenizer
    encoded_input = tokenizer(
        sentence1,
        sentence2,
        padding="max_length",
        max_length=128,
        truncation=True,
        return_tensors="pt"
    ).to(device)

    # Set the model to evaluation mode
    model.eval()

    with torch.no_grad():
        # Perform inference
        predicted_score = model({
            'input_ids': encoded_input['input_ids'],
            'attention_mask': encoded_input['attention_mask']
        })

    # Since the model output is normalized to [0, 1], rescale it to [0, 5]
    predicted_score_rescaled = predicted_score.item() * 5.0

    print(f"Sentence 1: {sentence1}")
    print(f"Sentence 2: {sentence2}")
    print(f"Predicted Similarity Score: {predicted_score_rescaled:.4f}")

    return predicted_score_rescaled

In [None]:
# Example test data point from the test set
test_sentence1 = test_data[0][0]  # Replace with the first sentence from your test data
test_sentence2 = test_data[0][1]  # Replace with the second sentence from your test data
true_score = test_data[0][2] * 5.0  # Rescale the true label to the [0, 5] range for comparison

print(f"True Similarity Score: {true_score:.4f}")
predicted_score = test_single_data_point(model, tokenizer, test_sentence1, test_sentence2)

In [None]:
# Example test data point from the test set
test_sentence1 = test_data[5][0]  # Replace with the first sentence from your test data
test_sentence2 = test_data[5][1]  # Replace with the second sentence from your test data
true_score = test_data[5][2] * 5.0  # Rescale the true label to the [0, 5] range for comparison

print(f"True Similarity Score: {true_score:.4f}")
predicted_score = test_single_data_point(model, tokenizer, test_sentence1, test_sentence2)

In [28]:
import csv

# Evaluate model on the unseen dataset and save results to CSV
def evaluate_and_save_results(model, dataloader, output_csv_path):
    model.eval()
    true_labels = []
    predicted_scores = []
    responses = []
    answers = []

    with torch.no_grad():
        for batch in dataloader:
            data, labels = batch
            data['input_ids'] = data['input_ids'].to(device)
            data['attention_mask'] = data['attention_mask'].to(device)
            predictions = model({
                'input_ids': data['input_ids'],
                'attention_mask': data['attention_mask']
            })

            true_labels.extend(labels.cpu().numpy())
            predicted_scores.extend(predictions.cpu().numpy())
            responses.extend(data['input_ids'].cpu().numpy())  # Add the actual `response`
            answers.extend(data['attention_mask'].cpu().numpy())  # Add the `answer`

    # Calculate metrics
    mse = mean_squared_error(true_labels, predicted_scores)
    mae = mean_absolute_error(true_labels, predicted_scores)
    rmse = root_mean_squared_error(true_labels, predicted_scores)
    pearson_corr = pearsonr(true_labels, predicted_scores)

    print(f"\nEvaluation on Unseen Dataset:")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Pearson Correlation: {pearson_corr:.4f}")

    # Save results to CSV
    results = {
        'answer': [tokenizer.decode(answers[i]) for i in range(len(answers))],
        'response': [tokenizer.decode(responses[i]) for i in range(len(responses))],
        'label': true_labels,
        'predicted_label': predicted_scores,
    }

    df_results = pd.DataFrame(results)
    df_results.to_csv(output_csv_path, index=False)
    print(f"Results saved to {output_csv_path}")

# Load unseen dataset and create DataLoader
unseen_data = load_indo_dataset("/kaggle/input/testi-data/test-BuIng.csv")
unseen_dataset = CustomDataset(unseen_data)
unseen_dataloader = DataLoader(unseen_dataset, batch_size=batch_size)

# Evaluate and save to CSV
output_csv_path = "/kaggle/working/unseen_dataset_results.csv"
evaluate_and_save_results(model, unseen_dataloader, output_csv_path)

                                              answer  \
0  animasi adalah sebuah proses merekam dan memai...   
1  animasi adalah menghidupkan, yaitu usaha untuk...   
2  animasi adalah sebuah proses merekam dan memai...   
3  animasi adalah sebuah proses merekam dan memai...   
4  animasi adalah menghidupkan, yaitu usaha untuk...   

                                            response  label  
0  animasi komputer adalah pembuatan atau pemrose...    2.5  
1  animasi komputer merupakan sebuah bentuk seni ...    4.5  
2  animasi yang dibuat pada saat sekarang dan dib...    2.5  
3  sebuah animasi dimana animasi ini sebuah perge...    4.0  
4    proses menciptakan gerakan menggunakan komputer    5.0  


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.



Evaluation on Unseen Dataset:
Mean Squared Error (MSE): 0.1031
Mean Absolute Error (MAE): 0.2725
Root Mean Squared Error (RMSE): 0.3211
Pearson Correlation: -0.4117
Results saved to /kaggle/working/unseen_dataset_results.csv
