In [None]:
%%capture
!pip install datasets transformers

In [None]:

import torch
import time
import datetime

import random
from random import seed

import numpy as np
from sklearn.metrics import accuracy_score
import torch.utils.data as Data

from datasets import load_dataset
from transformers import AutoTokenizer
def format_time(elapsed):
    """Takes a time in seconds and returns a string hh:mm:ss."""
    elapsed_rounded = int(round(elapsed))
    return str(datetime.timedelta(seconds=elapsed_rounded))


In [None]:

def get_device():
    if torch.backends.cuda.is_built():
        print("CUDA")
        device = torch.device("cuda")
    elif torch.backends.mps.is_built():
        print("mps")
        device = torch.device("mps")
    else:
        device = torch.device("cpu")
        raise Exception("GPU is not avalaible!")
    return device


In [None]:
from transformers import BertForSequenceClassification
from torch.nn import CrossEntropyLoss, MSELoss
import torch

class MyBertMaxPoolClassifier(BertForSequenceClassification):
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
    ):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=False,
        )

        last_hidden_state = outputs[0]  

        extended_attention_mask = attention_mask.unsqueeze(-1).bool()
        masked_hidden_state = last_hidden_state.masked_fill(~extended_attention_mask, float('-inf'))

        
        pooled_output, _ = torch.max(masked_hidden_state, dim=1)

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        outputs = (logits,) + outputs[2:]

       
        if labels is not None:
            if self.num_labels == 1:
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs


In [None]:
from transformers.modeling_outputs import SequenceClassifierOutput
import torch
import torch.nn as nn

class AttentionPoolBertClassifier(BertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        
        self.attention_layer = nn.Linear(config.hidden_size, 1)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None
    ):
        
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict
        )

        
        last_hidden_state = outputs[0]

        
        
        attn_scores = self.attention_layer(last_hidden_state)
        attn_scores = attn_scores.masked_fill(attention_mask.unsqueeze(-1) == 0, -1e9)
        attn_weights = torch.softmax(attn_scores, dim=1)
        pooled_output = torch.sum(last_hidden_state * attn_weights, dim=1)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            if self.num_labels == 1:
                loss_fct = nn.MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states if output_hidden_states else None,
            attentions=outputs.attentions if output_attentions else None,
        )

In [None]:
def train_eval_loop(
    model, loader, optimizer, scheduler, device, n_epochs=2, seed_val=42
):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    loss_values = []
    t00 = time.time()

    for epoch_i in range(0, n_epochs):
        print("")
        print("======== Epoch {:} / {:} ========".format(epoch_i + 1, n_epochs))
        print("Training...")

        t0 = time.time()
        total_loss = 0
        model.train()

        for step, batch in enumerate(loader["train"]):
            b_input_ids, b_input_mask, b_token_type_ids, b_labels = tuple(t.to(device) for t in batch)
            model.zero_grad()

            outputs = model(
                input_ids=b_input_ids,
                attention_mask=b_input_mask,
                token_type_ids=b_token_type_ids,
                labels=b_labels
            )
            loss = outputs[0]

            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_loss / len(loader["train"])
        loss_values.append(avg_train_loss)

        print("\nAverage training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(format_time(time.time() - t00)))

        print("\nRunning Validation...")
        t0 = time.time()
        model.eval()
        val_acc, nb_eval_steps = 0, 0

        for batch in loader["validation"]:
            b_input_ids, b_input_mask, b_token_type_ids, b_labels = tuple(t.to(device) for t in batch)

            with torch.no_grad():
                logits = model(
                    input_ids=b_input_ids,
                    attention_mask=b_input_mask,
                    token_type_ids=b_token_type_ids
                )[0]

            logits = logits.detach().cpu().numpy()
            logits = np.argmax(logits, axis=1).flatten()
            label_ids = b_labels.to("cpu").numpy()

            val_acc += accuracy_score(logits, label_ids)
            nb_eval_steps += 1

        val_acc = 100 * (val_acc / nb_eval_steps)
        print("  Validation ACC: {0:.2f}".format(val_acc))
        print("  Validation took: {:}".format(format_time(time.time() - t0)))

    return val_acc, loss_values


In [None]:
def init_loader(max_length=16, batch_size=32, test_size=0.2, random_state=2023):
    model_checkpoint = "bert-base-uncased"

    dataset = load_dataset("glue", "rte")


    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

    df_s, x, y = {}, {}, {}
    input_ids, attention_mask = {}, {}
    token_type_ids = {}
    datasets, loader = {}, {}

    max_length = 128

    for split in ["train", "validation", "test"]:

        df_s[split] = dataset[split].to_pandas()

        premise = dataset[split]["sentence1"]
        hypothesis = dataset[split]["sentence2"]
        y[split] = dataset[split]["label"]

        x[split] = list(zip(premise, hypothesis))

        input= tokenizer(
            premise,
            hypothesis,
            max_length=max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
            return_token_type_ids=True,
        )

        input_ids[split], attention_mask[split],token_type_ids[split]  = input.input_ids, input.attention_mask,input.token_type_ids

        Data.TensorDataset

        datasets[split] = Data.TensorDataset(
            input_ids[split], attention_mask[split], token_type_ids[split], torch.LongTensor(y[split])
        )

        loader[split] = Data.DataLoader(
            datasets[split], batch_size=batch_size, shuffle=False
        )
    return loader, y

In [None]:
from transformers import get_linear_schedule_with_warmup
from transformers import BertForSequenceClassification


def init_objects(
    lr, n_epochs, max_length=16, batch_size=32, test_size=0.2, random_state=2023
):
    loader, _ = init_loader(max_length=max_length, batch_size=batch_size)

    model = AttentionPoolBertClassifier.from_pretrained("bert-base-uncased", num_labels=2)


    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, eps=1e-8)

    total_steps = len(loader["train"]) * n_epochs

    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=total_steps
    )
    return model, loader, optimizer, scheduler


In [None]:

!pip install optuna
import torch
import optuna
from transformers import logging


Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.2.1


In [None]:
lr = 2e-5
n_epochs = 1
max_length = 16
batch_size = 32
test_size = 0.2
random_state = 2023
device = torch.device("cuda")

model, loader, optimizer, scheduler = init_objects(
    lr, n_epochs, max_length, batch_size, test_size, random_state
)
model.to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/584k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/69.0k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/621k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/277 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of AttentionPoolBertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['attention_layer.bias', 'attention_layer.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AttentionPoolBertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-1

In [None]:
_, _ = train_eval_loop(
    model, loader, optimizer, scheduler, device, n_epochs=n_epochs, seed_val=42
)


Training...

Average training loss: 0.69
  Training epoch took: 0:00:13

Running Validation...
  Validation ACC: 56.00
  Validation took: 0:00:00


In [None]:

param_dict = {
    "lr": [1e-6, 3e-5],
    "n_epochs": [2,3,4],
    "max_length": [16, 32, 64],
    "batch_size": [16, 32,64],
}


class BertObjective:
    def __init__(self, d, device):
        self.d = d
        self.device = device

    def __call__(self, trial: optuna.trial.Trial):
        self.lr = trial.suggest_float("lr", self.d["lr"][0], self.d["lr"][1], log=True)
        self.n_epochs = trial.suggest_categorical("n_epochs", self.d["n_epochs"])
        self.max_length = trial.suggest_categorical("max_length", self.d["max_length"])
        self.batch_size = trial.suggest_categorical("batch_size", self.d["batch_size"])

        model, loader, optimizer, scheduler = init_objects(
            self.lr, self.n_epochs, self.max_length, self.batch_size
        )
        model.to(self.device)
        val_acc, _ = train_eval_loop(
            model, loader, optimizer, scheduler, self.device, self.n_epochs
        )

        return val_acc


device = torch.device("cuda")
study = optuna.create_study(study_name="Stduy 0", direction="maximize")
study.optimize(BertObjective(param_dict, device), n_trials=24)


[I 2025-04-07 12:47:08,819] A new study created in memory with name: Stduy 0
Some weights of MyBertMaxPoolClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.72
  Training epoch took: 0:00:12

Running Validation...
  Validation ACC: 51.14
  Validation took: 0:00:00

Training...

Average training loss: 0.69
  Training epoch took: 0:00:24

Running Validation...


[I 2025-04-07 12:47:35,951] Trial 0 finished with value: 50.95899470899471 and parameters: {'lr': 4.75181067704919e-06, 'n_epochs': 2, 'max_length': 64, 'batch_size': 32}. Best is trial 0 with value: 50.95899470899471.


  Validation ACC: 50.96
  Validation took: 0:00:00


Some weights of MyBertMaxPoolClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.71
  Training epoch took: 0:00:13

Running Validation...
  Validation ACC: 53.06
  Validation took: 0:00:00

Training...

Average training loss: 0.69
  Training epoch took: 0:00:26

Running Validation...
  Validation ACC: 54.03
  Validation took: 0:00:00

Training...

Average training loss: 0.68
  Training epoch took: 0:00:40

Running Validation...


[I 2025-04-07 12:48:18,773] Trial 1 finished with value: 54.02777777777777 and parameters: {'lr': 3.881703153788711e-06, 'n_epochs': 3, 'max_length': 32, 'batch_size': 16}. Best is trial 1 with value: 54.02777777777777.


  Validation ACC: 54.03
  Validation took: 0:00:00


Some weights of MyBertMaxPoolClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.72
  Training epoch took: 0:00:13

Running Validation...
  Validation ACC: 54.72
  Validation took: 0:00:00

Training...

Average training loss: 0.65
  Training epoch took: 0:00:26

Running Validation...
  Validation ACC: 57.85
  Validation took: 0:00:00

Training...

Average training loss: 0.62
  Training epoch took: 0:00:40

Running Validation...


[I 2025-04-07 12:49:01,747] Trial 2 finished with value: 55.76388888888889 and parameters: {'lr': 1.2076887034636962e-05, 'n_epochs': 3, 'max_length': 16, 'batch_size': 16}. Best is trial 2 with value: 55.76388888888889.


  Validation ACC: 55.76
  Validation took: 0:00:00


Some weights of MyBertMaxPoolClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.80
  Training epoch took: 0:00:12

Running Validation...
  Validation ACC: 54.10
  Validation took: 0:00:00

Training...

Average training loss: 0.72
  Training epoch took: 0:00:24

Running Validation...


[I 2025-04-07 12:49:29,344] Trial 3 finished with value: 51.66997354497354 and parameters: {'lr': 1.4690620663130575e-06, 'n_epochs': 2, 'max_length': 32, 'batch_size': 32}. Best is trial 2 with value: 55.76388888888889.


  Validation ACC: 51.67
  Validation took: 0:00:00


Some weights of MyBertMaxPoolClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.71
  Training epoch took: 0:00:12

Running Validation...
  Validation ACC: 54.08
  Validation took: 0:00:00

Training...

Average training loss: 0.67
  Training epoch took: 0:00:24

Running Validation...
  Validation ACC: 55.84
  Validation took: 0:00:00

Training...

Average training loss: 0.66
  Training epoch took: 0:00:36

Running Validation...


[I 2025-04-07 12:50:08,480] Trial 4 finished with value: 53.720238095238095 and parameters: {'lr': 1.1616289310589247e-05, 'n_epochs': 3, 'max_length': 32, 'batch_size': 32}. Best is trial 2 with value: 55.76388888888889.


  Validation ACC: 53.72
  Validation took: 0:00:00


Some weights of MyBertMaxPoolClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.72
  Training epoch took: 0:00:12

Running Validation...
  Validation ACC: 55.51
  Validation took: 0:00:00

Training...

Average training loss: 0.68
  Training epoch took: 0:00:24

Running Validation...


[I 2025-04-07 12:50:35,604] Trial 5 finished with value: 56.71296296296296 and parameters: {'lr': 1.2225251522321694e-05, 'n_epochs': 2, 'max_length': 16, 'batch_size': 32}. Best is trial 5 with value: 56.71296296296296.


  Validation ACC: 56.71
  Validation took: 0:00:00


Some weights of MyBertMaxPoolClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.72
  Training epoch took: 0:00:13

Running Validation...
  Validation ACC: 50.28
  Validation took: 0:00:00

Training...

Average training loss: 0.70
  Training epoch took: 0:00:26

Running Validation...
  Validation ACC: 52.01
  Validation took: 0:00:00

Training...

Average training loss: 0.70
  Training epoch took: 0:00:40

Running Validation...
  Validation ACC: 51.94
  Validation took: 0:00:00

Training...

Average training loss: 0.70
  Training epoch took: 0:00:53

Running Validation...


[I 2025-04-07 12:51:33,814] Trial 6 finished with value: 52.29166666666667 and parameters: {'lr': 1.3624281853312044e-06, 'n_epochs': 4, 'max_length': 64, 'batch_size': 16}. Best is trial 5 with value: 56.71296296296296.


  Validation ACC: 52.29
  Validation took: 0:00:00


Some weights of MyBertMaxPoolClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.72
  Training epoch took: 0:00:13

Running Validation...
  Validation ACC: 53.19
  Validation took: 0:00:00

Training...

Average training loss: 0.71
  Training epoch took: 0:00:26

Running Validation...
  Validation ACC: 54.93
  Validation took: 0:00:00

Training...

Average training loss: 0.69
  Training epoch took: 0:00:40

Running Validation...
  Validation ACC: 55.28
  Validation took: 0:00:00

Training...

Average training loss: 0.68
  Training epoch took: 0:00:53

Running Validation...


[I 2025-04-07 12:52:31,034] Trial 7 finished with value: 54.58333333333333 and parameters: {'lr': 2.1022475445490807e-06, 'n_epochs': 4, 'max_length': 64, 'batch_size': 16}. Best is trial 5 with value: 56.71296296296296.


  Validation ACC: 54.58
  Validation took: 0:00:00


Some weights of MyBertMaxPoolClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.72
  Training epoch took: 0:00:11

Running Validation...
  Validation ACC: 50.74
  Validation took: 0:00:01

Training...

Average training loss: 0.70
  Training epoch took: 0:00:22

Running Validation...
  Validation ACC: 53.24
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:33

Running Validation...


[I 2025-04-07 12:53:08,214] Trial 8 finished with value: 53.55654761904762 and parameters: {'lr': 5.491146747732536e-06, 'n_epochs': 3, 'max_length': 64, 'batch_size': 64}. Best is trial 5 with value: 56.71296296296296.


  Validation ACC: 53.56
  Validation took: 0:00:01


Some weights of MyBertMaxPoolClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.73
  Training epoch took: 0:00:11

Running Validation...
  Validation ACC: 56.46
  Validation took: 0:00:01

Training...

Average training loss: 0.66
  Training epoch took: 0:00:22

Running Validation...
  Validation ACC: 56.74
  Validation took: 0:00:01

Training...

Average training loss: 0.64
  Training epoch took: 0:00:33

Running Validation...
  Validation ACC: 59.24
  Validation took: 0:00:01

Training...

Average training loss: 0.62
  Training epoch took: 0:00:45

Running Validation...


[I 2025-04-07 12:53:56,270] Trial 9 finished with value: 58.31845238095238 and parameters: {'lr': 1.7559589889568154e-05, 'n_epochs': 4, 'max_length': 64, 'batch_size': 64}. Best is trial 9 with value: 58.31845238095238.


  Validation ACC: 58.32
  Validation took: 0:00:01


Some weights of MyBertMaxPoolClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.71
  Training epoch took: 0:00:11

Running Validation...
  Validation ACC: 56.40
  Validation took: 0:00:01

Training...

Average training loss: 0.64
  Training epoch took: 0:00:22

Running Validation...
  Validation ACC: 57.34
  Validation took: 0:00:01

Training...

Average training loss: 0.60
  Training epoch took: 0:00:33

Running Validation...
  Validation ACC: 55.46
  Validation took: 0:00:01

Training...

Average training loss: 0.57
  Training epoch took: 0:00:45

Running Validation...


[I 2025-04-07 12:54:44,362] Trial 10 finished with value: 56.08630952380953 and parameters: {'lr': 2.9336793455741386e-05, 'n_epochs': 4, 'max_length': 64, 'batch_size': 64}. Best is trial 9 with value: 58.31845238095238.


  Validation ACC: 56.09
  Validation took: 0:00:01


Some weights of MyBertMaxPoolClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.71
  Training epoch took: 0:00:11

Running Validation...
  Validation ACC: 54.17
  Validation took: 0:00:01

Training...

Average training loss: 0.67
  Training epoch took: 0:00:22

Running Validation...


[I 2025-04-07 12:55:12,947] Trial 11 finished with value: 54.80654761904762 and parameters: {'lr': 2.1308280054307903e-05, 'n_epochs': 2, 'max_length': 16, 'batch_size': 64}. Best is trial 9 with value: 58.31845238095238.


  Validation ACC: 54.81
  Validation took: 0:00:01


Some weights of MyBertMaxPoolClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.71
  Training epoch took: 0:00:11

Running Validation...
  Validation ACC: 53.60
  Validation took: 0:00:01

Training...

Average training loss: 0.68
  Training epoch took: 0:00:22

Running Validation...
  Validation ACC: 53.27
  Validation took: 0:00:01

Training...

Average training loss: 0.66
  Training epoch took: 0:00:33

Running Validation...
  Validation ACC: 53.88
  Validation took: 0:00:01

Training...

Average training loss: 0.65
  Training epoch took: 0:00:45

Running Validation...


[I 2025-04-07 12:56:06,606] Trial 12 finished with value: 54.211309523809526 and parameters: {'lr': 1.2571336212999336e-05, 'n_epochs': 4, 'max_length': 16, 'batch_size': 64}. Best is trial 9 with value: 58.31845238095238.


  Validation ACC: 54.21
  Validation took: 0:00:01


Some weights of MyBertMaxPoolClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.71
  Training epoch took: 0:00:12

Running Validation...
  Validation ACC: 55.62
  Validation took: 0:00:00

Training...

Average training loss: 0.69
  Training epoch took: 0:00:24

Running Validation...


[I 2025-04-07 12:56:41,702] Trial 13 finished with value: 56.66335978835979 and parameters: {'lr': 8.979223534017073e-06, 'n_epochs': 2, 'max_length': 16, 'batch_size': 32}. Best is trial 9 with value: 58.31845238095238.


  Validation ACC: 56.66
  Validation took: 0:00:00


Some weights of MyBertMaxPoolClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.71
  Training epoch took: 0:00:11

Running Validation...
  Validation ACC: 51.67
  Validation took: 0:00:01

Training...

Average training loss: 0.66
  Training epoch took: 0:00:22

Running Validation...
  Validation ACC: 55.46
  Validation took: 0:00:01

Training...

Average training loss: 0.63
  Training epoch took: 0:00:33

Running Validation...
  Validation ACC: 56.73
  Validation took: 0:00:01

Training...

Average training loss: 0.61
  Training epoch took: 0:00:45

Running Validation...


[I 2025-04-07 12:57:36,697] Trial 14 finished with value: 56.08630952380953 and parameters: {'lr': 1.9296623020494705e-05, 'n_epochs': 4, 'max_length': 16, 'batch_size': 64}. Best is trial 9 with value: 58.31845238095238.


  Validation ACC: 56.09
  Validation took: 0:00:01


Some weights of MyBertMaxPoolClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.71
  Training epoch took: 0:00:12

Running Validation...
  Validation ACC: 56.15
  Validation took: 0:00:00

Training...

Average training loss: 0.69
  Training epoch took: 0:00:24

Running Validation...


[I 2025-04-07 12:58:10,878] Trial 15 finished with value: 56.66335978835979 and parameters: {'lr': 7.416392316285699e-06, 'n_epochs': 2, 'max_length': 64, 'batch_size': 32}. Best is trial 9 with value: 58.31845238095238.


  Validation ACC: 56.66
  Validation took: 0:00:00


Some weights of MyBertMaxPoolClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.71
  Training epoch took: 0:00:11

Running Validation...
  Validation ACC: 53.90
  Validation took: 0:00:01

Training...

Average training loss: 0.67
  Training epoch took: 0:00:22

Running Validation...


[I 2025-04-07 12:58:47,792] Trial 16 finished with value: 52.61904761904762 and parameters: {'lr': 1.8235090772763022e-05, 'n_epochs': 2, 'max_length': 16, 'batch_size': 64}. Best is trial 9 with value: 58.31845238095238.


  Validation ACC: 52.62
  Validation took: 0:00:01


Some weights of MyBertMaxPoolClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.71
  Training epoch took: 0:00:12

Running Validation...
  Validation ACC: 54.78
  Validation took: 0:00:00

Training...

Average training loss: 0.62
  Training epoch took: 0:00:24

Running Validation...
  Validation ACC: 53.21
  Validation took: 0:00:00

Training...

Average training loss: 0.56
  Training epoch took: 0:00:36

Running Validation...
  Validation ACC: 56.88
  Validation took: 0:00:00

Training...

Average training loss: 0.50
  Training epoch took: 0:00:48

Running Validation...


[I 2025-04-07 12:59:45,590] Trial 17 finished with value: 57.04365079365079 and parameters: {'lr': 2.765584874583562e-05, 'n_epochs': 4, 'max_length': 16, 'batch_size': 32}. Best is trial 9 with value: 58.31845238095238.


  Validation ACC: 57.04
  Validation took: 0:00:00


Some weights of MyBertMaxPoolClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.70
  Training epoch took: 0:00:12

Running Validation...
  Validation ACC: 56.86
  Validation took: 0:00:00

Training...

Average training loss: 0.62
  Training epoch took: 0:00:24

Running Validation...
  Validation ACC: 62.24
  Validation took: 0:00:00

Training...

Average training loss: 0.55
  Training epoch took: 0:00:36

Running Validation...
  Validation ACC: 59.29
  Validation took: 0:00:00

Training...

Average training loss: 0.50
  Training epoch took: 0:00:48

Running Validation...


[I 2025-04-07 13:00:51,874] Trial 18 finished with value: 59.639550264550266 and parameters: {'lr': 2.933404523818007e-05, 'n_epochs': 4, 'max_length': 64, 'batch_size': 32}. Best is trial 18 with value: 59.639550264550266.


  Validation ACC: 59.64
  Validation took: 0:00:00


Some weights of MyBertMaxPoolClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.73
  Training epoch took: 0:00:11

Running Validation...
  Validation ACC: 51.06
  Validation took: 0:00:01

Training...

Average training loss: 0.70
  Training epoch took: 0:00:22

Running Validation...
  Validation ACC: 53.87
  Validation took: 0:00:01

Training...

Average training loss: 0.70
  Training epoch took: 0:00:33

Running Validation...
  Validation ACC: 54.18
  Validation took: 0:00:01

Training...

Average training loss: 0.70
  Training epoch took: 0:00:45

Running Validation...


[I 2025-04-07 13:01:56,371] Trial 19 finished with value: 54.80654761904762 and parameters: {'lr': 3.0249242274371735e-06, 'n_epochs': 4, 'max_length': 64, 'batch_size': 64}. Best is trial 18 with value: 59.639550264550266.


  Validation ACC: 54.81
  Validation took: 0:00:01


Some weights of MyBertMaxPoolClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.72
  Training epoch took: 0:00:11

Running Validation...
  Validation ACC: 53.85
  Validation took: 0:00:01

Training...

Average training loss: 0.67
  Training epoch took: 0:00:22

Running Validation...
  Validation ACC: 55.13
  Validation took: 0:00:01

Training...

Average training loss: 0.64
  Training epoch took: 0:00:33

Running Validation...
  Validation ACC: 56.38
  Validation took: 0:00:01

Training...

Average training loss: 0.63
  Training epoch took: 0:00:45

Running Validation...


[I 2025-04-07 13:02:53,735] Trial 20 finished with value: 55.75892857142857 and parameters: {'lr': 1.685196833452587e-05, 'n_epochs': 4, 'max_length': 64, 'batch_size': 64}. Best is trial 18 with value: 59.639550264550266.


  Validation ACC: 55.76
  Validation took: 0:00:01


Some weights of MyBertMaxPoolClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.71
  Training epoch took: 0:00:12

Running Validation...
  Validation ACC: 59.46
  Validation took: 0:00:00

Training...

Average training loss: 0.63
  Training epoch took: 0:00:24

Running Validation...
  Validation ACC: 59.09
  Validation took: 0:00:00

Training...

Average training loss: 0.56
  Training epoch took: 0:00:36

Running Validation...
  Validation ACC: 60.68
  Validation took: 0:00:00

Training...

Average training loss: 0.52
  Training epoch took: 0:00:48

Running Validation...


[I 2025-04-07 13:03:52,519] Trial 21 finished with value: 61.37566137566137 and parameters: {'lr': 2.4385374317676526e-05, 'n_epochs': 4, 'max_length': 64, 'batch_size': 32}. Best is trial 21 with value: 61.37566137566137.


  Validation ACC: 61.38
  Validation took: 0:00:00


Some weights of MyBertMaxPoolClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.70
  Training epoch took: 0:00:12

Running Validation...
  Validation ACC: 59.11
  Validation took: 0:00:00

Training...

Average training loss: 0.63
  Training epoch took: 0:00:24

Running Validation...
  Validation ACC: 59.62
  Validation took: 0:00:00

Training...

Average training loss: 0.56
  Training epoch took: 0:00:36

Running Validation...
  Validation ACC: 61.03
  Validation took: 0:00:00

Training...

Average training loss: 0.52
  Training epoch took: 0:00:48

Running Validation...


[I 2025-04-07 13:04:52,284] Trial 22 finished with value: 61.02843915343915 and parameters: {'lr': 2.486931777944464e-05, 'n_epochs': 4, 'max_length': 64, 'batch_size': 32}. Best is trial 21 with value: 61.37566137566137.


  Validation ACC: 61.03
  Validation took: 0:00:00


Some weights of MyBertMaxPoolClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.70
  Training epoch took: 0:00:12

Running Validation...
  Validation ACC: 56.68
  Validation took: 0:00:00

Training...

Average training loss: 0.62
  Training epoch took: 0:00:24

Running Validation...
  Validation ACC: 59.28
  Validation took: 0:00:00

Training...

Average training loss: 0.54
  Training epoch took: 0:00:36

Running Validation...
  Validation ACC: 60.52
  Validation took: 0:00:00

Training...

Average training loss: 0.48
  Training epoch took: 0:00:48

Running Validation...


[I 2025-04-07 13:05:49,593] Trial 23 finished with value: 60.16865079365079 and parameters: {'lr': 2.8641112391992403e-05, 'n_epochs': 4, 'max_length': 64, 'batch_size': 32}. Best is trial 21 with value: 61.37566137566137.


  Validation ACC: 60.17
  Validation took: 0:00:00


In [None]:

# Train again with best parameters
lr = study.best_params["lr"]
n_epochs = study.best_params["n_epochs"]
max_length = study.best_params["max_length"]
batch_size = study.best_params["batch_size"]

model, loader, optimizer, scheduler = init_objects(lr, n_epochs, max_length, batch_size)
model.to(device)
val_acc, _ = train_eval_loop(model, loader, optimizer, scheduler, device, n_epochs)
# Obtain Test Results
# %%


Some weights of MyBertMaxPoolClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.71
  Training epoch took: 0:00:12

Running Validation...
  Validation ACC: 59.46
  Validation took: 0:00:00

Training...

Average training loss: 0.63
  Training epoch took: 0:00:24

Running Validation...
  Validation ACC: 59.09
  Validation took: 0:00:00

Training...

Average training loss: 0.56
  Training epoch took: 0:00:36

Running Validation...
  Validation ACC: 60.68
  Validation took: 0:00:00

Training...

Average training loss: 0.52
  Training epoch took: 0:00:48

Running Validation...
  Validation ACC: 61.38
  Validation took: 0:00:00


In [None]:
import pandas as pd

best_params = {
    "Learning Rate": study.best_params["lr"],
    "Epochs": study.best_params["n_epochs"],
    "Max Length": study.best_params["max_length"],
    "Batch Size": study.best_params["batch_size"]
}

best_params_df = pd.DataFrame([best_params])
print(best_params_df.to_string(index=False))


 Learning Rate  Epochs  Max Length  Batch Size
      0.000024       4          64          32


In [None]:

param_dict = {
    "lr": [1e-6, 3e-5],
    "n_epochs": [2,3,4],
    "max_length": [16, 32, 64],
    "batch_size": [16, 32,64],
}


class BertObjective:
    def __init__(self, d, device):
        self.d = d
        self.device = device

    def __call__(self, trial: optuna.trial.Trial):
        self.lr = trial.suggest_float("lr", self.d["lr"][0], self.d["lr"][1], log=True)
        self.n_epochs = trial.suggest_categorical("n_epochs", self.d["n_epochs"])
        self.max_length = trial.suggest_categorical("max_length", self.d["max_length"])
        self.batch_size = trial.suggest_categorical("batch_size", self.d["batch_size"])

        model, loader, optimizer, scheduler = init_objects(
            self.lr, self.n_epochs, self.max_length, self.batch_size
        )
        model.to(self.device)
        val_acc, _ = train_eval_loop(
            model, loader, optimizer, scheduler, self.device, self.n_epochs
        )

        return val_acc


device = torch.device("cuda")
study = optuna.create_study(study_name="Stduy 0", direction="maximize")
study.optimize(BertObjective(param_dict, device), n_trials=24)


[I 2025-04-07 16:44:21,399] A new study created in memory with name: Stduy 0
Some weights of AttentionPoolBertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['attention_layer.bias', 'attention_layer.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.69
  Training epoch took: 0:00:13

Running Validation...
  Validation ACC: 55.56
  Validation took: 0:00:00

Training...

Average training loss: 0.68
  Training epoch took: 0:00:26

Running Validation...
  Validation ACC: 57.64
  Validation took: 0:00:00

Training...

Average training loss: 0.66
  Training epoch took: 0:00:40

Running Validation...


[I 2025-04-07 16:45:03,966] Trial 0 finished with value: 56.87500000000001 and parameters: {'lr': 4.360915124784374e-06, 'n_epochs': 3, 'max_length': 64, 'batch_size': 16}. Best is trial 0 with value: 56.87500000000001.


  Validation ACC: 56.88
  Validation took: 0:00:00


Some weights of AttentionPoolBertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['attention_layer.bias', 'attention_layer.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.70
  Training epoch took: 0:00:11

Running Validation...
  Validation ACC: 58.66
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:22

Running Validation...
  Validation ACC: 62.43
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:33

Running Validation...
  Validation ACC: 62.74
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:44

Running Validation...


[I 2025-04-07 16:45:52,250] Trial 1 finished with value: 60.848214285714285 and parameters: {'lr': 2.1024437666652437e-06, 'n_epochs': 4, 'max_length': 16, 'batch_size': 64}. Best is trial 1 with value: 60.848214285714285.


  Validation ACC: 60.85
  Validation took: 0:00:01


Some weights of AttentionPoolBertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['attention_layer.bias', 'attention_layer.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.69
  Training epoch took: 0:00:12

Running Validation...
  Validation ACC: 55.31
  Validation took: 0:00:00

Training...

Average training loss: 0.68
  Training epoch took: 0:00:24

Running Validation...
  Validation ACC: 54.27
  Validation took: 0:00:00

Training...

Average training loss: 0.68
  Training epoch took: 0:00:36

Running Validation...
  Validation ACC: 58.61
  Validation took: 0:00:00

Training...

Average training loss: 0.67
  Training epoch took: 0:00:48

Running Validation...


[I 2025-04-07 16:46:43,201] Trial 2 finished with value: 57.55621693121693 and parameters: {'lr': 3.3969354542575265e-06, 'n_epochs': 4, 'max_length': 16, 'batch_size': 32}. Best is trial 1 with value: 60.848214285714285.


  Validation ACC: 57.56
  Validation took: 0:00:00


Some weights of AttentionPoolBertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['attention_layer.bias', 'attention_layer.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.69
  Training epoch took: 0:00:13

Running Validation...
  Validation ACC: 54.37
  Validation took: 0:00:00

Training...

Average training loss: 0.68
  Training epoch took: 0:00:26

Running Validation...
  Validation ACC: 57.15
  Validation took: 0:00:00

Training...

Average training loss: 0.67
  Training epoch took: 0:00:39

Running Validation...


[I 2025-04-07 16:47:25,728] Trial 3 finished with value: 58.12500000000001 and parameters: {'lr': 3.5798365989873304e-06, 'n_epochs': 3, 'max_length': 16, 'batch_size': 16}. Best is trial 1 with value: 60.848214285714285.


  Validation ACC: 58.13
  Validation took: 0:00:00


Some weights of AttentionPoolBertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['attention_layer.bias', 'attention_layer.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.69
  Training epoch took: 0:00:12

Running Validation...
  Validation ACC: 55.32
  Validation took: 0:00:00

Training...

Average training loss: 0.67
  Training epoch took: 0:00:24

Running Validation...
  Validation ACC: 57.41
  Validation took: 0:00:00

Training...

Average training loss: 0.64
  Training epoch took: 0:00:36

Running Validation...
  Validation ACC: 56.86
  Validation took: 0:00:00

Training...

Average training loss: 0.62
  Training epoch took: 0:00:48

Running Validation...


[I 2025-04-07 16:48:16,257] Trial 4 finished with value: 57.20899470899471 and parameters: {'lr': 8.180286803974599e-06, 'n_epochs': 4, 'max_length': 16, 'batch_size': 32}. Best is trial 1 with value: 60.848214285714285.


  Validation ACC: 57.21
  Validation took: 0:00:00


Some weights of AttentionPoolBertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['attention_layer.bias', 'attention_layer.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.70
  Training epoch took: 0:00:12

Running Validation...
  Validation ACC: 53.57
  Validation took: 0:00:00

Training...

Average training loss: 0.69
  Training epoch took: 0:00:24

Running Validation...
  Validation ACC: 55.31
  Validation took: 0:00:00

Training...

Average training loss: 0.68
  Training epoch took: 0:00:36

Running Validation...


[I 2025-04-07 16:48:55,748] Trial 5 finished with value: 57.20899470899471 and parameters: {'lr': 3.199977084716165e-06, 'n_epochs': 3, 'max_length': 16, 'batch_size': 32}. Best is trial 1 with value: 60.848214285714285.


  Validation ACC: 57.21
  Validation took: 0:00:00


Some weights of AttentionPoolBertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['attention_layer.bias', 'attention_layer.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.69
  Training epoch took: 0:00:13

Running Validation...
  Validation ACC: 62.15
  Validation took: 0:00:00

Training...

Average training loss: 0.53
  Training epoch took: 0:00:26

Running Validation...


[I 2025-04-07 16:49:25,186] Trial 6 finished with value: 59.30555555555556 and parameters: {'lr': 2.8042621727559996e-05, 'n_epochs': 2, 'max_length': 32, 'batch_size': 16}. Best is trial 1 with value: 60.848214285714285.


  Validation ACC: 59.31
  Validation took: 0:00:00


Some weights of AttentionPoolBertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['attention_layer.bias', 'attention_layer.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.69
  Training epoch took: 0:00:11

Running Validation...
  Validation ACC: 53.27
  Validation took: 0:00:01

Training...

Average training loss: 0.67
  Training epoch took: 0:00:22

Running Validation...
  Validation ACC: 54.20
  Validation took: 0:00:01

Training...

Average training loss: 0.66
  Training epoch took: 0:00:33

Running Validation...


[I 2025-04-07 16:50:02,218] Trial 7 finished with value: 54.50892857142857 and parameters: {'lr': 1.0172115185003336e-05, 'n_epochs': 3, 'max_length': 32, 'batch_size': 64}. Best is trial 1 with value: 60.848214285714285.


  Validation ACC: 54.51
  Validation took: 0:00:01


Some weights of AttentionPoolBertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['attention_layer.bias', 'attention_layer.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.69
  Training epoch took: 0:00:12

Running Validation...
  Validation ACC: 56.73
  Validation took: 0:00:00

Training...

Average training loss: 0.69
  Training epoch took: 0:00:24

Running Validation...


[I 2025-04-07 16:50:29,634] Trial 8 finished with value: 58.63095238095239 and parameters: {'lr': 3.4438951349849567e-06, 'n_epochs': 2, 'max_length': 64, 'batch_size': 32}. Best is trial 1 with value: 60.848214285714285.


  Validation ACC: 58.63
  Validation took: 0:00:00


Some weights of AttentionPoolBertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['attention_layer.bias', 'attention_layer.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.69
  Training epoch took: 0:00:13

Running Validation...
  Validation ACC: 56.18
  Validation took: 0:00:00

Training...

Average training loss: 0.66
  Training epoch took: 0:00:26

Running Validation...
  Validation ACC: 59.58
  Validation took: 0:00:00

Training...

Average training loss: 0.63
  Training epoch took: 0:00:40

Running Validation...


[I 2025-04-07 16:51:12,147] Trial 9 finished with value: 59.166666666666664 and parameters: {'lr': 7.395349036197772e-06, 'n_epochs': 3, 'max_length': 32, 'batch_size': 16}. Best is trial 1 with value: 60.848214285714285.


  Validation ACC: 59.17
  Validation took: 0:00:00


Some weights of AttentionPoolBertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['attention_layer.bias', 'attention_layer.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.70
  Training epoch took: 0:00:11

Running Validation...
  Validation ACC: 61.47
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:22

Running Validation...
  Validation ACC: 61.16
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:33

Running Validation...
  Validation ACC: 60.85
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:44

Running Validation...


[I 2025-04-07 16:52:00,176] Trial 10 finished with value: 61.78571428571429 and parameters: {'lr': 1.0466689418484104e-06, 'n_epochs': 4, 'max_length': 16, 'batch_size': 64}. Best is trial 10 with value: 61.78571428571429.


  Validation ACC: 61.79
  Validation took: 0:00:01


Some weights of AttentionPoolBertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['attention_layer.bias', 'attention_layer.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.70
  Training epoch took: 0:00:11

Running Validation...
  Validation ACC: 54.90
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:22

Running Validation...
  Validation ACC: 56.46
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:33

Running Validation...
  Validation ACC: 56.46
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:44

Running Validation...


[I 2025-04-07 16:52:48,175] Trial 11 finished with value: 56.458333333333336 and parameters: {'lr': 1.050026477213232e-06, 'n_epochs': 4, 'max_length': 16, 'batch_size': 64}. Best is trial 10 with value: 61.78571428571429.


  Validation ACC: 56.46
  Validation took: 0:00:01


Some weights of AttentionPoolBertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['attention_layer.bias', 'attention_layer.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.70
  Training epoch took: 0:00:11

Running Validation...
  Validation ACC: 54.90
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:22

Running Validation...
  Validation ACC: 56.15
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:33

Running Validation...
  Validation ACC: 56.46
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:44

Running Validation...


[I 2025-04-07 16:53:35,546] Trial 12 finished with value: 56.458333333333336 and parameters: {'lr': 1.023325802148113e-06, 'n_epochs': 4, 'max_length': 16, 'batch_size': 64}. Best is trial 10 with value: 61.78571428571429.


  Validation ACC: 56.46
  Validation took: 0:00:01


Some weights of AttentionPoolBertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['attention_layer.bias', 'attention_layer.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.70
  Training epoch took: 0:00:11

Running Validation...
  Validation ACC: 57.08
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:22

Running Validation...
  Validation ACC: 54.24
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:33

Running Validation...
  Validation ACC: 55.49
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:44

Running Validation...


[I 2025-04-07 16:54:22,835] Trial 13 finished with value: 55.803571428571416 and parameters: {'lr': 1.75503850612844e-06, 'n_epochs': 4, 'max_length': 16, 'batch_size': 64}. Best is trial 10 with value: 61.78571428571429.


  Validation ACC: 55.80
  Validation took: 0:00:01


Some weights of AttentionPoolBertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['attention_layer.bias', 'attention_layer.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.70
  Training epoch took: 0:00:11

Running Validation...
  Validation ACC: 54.55
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:22

Running Validation...
  Validation ACC: 55.18
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:33

Running Validation...
  Validation ACC: 55.80
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:44

Running Validation...


[I 2025-04-07 16:55:10,326] Trial 14 finished with value: 56.74107142857142 and parameters: {'lr': 1.960365261746454e-06, 'n_epochs': 4, 'max_length': 16, 'batch_size': 64}. Best is trial 10 with value: 61.78571428571429.


  Validation ACC: 56.74
  Validation took: 0:00:01


Some weights of AttentionPoolBertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['attention_layer.bias', 'attention_layer.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.70
  Training epoch took: 0:00:11

Running Validation...
  Validation ACC: 56.44
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:22

Running Validation...
  Validation ACC: 54.55
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:33

Running Validation...
  Validation ACC: 55.80
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:44

Running Validation...


[I 2025-04-07 16:55:57,499] Trial 15 finished with value: 55.49107142857142 and parameters: {'lr': 1.816215147285858e-06, 'n_epochs': 4, 'max_length': 64, 'batch_size': 64}. Best is trial 10 with value: 61.78571428571429.


  Validation ACC: 55.49
  Validation took: 0:00:01


Some weights of AttentionPoolBertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['attention_layer.bias', 'attention_layer.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.70
  Training epoch took: 0:00:11

Running Validation...
  Validation ACC: 56.77
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:22

Running Validation...


[I 2025-04-07 16:56:22,563] Trial 16 finished with value: 56.77083333333333 and parameters: {'lr': 1.8183923861854142e-06, 'n_epochs': 2, 'max_length': 16, 'batch_size': 64}. Best is trial 10 with value: 61.78571428571429.


  Validation ACC: 56.77
  Validation took: 0:00:01


Some weights of AttentionPoolBertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['attention_layer.bias', 'attention_layer.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.70
  Training epoch took: 0:00:11

Running Validation...
  Validation ACC: 47.01
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:22

Running Validation...
  Validation ACC: 49.52
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:33

Running Validation...
  Validation ACC: 50.46
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:44

Running Validation...


[I 2025-04-07 16:57:09,882] Trial 17 finished with value: 49.82142857142857 and parameters: {'lr': 1.3188883435219846e-06, 'n_epochs': 4, 'max_length': 16, 'batch_size': 64}. Best is trial 10 with value: 61.78571428571429.


  Validation ACC: 49.82
  Validation took: 0:00:01


Some weights of AttentionPoolBertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['attention_layer.bias', 'attention_layer.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.70
  Training epoch took: 0:00:11

Running Validation...
  Validation ACC: 54.55
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:22

Running Validation...
  Validation ACC: 55.49
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:33

Running Validation...
  Validation ACC: 55.18
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:44

Running Validation...


[I 2025-04-07 16:57:57,268] Trial 18 finished with value: 56.42857142857143 and parameters: {'lr': 2.4293234909440323e-06, 'n_epochs': 4, 'max_length': 32, 'batch_size': 64}. Best is trial 10 with value: 61.78571428571429.


  Validation ACC: 56.43
  Validation took: 0:00:01


Some weights of AttentionPoolBertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['attention_layer.bias', 'attention_layer.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.69
  Training epoch took: 0:00:11

Running Validation...
  Validation ACC: 54.84
  Validation took: 0:00:01

Training...

Average training loss: 0.66
  Training epoch took: 0:00:22

Running Validation...
  Validation ACC: 58.01
  Validation took: 0:00:01

Training...

Average training loss: 0.63
  Training epoch took: 0:00:33

Running Validation...
  Validation ACC: 59.26
  Validation took: 0:00:01

Training...

Average training loss: 0.60
  Training epoch took: 0:00:44

Running Validation...


[I 2025-04-07 16:58:45,838] Trial 19 finished with value: 59.86607142857142 and parameters: {'lr': 1.3629947563516654e-05, 'n_epochs': 4, 'max_length': 64, 'batch_size': 64}. Best is trial 10 with value: 61.78571428571429.


  Validation ACC: 59.87
  Validation took: 0:00:01


Some weights of AttentionPoolBertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['attention_layer.bias', 'attention_layer.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.70
  Training epoch took: 0:00:11

Running Validation...
  Validation ACC: 55.52
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:22

Running Validation...


[I 2025-04-07 16:59:10,626] Trial 20 finished with value: 55.833333333333336 and parameters: {'lr': 1.320972614455104e-06, 'n_epochs': 2, 'max_length': 16, 'batch_size': 64}. Best is trial 10 with value: 61.78571428571429.


  Validation ACC: 55.83
  Validation took: 0:00:01


Some weights of AttentionPoolBertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['attention_layer.bias', 'attention_layer.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.69
  Training epoch took: 0:00:11

Running Validation...
  Validation ACC: 54.21
  Validation took: 0:00:01

Training...

Average training loss: 0.65
  Training epoch took: 0:00:22

Running Validation...
  Validation ACC: 58.30
  Validation took: 0:00:01

Training...

Average training loss: 0.60
  Training epoch took: 0:00:33

Running Validation...
  Validation ACC: 58.62
  Validation took: 0:00:01

Training...

Average training loss: 0.56
  Training epoch took: 0:00:44

Running Validation...


[I 2025-04-07 16:59:58,467] Trial 21 finished with value: 61.75595238095239 and parameters: {'lr': 1.573383649780384e-05, 'n_epochs': 4, 'max_length': 64, 'batch_size': 64}. Best is trial 10 with value: 61.78571428571429.


  Validation ACC: 61.76
  Validation took: 0:00:01


Some weights of AttentionPoolBertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['attention_layer.bias', 'attention_layer.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.69
  Training epoch took: 0:00:11

Running Validation...
  Validation ACC: 56.41
  Validation took: 0:00:01

Training...

Average training loss: 0.64
  Training epoch took: 0:00:22

Running Validation...
  Validation ACC: 59.88
  Validation took: 0:00:01

Training...

Average training loss: 0.56
  Training epoch took: 0:00:33

Running Validation...
  Validation ACC: 58.63
  Validation took: 0:00:01

Training...

Average training loss: 0.48
  Training epoch took: 0:00:44

Running Validation...


[I 2025-04-07 17:00:46,167] Trial 22 finished with value: 59.25595238095238 and parameters: {'lr': 2.070094313904429e-05, 'n_epochs': 4, 'max_length': 64, 'batch_size': 64}. Best is trial 10 with value: 61.78571428571429.


  Validation ACC: 59.26
  Validation took: 0:00:01


Some weights of AttentionPoolBertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['attention_layer.bias', 'attention_layer.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.70
  Training epoch took: 0:00:11

Running Validation...
  Validation ACC: 54.55
  Validation took: 0:00:01

Training...

Average training loss: 0.68
  Training epoch took: 0:00:22

Running Validation...
  Validation ACC: 56.12
  Validation took: 0:00:01

Training...

Average training loss: 0.68
  Training epoch took: 0:00:33

Running Validation...
  Validation ACC: 57.34
  Validation took: 0:00:01

Training...

Average training loss: 0.67
  Training epoch took: 0:00:44

Running Validation...


[I 2025-04-07 17:01:33,764] Trial 23 finished with value: 56.08630952380953 and parameters: {'lr': 5.329121742275755e-06, 'n_epochs': 4, 'max_length': 64, 'batch_size': 64}. Best is trial 10 with value: 61.78571428571429.


  Validation ACC: 56.09
  Validation took: 0:00:01


In [None]:

# Train again with best parameters
lr = study.best_params["lr"]
n_epochs = study.best_params["n_epochs"]
max_length = study.best_params["max_length"]
batch_size = study.best_params["batch_size"]

model, loader, optimizer, scheduler = init_objects(lr, n_epochs, max_length, batch_size)
model.to(device)
val_acc, _ = train_eval_loop(model, loader, optimizer, scheduler, device, n_epochs)
# Obtain Test Results
# %%


Some weights of AttentionPoolBertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['attention_layer.bias', 'attention_layer.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

Average training loss: 0.70
  Training epoch took: 0:00:11

Running Validation...
  Validation ACC: 54.90
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:22

Running Validation...
  Validation ACC: 56.15
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:33

Running Validation...
  Validation ACC: 56.46
  Validation took: 0:00:01

Training...

Average training loss: 0.69
  Training epoch took: 0:00:44

Running Validation...
  Validation ACC: 56.46
  Validation took: 0:00:01


In [None]:
import pandas as pd

best_params = {
    "Learning Rate": study.best_params["lr"],
    "Epochs": study.best_params["n_epochs"],
    "Max Length": study.best_params["max_length"],
    "Batch Size": study.best_params["batch_size"]
}

best_params_df = pd.DataFrame([best_params])
print(best_params_df.to_string(index=False))


 Learning Rate  Epochs  Max Length  Batch Size
      0.000001       4          16          64
