In [1]:
import sys
import os

# Get the root directory
root_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Add the src directory to sys.path
src_dir = os.path.join(root_dir, "src")
if src_dir not in sys.path:
    sys.path.append(src_dir)

In [2]:
from dataloader import DataLoader
from trainer import WeightedTrainer, compute_metrics
from utils import *

from typing import List
import yaml

import optuna
from optuna import Trial

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    DataCollatorWithPadding,
)

from peft import (
    LoraConfig,
    TaskType,
    get_peft_model
)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
config_file = "../batch_config.yaml"
configs = load_config(config_file)

In [4]:
tokenizer = AutoTokenizer.from_pretrained(configs['checkpoint'])
if configs['decoder_only']:
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.pad_token = tokenizer.eos_token


data_collator = DataCollatorWithPadding(tokenizer)
dataset = DataLoader().load('../data/', tokenizer)


Map:   0%|                                          | 0/42953 [00:00<?, ? examples/s]


Map:  16%|████▌                       | 7000/42953 [00:00<00:00, 46380.24 examples/s]


Map:  40%|██████████▋                | 17000/42953 [00:00<00:00, 71510.80 examples/s]


Map:  65%|█████████████████▌         | 28000/42953 [00:00<00:00, 82474.58 examples/s]


Map:  95%|█████████████████████████▊ | 41000/42953 [00:00<00:00, 94864.29 examples/s]


Map: 100%|███████████████████████████| 42953/42953 [00:00<00:00, 86797.71 examples/s]





Map:   0%|                                         | 0/227328 [00:00<?, ? examples/s]


Map:   1%|▎                          | 3000/227328 [00:00<00:07, 28408.67 examples/s]


Map:   6%|█▌                        | 14000/227328 [00:00<00:03, 69447.52 examples/s]


Map:  11%|██▊                       | 25000/227328 [00:00<00:02, 83466.89 examples/s]


Map:  15%|████                      | 35000/227328 [00:00<00:02, 86348.07 examples/s]


Map:  19%|█████                     | 44000/227328 [00:00<00:02, 71960.90 examples/s]


Map:  24%|██████▎                   | 55000/227328 [00:00<00:02, 68866.45 examples/s]


Map:  29%|███████▍                  | 65000/227328 [00:00<00:02, 71848.71 examples/s]


Map:  32%|████████▎                 | 73000/227328 [00:01<00:02, 64150.76 examples/s]


Map:  35%|█████████▏                | 80000/227328 [00:01<00:02, 63514.69 examples/s]


Map:  40%|██████████▍               | 91000/227328 [00:01<00:01, 74462.76 examples/s]


Map:  46%|███████████▍             | 104000/227328 [00:01<00:01, 74461.95 examples/s]


Map:  52%|████████████▉            | 118000/227328 [00:01<00:01, 85955.35 examples/s]


Map:  57%|██████████████▎          | 130000/227328 [00:01<00:01, 71461.60 examples/s]


Map:  63%|███████████████▊         | 144000/227328 [00:01<00:00, 84874.43 examples/s]


Map:  68%|█████████████████        | 155000/227328 [00:02<00:00, 76864.78 examples/s]


Map:  73%|██████████████████▎      | 166000/227328 [00:02<00:00, 72476.44 examples/s]


Map:  79%|███████████████████▋     | 179000/227328 [00:02<00:00, 83025.28 examples/s]


Map:  84%|████████████████████▉    | 190000/227328 [00:02<00:00, 87085.46 examples/s]


Map:  89%|██████████████████████▎  | 203000/227328 [00:02<00:00, 78225.60 examples/s]


Map:  95%|███████████████████████▋ | 215000/227328 [00:02<00:00, 76479.85 examples/s]


Map: 100%|█████████████████████████| 227328/227328 [00:03<00:00, 72471.98 examples/s]


Map: 100%|█████████████████████████| 227328/227328 [00:03<00:00, 74753.96 examples/s]





Map:   0%|                                          | 0/36438 [00:00<?, ? examples/s]


Map:  33%|████████▌                 | 12000/36438 [00:00<00:00, 107777.93 examples/s]


Map:  69%|██████████████████▌        | 25000/36438 [00:00<00:00, 91184.71 examples/s]


Map:  96%|█████████████████████████▉ | 35000/36438 [00:00<00:00, 83002.60 examples/s]


Map: 100%|███████████████████████████| 36438/36438 [00:00<00:00, 85511.74 examples/s]




In [5]:
def objective(trail: Trial, config: Dict):

    lr = trail.suggest_categorical("learning_rate", config['lr'])
    batch_size = trail.suggest_categorical("batch_size", config['batch_size'])

    out_dir = f"../{config['cache_dir']}/{config['model_name']}/lr_{trail.params['learning_rate']}_bsz_{trail.params['batch_size']}"

    train_args = TrainingArguments(
        output_dir=out_dir,
        eval_strategy='epoch',
        save_strategy='epoch',
        learning_rate=lr,
        weight_decay=0.001,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=config['epochs'],
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model='eval_F1',
        greater_is_better=True
    )

    peft_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        inference_mode=False,
        r=3,
        lora_alpha=16,
        lora_dropout=0.1
    )

    model = AutoModelForSequenceClassification.from_pretrained(config['checkpoint'])
    model = get_peft_model(model, peft_config)
    if config['decoder_only']:
        model.config.pad_token_id = model.config.eos_token_id

    trainer = WeightedTrainer(
        model=model,
        args=train_args,
        train_dataset=dataset['train'],
        eval_dataset=dataset['dev'],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    trainer.train()

    trainer.save_model(out_dir)

    eval_result = trainer.evaluate()

    return eval_result['eval_loss']

In [6]:
study = optuna.create_study(direction="minimize")
study.optimize(lambda trial: objective(trial, configs), n_trials=16)

print("Best hyperparameters:", study.best_params)

[I 2025-02-22 00:26:37,967] A new study created in memory with name: no-name-8cfbb771-b084-4c77-847b-4420d82aea22


Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Acc,Precision,F1
1,0.1025,0.192814,{'accuracy': 0.9347659037268785},{'precision': 0.7649164677804295},0.518923
2,0.085,0.239274,{'accuracy': 0.9400076842856359},{'precision': 0.7799688635184224},0.578968


[I 2025-02-22 00:37:18,058] Trial 0 finished with value: 0.23927371203899384 and parameters: {'learning_rate': 0.0003, 'batch_size': 64}. Best is trial 0 with value: 0.23927371203899384.


Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Acc,Precision,F1
1,0.1738,0.271533,{'accuracy': 0.9230748120094407},{'precision': 0.71},0.357847
2,0.1419,0.313148,{'accuracy': 0.92417256710028},{'precision': 0.7127118644067797},0.378403


[I 2025-02-22 00:56:24,315] Trial 1 finished with value: 0.31314781308174133 and parameters: {'learning_rate': 0.001, 'batch_size': 16}. Best is trial 0 with value: 0.23927371203899384.


Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Acc,Precision,F1
1,0.1014,0.202302,{'accuracy': 0.9385806026675448},{'precision': 0.7385044124477473},0.586932
2,0.0856,0.248294,{'accuracy': 0.9391294802129645},{'precision': 0.7650632911392405},0.576718


[I 2025-02-22 01:07:00,460] Trial 2 finished with value: 0.20230181515216827 and parameters: {'learning_rate': 0.0003, 'batch_size': 64}. Best is trial 2 with value: 0.20230181515216827.


Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Acc,Precision,F1
1,0.1586,0.280023,{'accuracy': 0.939815577144739},{'precision': 0.7800417972831766},0.576559
2,0.119,0.33288,{'accuracy': 0.9381140567539382},{'precision': 0.7744565217391305},0.558276


[I 2025-02-22 01:39:24,451] Trial 3 finished with value: 0.2800227999687195 and parameters: {'learning_rate': 0.0001, 'batch_size': 8}. Best is trial 2 with value: 0.20230181515216827.


Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Acc,Precision,F1
1,0.1286,0.222027,{'accuracy': 0.9374279598221636},{'precision': 0.8694673668417104},0.504132
2,0.0954,0.208937,{'accuracy': 0.9433283934354245},{'precision': 0.8128258602711157},0.601582


[I 2025-02-22 01:50:00,188] Trial 4 finished with value: 0.20893679559230804 and parameters: {'learning_rate': 0.001, 'batch_size': 64}. Best is trial 2 with value: 0.20230181515216827.


Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Acc,Precision,F1
1,0.2089,0.281599,{'accuracy': 0.9282617048136561},{'precision': 0.7784431137724551},0.410465
2,0.1517,0.358058,{'accuracy': 0.932625281299742},{'precision': 0.7947598253275109},0.470791


[I 2025-02-22 02:22:26,830] Trial 5 finished with value: 0.35805845260620117 and parameters: {'learning_rate': 0.0005, 'batch_size': 8}. Best is trial 2 with value: 0.20230181515216827.


Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Acc,Precision,F1
1,0.1636,0.256392,{'accuracy': 0.9302102200998957},{'precision': 0.7426075268817204},0.464969
2,0.1211,0.312582,{'accuracy': 0.932625281299742},{'precision': 0.753125},0.495375


[I 2025-02-22 02:41:32,931] Trial 6 finished with value: 0.3125820457935333 and parameters: {'learning_rate': 0.0005, 'batch_size': 16}. Best is trial 2 with value: 0.20230181515216827.


Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Acc,Precision,F1
1,0.2321,0.273037,{'accuracy': 0.9193698885778583},{'precision': 0.693950177935943},0.28481
2,0.1562,0.382571,{'accuracy': 0.9215928426368077},{'precision': 0.7207792207792207},0.317976


[I 2025-02-22 03:13:53,437] Trial 7 finished with value: 0.382571280002594 and parameters: {'learning_rate': 0.001, 'batch_size': 8}. Best is trial 2 with value: 0.20230181515216827.


Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Acc,Precision,F1
1,0.1014,0.202302,{'accuracy': 0.9385806026675448},{'precision': 0.7385044124477473},0.586932
2,0.0856,0.248294,{'accuracy': 0.9391294802129645},{'precision': 0.7650632911392405},0.576718


[I 2025-02-22 03:24:29,497] Trial 8 finished with value: 0.20230181515216827 and parameters: {'learning_rate': 0.0003, 'batch_size': 64}. Best is trial 2 with value: 0.20230181515216827.


Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Acc,Precision,F1
1,0.2089,0.281599,{'accuracy': 0.9282617048136561},{'precision': 0.7784431137724551},0.410465
2,0.1517,0.358058,{'accuracy': 0.932625281299742},{'precision': 0.7947598253275109},0.470791


[I 2025-02-22 03:56:59,738] Trial 9 finished with value: 0.35805845260620117 and parameters: {'learning_rate': 0.0005, 'batch_size': 8}. Best is trial 2 with value: 0.20230181515216827.


Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Acc,Precision,F1
1,0.1088,0.208127,{'accuracy': 0.9395136944947582},{'precision': 0.7532219570405728},0.588806
2,0.0855,0.264363,{'accuracy': 0.9364399802404083},{'precision': 0.7682306387789711},0.539928


[I 2025-02-22 04:10:01,259] Trial 10 finished with value: 0.20812666416168213 and parameters: {'learning_rate': 0.0003, 'batch_size': 32}. Best is trial 2 with value: 0.20230181515216827.


Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Acc,Precision,F1
1,0.1014,0.202302,{'accuracy': 0.9385806026675448},{'precision': 0.7385044124477473},0.586932
2,0.0856,0.248294,{'accuracy': 0.9391294802129645},{'precision': 0.7650632911392405},0.576718


[I 2025-02-22 04:20:37,815] Trial 11 finished with value: 0.20230181515216827 and parameters: {'learning_rate': 0.0003, 'batch_size': 64}. Best is trial 2 with value: 0.20230181515216827.


Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Acc,Precision,F1
1,0.1014,0.202302,{'accuracy': 0.9385806026675448},{'precision': 0.7385044124477473},0.586932
2,0.0856,0.248294,{'accuracy': 0.9391294802129645},{'precision': 0.7650632911392405},0.576718


[I 2025-02-22 04:31:14,282] Trial 12 finished with value: 0.20230181515216827 and parameters: {'learning_rate': 0.0003, 'batch_size': 64}. Best is trial 2 with value: 0.20230181515216827.


Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Acc,Precision,F1
1,0.1014,0.202302,{'accuracy': 0.9385806026675448},{'precision': 0.7385044124477473},0.586932
2,0.0856,0.248294,{'accuracy': 0.9391294802129645},{'precision': 0.7650632911392405},0.576718


[I 2025-02-22 04:41:50,023] Trial 13 finished with value: 0.20230181515216827 and parameters: {'learning_rate': 0.0003, 'batch_size': 64}. Best is trial 2 with value: 0.20230181515216827.


Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Acc,Precision,F1
1,0.1047,0.21897,{'accuracy': 0.938306163894835},{'precision': 0.7569479535118747},0.57132
2,0.0867,0.273796,{'accuracy': 0.9353696690268402},{'precision': 0.7508269018743109},0.536326


[I 2025-02-22 04:54:51,815] Trial 14 finished with value: 0.2189699113368988 and parameters: {'learning_rate': 0.0001, 'batch_size': 32}. Best is trial 2 with value: 0.20230181515216827.


Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Acc,Precision,F1
1,0.1014,0.202302,{'accuracy': 0.9385806026675448},{'precision': 0.7385044124477473},0.586932
2,0.0856,0.248294,{'accuracy': 0.9391294802129645},{'precision': 0.7650632911392405},0.576718


[I 2025-02-22 05:05:27,761] Trial 15 finished with value: 0.20230181515216827 and parameters: {'learning_rate': 0.0003, 'batch_size': 64}. Best is trial 2 with value: 0.20230181515216827.


Best hyperparameters: {'learning_rate': 0.0003, 'batch_size': 64}
