In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding, get_scheduler
from datasets import load_dataset
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
adept_data_path = "../datasets/adept/train-dev-test-split"

In [4]:
adept_data_path

'../datasets/adept/train-dev-test-split'

In [5]:
train_split = "train.json"
validation_split = "val.json"
test_split = "test.json"

In [6]:
data_files = {
    "train": "{}/{}".format(adept_data_path, train_split), 
    "validation": "{}/{}".format(adept_data_path, validation_split), 
    "test": "{}/{}".format(adept_data_path, test_split),
}

In [7]:
adept_dataset = load_dataset("json", data_files=data_files)
adept_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence2', 'label', 'idx', 'sentence1', 'modifier', 'noun'],
        num_rows: 12892
    })
    validation: Dataset({
        features: ['sentence2', 'label', 'idx', 'sentence1', 'modifier', 'noun'],
        num_rows: 1611
    })
    test: Dataset({
        features: ['sentence2', 'label', 'idx', 'sentence1', 'modifier', 'noun'],
        num_rows: 1612
    })
})

In [8]:
models_dict = {
    "BERT": "bert-base-uncased",
    "ROBERTA": "grammarly/detexd-roberta-base",
    "DEBERTA": "sileod/deberta-v3-base-tasksource-nli"
}

In [9]:
params_dict = {
    "learning_rate"
}

In [10]:
# tokenized_dataset = tokenized_dataset.remove_columns(['sentence1', 'sentence2', 'idx'])
# tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
# tokenized_dataset = tokenized_dataset.with_format("torch")
# tokenized_dataset

In [16]:
import evaluate

In [17]:
roc_auc =  evaluate.load("roc_auc", "multiclass")

In [41]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    logits_tensor = torch.from_numpy(logits)
    probabilities = torch.nn.functional.softmax(logits_tensor, dim=-1)
    # preds = np.argmax(logits, axis=-1)
    return roc_auc.compute(prediction_scores=probabilities, references=labels, multi_class='ovo', average="macro")

In [49]:
import optuna
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import load_dataset
from sklearn.metrics import roc_auc_score


def tokenize_sentences_function(item):
    return tokenizer(item['sentence1'], item['sentence2'], truncation=True)

def objective(trial):
    # Define hyperparameters to tune
    hyperparams = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 5),
        # add other hyperparameters to tune
    }

    # Choose a model architecture
    model_name = trial.suggest_categorical("model_name", ["bert-base-uncased", "roberta-base", "distilbert-base-cased", "microsoft/deberta-base"])
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5, ignore_mismatched_sizes=True)

    # Define a data collator with dynamic padding using the chosen tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenized_dataset = adept_dataset.map(lambda x:tokenizer(x['sentence2'], truncation=True))
    tokenized_dataset = tokenized_dataset.remove_columns(['sentence1', 'sentence2', 'idx', 'modifier', 'noun'])
    tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
    tokenized_dataset = tokenized_dataset.with_format("torch")
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Define Trainer and TrainingArguments
    trainer = Trainer(
        model=model,
        args=TrainingArguments(
            output_dir=f"./output_{model_name}",
            per_device_train_batch_size=32,
            per_device_eval_batch_size=32,
            learning_rate=hyperparams["learning_rate"],
            num_train_epochs=hyperparams["num_train_epochs"],
            evaluation_strategy="epoch",
            # add other training arguments
        ),
        data_collator=data_collator,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()

    # evaluate
    results  = trainer.evaluate()
    # print(results.keys())

    custom_metric = results["eval_roc_auc"]

    # Return the negative value of the custom metric for optimization
    return -custom_metric

In [50]:
# Perform hyperparameter and model architecture optimization
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)

# Get the best hyperparameters and model architecture
best_params = study.best_params
best_model_name = best_params["model_name"]
print("Best Model Architecture:", best_model_name)
print("Best Hyperparameters:", best_params)

[I 2024-01-13 17:50:52,903] A new study created in memory with name: no-name-071ce96e-1909-4f77-b118-df3f362a0b5a
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Roc Auc
1,No log,0.864196,0.673319
2,0.910400,0.862119,0.692026


Checkpoint destination directory ./output_distilbert-base-cased/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


[I 2024-01-13 17:52:34,138] Trial 0 finished with value: -0.692026069842889 and parameters: {'learning_rate': 2.1328016892378287e-05, 'num_train_epochs': 2, 'model_name': 'distilbert-base-cased'}. Best is trial 0 with value: -0.692026069842889.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Roc Auc
1,No log,0.888242,0.705026
2,0.892500,0.886631,0.709638
3,0.668300,1.092049,0.713487
4,0.407300,1.271343,0.703551
5,0.223400,1.46738,0.680002


Checkpoint destination directory ./output_distilbert-base-cased/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output_distilbert-base-cased/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output_distilbert-base-cased/checkpoint-1500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


[I 2024-01-13 17:56:43,776] Trial 1 finished with value: -0.6800016693580846 and parameters: {'learning_rate': 7.787998718535026e-05, 'num_train_epochs': 5, 'model_name': 'distilbert-base-cased'}. Best is trial 0 with value: -0.692026069842889.
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Roc Auc
1,No log,0.845369,0.7106


[I 2024-01-13 17:58:38,002] Trial 2 finished with value: -0.7105996720057149 and parameters: {'learning_rate': 4.996019112313136e-05, 'num_train_epochs': 1, 'model_name': 'microsoft/deberta-base'}. Best is trial 2 with value: -0.7105996720057149.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Roc Auc
1,No log,0.876918,0.689127


[I 2024-01-13 17:59:25,037] Trial 3 finished with value: -0.6891274850721383 and parameters: {'learning_rate': 3.5117115361256974e-05, 'num_train_epochs': 1, 'model_name': 'distilbert-base-cased'}. Best is trial 2 with value: -0.7105996720057149.
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Roc Auc
1,No log,0.8482,0.737409
2,0.887100,0.846205,0.757281
3,0.684400,0.900934,0.75995


[I 2024-01-13 18:05:02,960] Trial 4 finished with value: -0.7599503215129537 and parameters: {'learning_rate': 3.660515504756857e-05, 'num_train_epochs': 3, 'model_name': 'microsoft/deberta-base'}. Best is trial 4 with value: -0.7599503215129537.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Roc Auc
1,No log,0.878758,0.690077
2,0.879300,0.884519,0.698981
3,0.657500,0.995414,0.703555


Checkpoint destination directory ./output_distilbert-base-cased/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output_distilbert-base-cased/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


[I 2024-01-13 18:07:30,810] Trial 5 finished with value: -0.7035546928444087 and parameters: {'learning_rate': 6.977912397791627e-05, 'num_train_epochs': 3, 'model_name': 'distilbert-base-cased'}. Best is trial 4 with value: -0.7599503215129537.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1612/1612 [00:00<00:00, 4357.97 examples/s]
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Roc Auc
1,No log,1.033417,0.517506
2,1.029500,1.031787,0.516036


[I 2024-01-13 18:10:18,874] Trial 6 finished with value: -0.5160361744976243 and parameters: {'learning_rate': 0.00020634093868572215, 'num_train_epochs': 2, 'model_name': 'bert-base-uncased'}. Best is trial 4 with value: -0.7599503215129537.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Roc Auc
1,No log,0.865409,0.675589
2,0.871800,0.848267,0.724039
3,0.636800,0.993124,0.712438


Checkpoint destination directory ./output_bert-base-uncased/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


[I 2024-01-13 18:14:23,614] Trial 7 finished with value: -0.7124381628847892 and parameters: {'learning_rate': 5.764319012629987e-05, 'num_train_epochs': 3, 'model_name': 'bert-base-uncased'}. Best is trial 4 with value: -0.7599503215129537.
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Roc Auc
1,No log,1.030701,0.519057


[I 2024-01-13 18:16:18,580] Trial 8 finished with value: -0.5190568423996229 and parameters: {'learning_rate': 0.0005314502765440309, 'num_train_epochs': 1, 'model_name': 'microsoft/deberta-base'}. Best is trial 4 with value: -0.7599503215129537.
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Roc Auc
1,No log,1.038419,0.478918
2,1.032200,1.030147,0.502858


Checkpoint destination directory ./output_microsoft/deberta-base/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


[I 2024-01-13 18:20:03,539] Trial 9 finished with value: -0.502858117000346 and parameters: {'learning_rate': 0.00010322417877086908, 'num_train_epochs': 2, 'model_name': 'microsoft/deberta-base'}. Best is trial 4 with value: -0.7599503215129537.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Roc Auc
1,No log,0.867007,0.695138
2,0.933800,0.839341,0.720548
3,0.793100,0.861399,0.733721
4,0.731700,0.843649,0.741644
5,0.686300,0.863715,0.745604


[I 2024-01-13 18:26:42,102] Trial 10 finished with value: -0.7456042442381093 and parameters: {'learning_rate': 1.1215900331523441e-05, 'num_train_epochs': 5, 'model_name': 'roberta-base'}. Best is trial 4 with value: -0.7599503215129537.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Roc Auc
1,No log,0.869944,0.688167
2,0.915700,0.848099,0.714709
3,0.776700,0.878328,0.72888
4,0.699700,0.867371,0.735913
5,0.646700,0.88924,0.742465


Checkpoint destination directory ./output_roberta-base/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output_roberta-base/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output_roberta-base/checkpoint-1500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output_roberta-base/checkpoint-2000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


[I 2024-01-13 18:33:31,290] Trial 11 finished with value: -0.7424653478058556 and parameters: {'learning_rate': 1.4658434307770551e-05, 'num_train_epochs': 5, 'model_name': 'roberta-base'}. Best is trial 4 with value: -0.7599503215129537.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Roc Auc
1,No log,0.880474,0.673487
2,0.932500,0.851996,0.711468
3,0.797100,0.855457,0.723108
4,0.744500,0.864177,0.727564


Checkpoint destination directory ./output_roberta-base/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output_roberta-base/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output_roberta-base/checkpoint-1500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


[I 2024-01-13 18:38:58,247] Trial 12 finished with value: -0.7275640350048825 and parameters: {'learning_rate': 1.0168420054914464e-05, 'num_train_epochs': 4, 'model_name': 'roberta-base'}. Best is trial 4 with value: -0.7599503215129537.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Roc Auc
1,No log,0.867848,0.709797
2,0.904200,0.85792,0.730256
3,0.750700,0.93383,0.745106
4,0.638900,0.947494,0.750414


Checkpoint destination directory ./output_roberta-base/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output_roberta-base/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output_roberta-base/checkpoint-1500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


[I 2024-01-13 18:44:26,926] Trial 13 finished with value: -0.7504144280912507 and parameters: {'learning_rate': 2.7137532356898816e-05, 'num_train_epochs': 4, 'model_name': 'roberta-base'}. Best is trial 4 with value: -0.7599503215129537.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Roc Auc
1,No log,0.885584,0.69057
2,0.929500,0.86724,0.727951
3,0.770200,0.925921,0.738426
4,0.662100,0.946681,0.74044


Checkpoint destination directory ./output_roberta-base/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output_roberta-base/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output_roberta-base/checkpoint-1500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


[I 2024-01-13 18:49:51,881] Trial 14 finished with value: -0.7404396527958552 and parameters: {'learning_rate': 2.6079432974776775e-05, 'num_train_epochs': 4, 'model_name': 'roberta-base'}. Best is trial 4 with value: -0.7599503215129537.
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Roc Auc
1,No log,1.036903,0.516155
2,1.021700,1.027086,0.540292
3,1.012500,1.03169,0.528124
4,1.010100,1.028735,0.532288


Checkpoint destination directory ./output_microsoft/deberta-base/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output_microsoft/deberta-base/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


[I 2024-01-13 18:57:20,529] Trial 15 finished with value: -0.5322878467485027 and parameters: {'learning_rate': 0.00019240689051403906, 'num_train_epochs': 4, 'model_name': 'microsoft/deberta-base'}. Best is trial 4 with value: -0.7599503215129537.
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Roc Auc
1,No log,0.843363,0.730905
2,0.902500,0.851829,0.745196
3,0.705100,0.891495,0.754671


Checkpoint destination directory ./output_microsoft/deberta-base/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output_microsoft/deberta-base/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


[I 2024-01-13 19:02:58,522] Trial 16 finished with value: -0.7546711875367471 and parameters: {'learning_rate': 2.9552575558641492e-05, 'num_train_epochs': 3, 'model_name': 'microsoft/deberta-base'}. Best is trial 4 with value: -0.7599503215129537.
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Roc Auc
1,No log,1.035377,0.524798
2,1.026000,1.027741,0.537338
3,1.012800,1.02908,0.530874


Checkpoint destination directory ./output_microsoft/deberta-base/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output_microsoft/deberta-base/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


[I 2024-01-13 19:08:36,241] Trial 17 finished with value: -0.530874054012001 and parameters: {'learning_rate': 0.00017446605467910054, 'num_train_epochs': 3, 'model_name': 'microsoft/deberta-base'}. Best is trial 4 with value: -0.7599503215129537.
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Roc Auc
1,No log,1.032278,0.492499
2,1.022000,1.030572,0.492022


Checkpoint destination directory ./output_microsoft/deberta-base/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


[I 2024-01-13 19:12:23,945] Trial 18 finished with value: -0.49202172380338993 and parameters: {'learning_rate': 0.000934428012268065, 'num_train_epochs': 2, 'model_name': 'microsoft/deberta-base'}. Best is trial 4 with value: -0.7599503215129537.
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Roc Auc
1,No log,0.846536,0.735541
2,0.877200,0.83809,0.747011
3,0.690600,0.902258,0.743289


Checkpoint destination directory ./output_microsoft/deberta-base/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./output_microsoft/deberta-base/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


[I 2024-01-13 19:18:26,935] Trial 19 finished with value: -0.7432886728267634 and parameters: {'learning_rate': 3.709945130637713e-05, 'num_train_epochs': 3, 'model_name': 'microsoft/deberta-base'}. Best is trial 4 with value: -0.7599503215129537.


Best Model Architecture: microsoft/deberta-base
Best Hyperparameters: {'learning_rate': 3.660515504756857e-05, 'num_train_epochs': 3, 'model_name': 'microsoft/deberta-base'}


In [55]:
study.

<optuna.study.study.Study at 0x7ff8b33b56f0>

In [54]:

# Fine-tune the best model with the best hyperparameters
final_model = AutoModelForSequenceClassification.from_pretrained(best_model_name, num_labels=5, ignore_mismatched_sizes=True)
final_tokenizer = AutoTokenizer.from_pretrained(best_model_name)
final_data_collator = DataCollatorWithPadding(tokenizer=final_tokenizer)
final_tokenized_dataset = adept_dataset.map(lambda x:final_tokenizer(x['sentence2'], truncation=True))
final_tokenized_dataset = final_tokenized_dataset.remove_columns(['sentence1', 'sentence2', 'idx', 'modifier', 'noun'])
final_tokenized_dataset = final_tokenized_dataset.rename_column("label", "labels")
final_tokenized_dataset = final_tokenized_dataset.with_format("torch")

final_trainer = Trainer(
    model=final_model,
    args=TrainingArguments(
        output_dir=f"./final_output_{best_model_name}",
        learning_rate=best_params["learning_rate"],
        num_train_epochs=best_params["num_train_epochs"],
        evaluation_strategy="epoch",
        # add other training arguments
    ),
    data_collator=final_data_collator,
    train_dataset=final_tokenized_dataset["train"],
    eval_dataset=final_tokenized_dataset["validation"],
    compute_metrics=compute_metrics
)

final_trainer.train()

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
training_args = TrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=1e-5,
    num_train_epochs=3,
    save_steps=1000,  # adjust as needed
    save_total_limit=2,
    logging_steps=100,  # adjust as needed
)
