In [5]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
from transformers import set_seed
set_seed(1729) # TODO: check this does indeed do for PyTorch and NumPy etc.

In [8]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=7ab69a3ad494e1b0c2360351713f03ad39c4dcbeca21dbf4ed3d3c0bb3830a10
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [9]:
import torch

from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric, Dataset, DatasetDict
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report

In [10]:
# Check CUDA stuff
print("CUDA available:", torch.cuda.is_available())
print("Current device index:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

CUDA available: True
Current device index: 0
Device name: Tesla P100-PCIE-16GB


In [11]:
raw_dataset = load_dataset("conll2003")

raw_dataset

Downloading builder script:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading and preparing dataset conll2003/conll2003 (download: 959.94 KiB, generated: 9.78 MiB, post-processed: Unknown size, total: 10.72 MiB) to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee...


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14042 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3251 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3454 [00:00<?, ? examples/s]

Dataset conll2003 downloaded and prepared to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14042
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3251
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3454
    })
})

In [12]:
for k in ["train","validation","test"]:
    raw_dataset[k] = raw_dataset[k].remove_columns(["pos_tags", "chunk_tags"])

raw_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 14042
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 3251
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 3454
    })
})

In [13]:
raw_dataset["train"][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [14]:
# check features if they are specified
raw_dataset["train"].features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(num_classes=9, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}

In [15]:
# assign the names of the possible labels/"ner_tags" as label_list
label_list = raw_dataset["train"].features["ner_tags"].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [16]:
pretrained_checkpoint = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(pretrained_checkpoint)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [17]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [18]:
tokenized_dataset = raw_dataset.map(tokenize_and_align_labels, batched=True)


print(tokenized_dataset)
print(tokenized_dataset["train"])
print(tokenized_dataset["train"][0])

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14042
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3251
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3454
    })
})
Dataset({
    features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 14042
})
{'id': '0', 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0], 'input_ids': [101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100]}


In [19]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [20]:
# TODO: !pip install transformers seqeval[gpu]
# check if need to install GPU version of seqeval
# found here: https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Custom_Named_Entity_Recognition_with_BERT.ipynb

from datasets import load_metric
metric = load_metric("seqeval")

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [21]:
def compute_metrics(eval_prediction_object):
    predictions, labels = eval_prediction_object
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    
    res = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }
    
    # ADD ENTITY LEVEL STUFF
    # YOU CAN FIND THIS IS THE results OBJECT ALSO
    for k in results.keys():
        if k not in res.keys():
            for poss_metric in results[k]:
                res[f"{k}_{poss_metric}"] = results[k][poss_metric]
    
    return res

In [22]:
print(label_list)

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [23]:
id2label = {i:label for i, label in enumerate(label_list)}
label2id = {v:k for k,v in id2label.items()}

id2label

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC',
 7: 'B-MISC',
 8: 'I-MISC'}

In [24]:
label2id

{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6,
 'B-MISC': 7,
 'I-MISC': 8}

In [25]:
num_labels = len(label2id.keys())
# ------- TRAINING ----------
model = AutoModelForTokenClassification.from_pretrained(
    pretrained_checkpoint,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
training_args = TrainingArguments(
    output_dir=f"finetuned-{pretrained_checkpoint}-conll2003-test-no-HP-search",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
    report_to=["tensorboard"]
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

ALREADY_DONE_NON_OPTUNA_FINETUNING = True
if not ALREADY_DONE_NON_OPTUNA_FINETUNING:
    trainer.train()

In [27]:
# UPDATE:
# I did the training earlier, reloading notebook next day so will load from HF

from transformers import pipeline

sample_test_text_with_entities = "France decided to display the supermellifluostastic weather information for the entire European Union by the minister Frank Higgins, who called his friend on the phone later."

classifier = pipeline("token-classification", model="benjaminzwhite/finetuned-distilbert-base-uncased-conll2003-test-no-HP-search")

classifier(sample_test_text_with_entities)

config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

[{'entity': 'B-LOC',
  'score': 0.9962255,
  'index': 1,
  'word': 'france',
  'start': 0,
  'end': 6},
 {'entity': 'B-ORG',
  'score': 0.9752045,
  'index': 18,
  'word': 'european',
  'start': 87,
  'end': 95},
 {'entity': 'I-ORG',
  'score': 0.94780993,
  'index': 19,
  'word': 'union',
  'start': 96,
  'end': 101},
 {'entity': 'B-PER',
  'score': 0.99649554,
  'index': 23,
  'word': 'frank',
  'start': 118,
  'end': 123},
 {'entity': 'I-PER',
  'score': 0.9966157,
  'index': 24,
  'word': 'higgins',
  'start': 124,
  'end': 131}]

In [28]:
# CARE! SEE IN ABOVE, THE INDEX IS 1-BASED !!!!!!!!!!!!! FFS


# trying to replicate the above with PyTorch directly, starting
# from a string (after, will try convert a tokenized entry in the dataset already)

# CARE! NOT IN TUTORIAL - NEED TO MOVE THE model AND THE inputs TO SAME DEVICE
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
inputs = tokenizer(sample_test_text_with_entities, return_tensors="pt").to(device)

with torch.no_grad():
    logits = model(**inputs).logits
    
predictions = torch.argmax(logits, dim=2)
predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
predicted_token_class

['I-LOC',
 'B-MISC',
 'I-LOC',
 'I-LOC',
 'B-MISC',
 'I-LOC',
 'B-MISC',
 'B-MISC',
 'B-MISC',
 'B-MISC',
 'B-MISC',
 'I-LOC',
 'I-LOC',
 'I-MISC',
 'I-LOC',
 'B-MISC',
 'B-MISC',
 'B-MISC',
 'B-MISC',
 'B-MISC',
 'I-PER',
 'B-MISC',
 'B-MISC',
 'B-LOC',
 'O',
 'I-LOC',
 'I-PER',
 'O',
 'I-LOC',
 'O',
 'I-PER',
 'B-MISC',
 'B-MISC',
 'O',
 'I-ORG',
 'I-LOC']

In [29]:
print(len(sample_test_text_with_entities.split()))
print(len(predicted_token_class))
print(inputs)

print(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]))

26
36
{'input_ids': tensor([[  101,  2605,  2787,  2000,  4653,  1996,  3565, 10199,  3669, 10258,
         19098,  9153, 10074,  4633,  2592,  2005,  1996,  2972,  2647,  2586,
          2011,  1996,  2704,  3581, 13466,  1010,  2040,  2170,  2010,  2767,
          2006,  1996,  3042,  2101,  1012,   102]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}
['[CLS]', 'france', 'decided', 'to', 'display', 'the', 'super', '##mel', '##li', '##fl', '##uo', '##sta', '##stic', 'weather', 'information', 'for', 'the', 'entire', 'european', 'union', 'by', 'the', 'minister', 'frank', 'higgins', ',', 'who', 'called', 'his', 'friend', 'on', 'the', 'phone', 'later', '.', '[SEP]']


In [30]:
print(inputs.word_ids())

[None, 0, 1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, None]


In [31]:
# UPDATE -- TODO THIS IS WHERE ERROR IS, IT'S BECAUSE EVEN AFTER YOU TRY TO REVERSE THE TOKENIZATION AND
# USE THE WORD_IDS, THERE ARE "NEW WORDS" COMPARED TO THE sentence.split() WORDS : NOTE FOR EXAMPLE
# BELOW YOU HAVE "," AS A WORD THAT DOES NOT APPEAR IN ....Frank Higgins, ...

"""
tokens_of_sample = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
words_of_sample = sample_test_text_with_entities.split()
print(words_of_sample, len(words_of_sample))

for wi, tok, ptc in zip(inputs.word_ids(), tokens_of_sample, predicted_token_class):
    if wi is not None:
        print(f"{tok} is a part of the word {words_of_sample[wi]} and is prediced to be of label type : {ptc}")

"""

'\ntokens_of_sample = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])\nwords_of_sample = sample_test_text_with_entities.split()\nprint(words_of_sample, len(words_of_sample))\n\nfor wi, tok, ptc in zip(inputs.word_ids(), tokens_of_sample, predicted_token_class):\n    if wi is not None:\n        print(f"{tok} is a part of the word {words_of_sample[wi]} and is prediced to be of label type : {ptc}")\n\n'

# Optuna stuff now

First, since restarted - want to evaluate the performance of non-Optuna model

In [32]:
# this is currently the PRETRAINED_CHECKPOINT, dont worry it's low - not the trained model on Conll2003
trainer.evaluate()

{'eval_loss': 2.074887752532959,
 'eval_overall_precision': 0.01961478857648882,
 'eval_overall_recall': 0.07455402221474251,
 'eval_overall_f1': 0.031058295649735338,
 'eval_overall_accuracy': 0.23778279662006932,
 'eval_LOC_precision': 0.010542457350967989,
 'eval_LOC_recall': 0.059880239520958084,
 'eval_LOC_f1': 0.01792844918914514,
 'eval_LOC_number': 1837,
 'eval_MISC_precision': 0.0296086508753862,
 'eval_MISC_recall': 0.12472885032537961,
 'eval_MISC_f1': 0.0478568456096546,
 'eval_MISC_number': 922,
 'eval_ORG_precision': 0.021662071681764473,
 'eval_ORG_recall': 0.08202833706189411,
 'eval_ORG_f1': 0.03427325128524692,
 'eval_ORG_number': 1341,
 'eval_PER_precision': 0.03386641580432737,
 'eval_PER_recall': 0.05863192182410423,
 'eval_PER_f1': 0.042933810375670844,
 'eval_PER_number': 1842,
 'eval_runtime': 5.6663,
 'eval_samples_per_second': 573.738,
 'eval_steps_per_second': 36.002}

In [33]:
#
#  NOW LOAD MY CHECKPOINT FROM HF: WANT TO CHECK ITS BASELINE PERFORMANCE BEFORE OPTUNA
#
ft_model = AutoModelForTokenClassification.from_pretrained("benjaminzwhite/finetuned-distilbert-base-uncased-conll2003-test-no-HP-search")

training_args = TrainingArguments(
    output_dir=f"finetuned-{pretrained_checkpoint}-conll2003-test-no-HP-search",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
    report_to=["tensorboard"]
)

trainer = Trainer(
    model=ft_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.evaluate()

{'eval_loss': 0.05074695497751236,
 'eval_overall_precision': 0.924126455906822,
 'eval_overall_recall': 0.9347021204981488,
 'eval_overall_f1': 0.929384203480589,
 'eval_overall_accuracy': 0.9860207935828044,
 'eval_LOC_precision': 0.94252261841405,
 'eval_LOC_recall': 0.9640718562874252,
 'eval_LOC_f1': 0.9531754574811626,
 'eval_LOC_number': 1837,
 'eval_MISC_precision': 0.8333333333333334,
 'eval_MISC_recall': 0.8297180043383948,
 'eval_MISC_f1': 0.8315217391304348,
 'eval_MISC_number': 922,
 'eval_ORG_precision': 0.8910963944076526,
 'eval_ORG_recall': 0.9030574198359433,
 'eval_ORG_f1': 0.8970370370370371,
 'eval_ORG_number': 1341,
 'eval_PER_precision': 0.9746494066882416,
 'eval_PER_recall': 0.9809989142236699,
 'eval_PER_f1': 0.9778138528138528,
 'eval_PER_number': 1842,
 'eval_runtime': 5.4391,
 'eval_samples_per_second': 597.71,
 'eval_steps_per_second': 37.506}

# So results with a non Optuna finetuning approach were

'eval_loss': 0.05074695497751236,

'eval_overall_precision': 0.924126455906822,

'eval_overall_recall': 0.9347021204981488,

'eval_overall_f1': 0.929384203480589,

'eval_overall_accuracy': 0.9860207935828044

In [41]:
import optuna

# model is distilbert-base-uncased again

def objective(trial): 
    # Define the hyperparameters to optimize 
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-2) 
    num_epochs = trial.suggest_int("num_epochs", 1, 3) # ONLY USE SMALL EPOCH NUMS AS PROOF OF PRINCIPLE
    batch_size = trial.suggest_int("batch_size", 8, 16)
 
    # Create and train the Transformer model 
    model = AutoModelForTokenClassification.from_pretrained(
        pretrained_checkpoint,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id
    )
    
    training_args = TrainingArguments( 
        per_device_train_batch_size=batch_size, 
        evaluation_strategy="epoch", 
        learning_rate=learning_rate, 
        num_train_epochs=num_epochs,
        push_to_hub=False,
        report_to=["tensorboard"],
        output_dir="optuna_hf_test",
    )
    
    trainer = Trainer( 
        model=model, 
        args=training_args, 
        train_dataset=tokenized_dataset["train"], 
        eval_dataset=tokenized_dataset["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    ) 
    trainer.train() 
 
    # Return the evaluation metric to optimize (e.g., validation accuracy) 
    results = trainer.evaluate()
    
    return results["eval_loss"] # MINIMIZE EVAL LOSS FOR NOW

In [42]:
study = optuna.create_study(direction="minimize") # MINIMIZE EVAL LOSS FOR NOW
study.optimize(objective, n_trials=3) # ONLY 3 TRIALS AS PROOF OF PRINCIPLE

best_params = study.best_params 
best_score = study.best_value 
print(f"Best Hyperparameters: {best_params}") 
print(f"Best Score: {best_score}") 

#print(study.best_value)
#print(study.best_params)
#print(study.best_trial)

[I 2024-02-18 12:19:36,019] A new study created in memory with name: no-name-92502674-9612-4b5d-ad40-057477221ef3
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy,Loc Precision,Loc Recall,Loc F1,Loc Number,Misc Precision,Misc Recall,Misc F1,Misc Number,Org Precision,Org Recall,Org F1,Org Number,Per Precision,Per Recall,Per F1,Per Number
1,0.804,0.938415,0.0,0.0,0.0,0.832503,0.0,0.0,0.0,1837,0.0,0.0,0.0,922,0.0,0.0,0.0,1341,0.0,0.0,0.0,1842
2,0.7912,0.91731,0.0,0.0,0.0,0.832503,0.0,0.0,0.0,1837,0.0,0.0,0.0,922,0.0,0.0,0.0,1341,0.0,0.0,0.0,1842


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[I 2024-02-18 12:21:50,927] Trial 0 finished with value: 0.9173100590705872 and parameters: {'learning_rate': 0.004993193952365803, 'num_epochs': 2, 'batch_size': 10}. Best is trial 0 with value: 0.9173100590705872.
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy,Loc Precision,Loc Recall,Loc F1,Loc Number,Misc Precision,Misc Recall,Misc F1,Misc Number,Org Precision,Org Recall,Org F1,Org Number,Per Precision,Per Recall,Per F1,Per Number
1,0.8061,0.957918,0.0,0.0,0.0,0.832503,0.0,0.0,0.0,1837,0.0,0.0,0.0,922,0.0,0.0,0.0,1341,0.0,0.0,0.0,1842
2,0.7935,0.904781,0.0,0.0,0.0,0.832503,0.0,0.0,0.0,1837,0.0,0.0,0.0,922,0.0,0.0,0.0,1341,0.0,0.0,0.0,1842
3,0.7936,0.918522,0.0,0.0,0.0,0.832503,0.0,0.0,0.0,1837,0.0,0.0,0.0,922,0.0,0.0,0.0,1341,0.0,0.0,0.0,1842


Checkpoint destination directory optuna_hf_test/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory optuna_hf_test/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Checkpoint destination directory optuna_hf_test/checkpoint-1500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory optuna_hf_test/checkpoint-2000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Checkpoint destination directory optuna_hf_test/checkpoint-2500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
  _warn_prf(average, modifier, 

[I 2024-02-18 12:24:58,219] Trial 1 finished with value: 0.9185219407081604 and parameters: {'learning_rate': 0.006645118032574131, 'num_epochs': 3, 'batch_size': 13}. Best is trial 0 with value: 0.9173100590705872.
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy,Loc Precision,Loc Recall,Loc F1,Loc Number,Misc Precision,Misc Recall,Misc F1,Misc Number,Org Precision,Org Recall,Org F1,Org Number,Per Precision,Per Recall,Per F1,Per Number
1,0.8012,0.913099,0.0,0.0,0.0,0.832503,0.0,0.0,0.0,1837,0.0,0.0,0.0,922,0.0,0.0,0.0,1341,0.0,0.0,0.0,1842
2,0.7889,0.915947,0.0,0.0,0.0,0.832503,0.0,0.0,0.0,1837,0.0,0.0,0.0,922,0.0,0.0,0.0,1341,0.0,0.0,0.0,1842


Checkpoint destination directory optuna_hf_test/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory optuna_hf_test/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Checkpoint destination directory optuna_hf_test/checkpoint-1500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory optuna_hf_test/checkpoint-2000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[I 2024-02-18 12:27:05,647] Trial 2 finished with value: 0.9159466624259949 and parameters: {'learning_rate': 0.00448456272278208, 'num_epochs': 2, 'batch_size': 13}. Best is trial 2 with value: 0.9159466624259949.


Best Hyperparameters: {'learning_rate': 0.00448456272278208, 'num_epochs': 2, 'batch_size': 13}
Best Score: 0.9159466624259949


In [43]:
# I THINK THE LEARNING RATE IS WAY TOO HIGH ABOVE

def objective(trial): 
    # Define the hyperparameters to optimize 
    learning_rate = trial.suggest_float("learning_rate", 5e-6, 2e-5) 
    num_epochs = trial.suggest_int("num_epochs", 1, 2) # ONLY USE SMALL EPOCH NUMS AS PROOF OF PRINCIPLE
    batch_size = trial.suggest_int("batch_size", 8, 16)
 
    # Create and train the Transformer model 
    model = AutoModelForTokenClassification.from_pretrained(
        pretrained_checkpoint,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id
    )
    
    training_args = TrainingArguments( 
        per_device_train_batch_size=batch_size, 
        evaluation_strategy="epoch", 
        learning_rate=learning_rate, 
        num_train_epochs=num_epochs,
        push_to_hub=False,
        report_to=["tensorboard"],
        output_dir="optuna_hf_test2",
        save_total_limit=1,
    )
    
    trainer = Trainer( 
        model=model, 
        args=training_args, 
        train_dataset=tokenized_dataset["train"], 
        eval_dataset=tokenized_dataset["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    ) 
    trainer.train() 
 
    # Return the evaluation metric to optimize (e.g., validation accuracy) 
    results = trainer.evaluate()
    
    return results["eval_loss"] # MINIMIZE EVAL LOSS FOR NOW

In [44]:
study = optuna.create_study(direction="minimize", study_name="tryagainwithsmallLR") # MINIMIZE EVAL LOSS FOR NOW
study.optimize(objective, n_trials=2) # ONLY 2 TRIALS AS PROOF OF PRINCIPLE

best_params = study.best_params 
best_score = study.best_value 
print(f"Best Hyperparameters: {best_params}") 
print(f"Best Score: {best_score}") 

[I 2024-02-18 12:33:07,936] A new study created in memory with name: tryagainwithsmallLR
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy,Loc Precision,Loc Recall,Loc F1,Loc Number,Misc Precision,Misc Recall,Misc F1,Misc Number,Org Precision,Org Recall,Org F1,Org Number,Per Precision,Per Recall,Per F1,Per Number
1,0.3538,0.08864,0.846439,0.884046,0.864834,0.975994,0.876151,0.931954,0.903192,1837,0.718294,0.694143,0.706012,922,0.747814,0.829232,0.786421,1341,0.954642,0.971227,0.962863,1842
2,0.0793,0.073706,0.87922,0.902895,0.8909,0.98016,0.896357,0.951007,0.922874,1837,0.772367,0.739696,0.755679,922,0.812233,0.851603,0.831452,1341,0.962446,0.973941,0.96816,1842


[I 2024-02-18 12:35:11,645] Trial 0 finished with value: 0.07370619475841522 and parameters: {'learning_rate': 6.83148537709498e-06, 'num_epochs': 2, 'batch_size': 16}. Best is trial 0 with value: 0.07370619475841522.
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy,Loc Precision,Loc Recall,Loc F1,Loc Number,Misc Precision,Misc Recall,Misc F1,Misc Number,Org Precision,Org Recall,Org F1,Org Number,Per Precision,Per Recall,Per F1,Per Number
1,0.1293,0.099629,0.834516,0.870751,0.852248,0.973911,0.868353,0.915623,0.891362,1837,0.703125,0.683297,0.693069,922,0.719788,0.808352,0.761503,1341,0.9554,0.965255,0.960302,1842


[I 2024-02-18 12:36:21,614] Trial 1 finished with value: 0.09962854534387589 and parameters: {'learning_rate': 6.905544432761504e-06, 'num_epochs': 1, 'batch_size': 11}. Best is trial 0 with value: 0.07370619475841522.


Best Hyperparameters: {'learning_rate': 6.83148537709498e-06, 'num_epochs': 2, 'batch_size': 16}
Best Score: 0.07370619475841522


# Visualization stuff for Optuna work O_o

Graphs etc

In [45]:
optuna.visualization.plot_optimization_history(study)

In [47]:
optuna.visualization.plot_param_importances(study)


In [48]:
# ok so code above seems to work, lets try with a few trials and see if we can actualy beat default params O_o


# I WILL KEEP FIXED EPOCH SIZE = 2 
# modify the learning rate and batch size

def objective(trial): 
    # Define the hyperparameters to optimize 
    learning_rate = trial.suggest_float("learning_rate", 5e-6, 5e-5) 
    #num_epochs = trial.suggest_int("num_epochs", 1, 2) # ONLY USE SMALL EPOCH NUMS AS PROOF OF PRINCIPLE
    num_epochs = 2 # FIXED, WHILE ON THIS TEST
    batch_size = trial.suggest_int("batch_size", 16, 32)
 
    # Create and train the Transformer model 
    model = AutoModelForTokenClassification.from_pretrained(
        pretrained_checkpoint,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id
    )
    
    training_args = TrainingArguments( 
        per_device_train_batch_size=batch_size, 
        evaluation_strategy="epoch", 
        learning_rate=learning_rate, 
        num_train_epochs=num_epochs,
        push_to_hub=False,
        report_to=["tensorboard"],
        output_dir="optuna_hf_test3",
        save_total_limit=1,
    )
    
    trainer = Trainer( 
        model=model, 
        args=training_args, 
        train_dataset=tokenized_dataset["train"], 
        eval_dataset=tokenized_dataset["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    ) 
    trainer.train() 
 
    # Return the evaluation metric to optimize (e.g., validation accuracy) 
    results = trainer.evaluate()
    
    return results["eval_loss"] # MINIMIZE EVAL LOSS FOR NOW

In [49]:
study = optuna.create_study(direction="minimize", study_name="hp_working") # MINIMIZE EVAL LOSS FOR NOW
study.optimize(objective, n_trials=5)

best_params = study.best_params 
best_score = study.best_value 
print(f"Best Hyperparameters: {best_params}") 
print(f"Best Score: {best_score}") 

[I 2024-02-18 12:59:23,378] A new study created in memory with name: hp_working
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy,Loc Precision,Loc Recall,Loc F1,Loc Number,Misc Precision,Misc Recall,Misc F1,Misc Number,Org Precision,Org Recall,Org F1,Org Number,Per Precision,Per Recall,Per F1,Per Number
1,0.1591,0.047806,0.920219,0.933692,0.926907,0.986449,0.94765,0.965705,0.956592,1837,0.818369,0.860087,0.83871,922,0.882353,0.872483,0.87739,1341,0.97261,0.98317,0.977862,1842
2,0.0335,0.046383,0.933378,0.943117,0.938222,0.987617,0.95182,0.967882,0.959784,1837,0.861736,0.872017,0.866846,922,0.898368,0.903057,0.900707,1341,0.97628,0.98317,0.979713,1842


[I 2024-02-18 13:01:19,872] Trial 0 finished with value: 0.04638304188847542 and parameters: {'learning_rate': 3.193047725884901e-05, 'batch_size': 17}. Best is trial 0 with value: 0.04638304188847542.
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy,Loc Precision,Loc Recall,Loc F1,Loc Number,Misc Precision,Misc Recall,Misc F1,Misc Number,Org Precision,Org Recall,Org F1,Org Number,Per Precision,Per Recall,Per F1,Per Number
1,0.1316,0.046159,0.925705,0.939414,0.932509,0.986897,0.957135,0.960261,0.958696,1837,0.843552,0.86551,0.85439,922,0.876633,0.90082,0.888562,1341,0.972625,0.983713,0.978138,1842
2,0.0413,0.045724,0.937615,0.945978,0.941778,0.988026,0.955843,0.966249,0.961018,1837,0.876078,0.881779,0.878919,922,0.90325,0.912006,0.907607,1341,0.975216,0.982628,0.978908,1842


[I 2024-02-18 13:03:11,995] Trial 1 finished with value: 0.045724399387836456 and parameters: {'learning_rate': 4.9894588609512605e-05, 'batch_size': 19}. Best is trial 1 with value: 0.045724399387836456.
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy,Loc Precision,Loc Recall,Loc F1,Loc Number,Misc Precision,Misc Recall,Misc F1,Misc Number,Org Precision,Org Recall,Org F1,Org Number,Per Precision,Per Recall,Per F1,Per Number
1,0.1393,0.046954,0.926461,0.939246,0.93281,0.98715,0.952228,0.965705,0.958919,1837,0.840415,0.87961,0.859565,922,0.886821,0.888143,0.887481,1341,0.974096,0.979913,0.976996,1842
2,0.0287,0.045867,0.935758,0.94379,0.939757,0.987715,0.949198,0.966249,0.957648,1837,0.882096,0.876356,0.879217,922,0.896831,0.907532,0.90215,1341,0.977297,0.981542,0.979415,1842


[I 2024-02-18 13:05:11,351] Trial 2 finished with value: 0.0458671934902668 and parameters: {'learning_rate': 4.367745178289785e-05, 'batch_size': 17}. Best is trial 1 with value: 0.045724399387836456.
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy,Loc Precision,Loc Recall,Loc F1,Loc Number,Misc Precision,Misc Recall,Misc F1,Misc Number,Org Precision,Org Recall,Org F1,Org Number,Per Precision,Per Recall,Per F1,Per Number
1,0.2523,0.083313,0.85332,0.890946,0.871727,0.976909,0.890843,0.937398,0.913528,1837,0.719637,0.687636,0.703272,922,0.757979,0.850112,0.801406,1341,0.95334,0.976113,0.964592,1842
2,0.0715,0.068197,0.885531,0.907439,0.896351,0.981309,0.897187,0.954818,0.925105,1837,0.787402,0.759219,0.773054,922,0.827561,0.855332,0.841217,1341,0.963421,0.972313,0.967847,1842


[I 2024-02-18 13:06:54,991] Trial 3 finished with value: 0.06819698214530945 and parameters: {'learning_rate': 1.1052205111266617e-05, 'batch_size': 28}. Best is trial 1 with value: 0.045724399387836456.
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy,Loc Precision,Loc Recall,Loc F1,Loc Number,Misc Precision,Misc Recall,Misc F1,Misc Number,Org Precision,Org Recall,Org F1,Org Number,Per Precision,Per Recall,Per F1,Per Number
1,0.1351,0.047304,0.918768,0.938405,0.928482,0.986644,0.947537,0.963527,0.955466,1837,0.83195,0.869848,0.850477,922,0.872449,0.892617,0.882418,1341,0.968901,0.980999,0.974912,1842
2,0.0377,0.046131,0.933012,0.944631,0.938786,0.98789,0.951337,0.968427,0.959806,1837,0.861555,0.87744,0.869425,922,0.900442,0.910515,0.905451,1341,0.974608,0.97937,0.976983,1842


[I 2024-02-18 13:08:42,506] Trial 4 finished with value: 0.046130843460559845 and parameters: {'learning_rate': 4.212474889884223e-05, 'batch_size': 23}. Best is trial 1 with value: 0.045724399387836456.


Best Hyperparameters: {'learning_rate': 4.9894588609512605e-05, 'batch_size': 19}
Best Score: 0.045724399387836456


In [50]:
optuna.visualization.plot_optimization_history(study)

In [51]:
optuna.visualization.plot_param_importances(study)

In [57]:
study.best_trial

FrozenTrial(number=1, state=TrialState.COMPLETE, values=[0.045724399387836456], datetime_start=datetime.datetime(2024, 2, 18, 13, 1, 19, 874142), datetime_complete=datetime.datetime(2024, 2, 18, 13, 3, 11, 995209), params={'learning_rate': 4.9894588609512605e-05, 'batch_size': 19}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=5e-05, log=False, low=5e-06, step=None), 'batch_size': IntDistribution(high=32, log=False, low=16, step=1)}, trial_id=1, value=None)

In [65]:
# Recap of Optuna vs no Optuna
# Earlier I had results from a finetuning using the tutorial settings:
no_optuna = {'eval_loss': 0.05074695497751236,
 'eval_overall_precision': 0.924126455906822,
 'eval_overall_recall': 0.9347021204981488,
 'eval_overall_f1': 0.929384203480589,
 'eval_overall_accuracy': 0.9860207935828044,
 'eval_LOC_precision': 0.94252261841405,
 'eval_LOC_recall': 0.9640718562874252,
 'eval_LOC_f1': 0.9531754574811626,
 'eval_LOC_number': 1837,
 'eval_MISC_precision': 0.8333333333333334,
 'eval_MISC_recall': 0.8297180043383948,
 'eval_MISC_f1': 0.8315217391304348,
 'eval_MISC_number': 922,
 'eval_ORG_precision': 0.8910963944076526,
 'eval_ORG_recall': 0.9030574198359433,
 'eval_ORG_f1': 0.8970370370370371,
 'eval_ORG_number': 1341,
 'eval_PER_precision': 0.9746494066882416,
 'eval_PER_recall': 0.9809989142236699,
 'eval_PER_f1': 0.9778138528138528,
 'eval_PER_number': 1842,
 'eval_runtime': 5.4391,
 'eval_samples_per_second': 597.71,
 'eval_steps_per_second': 37.506}

# HOW DO YOU GET THIS FROM THE OPTUNA STUDY!?!??! DOES IT STORE THEM SOMEWHERE FOR EACH TRIAL ??
# I CANT FIND IT ANYWHERE, read the docs but still nothing
# googled stuff like "how can i find all metrics from optuna trials" etc

# RIGHT NOW I WILL JUST RETRAIN AGAIN WITH THE BEST_PARAMS!?!?
study.best_params

{'learning_rate': 4.9894588609512605e-05, 'batch_size': 19}

In [60]:
optuna_LR = study.best_params["learning_rate"]
optuna_BS = study.best_params["batch_size"]

# retraining this time with best params from Optuna
training_args = TrainingArguments(
    output_dir=f"finetuned-{pretrained_checkpoint}-conll2003-test-after-Optuna-HP-search",
    learning_rate=optuna_LR,
    per_device_train_batch_size=optuna_BS,
    per_device_eval_batch_size=optuna_BS,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
    report_to=["tensorboard"]
)

optuna_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

optuna_trainer.train()

Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy,Loc Precision,Loc Recall,Loc F1,Loc Number,Misc Precision,Misc Recall,Misc F1,Misc Number,Org Precision,Org Recall,Org F1,Org Number,Per Precision,Per Recall,Per F1,Per Number
1,0.1304,0.045152,0.925779,0.940424,0.933044,0.987053,0.955159,0.962439,0.958785,1837,0.837113,0.880694,0.858351,922,0.881965,0.897092,0.889464,1341,0.975149,0.979913,0.977525,1842
2,0.0415,0.045459,0.934579,0.942444,0.938495,0.987442,0.948827,0.968971,0.958793,1837,0.87149,0.875271,0.873377,922,0.895988,0.899329,0.897655,1341,0.979935,0.980999,0.980467,1842


TrainOutput(global_step=1480, training_loss=0.06643215450080665, metrics={'train_runtime': 107.5787, 'train_samples_per_second': 261.055, 'train_steps_per_second': 13.757, 'total_flos': 350249241934116.0, 'train_loss': 0.06643215450080665, 'epoch': 2.0})

In [61]:
optuna_trainer.evaluate()

{'eval_loss': 0.045152388513088226,
 'eval_overall_precision': 0.9257786613651425,
 'eval_overall_recall': 0.9404240996297543,
 'eval_overall_f1': 0.9330439138420437,
 'eval_overall_accuracy': 0.9870526848642965,
 'eval_LOC_precision': 0.9551593733117234,
 'eval_LOC_recall': 0.9624387588459444,
 'eval_LOC_f1': 0.9587852494577006,
 'eval_LOC_number': 1837,
 'eval_MISC_precision': 0.8371134020618557,
 'eval_MISC_recall': 0.8806941431670282,
 'eval_MISC_f1': 0.8583509513742071,
 'eval_MISC_number': 922,
 'eval_ORG_precision': 0.8819648093841642,
 'eval_ORG_recall': 0.8970917225950783,
 'eval_ORG_f1': 0.8894639556377079,
 'eval_ORG_number': 1341,
 'eval_PER_precision': 0.975148568341437,
 'eval_PER_recall': 0.9799131378935939,
 'eval_PER_f1': 0.9775250473869481,
 'eval_PER_number': 1842,
 'eval_runtime': 5.4228,
 'eval_samples_per_second': 599.504,
 'eval_steps_per_second': 31.718,
 'epoch': 2.0}

In [71]:
with_optuna = {'eval_loss': 0.045152388513088226,
 'eval_overall_precision': 0.9257786613651425,
 'eval_overall_recall': 0.9404240996297543,
 'eval_overall_f1': 0.9330439138420437,
 'eval_overall_accuracy': 0.9870526848642965,
 'eval_LOC_precision': 0.9551593733117234,
 'eval_LOC_recall': 0.9624387588459444,
 'eval_LOC_f1': 0.9587852494577006,
 'eval_LOC_number': 1837,
 'eval_MISC_precision': 0.8371134020618557,
 'eval_MISC_recall': 0.8806941431670282,
 'eval_MISC_f1': 0.8583509513742071,
 'eval_MISC_number': 922,
 'eval_ORG_precision': 0.8819648093841642,
 'eval_ORG_recall': 0.8970917225950783,
 'eval_ORG_f1': 0.8894639556377079,
 'eval_ORG_number': 1341,
 'eval_PER_precision': 0.975148568341437,
 'eval_PER_recall': 0.9799131378935939,
 'eval_PER_f1': 0.9775250473869481,
 'eval_PER_number': 1842,
 'eval_runtime': 5.4228,
 'eval_samples_per_second': 599.504,
 'eval_steps_per_second': 31.718,
 'epoch': 2.0}


res = pd.DataFrame([no_optuna, with_optuna]).T

res

Unnamed: 0,0,1
eval_loss,0.050747,0.045152
eval_overall_precision,0.924126,0.925779
eval_overall_recall,0.934702,0.940424
eval_overall_f1,0.929384,0.933044
eval_overall_accuracy,0.986021,0.987053
eval_LOC_precision,0.942523,0.955159
eval_LOC_recall,0.964072,0.962439
eval_LOC_f1,0.953175,0.958785
eval_LOC_number,1837.0,1837.0
eval_MISC_precision,0.833333,0.837113
