In [1]:
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForTokenClassification
from pathlib import Path
import numpy as np
import torch
from tokenizers import AddedToken
from tqdm.notebook import tqdm
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
import pandas as pd
from datasets import Dataset

kaggle=False

path="/kaggle/input/pii-detection-removal-from-educational-data" if kaggle else "data"
train_path = path + "/train.json"
test_path = path + "/test.json"

mixtral_path="data/mpware_mixtral8x7b_v1.1.json" if not kaggle else "/kaggle/input/mixtral-8x7b-v11/mixtral8x7b_v1.1.json"

model_path = "/kaggle/input/huggingfacedebertav3variants/deberta-v3-base" if kaggle else "microsoft/deberta-v3-base"

if not kaggle: import neptune
if not kaggle: from seqeval.metrics import recall_score, precision_score, f1_score, accuracy_score

https://www.kaggle.com/datasets/mpware/pii-mixtral8x7b-generated-essays

In [2]:
cross_entropy_weight_multi = 400

CROSS_ENTROPY_WEIGHTS = [cross_entropy_weight_multi]*12
CROSS_ENTROPY_WEIGHTS.append(1)


# best PII-265

parameter= {
    "model": model_path,
    "max_length": 1024,
    "inference_max_length": 2000,
    "batch_size": 4,
    "inference_batch_size": 1,
    "lr": 5e-05,
    "lr_scale_unfreeze": 0.01,
    "filter_no_pii_percent_allow": 0.2,
    "notebook": "20_deberta base_1024len.ipynb",
    "CROSS_ENTROPY_WEIGHT_MULTI": cross_entropy_weight_multi,
    "epochs_before_unfreeze": 1,
    "epochs_after_unfreeze": 2,
    "train_test_split": 0.2,
    "num_proc": 16 if not kaggle else 2, 
    "freeze_embeddings": False,
    "freeze_layers": 6,
    "warumup_steps": 500,
    "weight_decay": 0.01,
    "logging_dir": './logs',
    "logging_steps": 10,
    "evaluation_strategy": "steps",
    "eval_steps": 400,
    "save_steps": 400,
    "save_total_limit": 3,
    "load_best_model_at_end": False,
    "metric_for_best_model": "f1",
    "greater_is_better": True,
    "overwrite_output_dir": True,
    "report_to": "none",
}

print(parameter["lr"]*parameter["lr_scale_unfreeze"])

5.000000000000001e-07


In [3]:
target = [
    'B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 
    'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 
    'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL'
]

In [4]:
from itertools import chain
import json

data = json.load(open(train_path))
all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
label2id = {l: i for i,l in enumerate(all_labels)}
id2label = {v:k for k,v in label2id.items()}

In [5]:
import random

def tokenize(example, tokenizer, label2id, max_length, all_labels_list):
    text = []
    import numpy as np

    # these are at the character level
    labels = []
    targets = []

    for t, l, ws in zip(example["tokens"], example["labels"], example["trailing_whitespace"]):

        text.append(t)
        labels.extend([l]*len(t))
        
        if l in all_labels_list:
            targets.append(1)
        else:
            targets.append(0)
        # if there is trailing whitespace
        if ws:
            text.append(" ")
            labels.append("O")

    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=max_length, padding="max_length")
    
    target_num = sum(targets)
    labels = np.array(labels)

    text = "".join(text)
    token_labels = []

    for start_idx, end_idx in tokenized.offset_mapping:

        # CLS token
        if start_idx == 0 and end_idx == 0: 
            token_labels.append(label2id["O"])
            continue

        # case when token starts with whitespace
        if text[start_idx].isspace():
            start_idx += 1

        try:
            token_labels.append(label2id[labels[start_idx]])
        except:
            token_labels.append(label2id["O"])

    length = len(tokenized.input_ids)

    return {
        **tokenized,
        "labels": token_labels,
        "length": length,
        "target_num": target_num,
        "group": 1 if target_num>0 else 0
    }

# https://www.kaggle.com/competitions/pii-detection-removal-from-educational-data/discussion/468844
def filter_no_pii(example, percent_allow=parameter["filter_no_pii_percent_allow"]):
    # Return True if there is PII
    # Or 20% of the time if there isn't
    has_pii = set("O") != set(example["labels"])
    return has_pii or (random.random() < percent_allow)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(parameter["model"])
tokenizer.add_tokens(AddedToken("\n", normalized=False)) 

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


1

In [7]:
len_data=len(data)
valid_idx = random.sample(range(len_data), int(len_data*parameter["train_test_split"]))
train_idx = list(set(range(len_data)) - set(valid_idx))

train_data = [data[i] for i in train_idx]
valid_data = [data[i] for i in valid_idx]




mixtral_data=json.load(open(mixtral_path))
dict_mixtral={
    "full_text": [x["full_text"] for x in mixtral_data],
    "document": [str(x["document"]) for x in mixtral_data],
    "tokens": [x["tokens"] for x in mixtral_data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in mixtral_data],
    "labels": [x["labels"] for x in mixtral_data],
}
data = json.load(open(train_path))
dict_train={
    "full_text": [x["full_text"] for x in train_data],
    "document": [str(x["document"]) for x in train_data],
    "tokens": [x["tokens"] for x in train_data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in train_data],
    "labels": [x["labels"] for x in train_data],
}

dict_valid={
    "full_text": [x["full_text"] for x in valid_data],
    "document": [str(x["document"]) for x in valid_data],
    "tokens": [x["tokens"] for x in valid_data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in valid_data],
    "labels": [x["labels"] for x in valid_data],
}

full_data = {
    "full_text": dict_train["full_text"] + dict_mixtral["full_text"],
    "document": dict_train["document"] + dict_mixtral["document"],
    "tokens": dict_train["tokens"] + dict_mixtral["tokens"],
    "trailing_whitespace": dict_train["trailing_whitespace"] + dict_mixtral["trailing_whitespace"],
    "labels": dict_train["labels"] + dict_mixtral["labels"],
}

In [8]:
train_ds = Dataset.from_dict(full_data)

train_ds = train_ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer, "label2id": label2id, "max_length": parameter["max_length"], "all_labels_list": target}, num_proc=parameter["num_proc"])
train_ds=train_ds.filter(filter_no_pii, num_proc=parameter["num_proc"])
valid_ds = Dataset.from_dict(dict_valid)
valid_ds = valid_ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer, "label2id": label2id, "max_length": parameter["max_length"], "all_labels_list": target}, num_proc=parameter["num_proc"])


print(len(train_ds), len(valid_ds))


Map (num_proc=16):   0%|          | 0/8138 [00:00<?, ? examples/s]

Filter (num_proc=16):   0%|          | 0/8138 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/1361 [00:00<?, ? examples/s]

8138 1361


In [9]:
def tokenize_inference(example, tokenizer, max_length):
        text = []
        for t,  ws in zip(example["tokens"], example["trailing_whitespace"]):
            text.append(t)
            if ws:
                text.append(" ")
        tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=max_length, padding="max_length")
        text = "".join(text)
        length = len(tokenized.input_ids)
        return {
            **tokenized,
            "length": length,
        }
        
class TestTokenizer():
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    
    def preprocess(self, example):
        # Preprocess the tokens and labels by adding trailing whitespace and labels
        tokens = []
        tokens_without_ws = []
        token_map = [] # Use the index as labels
        index = 0
        for token, t_ws in zip(example["tokens"], example["trailing_whitespace"]):
            tokens_without_ws.append(token)
            tokens.append(token)
            token_map.extend([index] * len(token))
            # Added trailing whitespace and label if true and 
            if t_ws:
                tokens.append(" ")
                token_map.append(-1)
            index += 1
        return tokens, token_map, tokens_without_ws
    
    def tokenize(self, example):
        tokens, token_map, tokens_without_ws = self.preprocess(example)
        text = "".join(tokens)
        tokenized = self.tokenizer(text, return_offsets_mapping=True, padding="max_length",
                                   truncation=True, max_length=parameter["inference_max_length"])
        return {**tokenized, "token_map": token_map, "tokens": tokens, "tokens_without_ws": tokens_without_ws} 

class PiiDatasetInference(torch.utils.data.Dataset):
        def __init__(self, dataset, tokenizer):
            self.dataset = dataset
            self.tokenizer=TestTokenizer(tokenizer)
            
        def __getitem__(self, idx):
            vals=self.tokenizer.tokenize(self.dataset[idx])
            input_ids = torch.tensor(vals["input_ids"])
            attention_mask = torch.tensor(vals["attention_mask"])
            document_id = self.dataset[idx]["document"]
            return input_ids, attention_mask, document_id, vals
        
        def __len__(self):
            return len(self.dataset)

# Convert preds to a list of dictionaries
def to_test_submission(preds=None, dataset=None, document_ids=None, id2label=None):
    pairs = []
    row_id = 0
    results = []
    
    for i in range(len(preds)):
        input_ids, attention_mask, document_id, vals = dataset[i]
        token_map=vals["token_map"]
        offsets=vals["offset_mapping"]
        tokens=vals["tokens_without_ws"]
        #print("tokens", tokens)
        pred=preds[i]
        #print("original_text", original_text)
        #print("token_map", token_map)
        #print("offsets", offsets)   
        #print("pred", pred)


        for token_pred, input_id, (start_idx, end_idx) in zip(pred, input_ids, offsets):
            #print("\nnow doing ", start_idx,  end_idx, token_pred)
            if start_idx == 0 and end_idx == 0: # Skip 0 offset
                continue
            # Skip spaces 
            while start_idx < len(token_map):
                #print("loop, start_idx now", start_idx) 
                #print(" tokens[token_map[start_idx]]: ", tokens[token_map[start_idx]] if not tokens[token_map[start_idx]].isspace() else "WHITESPACE")          
                if token_map[start_idx] == -1: # Skip unknown tokens               
                    start_idx += 1
                elif tokens[token_map[start_idx]].isspace(): # Skip white space
                    start_idx += 1
                else:
                    break
            # Ensure start index < length
            if start_idx < len(token_map):
                token_id = token_map[start_idx]
                #print("token_id", token_id)
                #token_id= input_id.item()
                label_pred = id2label[token_pred.item()]
                #print("label_pred", label_pred)
                # ignore "O" and whitespace preds
                if label_pred != "O" and token_id != -1:
                    #print("is PII", token_id, label_pred)
                    token_str = tokens[token_id]
                    pair=(document_id, token_id)
                    if pair not in pairs:
                        results.append({
                            "row_id": row_id, 
                            "document": document_id,
                            "token": token_id, 
                            "label": label_pred,
                            "token_str": token_str
                        })
                        pairs.append(pair)
                        row_id += 1

    # Create a dataframe 
    return results

def create_submission(model, filename="submission.csv"):
    data = json.load(open(train_path))
    from itertools import chain
    all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
    label2id = {l: i for i,l in enumerate(all_labels)}
    id2label = {v:k for k,v in label2id.items()}

    data=json.load(open(test_path))
    tokenizer = AutoTokenizer.from_pretrained(parameter["model"])
    my_dataset=PiiDatasetInference(data, tokenizer)
    loader=torch.utils.data.DataLoader(my_dataset, batch_size=1, shuffle=False)

    device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    
    # stack all predictions into tensor
    all_preds = []

    for id, attention_mask, document_ids, vals in loader:
        id=id.to(device)
        attention_mask=attention_mask.to(device)
        preds=model(id, attention_mask).get('logits').argmax(dim=2)
        all_preds.append(preds)
        #for pred, id in zip(preds.flatten(), id.flatten()):
        #    if pred != 12:
                #print(f"Document: {document_id.item()} TOKEN:{tokenizer.decode(id)}  --- pred:{id2label[pred.item()]}")
        #        output[row_id]={"document":document_id.item(), "token":id.item(), "label":id2label[pred.item()]}
        #        row_id+=1
        #for pred, id in zip(preds.flatten(), id.flatten()):
        #    if pred != 12:
        #        print(f"TOKEN:{tokenizer.decode(id)}  --- pred:{id2label[pred.item()]}")
    
   
    all_preds = torch.cat(all_preds, dim=0)
    
    results = to_test_submission(preds=all_preds, dataset=my_dataset, document_ids=document_ids, id2label=id2label)
    if len(results) == 0:
        print("Error in create_submission(): No predictions made, probably because the model is not learning. Check the model and the data.")
        return
    df = pd.DataFrame(results)
    df=df[["row_id", "document", "token", "label"]]
    print(df)
    df.to_csv(filename, index=False)

#create_submission(MyModel(parameter['model'], len(label2id)).to(device), "submission_just_dumb.csv")
# create_submission(model, "submission.csv")
    




In [10]:
from transformers import DataCollatorForTokenClassification

collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

In [11]:
# using Trainer and TrainingArguments from transformers


def compute_metrics(p, all_labels):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    f1_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)
    
    results = {
        'recall': recall,
        'precision': precision,
        'f1': f1_score
    }
    return results

from functools import partial
from transformers import TrainerCallback
from transformers.trainer_callback import TrainerControl, TrainerState
from transformers.training_args import TrainingArguments

def get_trainer(model, train_dataloader, valid_dataloader, learnrate_multiplier=1.0):

    if not kaggle:
        from transformers.integrations import NeptuneCallback

        run = neptune.init_run(
            project="bernd.heidemann/PII",
            api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIzNjBlYzVkNi0zZTUwLTQ1ODYtODhlNC02NDUxNDg0MDdjNzUifQ==",
        )  # your credentials
        run["parameters"] = {
        **parameter
        }

        neptune_callback = NeptuneCallback(run=run, log_model_weights=False, log_parameters=False)

    training_args = TrainingArguments(
        output_dir='./results',          # output directory
        num_train_epochs=parameter["epochs_before_unfreeze"]+parameter["epochs_after_unfreeze"],
        per_device_train_batch_size=parameter["batch_size"],  # batch size per device during training
        per_device_eval_batch_size=parameter["inference_batch_size"],   # batch size for evaluation
        warmup_steps=parameter["warumup_steps"],                # number of warmup steps for learning rate scheduler
        weight_decay=parameter["weight_decay"],               # strength of weight decay
        logging_dir=parameter["logging_dir"],            # directory for storing logs
        logging_steps=parameter["logging_steps"],
        evaluation_strategy=parameter["evaluation_strategy"],
        eval_steps=parameter["eval_steps"],
        save_steps=parameter["save_steps"],
        save_total_limit=parameter["save_total_limit"],
        load_best_model_at_end=parameter["load_best_model_at_end"],
        metric_for_best_model="f1" if not kaggle else "eval_loss",
        greater_is_better=True if not kaggle else False,
        overwrite_output_dir=parameter["overwrite_output_dir"],
        report_to=parameter["report_to"],
        learning_rate=parameter["lr"]
    )

    class FreezingCallback(TrainerCallback):
        def on_epoch_begin(self, args, state, control, model, **kwargs):
            if state.epoch == parameter["epochs_before_unfreeze"]:
                # change learning rate
                optimizer= kwargs["optimizer"]

                for param_group in optimizer.param_groups:
                    param_group['lr'] = parameter["lr"]*parameter["lr_scale_unfreeze"]
                for param in model.base_model.parameters():
                    param.requires_grad = True
                
    class MyTrainer(Trainer):
        def __init__(self, model=None, args=None, train_dataset=None, eval_dataset=None, compute_metrics=None, callbacks=None):
            super().__init__(model=model, args=args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, callbacks=callbacks)
            # Definieren Sie hier Ihre Gewichte für die Klassen, z.B. torch.tensor([1.0, 2.0, 0.5])
            self.weight = torch.tensor(CROSS_ENTROPY_WEIGHTS).to(device)
            self.loss_func=torch.nn.CrossEntropyLoss(ignore_index=-100, weight=torch.tensor(CROSS_ENTROPY_WEIGHTS, dtype=torch.float32).to(device))

        def compute_loss(self, model, inputs, return_outputs=False):
            labels = inputs.get("labels")
            outputs = model(**inputs)
            logits = outputs.get('logits')
            loss = self.loss_func(logits.view(-1, self.model.config.num_labels), labels.view(-1))
            return (loss, outputs) if return_outputs else loss
        
    trainer = MyTrainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataloader,         # training dataset
        eval_dataset=valid_dataloader,             # evaluation dataset
        compute_metrics=partial(compute_metrics, all_labels=all_labels) if not kaggle else None,
        callbacks=[neptune_callback, FreezingCallback()] if not kaggle else [FreezingCallback()]
    )
    return trainer

In [12]:


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"



model = AutoModelForTokenClassification.from_pretrained(
    parameter["model"],
    num_labels=len(all_labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

if parameter['freeze_embeddings']:
    for param in model.deberta.embeddings.parameters():
        param.requires_grad = False
        
if parameter['freeze_layers'] > 0:
    for layer in model.deberta.encoder.layer[:parameter['freeze_layers']]:
        for param in layer.parameters():
            param.requires_grad = False

#my_model=MyModel(parameter['model'], len(label2id))
# set torch seed
torch.manual_seed(189237)
trainer=get_trainer(model, train_ds, valid_ds)
trainer.train()

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


https://app.neptune.ai/bernd.heidemann/PII/e/PII-266




  0%|          | 0/16280 [00:00<?, ?it/s]

{'loss': 3.1686, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}
{'loss': 3.0549, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.01}
{'loss': 2.9252, 'learning_rate': 3e-06, 'epoch': 0.01}
{'loss': 2.6947, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.02}
{'loss': 2.4117, 'learning_rate': 5e-06, 'epoch': 0.02}
{'loss': 2.2911, 'learning_rate': 6e-06, 'epoch': 0.03}
{'loss': 2.1482, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.03}
{'loss': 1.621, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.04}
{'loss': 1.7245, 'learning_rate': 9e-06, 'epoch': 0.04}
{'loss': 1.5878, 'learning_rate': 1e-05, 'epoch': 0.05}
{'loss': 0.9594, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.05}
{'loss': 0.9283, 'learning_rate': 1.2e-05, 'epoch': 0.06}
{'loss': 1.1425, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.06}
{'loss': 0.7517, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.07}
{'loss': 1.1812, 'learning_rate': 1.5e-05, 'epoch': 0.07}
{'loss': 0.895, 'learning

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.017177848145365715, 'eval_recall': 0.8756641870350691, 'eval_precision': 0.39219419324131366, 'eval_f1': 0.8360259111839539, 'eval_runtime': 42.8958, 'eval_samples_per_second': 31.728, 'eval_steps_per_second': 31.728, 'epoch': 0.2}
{'loss': 0.0939, 'learning_rate': 4.1e-05, 'epoch': 0.2}
{'loss': 0.1047, 'learning_rate': 4.2e-05, 'epoch': 0.21}
{'loss': 0.1579, 'learning_rate': 4.3e-05, 'epoch': 0.21}
{'loss': 0.0586, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.22}
{'loss': 0.2452, 'learning_rate': 4.5e-05, 'epoch': 0.22}
{'loss': 0.1364, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.23}
{'loss': 0.0505, 'learning_rate': 4.7e-05, 'epoch': 0.23}
{'loss': 0.1826, 'learning_rate': 4.8e-05, 'epoch': 0.24}
{'loss': 0.1755, 'learning_rate': 4.9e-05, 'epoch': 0.24}
{'loss': 0.0638, 'learning_rate': 5e-05, 'epoch': 0.25}
{'loss': 0.0362, 'learning_rate': 4.996831432192649e-05, 'epoch': 0.25}
{'loss': 0.1674, 'learning_rate': 4.993662864385298e-05, 'epoch': 0.26}
{'

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.017025966197252274, 'eval_recall': 0.8597236981934112, 'eval_precision': 0.6152091254752852, 'eval_f1': 0.8467793880837359, 'eval_runtime': 40.782, 'eval_samples_per_second': 33.373, 'eval_steps_per_second': 33.373, 'epoch': 0.39}
{'loss': 0.0292, 'learning_rate': 4.901774397972117e-05, 'epoch': 0.4}
{'loss': 0.1757, 'learning_rate': 4.898605830164766e-05, 'epoch': 0.4}
{'loss': 0.0526, 'learning_rate': 4.8954372623574146e-05, 'epoch': 0.41}
{'loss': 0.0526, 'learning_rate': 4.8922686945500636e-05, 'epoch': 0.41}
{'loss': 0.0217, 'learning_rate': 4.8891001267427125e-05, 'epoch': 0.42}
{'loss': 0.0024, 'learning_rate': 4.8859315589353615e-05, 'epoch': 0.42}
{'loss': 0.1464, 'learning_rate': 4.8827629911280104e-05, 'epoch': 0.43}
{'loss': 0.0249, 'learning_rate': 4.8795944233206594e-05, 'epoch': 0.43}
{'loss': 0.0333, 'learning_rate': 4.876425855513308e-05, 'epoch': 0.44}
{'loss': 0.1024, 'learning_rate': 4.873257287705957e-05, 'epoch': 0.44}
{'loss': 0.0363, 'learning_ra

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.012808284722268581, 'eval_recall': 0.924548352816153, 'eval_precision': 0.6118143459915611, 'eval_f1': 0.9067222511724855, 'eval_runtime': 40.712, 'eval_samples_per_second': 33.43, 'eval_steps_per_second': 33.43, 'epoch': 0.59}
{'loss': 0.0368, 'learning_rate': 4.775031685678074e-05, 'epoch': 0.59}
{'loss': 0.0917, 'learning_rate': 4.771863117870723e-05, 'epoch': 0.6}
{'loss': 0.1458, 'learning_rate': 4.7686945500633716e-05, 'epoch': 0.6}
{'loss': 0.1424, 'learning_rate': 4.7655259822560206e-05, 'epoch': 0.61}
{'loss': 0.0373, 'learning_rate': 4.7623574144486695e-05, 'epoch': 0.61}
{'loss': 0.1882, 'learning_rate': 4.7591888466413185e-05, 'epoch': 0.62}
{'loss': 0.0279, 'learning_rate': 4.7560202788339674e-05, 'epoch': 0.62}
{'loss': 0.061, 'learning_rate': 4.7528517110266163e-05, 'epoch': 0.63}
{'loss': 0.1488, 'learning_rate': 4.749683143219265e-05, 'epoch': 0.63}
{'loss': 0.1249, 'learning_rate': 4.746514575411914e-05, 'epoch': 0.64}
{'loss': 0.0085, 'learning_rate':

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.016459699720144272, 'eval_recall': 0.8533475026567482, 'eval_precision': 0.6636363636363637, 'eval_f1': 0.8440671113806346, 'eval_runtime': 40.743, 'eval_samples_per_second': 33.404, 'eval_steps_per_second': 33.404, 'epoch': 0.79}
{'loss': 0.0022, 'learning_rate': 4.648288973384031e-05, 'epoch': 0.79}
{'loss': 0.3196, 'learning_rate': 4.64512040557668e-05, 'epoch': 0.8}
{'loss': 0.0058, 'learning_rate': 4.641951837769328e-05, 'epoch': 0.8}
{'loss': 0.0288, 'learning_rate': 4.6387832699619776e-05, 'epoch': 0.81}
{'loss': 0.027, 'learning_rate': 4.6356147021546265e-05, 'epoch': 0.81}
{'loss': 0.0471, 'learning_rate': 4.6324461343472755e-05, 'epoch': 0.82}
{'loss': 0.0789, 'learning_rate': 4.629277566539924e-05, 'epoch': 0.82}
{'loss': 0.0114, 'learning_rate': 4.626108998732573e-05, 'epoch': 0.83}
{'loss': 0.1661, 'learning_rate': 4.622940430925222e-05, 'epoch': 0.83}
{'loss': 0.0023, 'learning_rate': 4.619771863117871e-05, 'epoch': 0.84}
{'loss': 0.0603, 'learning_rate': 

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.009423228912055492, 'eval_recall': 0.9511158342189161, 'eval_precision': 0.6719219219219219, 'eval_f1': 0.9361548054873878, 'eval_runtime': 40.7583, 'eval_samples_per_second': 33.392, 'eval_steps_per_second': 33.392, 'epoch': 0.98}
{'loss': 0.002, 'learning_rate': 4.521546261089988e-05, 'epoch': 0.99}
{'loss': 0.001, 'learning_rate': 4.518377693282637e-05, 'epoch': 0.99}
{'loss': 0.002, 'learning_rate': 4.5152091254752856e-05, 'epoch': 1.0}
{'loss': 0.0013, 'learning_rate': 4.512040557667934e-05, 'epoch': 1.0}
{'loss': 0.0044, 'learning_rate': 4.508871989860583e-05, 'epoch': 1.01}
{'loss': 0.0154, 'learning_rate': 4.5057034220532325e-05, 'epoch': 1.01}
{'loss': 0.0026, 'learning_rate': 4.5025348542458814e-05, 'epoch': 1.02}
{'loss': 0.0044, 'learning_rate': 4.49936628643853e-05, 'epoch': 1.02}
{'loss': 0.0018, 'learning_rate': 4.4961977186311786e-05, 'epoch': 1.03}
{'loss': 0.0175, 'learning_rate': 4.4930291508238275e-05, 'epoch': 1.03}
{'loss': 0.0073, 'learning_rate':

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.016663121059536934, 'eval_recall': 0.9500531349628055, 'eval_precision': 0.6182572614107884, 'eval_f1': 0.9308397741380001, 'eval_runtime': 45.0143, 'eval_samples_per_second': 30.235, 'eval_steps_per_second': 30.235, 'epoch': 1.18}
{'loss': 0.0202, 'learning_rate': 4.394803548795944e-05, 'epoch': 1.18}
{'loss': 0.0005, 'learning_rate': 4.391634980988593e-05, 'epoch': 1.19}
{'loss': 0.0052, 'learning_rate': 4.3884664131812426e-05, 'epoch': 1.19}
{'loss': 0.0067, 'learning_rate': 4.3852978453738916e-05, 'epoch': 1.2}
{'loss': 0.0065, 'learning_rate': 4.38212927756654e-05, 'epoch': 1.2}
{'loss': 0.0558, 'learning_rate': 4.378960709759189e-05, 'epoch': 1.21}
{'loss': 0.0367, 'learning_rate': 4.375792141951838e-05, 'epoch': 1.21}
{'loss': 0.0411, 'learning_rate': 4.3726235741444873e-05, 'epoch': 1.22}
{'loss': 0.0245, 'learning_rate': 4.3694550063371356e-05, 'epoch': 1.22}
{'loss': 0.3568, 'learning_rate': 4.3662864385297845e-05, 'epoch': 1.23}
{'loss': 0.3242, 'learning_rat

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.012846563011407852, 'eval_recall': 0.9298618490967057, 'eval_precision': 0.6097560975609756, 'eval_f1': 0.9114583333333335, 'eval_runtime': 43.6942, 'eval_samples_per_second': 31.148, 'eval_steps_per_second': 31.148, 'epoch': 1.38}
{'loss': 0.0038, 'learning_rate': 4.268060836501902e-05, 'epoch': 1.38}
{'loss': 0.0015, 'learning_rate': 4.26489226869455e-05, 'epoch': 1.39}
{'loss': 0.0016, 'learning_rate': 4.261723700887199e-05, 'epoch': 1.39}
{'loss': 0.0438, 'learning_rate': 4.258555133079848e-05, 'epoch': 1.4}
{'loss': 0.0014, 'learning_rate': 4.2553865652724975e-05, 'epoch': 1.4}
{'loss': 0.0681, 'learning_rate': 4.252217997465146e-05, 'epoch': 1.41}
{'loss': 0.0583, 'learning_rate': 4.249049429657795e-05, 'epoch': 1.41}
{'loss': 0.0025, 'learning_rate': 4.2458808618504437e-05, 'epoch': 1.42}
{'loss': 0.0035, 'learning_rate': 4.2427122940430926e-05, 'epoch': 1.42}
{'loss': 0.082, 'learning_rate': 4.2395437262357415e-05, 'epoch': 1.43}
{'loss': 0.0025, 'learning_rate'

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.008063482120633125, 'eval_recall': 0.9744952178533475, 'eval_precision': 0.6221166892808684, 'eval_f1': 0.9537181487259488, 'eval_runtime': 41.7488, 'eval_samples_per_second': 32.6, 'eval_steps_per_second': 32.6, 'epoch': 1.57}
{'loss': 0.0301, 'learning_rate': 4.141318124207858e-05, 'epoch': 1.58}
{'loss': 0.0008, 'learning_rate': 4.138149556400508e-05, 'epoch': 1.58}
{'loss': 0.0036, 'learning_rate': 4.134980988593156e-05, 'epoch': 1.59}
{'loss': 0.0031, 'learning_rate': 4.131812420785805e-05, 'epoch': 1.59}
{'loss': 0.0014, 'learning_rate': 4.128643852978454e-05, 'epoch': 1.6}
{'loss': 0.0039, 'learning_rate': 4.125475285171103e-05, 'epoch': 1.6}
{'loss': 0.0169, 'learning_rate': 4.122306717363752e-05, 'epoch': 1.61}
{'loss': 0.0218, 'learning_rate': 4.1191381495564007e-05, 'epoch': 1.61}
{'loss': 0.0311, 'learning_rate': 4.1159695817490496e-05, 'epoch': 1.62}
{'loss': 0.1154, 'learning_rate': 4.1128010139416985e-05, 'epoch': 1.62}
{'loss': 0.019, 'learning_rate': 4.

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.014723185449838638, 'eval_recall': 0.953241232731137, 'eval_precision': 0.6467195385724586, 'eval_f1': 0.9361753371868977, 'eval_runtime': 40.7887, 'eval_samples_per_second': 33.367, 'eval_steps_per_second': 33.367, 'epoch': 1.77}
{'loss': 0.0011, 'learning_rate': 4.014575411913815e-05, 'epoch': 1.77}
{'loss': 0.0007, 'learning_rate': 4.011406844106464e-05, 'epoch': 1.78}
{'loss': 0.0245, 'learning_rate': 4.008238276299113e-05, 'epoch': 1.78}
{'loss': 0.0222, 'learning_rate': 4.005069708491762e-05, 'epoch': 1.79}
{'loss': 0.2042, 'learning_rate': 4.001901140684411e-05, 'epoch': 1.79}
{'loss': 0.0237, 'learning_rate': 3.99873257287706e-05, 'epoch': 1.8}
{'loss': 0.0036, 'learning_rate': 3.995564005069709e-05, 'epoch': 1.8}
{'loss': 0.0043, 'learning_rate': 3.9923954372623577e-05, 'epoch': 1.81}
{'loss': 0.0032, 'learning_rate': 3.9892268694550066e-05, 'epoch': 1.81}
{'loss': 0.0021, 'learning_rate': 3.9860583016476555e-05, 'epoch': 1.82}
{'loss': 0.0044, 'learning_rate':

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.00828137993812561, 'eval_recall': 0.9840595111583422, 'eval_precision': 0.653954802259887, 'eval_f1': 0.9653181508359729, 'eval_runtime': 40.7914, 'eval_samples_per_second': 33.365, 'eval_steps_per_second': 33.365, 'epoch': 1.97}
{'loss': 0.1983, 'learning_rate': 3.887832699619772e-05, 'epoch': 1.97}
{'loss': 0.001, 'learning_rate': 3.884664131812421e-05, 'epoch': 1.98}
{'loss': 0.0007, 'learning_rate': 3.88149556400507e-05, 'epoch': 1.98}
{'loss': 0.0132, 'learning_rate': 3.878326996197719e-05, 'epoch': 1.99}
{'loss': 0.0067, 'learning_rate': 3.875158428390367e-05, 'epoch': 1.99}
{'loss': 0.003, 'learning_rate': 3.871989860583017e-05, 'epoch': 2.0}
{'loss': 0.0077, 'learning_rate': 3.868821292775666e-05, 'epoch': 2.0}
{'loss': 0.0054, 'learning_rate': 3.8656527249683146e-05, 'epoch': 2.0}
{'loss': 0.0007, 'learning_rate': 3.862484157160963e-05, 'epoch': 2.01}
{'loss': 0.0013, 'learning_rate': 3.8593155893536125e-05, 'epoch': 2.01}
{'loss': 0.0019, 'learning_rate': 3.85

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.013922950252890587, 'eval_recall': 0.9362380446333688, 'eval_precision': 0.6834755624515128, 'eval_f1': 0.9231079229467235, 'eval_runtime': 40.8005, 'eval_samples_per_second': 33.357, 'eval_steps_per_second': 33.357, 'epoch': 2.16}
{'loss': 0.0389, 'learning_rate': 3.761089987325729e-05, 'epoch': 2.17}
{'loss': 0.001, 'learning_rate': 3.757921419518377e-05, 'epoch': 2.17}
{'loss': 0.0019, 'learning_rate': 3.754752851711027e-05, 'epoch': 2.18}
{'loss': 0.0018, 'learning_rate': 3.751584283903676e-05, 'epoch': 2.18}
{'loss': 0.0822, 'learning_rate': 3.748415716096325e-05, 'epoch': 2.19}
{'loss': 0.0038, 'learning_rate': 3.745247148288973e-05, 'epoch': 2.19}
{'loss': 0.0011, 'learning_rate': 3.742078580481623e-05, 'epoch': 2.2}
{'loss': 0.0211, 'learning_rate': 3.7389100126742716e-05, 'epoch': 2.2}
{'loss': 0.0011, 'learning_rate': 3.7357414448669206e-05, 'epoch': 2.21}
{'loss': 0.0304, 'learning_rate': 3.732572877059569e-05, 'epoch': 2.21}
{'loss': 0.0238, 'learning_rate':

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.01817256398499012, 'eval_recall': 0.9319872476089267, 'eval_precision': 0.6910953506698188, 'eval_f1': 0.919657981769783, 'eval_runtime': 40.8187, 'eval_samples_per_second': 33.343, 'eval_steps_per_second': 33.343, 'epoch': 2.36}
{'loss': 0.1329, 'learning_rate': 3.634347275031686e-05, 'epoch': 2.36}
{'loss': 0.008, 'learning_rate': 3.631178707224335e-05, 'epoch': 2.37}
{'loss': 0.0013, 'learning_rate': 3.628010139416983e-05, 'epoch': 2.37}
{'loss': 0.0003, 'learning_rate': 3.624841571609633e-05, 'epoch': 2.38}
{'loss': 0.0018, 'learning_rate': 3.621673003802282e-05, 'epoch': 2.38}
{'loss': 0.001, 'learning_rate': 3.618504435994931e-05, 'epoch': 2.39}
{'loss': 0.0033, 'learning_rate': 3.615335868187579e-05, 'epoch': 2.39}
{'loss': 0.0014, 'learning_rate': 3.612167300380228e-05, 'epoch': 2.4}
{'loss': 0.0003, 'learning_rate': 3.6089987325728776e-05, 'epoch': 2.4}
{'loss': 0.0011, 'learning_rate': 3.6058301647655265e-05, 'epoch': 2.41}
{'loss': 0.0004, 'learning_rate': 3.

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.018895886838436127, 'eval_recall': 0.971307120085016, 'eval_precision': 0.7412814274128142, 'eval_f1': 0.9598513611761855, 'eval_runtime': 40.863, 'eval_samples_per_second': 33.306, 'eval_steps_per_second': 33.306, 'epoch': 2.56}
{'loss': 0.0001, 'learning_rate': 3.5076045627376424e-05, 'epoch': 2.56}
{'loss': 0.0001, 'learning_rate': 3.504435994930292e-05, 'epoch': 2.57}
{'loss': 0.0002, 'learning_rate': 3.501267427122941e-05, 'epoch': 2.57}
{'loss': 0.0012, 'learning_rate': 3.498098859315589e-05, 'epoch': 2.57}
{'loss': 0.0001, 'learning_rate': 3.494930291508238e-05, 'epoch': 2.58}
{'loss': 0.0003, 'learning_rate': 3.491761723700888e-05, 'epoch': 2.58}
{'loss': 0.0001, 'learning_rate': 3.488593155893537e-05, 'epoch': 2.59}
{'loss': 0.0001, 'learning_rate': 3.485424588086185e-05, 'epoch': 2.59}
{'loss': 0.0201, 'learning_rate': 3.482256020278834e-05, 'epoch': 2.6}
{'loss': 0.0002, 'learning_rate': 3.479087452471483e-05, 'epoch': 2.6}
{'loss': 0.0123, 'learning_rate': 3

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.011199736967682838, 'eval_recall': 0.9574920297555791, 'eval_precision': 0.6620132255694342, 'eval_f1': 0.9413324760909749, 'eval_runtime': 40.8295, 'eval_samples_per_second': 33.334, 'eval_steps_per_second': 33.334, 'epoch': 2.75}
{'loss': 0.0002, 'learning_rate': 3.3808618504435994e-05, 'epoch': 2.76}
{'loss': 0.0015, 'learning_rate': 3.377693282636248e-05, 'epoch': 2.76}
{'loss': 0.0363, 'learning_rate': 3.374524714828898e-05, 'epoch': 2.77}
{'loss': 0.0421, 'learning_rate': 3.371356147021547e-05, 'epoch': 2.77}
{'loss': 0.0009, 'learning_rate': 3.368187579214195e-05, 'epoch': 2.78}
{'loss': 0.0003, 'learning_rate': 3.365019011406844e-05, 'epoch': 2.78}
{'loss': 0.0006, 'learning_rate': 3.361850443599493e-05, 'epoch': 2.79}
{'loss': 0.0004, 'learning_rate': 3.3586818757921426e-05, 'epoch': 2.79}
{'loss': 0.0286, 'learning_rate': 3.355513307984791e-05, 'epoch': 2.8}
{'loss': 0.0013, 'learning_rate': 3.35234474017744e-05, 'epoch': 2.8}
{'loss': 0.0031, 'learning_rate':

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.01581403985619545, 'eval_recall': 0.9585547290116897, 'eval_precision': 0.7210231814548361, 'eval_f1': 0.9465611882466902, 'eval_runtime': 40.8659, 'eval_samples_per_second': 33.304, 'eval_steps_per_second': 33.304, 'epoch': 2.95}
{'loss': 0.0185, 'learning_rate': 3.254119138149557e-05, 'epoch': 2.95}
{'loss': 0.0001, 'learning_rate': 3.250950570342205e-05, 'epoch': 2.96}
{'loss': 0.0006, 'learning_rate': 3.247782002534854e-05, 'epoch': 2.96}
{'loss': 0.0003, 'learning_rate': 3.244613434727503e-05, 'epoch': 2.97}
{'loss': 0.0001, 'learning_rate': 3.241444866920153e-05, 'epoch': 2.97}
{'loss': 0.007, 'learning_rate': 3.238276299112801e-05, 'epoch': 2.98}
{'loss': 0.0031, 'learning_rate': 3.23510773130545e-05, 'epoch': 2.98}
{'loss': 0.0007, 'learning_rate': 3.231939163498099e-05, 'epoch': 2.99}
{'loss': 0.0003, 'learning_rate': 3.228770595690748e-05, 'epoch': 2.99}
{'loss': 0.0001, 'learning_rate': 3.225602027883397e-05, 'epoch': 3.0}
{'loss': 0.0044, 'learning_rate': 3.

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.007795074488967657, 'eval_recall': 0.9553666312433581, 'eval_precision': 0.7454394693200663, 'eval_f1': 0.9451295944361328, 'eval_runtime': 40.8187, 'eval_samples_per_second': 33.343, 'eval_steps_per_second': 33.343, 'epoch': 3.14}
{'loss': 0.0002, 'learning_rate': 3.1273764258555134e-05, 'epoch': 3.15}
{'loss': 0.0002, 'learning_rate': 3.124207858048162e-05, 'epoch': 3.15}
{'loss': 0.0002, 'learning_rate': 3.121039290240811e-05, 'epoch': 3.16}
{'loss': 0.0002, 'learning_rate': 3.11787072243346e-05, 'epoch': 3.16}
{'loss': 0.0001, 'learning_rate': 3.114702154626109e-05, 'epoch': 3.17}
{'loss': 0.0005, 'learning_rate': 3.111533586818758e-05, 'epoch': 3.17}
{'loss': 0.0008, 'learning_rate': 3.108365019011407e-05, 'epoch': 3.18}
{'loss': 0.0007, 'learning_rate': 3.105196451204056e-05, 'epoch': 3.18}
{'loss': 0.0006, 'learning_rate': 3.102027883396705e-05, 'epoch': 3.19}
{'loss': 0.0004, 'learning_rate': 3.098859315589354e-05, 'epoch': 3.19}
{'loss': 0.0001, 'learning_rate'

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.022507846355438232, 'eval_recall': 0.9479277364505845, 'eval_precision': 0.7420965058236273, 'eval_f1': 0.9379221094350305, 'eval_runtime': 40.8409, 'eval_samples_per_second': 33.324, 'eval_steps_per_second': 33.324, 'epoch': 3.34}
{'loss': 0.0, 'learning_rate': 3.0006337135614704e-05, 'epoch': 3.35}
{'loss': 0.0019, 'learning_rate': 2.9974651457541193e-05, 'epoch': 3.35}
{'loss': 0.0008, 'learning_rate': 2.994296577946768e-05, 'epoch': 3.36}
{'loss': 0.0, 'learning_rate': 2.9911280101394172e-05, 'epoch': 3.36}
{'loss': 0.0463, 'learning_rate': 2.987959442332066e-05, 'epoch': 3.37}
{'loss': 0.0044, 'learning_rate': 2.984790874524715e-05, 'epoch': 3.37}
{'loss': 0.0018, 'learning_rate': 2.9816223067173637e-05, 'epoch': 3.38}
{'loss': 0.0002, 'learning_rate': 2.9784537389100126e-05, 'epoch': 3.38}
{'loss': 0.1088, 'learning_rate': 2.975285171102662e-05, 'epoch': 3.39}
{'loss': 0.003, 'learning_rate': 2.972116603295311e-05, 'epoch': 3.39}
{'loss': 0.0007, 'learning_rate': 

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.021322352811694145, 'eval_recall': 0.9373007438894793, 'eval_precision': 0.7368421052631579, 'eval_f1': 0.9275948547852115, 'eval_runtime': 40.8797, 'eval_samples_per_second': 33.293, 'eval_steps_per_second': 33.293, 'epoch': 3.54}
{'loss': 0.0034, 'learning_rate': 2.8738910012674274e-05, 'epoch': 3.54}
{'loss': 0.1722, 'learning_rate': 2.8707224334600763e-05, 'epoch': 3.55}
{'loss': 0.0177, 'learning_rate': 2.8675538656527252e-05, 'epoch': 3.55}
{'loss': 0.0004, 'learning_rate': 2.864385297845374e-05, 'epoch': 3.56}
{'loss': 0.0002, 'learning_rate': 2.8612167300380228e-05, 'epoch': 3.56}
{'loss': 0.001, 'learning_rate': 2.858048162230672e-05, 'epoch': 3.57}
{'loss': 0.0053, 'learning_rate': 2.854879594423321e-05, 'epoch': 3.57}
{'loss': 0.0002, 'learning_rate': 2.8517110266159696e-05, 'epoch': 3.58}
{'loss': 0.0001, 'learning_rate': 2.8485424588086185e-05, 'epoch': 3.58}
{'loss': 0.0012, 'learning_rate': 2.8453738910012678e-05, 'epoch': 3.59}
{'loss': 0.0008, 'learning

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.024333620443940163, 'eval_recall': 0.9426142401700319, 'eval_precision': 0.7726480836236934, 'eval_f1': 0.9347059538767074, 'eval_runtime': 40.8387, 'eval_samples_per_second': 33.326, 'eval_steps_per_second': 33.326, 'epoch': 3.73}
{'loss': 0.0003, 'learning_rate': 2.747148288973384e-05, 'epoch': 3.74}
{'loss': 0.0051, 'learning_rate': 2.743979721166033e-05, 'epoch': 3.74}
{'loss': 0.0017, 'learning_rate': 2.7408111533586822e-05, 'epoch': 3.75}
{'loss': 0.0012, 'learning_rate': 2.7376425855513312e-05, 'epoch': 3.75}
{'loss': 0.0007, 'learning_rate': 2.7344740177439798e-05, 'epoch': 3.76}
{'loss': 0.0036, 'learning_rate': 2.7313054499366287e-05, 'epoch': 3.76}
{'loss': 0.0003, 'learning_rate': 2.7281368821292773e-05, 'epoch': 3.77}
{'loss': 0.0011, 'learning_rate': 2.724968314321927e-05, 'epoch': 3.77}
{'loss': 0.0, 'learning_rate': 2.7217997465145755e-05, 'epoch': 3.78}
{'loss': 0.0031, 'learning_rate': 2.7186311787072245e-05, 'epoch': 3.78}
{'loss': 0.0815, 'learning_r

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.017891524359583855, 'eval_recall': 0.9628055260361318, 'eval_precision': 0.7587939698492462, 'eval_f1': 0.952951171163882, 'eval_runtime': 40.8769, 'eval_samples_per_second': 33.295, 'eval_steps_per_second': 33.295, 'epoch': 3.93}
{'loss': 0.0018, 'learning_rate': 2.620405576679341e-05, 'epoch': 3.94}
{'loss': 0.0, 'learning_rate': 2.61723700887199e-05, 'epoch': 3.94}
{'loss': 0.0002, 'learning_rate': 2.614068441064639e-05, 'epoch': 3.95}
{'loss': 0.0011, 'learning_rate': 2.6108998732572875e-05, 'epoch': 3.95}
{'loss': 0.0066, 'learning_rate': 2.6077313054499368e-05, 'epoch': 3.96}
{'loss': 0.2742, 'learning_rate': 2.6045627376425857e-05, 'epoch': 3.96}
{'loss': 0.0007, 'learning_rate': 2.6013941698352347e-05, 'epoch': 3.97}
{'loss': 0.0027, 'learning_rate': 2.5982256020278833e-05, 'epoch': 3.97}
{'loss': 0.0039, 'learning_rate': 2.5950570342205325e-05, 'epoch': 3.98}
{'loss': 0.0017, 'learning_rate': 2.5918884664131815e-05, 'epoch': 3.98}
{'loss': 0.0005, 'learning_rat

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.01723979413509369, 'eval_recall': 0.9543039319872476, 'eval_precision': 0.7115689381933439, 'eval_f1': 0.9419453745915198, 'eval_runtime': 40.8897, 'eval_samples_per_second': 33.285, 'eval_steps_per_second': 33.285, 'epoch': 4.13}
{'loss': 0.0001, 'learning_rate': 2.493662864385298e-05, 'epoch': 4.13}
{'loss': 0.0001, 'learning_rate': 2.490494296577947e-05, 'epoch': 4.14}
{'loss': 0.0041, 'learning_rate': 2.487325728770596e-05, 'epoch': 4.14}
{'loss': 0.0005, 'learning_rate': 2.4841571609632448e-05, 'epoch': 4.15}
{'loss': 0.002, 'learning_rate': 2.4809885931558938e-05, 'epoch': 4.15}
{'loss': 0.0002, 'learning_rate': 2.4778200253485427e-05, 'epoch': 4.16}
{'loss': 0.0003, 'learning_rate': 2.4746514575411913e-05, 'epoch': 4.16}
{'loss': 0.0017, 'learning_rate': 2.4714828897338406e-05, 'epoch': 4.17}
{'loss': 0.0001, 'learning_rate': 2.4683143219264892e-05, 'epoch': 4.17}
{'loss': 0.0022, 'learning_rate': 2.4651457541191385e-05, 'epoch': 4.18}
{'loss': 0.003, 'learning_r

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.008849642239511013, 'eval_recall': 0.9628055260361318, 'eval_precision': 0.7122641509433962, 'eval_f1': 0.949953623422188, 'eval_runtime': 40.9074, 'eval_samples_per_second': 33.27, 'eval_steps_per_second': 33.27, 'epoch': 4.32}
{'loss': 0.0005, 'learning_rate': 2.366920152091255e-05, 'epoch': 4.33}
{'loss': 0.0, 'learning_rate': 2.363751584283904e-05, 'epoch': 4.33}
{'loss': 0.0005, 'learning_rate': 2.360583016476553e-05, 'epoch': 4.34}
{'loss': 0.0008, 'learning_rate': 2.3574144486692015e-05, 'epoch': 4.34}
{'loss': 0.0992, 'learning_rate': 2.3542458808618508e-05, 'epoch': 4.35}
{'loss': 0.0008, 'learning_rate': 2.3510773130544994e-05, 'epoch': 4.35}
{'loss': 0.0108, 'learning_rate': 2.3479087452471487e-05, 'epoch': 4.36}
{'loss': 0.0001, 'learning_rate': 2.3447401774397973e-05, 'epoch': 4.36}
{'loss': 0.0, 'learning_rate': 2.3415716096324465e-05, 'epoch': 4.37}
{'loss': 0.0043, 'learning_rate': 2.338403041825095e-05, 'epoch': 4.37}
{'loss': 0.0001, 'learning_rate': 2

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.0117284981533885, 'eval_recall': 0.9670563230605739, 'eval_precision': 0.7268370607028753, 'eval_f1': 0.9549178673770027, 'eval_runtime': 40.9268, 'eval_samples_per_second': 33.254, 'eval_steps_per_second': 33.254, 'epoch': 4.52}
{'loss': 0.0002, 'learning_rate': 2.2401774397972117e-05, 'epoch': 4.53}
{'loss': 0.0005, 'learning_rate': 2.2370088719898606e-05, 'epoch': 4.53}
{'loss': 0.0, 'learning_rate': 2.2338403041825095e-05, 'epoch': 4.54}
{'loss': 0.0006, 'learning_rate': 2.2306717363751585e-05, 'epoch': 4.54}
{'loss': 0.0045, 'learning_rate': 2.2275031685678074e-05, 'epoch': 4.55}
{'loss': 0.0003, 'learning_rate': 2.2243346007604564e-05, 'epoch': 4.55}
{'loss': 0.0101, 'learning_rate': 2.2211660329531053e-05, 'epoch': 4.56}
{'loss': 0.0, 'learning_rate': 2.2179974651457543e-05, 'epoch': 4.56}
{'loss': 0.0006, 'learning_rate': 2.2148288973384032e-05, 'epoch': 4.57}
{'loss': 0.0, 'learning_rate': 2.211660329531052e-05, 'epoch': 4.57}
{'loss': 0.0002, 'learning_rate': 

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.02738422341644764, 'eval_recall': 0.9181721572794899, 'eval_precision': 0.8260038240917782, 'eval_f1': 0.9142485043343779, 'eval_runtime': 40.9157, 'eval_samples_per_second': 33.263, 'eval_steps_per_second': 33.263, 'epoch': 4.72}
{'loss': 0.0001, 'learning_rate': 2.1134347275031687e-05, 'epoch': 4.72}
{'loss': 0.0053, 'learning_rate': 2.1102661596958176e-05, 'epoch': 4.73}
{'loss': 0.0002, 'learning_rate': 2.1070975918884665e-05, 'epoch': 4.73}
{'loss': 0.0012, 'learning_rate': 2.1039290240811155e-05, 'epoch': 4.74}
{'loss': 0.0014, 'learning_rate': 2.1007604562737644e-05, 'epoch': 4.74}
{'loss': 0.0004, 'learning_rate': 2.0975918884664134e-05, 'epoch': 4.75}
{'loss': 0.0, 'learning_rate': 2.0944233206590623e-05, 'epoch': 4.75}
{'loss': 0.011, 'learning_rate': 2.0912547528517112e-05, 'epoch': 4.76}
{'loss': 0.0002, 'learning_rate': 2.0880861850443602e-05, 'epoch': 4.76}
{'loss': 0.0013, 'learning_rate': 2.0849176172370088e-05, 'epoch': 4.77}
{'loss': 0.0001, 'learning_

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.019115926697850227, 'eval_recall': 0.944739638682253, 'eval_precision': 0.7553101104502974, 'eval_f1': 0.935713707392114, 'eval_runtime': 40.9481, 'eval_samples_per_second': 33.237, 'eval_steps_per_second': 33.237, 'epoch': 4.91}
{'loss': 0.0, 'learning_rate': 1.9866920152091257e-05, 'epoch': 4.92}
{'loss': 0.0018, 'learning_rate': 1.9835234474017746e-05, 'epoch': 4.92}
{'loss': 0.0001, 'learning_rate': 1.9803548795944235e-05, 'epoch': 4.93}
{'loss': 0.0004, 'learning_rate': 1.9771863117870725e-05, 'epoch': 4.93}
{'loss': 0.0223, 'learning_rate': 1.9740177439797214e-05, 'epoch': 4.94}
{'loss': 0.0, 'learning_rate': 1.9708491761723704e-05, 'epoch': 4.94}
{'loss': 0.001, 'learning_rate': 1.967680608365019e-05, 'epoch': 4.95}
{'loss': 0.0088, 'learning_rate': 1.9645120405576682e-05, 'epoch': 4.95}
{'loss': 0.0002, 'learning_rate': 1.961343472750317e-05, 'epoch': 4.96}
{'loss': 0.0, 'learning_rate': 1.958174904942966e-05, 'epoch': 4.96}
{'loss': 0.0002, 'learning_rate': 1.9

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.028568949550390244, 'eval_recall': 0.9362380446333688, 'eval_precision': 0.7866071428571428, 'eval_f1': 0.9294380198823291, 'eval_runtime': 40.9374, 'eval_samples_per_second': 33.246, 'eval_steps_per_second': 33.246, 'epoch': 5.11}
{'loss': 0.0001, 'learning_rate': 1.8599493029150823e-05, 'epoch': 5.12}
{'loss': 0.0003, 'learning_rate': 1.8567807351077313e-05, 'epoch': 5.12}
{'loss': 0.0001, 'learning_rate': 1.8536121673003802e-05, 'epoch': 5.13}
{'loss': 0.0001, 'learning_rate': 1.850443599493029e-05, 'epoch': 5.13}
{'loss': 0.0, 'learning_rate': 1.847275031685678e-05, 'epoch': 5.14}
{'loss': 0.0008, 'learning_rate': 1.844106463878327e-05, 'epoch': 5.14}
{'loss': 0.0001, 'learning_rate': 1.840937896070976e-05, 'epoch': 5.14}
{'loss': 0.0011, 'learning_rate': 1.837769328263625e-05, 'epoch': 5.15}
{'loss': 0.0002, 'learning_rate': 1.834600760456274e-05, 'epoch': 5.15}
{'loss': 0.0, 'learning_rate': 1.8314321926489228e-05, 'epoch': 5.16}
{'loss': 0.0001, 'learning_rate': 

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.026517555117607117, 'eval_recall': 0.9043570669500531, 'eval_precision': 0.8166986564299424, 'eval_f1': 0.9006390686693534, 'eval_runtime': 40.9824, 'eval_samples_per_second': 33.209, 'eval_steps_per_second': 33.209, 'epoch': 5.31}
{'loss': 0.0, 'learning_rate': 1.7332065906210393e-05, 'epoch': 5.31}
{'loss': 0.0004, 'learning_rate': 1.7300380228136882e-05, 'epoch': 5.32}
{'loss': 0.0, 'learning_rate': 1.7268694550063372e-05, 'epoch': 5.32}
{'loss': 0.0002, 'learning_rate': 1.723700887198986e-05, 'epoch': 5.33}
{'loss': 0.0, 'learning_rate': 1.720532319391635e-05, 'epoch': 5.33}
{'loss': 0.0905, 'learning_rate': 1.717363751584284e-05, 'epoch': 5.34}
{'loss': 0.0131, 'learning_rate': 1.714195183776933e-05, 'epoch': 5.34}
{'loss': 0.0001, 'learning_rate': 1.711026615969582e-05, 'epoch': 5.35}
{'loss': 0.0004, 'learning_rate': 1.707858048162231e-05, 'epoch': 5.35}
{'loss': 0.0001, 'learning_rate': 1.7046894803548798e-05, 'epoch': 5.36}
{'loss': 0.0182, 'learning_rate': 1.7

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.022424673661589622, 'eval_recall': 0.9606801275239107, 'eval_precision': 0.7719897523484202, 'eval_f1': 0.9517330741820538, 'eval_runtime': 40.9745, 'eval_samples_per_second': 33.216, 'eval_steps_per_second': 33.216, 'epoch': 5.5}
{'loss': 0.0, 'learning_rate': 1.6064638783269963e-05, 'epoch': 5.51}
{'loss': 0.0173, 'learning_rate': 1.6032953105196452e-05, 'epoch': 5.51}
{'loss': 0.0001, 'learning_rate': 1.600126742712294e-05, 'epoch': 5.52}
{'loss': 0.0001, 'learning_rate': 1.596958174904943e-05, 'epoch': 5.52}
{'loss': 0.0004, 'learning_rate': 1.5937896070975917e-05, 'epoch': 5.53}
{'loss': 0.0021, 'learning_rate': 1.590621039290241e-05, 'epoch': 5.53}
{'loss': 0.0009, 'learning_rate': 1.5874524714828896e-05, 'epoch': 5.54}
{'loss': 0.0001, 'learning_rate': 1.584283903675539e-05, 'epoch': 5.54}
{'loss': 0.0, 'learning_rate': 1.5811153358681875e-05, 'epoch': 5.55}
{'loss': 0.0011, 'learning_rate': 1.5779467680608364e-05, 'epoch': 5.55}
{'loss': 0.0001, 'learning_rate':

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.01637597754597664, 'eval_recall': 0.9638682252922423, 'eval_precision': 0.7846020761245674, 'eval_f1': 0.9554718204286698, 'eval_runtime': 40.9575, 'eval_samples_per_second': 33.23, 'eval_steps_per_second': 33.23, 'epoch': 5.7}
{'loss': 0.0015, 'learning_rate': 1.4797211660329533e-05, 'epoch': 5.71}
{'loss': 0.0, 'learning_rate': 1.476552598225602e-05, 'epoch': 5.71}
{'loss': 0.0, 'learning_rate': 1.4733840304182512e-05, 'epoch': 5.71}
{'loss': 0.0, 'learning_rate': 1.4702154626109e-05, 'epoch': 5.72}
{'loss': 0.0, 'learning_rate': 1.4670468948035487e-05, 'epoch': 5.72}
{'loss': 0.0001, 'learning_rate': 1.4638783269961978e-05, 'epoch': 5.73}
{'loss': 0.0001, 'learning_rate': 1.4607097591888466e-05, 'epoch': 5.73}
{'loss': 0.0003, 'learning_rate': 1.4575411913814957e-05, 'epoch': 5.74}
{'loss': 0.0, 'learning_rate': 1.4543726235741445e-05, 'epoch': 5.74}
{'loss': 0.0, 'learning_rate': 1.4512040557667936e-05, 'epoch': 5.75}
{'loss': 0.0001, 'learning_rate': 1.448035487959

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.02516833320260048, 'eval_recall': 0.922422954303932, 'eval_precision': 0.8250950570342205, 'eval_f1': 0.9182569068641412, 'eval_runtime': 40.9304, 'eval_samples_per_second': 33.252, 'eval_steps_per_second': 33.252, 'epoch': 5.9}
{'loss': 0.0488, 'learning_rate': 1.3529784537389101e-05, 'epoch': 5.9}
{'loss': 0.0003, 'learning_rate': 1.3498098859315589e-05, 'epoch': 5.91}
{'loss': 0.0, 'learning_rate': 1.346641318124208e-05, 'epoch': 5.91}
{'loss': 0.0, 'learning_rate': 1.3434727503168568e-05, 'epoch': 5.92}
{'loss': 0.0, 'learning_rate': 1.3403041825095059e-05, 'epoch': 5.92}
{'loss': 0.0002, 'learning_rate': 1.3371356147021547e-05, 'epoch': 5.93}
{'loss': 0.0012, 'learning_rate': 1.3339670468948038e-05, 'epoch': 5.93}
{'loss': 0.0001, 'learning_rate': 1.3307984790874526e-05, 'epoch': 5.94}
{'loss': 0.0, 'learning_rate': 1.3276299112801013e-05, 'epoch': 5.94}
{'loss': 0.0, 'learning_rate': 1.3244613434727504e-05, 'epoch': 5.95}
{'loss': 0.0003, 'learning_rate': 1.321292

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.02954103797674179, 'eval_recall': 0.8990435706695006, 'eval_precision': 0.8229571984435797, 'eval_f1': 0.8958579399666028, 'eval_runtime': 40.9905, 'eval_samples_per_second': 33.203, 'eval_steps_per_second': 33.203, 'epoch': 6.09}
{'loss': 0.0002, 'learning_rate': 1.226235741444867e-05, 'epoch': 6.1}
{'loss': 0.0177, 'learning_rate': 1.2230671736375159e-05, 'epoch': 6.1}
{'loss': 0.0001, 'learning_rate': 1.2198986058301648e-05, 'epoch': 6.11}
{'loss': 0.0, 'learning_rate': 1.2167300380228138e-05, 'epoch': 6.11}
{'loss': 0.0, 'learning_rate': 1.2135614702154627e-05, 'epoch': 6.12}
{'loss': 0.0002, 'learning_rate': 1.2103929024081117e-05, 'epoch': 6.12}
{'loss': 0.0005, 'learning_rate': 1.2072243346007606e-05, 'epoch': 6.13}
{'loss': 0.0, 'learning_rate': 1.2040557667934094e-05, 'epoch': 6.13}
{'loss': 0.0, 'learning_rate': 1.2008871989860583e-05, 'epoch': 6.14}
{'loss': 0.0075, 'learning_rate': 1.1977186311787073e-05, 'epoch': 6.14}
{'loss': 0.0001, 'learning_rate': 1.19

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.026929756626486778, 'eval_recall': 0.9489904357066951, 'eval_precision': 0.7930728241563055, 'eval_f1': 0.9418684840371587, 'eval_runtime': 40.9925, 'eval_samples_per_second': 33.201, 'eval_steps_per_second': 33.201, 'epoch': 6.29}
{'loss': 0.0002, 'learning_rate': 1.099493029150824e-05, 'epoch': 6.29}
{'loss': 0.0, 'learning_rate': 1.0963244613434729e-05, 'epoch': 6.3}
{'loss': 0.0001, 'learning_rate': 1.0931558935361218e-05, 'epoch': 6.3}
{'loss': 0.0, 'learning_rate': 1.0899873257287708e-05, 'epoch': 6.31}
{'loss': 0.0, 'learning_rate': 1.0868187579214195e-05, 'epoch': 6.31}
{'loss': 0.0451, 'learning_rate': 1.0836501901140685e-05, 'epoch': 6.32}
{'loss': 0.0002, 'learning_rate': 1.0804816223067174e-05, 'epoch': 6.32}
{'loss': 0.0005, 'learning_rate': 1.0773130544993664e-05, 'epoch': 6.33}
{'loss': 0.0, 'learning_rate': 1.0741444866920153e-05, 'epoch': 6.33}
{'loss': 0.0, 'learning_rate': 1.0709759188846643e-05, 'epoch': 6.34}
{'loss': 0.0001, 'learning_rate': 1.0678

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.006507540587335825, 'eval_recall': 0.9893730074388948, 'eval_precision': 0.72734375, 'eval_f1': 0.9758516428139485, 'eval_runtime': 40.9922, 'eval_samples_per_second': 33.201, 'eval_steps_per_second': 33.201, 'epoch': 6.49}
{'loss': 0.0, 'learning_rate': 9.727503168567808e-06, 'epoch': 6.49}
{'loss': 0.0002, 'learning_rate': 9.695817490494297e-06, 'epoch': 6.5}
{'loss': 0.0001, 'learning_rate': 9.664131812420787e-06, 'epoch': 6.5}
{'loss': 0.0002, 'learning_rate': 9.632446134347276e-06, 'epoch': 6.51}
{'loss': 0.0002, 'learning_rate': 9.600760456273765e-06, 'epoch': 6.51}
{'loss': 0.0006, 'learning_rate': 9.569074778200255e-06, 'epoch': 6.52}
{'loss': 0.0003, 'learning_rate': 9.537389100126744e-06, 'epoch': 6.52}
{'loss': 0.0, 'learning_rate': 9.505703422053232e-06, 'epoch': 6.53}
{'loss': 0.0001, 'learning_rate': 9.474017743979721e-06, 'epoch': 6.53}
{'loss': 0.0006, 'learning_rate': 9.442332065906211e-06, 'epoch': 6.54}
{'loss': 0.0001, 'learning_rate': 9.410646387832

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.02890155278146267, 'eval_recall': 0.9479277364505845, 'eval_precision': 0.8109090909090909, 'eval_f1': 0.9418071065989847, 'eval_runtime': 41.0481, 'eval_samples_per_second': 33.156, 'eval_steps_per_second': 33.156, 'epoch': 6.68}
{'loss': 0.0, 'learning_rate': 8.460076045627378e-06, 'epoch': 6.69}
{'loss': 0.0001, 'learning_rate': 8.428390367553867e-06, 'epoch': 6.69}
{'loss': 0.0001, 'learning_rate': 8.396704689480357e-06, 'epoch': 6.7}
{'loss': 0.0, 'learning_rate': 8.365019011406844e-06, 'epoch': 6.7}
{'loss': 0.0026, 'learning_rate': 8.333333333333334e-06, 'epoch': 6.71}
{'loss': 0.0, 'learning_rate': 8.301647655259823e-06, 'epoch': 6.71}
{'loss': 0.0, 'learning_rate': 8.269961977186313e-06, 'epoch': 6.72}
{'loss': 0.0001, 'learning_rate': 8.238276299112802e-06, 'epoch': 6.72}
{'loss': 0.0002, 'learning_rate': 8.206590621039291e-06, 'epoch': 6.73}
{'loss': 0.0003, 'learning_rate': 8.17490494296578e-06, 'epoch': 6.73}
{'loss': 0.0, 'learning_rate': 8.14321926489227e

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.017972953617572784, 'eval_recall': 0.9628055260361318, 'eval_precision': 0.7743589743589744, 'eval_f1': 0.953877303097793, 'eval_runtime': 40.9904, 'eval_samples_per_second': 33.203, 'eval_steps_per_second': 33.203, 'epoch': 6.88}
{'loss': 0.0, 'learning_rate': 7.192648922686945e-06, 'epoch': 6.88}
{'loss': 0.0, 'learning_rate': 7.160963244613435e-06, 'epoch': 6.89}
{'loss': 0.0006, 'learning_rate': 7.129277566539924e-06, 'epoch': 6.89}
{'loss': 0.0, 'learning_rate': 7.0975918884664134e-06, 'epoch': 6.9}
{'loss': 0.0, 'learning_rate': 7.065906210392903e-06, 'epoch': 6.9}
{'loss': 0.0001, 'learning_rate': 7.034220532319392e-06, 'epoch': 6.91}
{'loss': 0.0001, 'learning_rate': 7.002534854245882e-06, 'epoch': 6.91}
{'loss': 0.0, 'learning_rate': 6.97084917617237e-06, 'epoch': 6.92}
{'loss': 0.0001, 'learning_rate': 6.93916349809886e-06, 'epoch': 6.92}
{'loss': 0.0012, 'learning_rate': 6.907477820025349e-06, 'epoch': 6.93}
{'loss': 0.0001, 'learning_rate': 6.875792141951838

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.02445222996175289, 'eval_recall': 0.9628055260361318, 'eval_precision': 0.7912663755458516, 'eval_f1': 0.9548439400081069, 'eval_runtime': 41.0525, 'eval_samples_per_second': 33.153, 'eval_steps_per_second': 33.153, 'epoch': 7.08}
{'loss': 0.0005, 'learning_rate': 5.925221799746515e-06, 'epoch': 7.08}
{'loss': 0.0, 'learning_rate': 5.893536121673004e-06, 'epoch': 7.09}
{'loss': 0.0, 'learning_rate': 5.861850443599493e-06, 'epoch': 7.09}
{'loss': 0.0, 'learning_rate': 5.8301647655259826e-06, 'epoch': 7.1}
{'loss': 0.0, 'learning_rate': 5.798479087452472e-06, 'epoch': 7.1}
{'loss': 0.0005, 'learning_rate': 5.7667934093789605e-06, 'epoch': 7.11}
{'loss': 0.0, 'learning_rate': 5.73510773130545e-06, 'epoch': 7.11}
{'loss': 0.0, 'learning_rate': 5.703422053231939e-06, 'epoch': 7.12}
{'loss': 0.0, 'learning_rate': 5.671736375158428e-06, 'epoch': 7.12}
{'loss': 0.0, 'learning_rate': 5.640050697084917e-06, 'epoch': 7.13}
{'loss': 0.0, 'learning_rate': 5.608365019011407e-06, 'epo

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.03392292186617851, 'eval_recall': 0.9351753453772582, 'eval_precision': 0.8255159474671669, 'eval_f1': 0.9304216989955676, 'eval_runtime': 41.0274, 'eval_samples_per_second': 33.173, 'eval_steps_per_second': 33.173, 'epoch': 7.27}
{'loss': 0.0, 'learning_rate': 4.657794676806084e-06, 'epoch': 7.28}
{'loss': 0.0002, 'learning_rate': 4.626108998732573e-06, 'epoch': 7.28}
{'loss': 0.0004, 'learning_rate': 4.594423320659062e-06, 'epoch': 7.29}
{'loss': 0.0, 'learning_rate': 4.562737642585552e-06, 'epoch': 7.29}
{'loss': 0.0001, 'learning_rate': 4.531051964512041e-06, 'epoch': 7.3}
{'loss': 0.0, 'learning_rate': 4.49936628643853e-06, 'epoch': 7.3}
{'loss': 0.0, 'learning_rate': 4.467680608365019e-06, 'epoch': 7.31}
{'loss': 0.0, 'learning_rate': 4.4359949302915085e-06, 'epoch': 7.31}
{'loss': 0.0, 'learning_rate': 4.404309252217997e-06, 'epoch': 7.32}
{'loss': 0.0, 'learning_rate': 4.3726235741444865e-06, 'epoch': 7.32}
{'loss': 0.0001, 'learning_rate': 4.340937896070976e-06

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.030466075986623764, 'eval_recall': 0.9351753453772582, 'eval_precision': 0.8333333333333334, 'eval_f1': 0.9308002115455026, 'eval_runtime': 41.0608, 'eval_samples_per_second': 33.146, 'eval_steps_per_second': 33.146, 'epoch': 7.47}
{'loss': 0.0001, 'learning_rate': 3.3903675538656534e-06, 'epoch': 7.47}
{'loss': 0.0838, 'learning_rate': 3.358681875792142e-06, 'epoch': 7.48}
{'loss': 0.0001, 'learning_rate': 3.3269961977186314e-06, 'epoch': 7.48}
{'loss': 0.0002, 'learning_rate': 3.295310519645121e-06, 'epoch': 7.49}
{'loss': 0.0002, 'learning_rate': 3.2636248415716094e-06, 'epoch': 7.49}
{'loss': 0.0, 'learning_rate': 3.2319391634980988e-06, 'epoch': 7.5}
{'loss': 0.0, 'learning_rate': 3.200253485424588e-06, 'epoch': 7.5}
{'loss': 0.0001, 'learning_rate': 3.1685678073510776e-06, 'epoch': 7.51}
{'loss': 0.0, 'learning_rate': 3.1368821292775666e-06, 'epoch': 7.51}
{'loss': 0.0, 'learning_rate': 3.105196451204056e-06, 'epoch': 7.52}
{'loss': 0.0004, 'learning_rate': 3.0735

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.030603989958763123, 'eval_recall': 0.9341126461211477, 'eval_precision': 0.8363463368220743, 'eval_f1': 0.9299316406250001, 'eval_runtime': 41.0508, 'eval_samples_per_second': 33.154, 'eval_steps_per_second': 33.154, 'epoch': 7.67}
{'loss': 0.0, 'learning_rate': 2.122940430925222e-06, 'epoch': 7.67}
{'loss': 0.0, 'learning_rate': 2.091254752851711e-06, 'epoch': 7.68}
{'loss': 0.0001, 'learning_rate': 2.0595690747782005e-06, 'epoch': 7.68}
{'loss': 0.0, 'learning_rate': 2.0278833967046895e-06, 'epoch': 7.69}
{'loss': 0.0001, 'learning_rate': 1.996197718631179e-06, 'epoch': 7.69}
{'loss': 0.0, 'learning_rate': 1.964512040557668e-06, 'epoch': 7.7}
{'loss': 0.0002, 'learning_rate': 1.9328263624841573e-06, 'epoch': 7.7}
{'loss': 0.0009, 'learning_rate': 1.9011406844106463e-06, 'epoch': 7.71}
{'loss': 0.0, 'learning_rate': 1.8694550063371357e-06, 'epoch': 7.71}
{'loss': 0.0003, 'learning_rate': 1.837769328263625e-06, 'epoch': 7.71}
{'loss': 0.0, 'learning_rate': 1.80608365019

  0%|          | 0/1361 [00:00<?, ?it/s]

{'eval_loss': 0.02844315953552723, 'eval_recall': 0.9351753453772582, 'eval_precision': 0.8255159474671669, 'eval_f1': 0.9304216989955676, 'eval_runtime': 41.0685, 'eval_samples_per_second': 33.14, 'eval_steps_per_second': 33.14, 'epoch': 7.86}
{'loss': 0.0, 'learning_rate': 8.55513307984791e-07, 'epoch': 7.87}
{'loss': 0.0001, 'learning_rate': 8.238276299112802e-07, 'epoch': 7.87}
{'loss': 0.0001, 'learning_rate': 7.921419518377694e-07, 'epoch': 7.88}
{'loss': 0.0294, 'learning_rate': 7.604562737642586e-07, 'epoch': 7.88}
{'loss': 0.0, 'learning_rate': 7.287705956907478e-07, 'epoch': 7.89}
{'loss': 0.0, 'learning_rate': 6.97084917617237e-07, 'epoch': 7.89}
{'loss': 0.0112, 'learning_rate': 6.653992395437262e-07, 'epoch': 7.9}
{'loss': 0.0001, 'learning_rate': 6.337135614702154e-07, 'epoch': 7.9}
{'loss': 0.0001, 'learning_rate': 6.020278833967048e-07, 'epoch': 7.91}
{'loss': 0.0001, 'learning_rate': 5.70342205323194e-07, 'epoch': 7.91}
{'loss': 0.0, 'learning_rate': 5.386565272496832e

TrainOutput(global_step=16280, training_loss=0.03680243135802396, metrics={'train_runtime': 6670.4371, 'train_samples_per_second': 9.76, 'train_steps_per_second': 2.441, 'train_loss': 0.03680243135802396, 'epoch': 8.0})

In [14]:
create_submission(model, f"submission.csv")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


    row_id  document  token           label
0        0         7      9  B-NAME_STUDENT
1        1         7     10  I-NAME_STUDENT
2        2         7    482  B-NAME_STUDENT
3        3         7    483  I-NAME_STUDENT
4        4         7    741  B-NAME_STUDENT
5        5         7    742  I-NAME_STUDENT
6        6        10      0  B-NAME_STUDENT
7        7        10      1  I-NAME_STUDENT
8        8        10    464  B-NAME_STUDENT
9        9        10    465  I-NAME_STUDENT
10      10        16      4  B-NAME_STUDENT
11      11        16      5  I-NAME_STUDENT
12      12        20      5  B-NAME_STUDENT
13      13        20      6  I-NAME_STUDENT
14      14        20      8  I-NAME_STUDENT
15      15        56     12  B-NAME_STUDENT
16      16        56     13  I-NAME_STUDENT
17      17        86      6  B-NAME_STUDENT
18      18        86      7  I-NAME_STUDENT
19      19        93      0  B-NAME_STUDENT
20      20        93      1  I-NAME_STUDENT
21      21       104      7  B-N

In [None]:
# save trainer
trainer.save_model("final_model")