In [1]:
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForTokenClassification
from pathlib import Path
import numpy as np
import torch
from tokenizers import AddedToken
from tqdm.notebook import tqdm
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
import pandas as pd
from datasets import Dataset

kaggle=False

path="/kaggle/input/pii-detection-removal-from-educational-data" if kaggle else "data"
train_path = path + "/train.json"
test_path = path + "/test.json"

mixtral_path="data/mpware_mixtral8x7b_v1.1.json" if not kaggle else "/kaggle/input/mixtral-8x7b-v11/mixtral8x7b_v1.1.json"

model_path = "/kaggle/input/huggingfacedebertav3variants/deberta-v3-base" if kaggle else "microsoft/deberta-v3-base"

if not kaggle: import neptune
if not kaggle: from seqeval.metrics import recall_score, precision_score, f1_score, accuracy_score

https://www.kaggle.com/datasets/mpware/pii-mixtral8x7b-generated-essays

In [2]:
cross_entropy_weight_multi = 400

CROSS_ENTROPY_WEIGHTS = [cross_entropy_weight_multi]*12
CROSS_ENTROPY_WEIGHTS.append(1)

parameter= {
    "model": model_path,
    "max_length": 1024,
    "inference_max_length": 2000,
    "batch_size": 4,
    "inference_batch_size": 1,
    "lr": 5e-05,
    "lr_scale_unfreeze": 0.1,
    "filter_no_pii_percent_allow": 0.2,
    "notebook": "20_deberta base_1024len.ipynb",
    "CROSS_ENTROPY_WEIGHT_MULTI": cross_entropy_weight_multi,
    "epochs_before_unfreeze": 2,
    "epochs_after_unfreeze": 6,
    "repeat_unfreeze_train_n_times": 2,
    "validate_every_n_epochs": 2,
    "train_test_split": 0.2,
    "num_proc": 16 if not kaggle else 2, 
    "freeze_embeddings": False,
    "freeze_layers": 6
}

print(parameter)

{'model': 'microsoft/deberta-v3-base', 'max_length': 1024, 'inference_max_length': 2000, 'batch_size': 4, 'inference_batch_size': 1, 'lr': 5e-05, 'lr_scale_unfreeze': 0.1, 'filter_no_pii_percent_allow': 0.2, 'notebook': '20_deberta base_1024len.ipynb', 'CROSS_ENTROPY_WEIGHT_MULTI': 400, 'epochs_before_unfreeze': 2, 'epochs_after_unfreeze': 6, 'repeat_unfreeze_train_n_times': 2, 'validate_every_n_epochs': 2, 'train_test_split': 0.2, 'num_proc': 16, 'freeze_embeddings': False, 'freeze_layers': 6}


In [3]:
target = [
    'B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 
    'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 
    'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL'
]

In [4]:
from itertools import chain
import json

data = json.load(open(train_path))
all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
label2id = {l: i for i,l in enumerate(all_labels)}
id2label = {v:k for k,v in label2id.items()}

In [5]:
import random

def tokenize(example, tokenizer, label2id, max_length, all_labels_list):
    text = []
    import numpy as np

    # these are at the character level
    labels = []
    targets = []

    for t, l, ws in zip(example["tokens"], example["labels"], example["trailing_whitespace"]):

        text.append(t)
        labels.extend([l]*len(t))
        
        if l in all_labels_list:
            targets.append(1)
        else:
            targets.append(0)
        # if there is trailing whitespace
        if ws:
            text.append(" ")
            labels.append("O")

    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=max_length, padding="max_length")
    
    target_num = sum(targets)
    labels = np.array(labels)

    text = "".join(text)
    token_labels = []

    for start_idx, end_idx in tokenized.offset_mapping:

        # CLS token
        if start_idx == 0 and end_idx == 0: 
            token_labels.append(label2id["O"])
            continue

        # case when token starts with whitespace
        if text[start_idx].isspace():
            start_idx += 1

        try:
            token_labels.append(label2id[labels[start_idx]])
        except:
            token_labels.append(label2id["O"])

    length = len(tokenized.input_ids)

    return {
        **tokenized,
        "labels": token_labels,
        "length": length,
        "target_num": target_num,
        "group": 1 if target_num>0 else 0
    }

# https://www.kaggle.com/competitions/pii-detection-removal-from-educational-data/discussion/468844
def filter_no_pii(example, percent_allow=parameter["filter_no_pii_percent_allow"]):
    # Return True if there is PII
    # Or 20% of the time if there isn't
    has_pii = set("O") != set(example["labels"])
    return has_pii or (random.random() < percent_allow)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(parameter["model"])
tokenizer.add_tokens(AddedToken("\n", normalized=False)) 

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


1

In [7]:
mixtral_data=json.load(open(mixtral_path))
dict_mixtral={
    "full_text": [x["full_text"] for x in mixtral_data],
    "document": [str(x["document"]) for x in mixtral_data],
    "tokens": [x["tokens"] for x in mixtral_data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in mixtral_data],
    "labels": [x["labels"] for x in mixtral_data],
}
data = json.load(open(train_path))
dict_train={
    "full_text": [x["full_text"] for x in data],
    "document": [str(x["document"]) for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
    "labels": [x["labels"] for x in data],
}

full_data = {
    "full_text": dict_train["full_text"] + dict_mixtral["full_text"],
    "document": dict_train["document"] + dict_mixtral["document"],
    "tokens": dict_train["tokens"] + dict_mixtral["tokens"],
    "trailing_whitespace": dict_train["trailing_whitespace"] + dict_mixtral["trailing_whitespace"],
    "labels": dict_train["labels"] + dict_mixtral["labels"],
}

In [8]:
ds = Dataset.from_dict(full_data)

ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer, "label2id": label2id, "max_length": parameter["max_length"], "all_labels_list": target}, num_proc=parameter["num_proc"])
ds=ds.filter(filter_no_pii, num_proc=parameter["num_proc"])


data_len=len(ds)
train_len=int(len(ds)*(1-parameter["train_test_split"]))
valid_len=len(ds)-train_len
train_data_idx=np.random.choice(data_len, train_len, replace=False)
valid_data_idx=np.array(list(set(range(data_len))-set(train_data_idx)))
print("train_len", train_len)
print("valid_len", valid_len)

# split ds in train and valid
train_ds=ds.select(train_data_idx)
valid_ds=ds.select(valid_data_idx)

Map (num_proc=16):   0%|          | 0/9499 [00:00<?, ? examples/s]

Filter (num_proc=16):   0%|          | 0/9499 [00:00<?, ? examples/s]

train_len 7599
valid_len 1900


In [20]:
def tokenize_inference(example, tokenizer, max_length):
        text = []
        for t,  ws in zip(example["tokens"], example["trailing_whitespace"]):
            text.append(t)
            if ws:
                text.append(" ")
        tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=max_length, padding="max_length")
        text = "".join(text)
        length = len(tokenized.input_ids)
        return {
            **tokenized,
            "length": length,
        }
        
class TestTokenizer():
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    
    def preprocess(self, example):
        # Preprocess the tokens and labels by adding trailing whitespace and labels
        tokens = []
        tokens_without_ws = []
        token_map = [] # Use the index as labels
        index = 0
        for token, t_ws in zip(example["tokens"], example["trailing_whitespace"]):
            tokens_without_ws.append(token)
            tokens.append(token)
            token_map.extend([index] * len(token))
            # Added trailing whitespace and label if true and 
            if t_ws:
                tokens.append(" ")
                token_map.append(-1)
            index += 1
        return tokens, token_map, tokens_without_ws
    
    def tokenize(self, example):
        tokens, token_map, tokens_without_ws = self.preprocess(example)
        text = "".join(tokens)
        tokenized = self.tokenizer(text, return_offsets_mapping=True, padding="max_length",
                                   truncation=True, max_length=parameter["inference_max_length"])
        return {**tokenized, "token_map": token_map, "tokens": tokens, "tokens_without_ws": tokens_without_ws} 

class PiiDatasetInference(torch.utils.data.Dataset):
        def __init__(self, dataset, tokenizer):
            self.dataset = dataset
            self.tokenizer=TestTokenizer(tokenizer)
            
        def __getitem__(self, idx):
            vals=self.tokenizer.tokenize(self.dataset[idx])
            input_ids = torch.tensor(vals["input_ids"])
            attention_mask = torch.tensor(vals["attention_mask"])
            document_id = self.dataset[idx]["document"]
            return input_ids, attention_mask, document_id, vals
        
        def __len__(self):
            return len(self.dataset)

# Convert preds to a list of dictionaries
def to_test_submission(preds=None, dataset=None, document_ids=None, id2label=None):
    pairs = []
    row_id = 0
    results = []
    
    for i in range(len(preds)):
        input_ids, attention_mask, document_id, vals = dataset[i]
        token_map=vals["token_map"]
        offsets=vals["offset_mapping"]
        tokens=vals["tokens_without_ws"]
        #print("tokens", tokens)
        pred=preds[i]
        #print("original_text", original_text)
        #print("token_map", token_map)
        #print("offsets", offsets)   
        #print("pred", pred)


        for token_pred, input_id, (start_idx, end_idx) in zip(pred, input_ids, offsets):
            #print("\nnow doing ", start_idx,  end_idx, token_pred)
            if start_idx == 0 and end_idx == 0: # Skip 0 offset
                continue
            # Skip spaces 
            while start_idx < len(token_map):
                #print("loop, start_idx now", start_idx) 
                #print(" tokens[token_map[start_idx]]: ", tokens[token_map[start_idx]] if not tokens[token_map[start_idx]].isspace() else "WHITESPACE")          
                if token_map[start_idx] == -1: # Skip unknown tokens               
                    start_idx += 1
                elif tokens[token_map[start_idx]].isspace(): # Skip white space
                    start_idx += 1
                else:
                    break
            # Ensure start index < length
            if start_idx < len(token_map):
                token_id = token_map[start_idx]
                #print("token_id", token_id)
                #token_id= input_id.item()
                label_pred = id2label[token_pred.item()]
                #print("label_pred", label_pred)
                # ignore "O" and whitespace preds
                if label_pred != "O" and token_id != -1:
                    #print("is PII", token_id, label_pred)
                    token_str = tokens[token_id]
                    pair=(document_id, token_id)
                    if pair not in pairs:
                        results.append({
                            "row_id": row_id, 
                            "document": document_id,
                            "token": token_id, 
                            "label": label_pred,
                            "token_str": token_str
                        })
                        pairs.append(pair)
                        row_id += 1

    # Create a dataframe 
    return results

def create_submission(model, filename="submission.csv"):
    data = json.load(open(train_path))
    from itertools import chain
    all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
    label2id = {l: i for i,l in enumerate(all_labels)}
    id2label = {v:k for k,v in label2id.items()}

    data=json.load(open(test_path))
    tokenizer = AutoTokenizer.from_pretrained(parameter["model"])
    my_dataset=PiiDatasetInference(data, tokenizer)
    loader=torch.utils.data.DataLoader(my_dataset, batch_size=1, shuffle=False)

    device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    
    # stack all predictions into tensor
    all_preds = []

    for id, attention_mask, document_ids, vals in loader:
        id=id.to(device)
        attention_mask=attention_mask.to(device)
        preds=model(id, attention_mask).get('logits').argmax(dim=2)
        all_preds.append(preds)
        #for pred, id in zip(preds.flatten(), id.flatten()):
        #    if pred != 12:
                #print(f"Document: {document_id.item()} TOKEN:{tokenizer.decode(id)}  --- pred:{id2label[pred.item()]}")
        #        output[row_id]={"document":document_id.item(), "token":id.item(), "label":id2label[pred.item()]}
        #        row_id+=1
        #for pred, id in zip(preds.flatten(), id.flatten()):
        #    if pred != 12:
        #        print(f"TOKEN:{tokenizer.decode(id)}  --- pred:{id2label[pred.item()]}")
    
   
    all_preds = torch.cat(all_preds, dim=0)
    
    results = to_test_submission(preds=all_preds, dataset=my_dataset, document_ids=document_ids, id2label=id2label)
    if len(results) == 0:
        print("Error in create_submission(): No predictions made, probably because the model is not learning. Check the model and the data.")
        return
    df = pd.DataFrame(results)
    df=df[["row_id", "document", "token", "label"]]
    print(df)
    df.to_csv(filename, index=False)

#create_submission(MyModel(parameter['model'], len(label2id)).to(device), "submission_just_dumb.csv")
# create_submission(model, "submission.csv")
    




In [21]:
from transformers import DataCollatorForTokenClassification

collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

In [22]:
# using Trainer and TrainingArguments from transformers


def compute_metrics(p, all_labels):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    f1_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)
    
    results = {
        'recall': recall,
        'precision': precision,
        'f1': f1_score
    }
    return results

from functools import partial

def get_trainer(model, train_dataloader, valid_dataloader, learnrate_multiplier=1.0):

    if not kaggle:
        from transformers.integrations import NeptuneCallback

        run = neptune.init_run(
            project="bernd.heidemann/PII",
            api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIzNjBlYzVkNi0zZTUwLTQ1ODYtODhlNC02NDUxNDg0MDdjNzUifQ==",
        )  # your credentials
        run["parameters"] = {
        **parameter
        }

        neptune_callback = NeptuneCallback(run=run, log_model_weights=False, log_parameters=False)

    training_args = TrainingArguments(
        output_dir='./results',          # output directory
        num_train_epochs=parameter["epochs_before_unfreeze"],              # total number of training epochs
        per_device_train_batch_size=parameter["batch_size"],  # batch size per device during training
        per_device_eval_batch_size=parameter["inference_batch_size"],   # batch size for evaluation
        warmup_steps=500,                # number of warmup steps for learning rate scheduler
        weight_decay=0.01,               # strength of weight decay
        logging_dir='./logs',            # directory for storing logs
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=400,
        save_steps=400,
        save_total_limit=3,
        load_best_model_at_end=False,
        metric_for_best_model="f1" if not kaggle else "eval_loss",
        greater_is_better=True,
        overwrite_output_dir=True,
        report_to="none",
        learning_rate=parameter["lr"]*learnrate_multiplier
    )

    class MyTrainer(Trainer):
        def __init__(self, model=None, args=None, train_dataset=None, eval_dataset=None, compute_metrics=None, callbacks=None):
            super().__init__(model=model, args=args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, callbacks=callbacks)
            # Definieren Sie hier Ihre Gewichte für die Klassen, z.B. torch.tensor([1.0, 2.0, 0.5])
            self.weight = torch.tensor(CROSS_ENTROPY_WEIGHTS).to(device)
            self.loss_func=torch.nn.CrossEntropyLoss(ignore_index=-100, weight=torch.tensor(CROSS_ENTROPY_WEIGHTS, dtype=torch.float32).to(device))

        def compute_loss(self, model, inputs, return_outputs=False):
            labels = inputs.get("labels")
            outputs = model(**inputs)
            logits = outputs.get('logits')
            loss = self.loss_func(logits.view(-1, self.model.config.num_labels), labels.view(-1))
            return (loss, outputs) if return_outputs else loss
        

    trainer = MyTrainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataloader,         # training dataset
        eval_dataset=valid_dataloader,             # evaluation dataset
        compute_metrics=partial(compute_metrics, all_labels=all_labels) if not kaggle else None,
        callbacks=[neptune_callback] if not kaggle else None
    )
    return trainer

In [23]:
def unfreeze(model):
    for param in model.base_model.parameters():
        param.requires_grad = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [24]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"



model = AutoModelForTokenClassification.from_pretrained(
    parameter["model"],
    num_labels=len(all_labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

if parameter['freeze_embeddings']:
    for param in model.deberta.embeddings.parameters():
        param.requires_grad = False
        
if parameter['freeze_layers'] > 0:
    for layer in model.deberta.encoder.layer[:parameter['freeze_layers']]:
        for param in layer.parameters():
            param.requires_grad = False

print(model.config)
#my_model=MyModel(parameter['model'], len(label2id))

trainer=get_trainer(model, train_ds, valid_ds)
#trainer.set_lr(0.1)
trainer.train()

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "B-EMAIL",
    "1": "B-ID_NUM",
    "2": "B-NAME_STUDENT",
    "3": "B-PHONE_NUM",
    "4": "B-STREET_ADDRESS",
    "5": "B-URL_PERSONAL",
    "6": "B-USERNAME",
    "7": "I-ID_NUM",
    "8": "I-NAME_STUDENT",
    "9": "I-PHONE_NUM",
    "10": "I-STREET_ADDRESS",
    "11": "I-URL_PERSONAL",
    "12": "O"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-EMAIL": 0,
    "B-ID_NUM": 1,
    "B-NAME_STUDENT": 2,
    "B-PHONE_NUM": 3,
    "B-STREET_ADDRESS": 4,
    "B-URL_PERSONAL": 5,
    "B-USERNAME": 6,
    "I-ID_NUM": 7,
    "I-NAME_STUDENT": 8,
    "I-PHONE_NUM": 9,
    "I-STREET_ADDRESS": 10,
    "I-URL_PERSONAL": 11,
    "O": 12
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deber



  0%|          | 0/3800 [00:00<?, ?it/s]

{'loss': 2.6708, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.01}
{'loss': 2.6521, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.01}
{'loss': 2.5128, 'learning_rate': 3e-06, 'epoch': 0.02}
{'loss': 2.428, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.02}
{'loss': 2.3913, 'learning_rate': 5e-06, 'epoch': 0.03}
{'loss': 1.9391, 'learning_rate': 6e-06, 'epoch': 0.03}
{'loss': 1.8615, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.04}
{'loss': 1.458, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.04}
{'loss': 1.2186, 'learning_rate': 9e-06, 'epoch': 0.05}
{'loss': 1.1076, 'learning_rate': 1e-05, 'epoch': 0.05}
{'loss': 0.8905, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.06}
{'loss': 0.8465, 'learning_rate': 1.2e-05, 'epoch': 0.06}
{'loss': 1.3269, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.07}
{'loss': 0.8295, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.07}
{'loss': 0.7895, 'learning_rate': 1.5e-05, 'epoch': 0.08}
{'loss': 0.9105, 'learnin

  0%|          | 0/1900 [00:00<?, ?it/s]

{'eval_loss': 0.04443526640534401, 'eval_recall': 0.9085017155928327, 'eval_precision': 0.5760212714527435, 'eval_f1': 0.8887709433096167, 'eval_runtime': 59.7059, 'eval_samples_per_second': 31.823, 'eval_steps_per_second': 31.823, 'epoch': 0.21}
{'loss': 0.1398, 'learning_rate': 4.1e-05, 'epoch': 0.22}
{'loss': 0.1358, 'learning_rate': 4.2e-05, 'epoch': 0.22}
{'loss': 0.1983, 'learning_rate': 4.3e-05, 'epoch': 0.23}
{'loss': 0.1682, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.23}
{'loss': 0.0754, 'learning_rate': 4.5e-05, 'epoch': 0.24}
{'loss': 0.1455, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.24}
{'loss': 0.2356, 'learning_rate': 4.7e-05, 'epoch': 0.25}
{'loss': 0.1719, 'learning_rate': 4.8e-05, 'epoch': 0.25}
{'loss': 0.1367, 'learning_rate': 4.9e-05, 'epoch': 0.26}
{'loss': 0.0431, 'learning_rate': 5e-05, 'epoch': 0.26}
{'loss': 0.2647, 'learning_rate': 4.984848484848485e-05, 'epoch': 0.27}
{'loss': 0.0537, 'learning_rate': 4.9696969696969694e-05, 'epoch': 0.27}
{

  0%|          | 0/1900 [00:00<?, ?it/s]

{'eval_loss': 0.011398006230592728, 'eval_recall': 0.98131910026687, 'eval_precision': 0.8414514547237659, 'eval_f1': 0.9750852347233148, 'eval_runtime': 63.7345, 'eval_samples_per_second': 29.811, 'eval_steps_per_second': 29.811, 'epoch': 0.42}
{'loss': 0.0043, 'learning_rate': 4.5303030303030304e-05, 'epoch': 0.43}
{'loss': 0.0218, 'learning_rate': 4.515151515151516e-05, 'epoch': 0.43}
{'loss': 0.0976, 'learning_rate': 4.5e-05, 'epoch': 0.44}
{'loss': 0.0687, 'learning_rate': 4.484848484848485e-05, 'epoch': 0.44}
{'loss': 0.0653, 'learning_rate': 4.46969696969697e-05, 'epoch': 0.45}
{'loss': 0.102, 'learning_rate': 4.454545454545455e-05, 'epoch': 0.45}
{'loss': 0.0707, 'learning_rate': 4.43939393939394e-05, 'epoch': 0.46}
{'loss': 0.0138, 'learning_rate': 4.4242424242424246e-05, 'epoch': 0.46}
{'loss': 0.0684, 'learning_rate': 4.409090909090909e-05, 'epoch': 0.47}
{'loss': 0.0806, 'learning_rate': 4.3939393939393944e-05, 'epoch': 0.47}
{'loss': 0.0305, 'learning_rate': 4.378787878787

  0%|          | 0/1900 [00:00<?, ?it/s]

{'eval_loss': 0.014275087043642998, 'eval_recall': 0.967848519506926, 'eval_precision': 0.8439716312056738, 'eval_f1': 0.9624153701840594, 'eval_runtime': 58.6956, 'eval_samples_per_second': 32.37, 'eval_steps_per_second': 32.37, 'epoch': 0.63}
{'loss': 0.0722, 'learning_rate': 3.924242424242424e-05, 'epoch': 0.64}
{'loss': 0.0507, 'learning_rate': 3.909090909090909e-05, 'epoch': 0.64}
{'loss': 0.0051, 'learning_rate': 3.8939393939393944e-05, 'epoch': 0.65}
{'loss': 0.0834, 'learning_rate': 3.878787878787879e-05, 'epoch': 0.65}
{'loss': 0.0637, 'learning_rate': 3.8636363636363636e-05, 'epoch': 0.66}
{'loss': 0.0132, 'learning_rate': 3.848484848484848e-05, 'epoch': 0.66}
{'loss': 0.0096, 'learning_rate': 3.8333333333333334e-05, 'epoch': 0.67}
{'loss': 0.0055, 'learning_rate': 3.818181818181819e-05, 'epoch': 0.67}
{'loss': 0.1119, 'learning_rate': 3.803030303030303e-05, 'epoch': 0.68}
{'loss': 0.0053, 'learning_rate': 3.787878787878788e-05, 'epoch': 0.68}
{'loss': 0.0969, 'learning_rate'

  0%|          | 0/1900 [00:00<?, ?it/s]

{'eval_loss': 0.020421413704752922, 'eval_recall': 0.9715338670733257, 'eval_precision': 0.8726172811322909, 'eval_f1': 0.9673165081806061, 'eval_runtime': 58.8324, 'eval_samples_per_second': 32.295, 'eval_steps_per_second': 32.295, 'epoch': 0.84}
{'loss': 0.0107, 'learning_rate': 3.318181818181819e-05, 'epoch': 0.85}
{'loss': 0.0183, 'learning_rate': 3.303030303030303e-05, 'epoch': 0.85}
{'loss': 0.0075, 'learning_rate': 3.287878787878788e-05, 'epoch': 0.86}
{'loss': 0.0239, 'learning_rate': 3.272727272727273e-05, 'epoch': 0.86}
{'loss': 0.0568, 'learning_rate': 3.257575757575758e-05, 'epoch': 0.87}
{'loss': 0.0106, 'learning_rate': 3.2424242424242423e-05, 'epoch': 0.87}
{'loss': 0.0784, 'learning_rate': 3.2272727272727276e-05, 'epoch': 0.88}
{'loss': 0.0337, 'learning_rate': 3.212121212121212e-05, 'epoch': 0.88}
{'loss': 0.0346, 'learning_rate': 3.1969696969696974e-05, 'epoch': 0.89}
{'loss': 0.0064, 'learning_rate': 3.181818181818182e-05, 'epoch': 0.89}
{'loss': 0.0153, 'learning_ra

  0%|          | 0/1900 [00:00<?, ?it/s]

{'eval_loss': 0.02042432501912117, 'eval_recall': 0.9692464099631465, 'eval_precision': 0.8905885100420364, 'eval_f1': 0.9659650541431832, 'eval_runtime': 58.798, 'eval_samples_per_second': 32.314, 'eval_steps_per_second': 32.314, 'epoch': 1.05}
{'loss': 0.0015, 'learning_rate': 2.7121212121212126e-05, 'epoch': 1.06}
{'loss': 0.0002, 'learning_rate': 2.696969696969697e-05, 'epoch': 1.06}
{'loss': 0.0008, 'learning_rate': 2.681818181818182e-05, 'epoch': 1.07}
{'loss': 0.0004, 'learning_rate': 2.6666666666666667e-05, 'epoch': 1.07}
{'loss': 0.0027, 'learning_rate': 2.6515151515151516e-05, 'epoch': 1.08}
{'loss': 0.0171, 'learning_rate': 2.636363636363636e-05, 'epoch': 1.08}
{'loss': 0.0086, 'learning_rate': 2.6212121212121214e-05, 'epoch': 1.09}
{'loss': 0.0393, 'learning_rate': 2.6060606060606063e-05, 'epoch': 1.09}
{'loss': 0.0165, 'learning_rate': 2.590909090909091e-05, 'epoch': 1.1}
{'loss': 0.0017, 'learning_rate': 2.575757575757576e-05, 'epoch': 1.11}
{'loss': 0.0044, 'learning_rat

  0%|          | 0/1900 [00:00<?, ?it/s]

{'eval_loss': 0.020251477137207985, 'eval_recall': 0.9749650527385945, 'eval_precision': 0.9482140650105055, 'eval_f1': 0.9739082884149677, 'eval_runtime': 58.9342, 'eval_samples_per_second': 32.239, 'eval_steps_per_second': 32.239, 'epoch': 1.26}
{'loss': 0.124, 'learning_rate': 2.106060606060606e-05, 'epoch': 1.27}
{'loss': 0.003, 'learning_rate': 2.090909090909091e-05, 'epoch': 1.27}
{'loss': 0.002, 'learning_rate': 2.075757575757576e-05, 'epoch': 1.28}
{'loss': 0.0216, 'learning_rate': 2.0606060606060608e-05, 'epoch': 1.28}
{'loss': 0.0081, 'learning_rate': 2.0454545454545457e-05, 'epoch': 1.29}
{'loss': 0.0007, 'learning_rate': 2.0303030303030303e-05, 'epoch': 1.29}
{'loss': 0.0956, 'learning_rate': 2.0151515151515152e-05, 'epoch': 1.3}
{'loss': 0.0007, 'learning_rate': 2e-05, 'epoch': 1.31}
{'loss': 0.0022, 'learning_rate': 1.984848484848485e-05, 'epoch': 1.31}
{'loss': 0.0014, 'learning_rate': 1.9696969696969697e-05, 'epoch': 1.32}
{'loss': 0.0131, 'learning_rate': 1.95454545454

  0%|          | 0/1900 [00:00<?, ?it/s]

{'eval_loss': 0.007594369817525148, 'eval_recall': 0.9960604905324691, 'eval_precision': 0.9080166821130676, 'eval_f1': 0.9923596468588849, 'eval_runtime': 59.0121, 'eval_samples_per_second': 32.197, 'eval_steps_per_second': 32.197, 'epoch': 1.47}
{'loss': 0.0024, 'learning_rate': 1.5e-05, 'epoch': 1.48}
{'loss': 0.0005, 'learning_rate': 1.484848484848485e-05, 'epoch': 1.48}
{'loss': 0.0017, 'learning_rate': 1.4696969696969697e-05, 'epoch': 1.49}
{'loss': 0.0009, 'learning_rate': 1.4545454545454545e-05, 'epoch': 1.49}
{'loss': 0.0012, 'learning_rate': 1.4393939393939396e-05, 'epoch': 1.5}
{'loss': 0.0002, 'learning_rate': 1.4242424242424243e-05, 'epoch': 1.51}
{'loss': 0.0008, 'learning_rate': 1.409090909090909e-05, 'epoch': 1.51}
{'loss': 0.0022, 'learning_rate': 1.3939393939393942e-05, 'epoch': 1.52}
{'loss': 0.0011, 'learning_rate': 1.3787878787878789e-05, 'epoch': 1.52}
{'loss': 0.0007, 'learning_rate': 1.3636363636363637e-05, 'epoch': 1.53}
{'loss': 0.0006, 'learning_rate': 1.3484

  0%|          | 0/1900 [00:00<?, ?it/s]

{'eval_loss': 0.01808127947151661, 'eval_recall': 0.9796670479095184, 'eval_precision': 0.9252280364858377, 'eval_f1': 0.9774550490839132, 'eval_runtime': 59.075, 'eval_samples_per_second': 32.162, 'eval_steps_per_second': 32.162, 'epoch': 1.68}
{'loss': 0.0067, 'learning_rate': 8.93939393939394e-06, 'epoch': 1.69}
{'loss': 0.0005, 'learning_rate': 8.787878787878788e-06, 'epoch': 1.69}
{'loss': 0.0006, 'learning_rate': 8.636363636363637e-06, 'epoch': 1.7}
{'loss': 0.0012, 'learning_rate': 8.484848484848486e-06, 'epoch': 1.71}
{'loss': 0.0004, 'learning_rate': 8.333333333333334e-06, 'epoch': 1.71}
{'loss': 0.0081, 'learning_rate': 8.181818181818183e-06, 'epoch': 1.72}
{'loss': 0.0044, 'learning_rate': 8.03030303030303e-06, 'epoch': 1.72}
{'loss': 0.0005, 'learning_rate': 7.878787878787878e-06, 'epoch': 1.73}
{'loss': 0.0436, 'learning_rate': 7.727272727272727e-06, 'epoch': 1.73}
{'loss': 0.0002, 'learning_rate': 7.5757575757575764e-06, 'epoch': 1.74}
{'loss': 0.0001, 'learning_rate': 7.

  0%|          | 0/1900 [00:00<?, ?it/s]

{'eval_loss': 0.009772734716534615, 'eval_recall': 0.9950438429279451, 'eval_precision': 0.9278350515463918, 'eval_f1': 0.9922793472539042, 'eval_runtime': 56.3969, 'eval_samples_per_second': 33.69, 'eval_steps_per_second': 33.69, 'epoch': 1.89}
{'loss': 0.0002, 'learning_rate': 2.8787878787878793e-06, 'epoch': 1.9}
{'loss': 0.0008, 'learning_rate': 2.7272727272727272e-06, 'epoch': 1.91}
{'loss': 0.0006, 'learning_rate': 2.575757575757576e-06, 'epoch': 1.91}
{'loss': 0.0004, 'learning_rate': 2.4242424242424244e-06, 'epoch': 1.92}
{'loss': 0.0074, 'learning_rate': 2.2727272727272728e-06, 'epoch': 1.92}
{'loss': 0.0011, 'learning_rate': 2.1212121212121216e-06, 'epoch': 1.93}
{'loss': 0.0057, 'learning_rate': 1.9696969696969695e-06, 'epoch': 1.93}
{'loss': 0.0055, 'learning_rate': 1.818181818181818e-06, 'epoch': 1.94}
{'loss': 0.008, 'learning_rate': 1.6666666666666667e-06, 'epoch': 1.94}
{'loss': 0.0003, 'learning_rate': 1.5151515151515152e-06, 'epoch': 1.95}
{'loss': 0.0668, 'learning_r

TrainOutput(global_step=3800, training_loss=0.12110708509250176, metrics={'train_runtime': 1753.8193, 'train_samples_per_second': 8.666, 'train_steps_per_second': 2.167, 'train_loss': 0.12110708509250176, 'epoch': 2.0})

In [25]:
create_submission(model, f"submission.csv")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


    row_id  document  token           label
0        0         7      9  B-NAME_STUDENT
1        1         7     10  I-NAME_STUDENT
2        2         7    482  B-NAME_STUDENT
3        3         7    483  I-NAME_STUDENT
4        4         7    741  B-NAME_STUDENT
5        5         7    742  I-NAME_STUDENT
6        6        10      0  B-NAME_STUDENT
7        7        10      1  I-NAME_STUDENT
8        8        10    464  B-NAME_STUDENT
9        9        10    465  I-NAME_STUDENT
10      10        16      4  B-NAME_STUDENT
11      11        16      5  I-NAME_STUDENT
12      12        20      5  B-NAME_STUDENT
13      13        20      6  I-NAME_STUDENT
14      14        20      8  I-NAME_STUDENT
15      15        20    328  B-NAME_STUDENT
16      16        20    330  B-NAME_STUDENT
17      17        56     12  B-NAME_STUDENT
18      18        56     13  I-NAME_STUDENT
19      19        86      6  B-NAME_STUDENT
20      20        86      7  I-NAME_STUDENT
21      21        93      0  B-N

In [26]:
unfreeze(model)
trainer=get_trainer(model, train_ds, valid_ds, learnrate_multiplier=parameter["lr_scale_unfreeze"])
trainer.train()

https://app.neptune.ai/bernd.heidemann/PII/e/PII-223




  0%|          | 0/3800 [00:00<?, ?it/s]

{'loss': 0.0004, 'learning_rate': 1.0000000000000001e-07, 'epoch': 0.01}
{'loss': 0.0009, 'learning_rate': 2.0000000000000002e-07, 'epoch': 0.01}
{'loss': 0.0003, 'learning_rate': 3.0000000000000004e-07, 'epoch': 0.02}
{'loss': 0.0018, 'learning_rate': 4.0000000000000003e-07, 'epoch': 0.02}
{'loss': 0.0232, 'learning_rate': 5.000000000000001e-07, 'epoch': 0.03}
{'loss': 0.0014, 'learning_rate': 6.000000000000001e-07, 'epoch': 0.03}
{'loss': 0.0007, 'learning_rate': 7.000000000000001e-07, 'epoch': 0.04}
{'loss': 0.0006, 'learning_rate': 8.000000000000001e-07, 'epoch': 0.04}
{'loss': 0.0002, 'learning_rate': 9.000000000000001e-07, 'epoch': 0.05}
{'loss': 0.0002, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.05}
{'loss': 0.0041, 'learning_rate': 1.1e-06, 'epoch': 0.06}
{'loss': 0.0014, 'learning_rate': 1.2000000000000002e-06, 'epoch': 0.06}
{'loss': 0.0708, 'learning_rate': 1.3e-06, 'epoch': 0.07}
{'loss': 0.0064, 'learning_rate': 1.4000000000000001e-06, 'epoch': 0.07}
{'loss': 0.00

  0%|          | 0/1900 [00:00<?, ?it/s]

{'eval_loss': 0.032189611345529556, 'eval_recall': 0.9785233193544288, 'eval_precision': 0.9595015576323987, 'eval_f1': 0.9777777777777777, 'eval_runtime': 55.827, 'eval_samples_per_second': 34.034, 'eval_steps_per_second': 34.034, 'epoch': 0.21}
{'loss': 0.0021, 'learning_rate': 4.1e-06, 'epoch': 0.22}
{'loss': 0.0002, 'learning_rate': 4.2000000000000004e-06, 'epoch': 0.22}
{'loss': 0.0015, 'learning_rate': 4.3e-06, 'epoch': 0.23}
{'loss': 0.0006, 'learning_rate': 4.4e-06, 'epoch': 0.23}
{'loss': 0.0042, 'learning_rate': 4.5e-06, 'epoch': 0.24}
{'loss': 0.0011, 'learning_rate': 4.600000000000001e-06, 'epoch': 0.24}
{'loss': 0.0018, 'learning_rate': 4.7e-06, 'epoch': 0.25}
{'loss': 0.0007, 'learning_rate': 4.800000000000001e-06, 'epoch': 0.25}
{'loss': 0.0004, 'learning_rate': 4.9000000000000005e-06, 'epoch': 0.26}
{'loss': 0.0001, 'learning_rate': 5e-06, 'epoch': 0.26}
{'loss': 0.0127, 'learning_rate': 4.984848484848485e-06, 'epoch': 0.27}
{'loss': 0.0001, 'learning_rate': 4.969696969

  0%|          | 0/1900 [00:00<?, ?it/s]

{'eval_loss': 0.019631382077932358, 'eval_recall': 0.9764900241453806, 'eval_precision': 0.9561971129915381, 'eval_f1': 0.9756936135299202, 'eval_runtime': 55.7665, 'eval_samples_per_second': 34.071, 'eval_steps_per_second': 34.071, 'epoch': 0.42}
{'loss': 0.0283, 'learning_rate': 4.53030303030303e-06, 'epoch': 0.43}
{'loss': 0.0059, 'learning_rate': 4.5151515151515155e-06, 'epoch': 0.43}
{'loss': 0.0011, 'learning_rate': 4.5e-06, 'epoch': 0.44}
{'loss': 0.0028, 'learning_rate': 4.4848484848484855e-06, 'epoch': 0.44}
{'loss': 0.0004, 'learning_rate': 4.46969696969697e-06, 'epoch': 0.45}
{'loss': 0.0002, 'learning_rate': 4.454545454545455e-06, 'epoch': 0.45}
{'loss': 0.0003, 'learning_rate': 4.43939393939394e-06, 'epoch': 0.46}
{'loss': 0.0001, 'learning_rate': 4.424242424242425e-06, 'epoch': 0.46}
{'loss': 0.0025, 'learning_rate': 4.409090909090909e-06, 'epoch': 0.47}
{'loss': 0.001, 'learning_rate': 4.393939393939394e-06, 'epoch': 0.47}
{'loss': 0.0002, 'learning_rate': 4.378787878787

  0%|          | 0/1900 [00:00<?, ?it/s]

{'eval_loss': 0.0054142167791724205, 'eval_recall': 0.9954250857796416, 'eval_precision': 0.9237028301886793, 'eval_f1': 0.9924611973392463, 'eval_runtime': 55.8742, 'eval_samples_per_second': 34.005, 'eval_steps_per_second': 34.005, 'epoch': 0.63}
{'loss': 0.0105, 'learning_rate': 3.9242424242424244e-06, 'epoch': 0.64}
{'loss': 0.0021, 'learning_rate': 3.90909090909091e-06, 'epoch': 0.64}
{'loss': 0.0001, 'learning_rate': 3.8939393939393944e-06, 'epoch': 0.65}
{'loss': 0.0042, 'learning_rate': 3.878787878787879e-06, 'epoch': 0.65}
{'loss': 0.0056, 'learning_rate': 3.863636363636364e-06, 'epoch': 0.66}
{'loss': 0.0005, 'learning_rate': 3.848484848484848e-06, 'epoch': 0.66}
{'loss': 0.0003, 'learning_rate': 3.833333333333334e-06, 'epoch': 0.67}
{'loss': 0.0009, 'learning_rate': 3.818181818181819e-06, 'epoch': 0.67}
{'loss': 0.0008, 'learning_rate': 3.803030303030303e-06, 'epoch': 0.68}
{'loss': 0.0011, 'learning_rate': 3.7878787878787882e-06, 'epoch': 0.68}
{'loss': 0.0002, 'learning_ra

  0%|          | 0/1900 [00:00<?, ?it/s]

{'eval_loss': 0.015537953935563564, 'eval_recall': 0.9894522811030626, 'eval_precision': 0.9553374233128834, 'eval_f1': 0.9880951799877975, 'eval_runtime': 55.8738, 'eval_samples_per_second': 34.005, 'eval_steps_per_second': 34.005, 'epoch': 0.84}
{'loss': 0.0009, 'learning_rate': 3.3181818181818188e-06, 'epoch': 0.85}
{'loss': 0.0007, 'learning_rate': 3.3030303030303033e-06, 'epoch': 0.85}
{'loss': 0.0001, 'learning_rate': 3.2878787878787883e-06, 'epoch': 0.86}
{'loss': 0.0003, 'learning_rate': 3.272727272727273e-06, 'epoch': 0.86}
{'loss': 0.0003, 'learning_rate': 3.257575757575758e-06, 'epoch': 0.87}
{'loss': 0.0001, 'learning_rate': 3.2424242424242425e-06, 'epoch': 0.87}
{'loss': 0.2165, 'learning_rate': 3.227272727272728e-06, 'epoch': 0.88}
{'loss': 0.0005, 'learning_rate': 3.2121212121212125e-06, 'epoch': 0.88}
{'loss': 0.0003, 'learning_rate': 3.196969696969697e-06, 'epoch': 0.89}
{'loss': 0.0547, 'learning_rate': 3.181818181818182e-06, 'epoch': 0.89}
{'loss': 0.0008, 'learning_

  0%|          | 0/1900 [00:00<?, ?it/s]

{'eval_loss': 0.011750098317861557, 'eval_recall': 0.9895793620536282, 'eval_precision': 0.9525382262996942, 'eval_f1': 0.9881015129331381, 'eval_runtime': 55.8907, 'eval_samples_per_second': 33.995, 'eval_steps_per_second': 33.995, 'epoch': 1.05}
{'loss': 0.0003, 'learning_rate': 2.7121212121212127e-06, 'epoch': 1.06}
{'loss': 0.0001, 'learning_rate': 2.6969696969696972e-06, 'epoch': 1.06}
{'loss': 0.0001, 'learning_rate': 2.6818181818181822e-06, 'epoch': 1.07}
{'loss': 0.0001, 'learning_rate': 2.666666666666667e-06, 'epoch': 1.07}
{'loss': 0.0004, 'learning_rate': 2.6515151515151514e-06, 'epoch': 1.08}
{'loss': 0.0002, 'learning_rate': 2.6363636363636364e-06, 'epoch': 1.08}
{'loss': 0.0004, 'learning_rate': 2.621212121212122e-06, 'epoch': 1.09}
{'loss': 0.0032, 'learning_rate': 2.6060606060606064e-06, 'epoch': 1.09}
{'loss': 0.0004, 'learning_rate': 2.590909090909091e-06, 'epoch': 1.1}
{'loss': 0.0007, 'learning_rate': 2.575757575757576e-06, 'epoch': 1.11}
{'loss': 0.0002, 'learning_

  0%|          | 0/1900 [00:00<?, ?it/s]

{'eval_loss': 0.021222904324531555, 'eval_recall': 0.9791587241072564, 'eval_precision': 0.9682080924855492, 'eval_f1': 0.9787329675644776, 'eval_runtime': 55.8967, 'eval_samples_per_second': 33.991, 'eval_steps_per_second': 33.991, 'epoch': 1.26}
{'loss': 0.0008, 'learning_rate': 2.106060606060606e-06, 'epoch': 1.27}
{'loss': 0.0006, 'learning_rate': 2.090909090909091e-06, 'epoch': 1.27}
{'loss': 0.0001, 'learning_rate': 2.075757575757576e-06, 'epoch': 1.28}
{'loss': 0.0002, 'learning_rate': 2.0606060606060607e-06, 'epoch': 1.28}
{'loss': 0.0002, 'learning_rate': 2.0454545454545457e-06, 'epoch': 1.29}
{'loss': 0.0003, 'learning_rate': 2.0303030303030303e-06, 'epoch': 1.29}
{'loss': 0.0008, 'learning_rate': 2.0151515151515153e-06, 'epoch': 1.3}
{'loss': 0.0001, 'learning_rate': 2.0000000000000003e-06, 'epoch': 1.31}
{'loss': 0.0043, 'learning_rate': 1.984848484848485e-06, 'epoch': 1.31}
{'loss': 0.0001, 'learning_rate': 1.96969696969697e-06, 'epoch': 1.32}
{'loss': 0.0002, 'learning_ra

  0%|          | 0/1900 [00:00<?, ?it/s]

{'eval_loss': 0.007906520739197731, 'eval_recall': 0.9949167619773795, 'eval_precision': 0.9388415877203502, 'eval_f1': 0.9926364452073498, 'eval_runtime': 55.8833, 'eval_samples_per_second': 33.999, 'eval_steps_per_second': 33.999, 'epoch': 1.47}
{'loss': 0.0002, 'learning_rate': 1.5e-06, 'epoch': 1.48}
{'loss': 0.0001, 'learning_rate': 1.484848484848485e-06, 'epoch': 1.48}
{'loss': 0.0012, 'learning_rate': 1.4696969696969698e-06, 'epoch': 1.49}
{'loss': 0.0003, 'learning_rate': 1.4545454545454546e-06, 'epoch': 1.49}
{'loss': 0.0006, 'learning_rate': 1.4393939393939396e-06, 'epoch': 1.5}
{'loss': 0.0001, 'learning_rate': 1.4242424242424244e-06, 'epoch': 1.51}
{'loss': 0.0004, 'learning_rate': 1.409090909090909e-06, 'epoch': 1.51}
{'loss': 0.0011, 'learning_rate': 1.3939393939393942e-06, 'epoch': 1.52}
{'loss': 0.0002, 'learning_rate': 1.3787878787878788e-06, 'epoch': 1.52}
{'loss': 0.0002, 'learning_rate': 1.3636363636363636e-06, 'epoch': 1.53}
{'loss': 0.0002, 'learning_rate': 1.3484

  0%|          | 0/1900 [00:00<?, ?it/s]

{'eval_loss': 0.010066197253763676, 'eval_recall': 0.9952980048290762, 'eval_precision': 0.9394266522729999, 'eval_f1': 0.9930264993026499, 'eval_runtime': 55.8208, 'eval_samples_per_second': 34.037, 'eval_steps_per_second': 34.037, 'epoch': 1.68}
{'loss': 0.0012, 'learning_rate': 8.93939393939394e-07, 'epoch': 1.69}
{'loss': 0.0003, 'learning_rate': 8.787878787878788e-07, 'epoch': 1.69}
{'loss': 0.0002, 'learning_rate': 8.636363636363637e-07, 'epoch': 1.7}
{'loss': 0.0001, 'learning_rate': 8.484848484848486e-07, 'epoch': 1.71}
{'loss': 0.0003, 'learning_rate': 8.333333333333333e-07, 'epoch': 1.71}
{'loss': 0.0001, 'learning_rate': 8.181818181818182e-07, 'epoch': 1.72}
{'loss': 0.0003, 'learning_rate': 8.030303030303031e-07, 'epoch': 1.72}
{'loss': 0.0001, 'learning_rate': 7.878787878787879e-07, 'epoch': 1.73}
{'loss': 0.0008, 'learning_rate': 7.727272727272727e-07, 'epoch': 1.73}
{'loss': 0.0001, 'learning_rate': 7.575757575757576e-07, 'epoch': 1.74}
{'loss': 0.0001, 'learning_rate': 

  0%|          | 0/1900 [00:00<?, ?it/s]

{'eval_loss': 0.009950129315257072, 'eval_recall': 0.9952980048290762, 'eval_precision': 0.9444109489931267, 'eval_f1': 0.9932396179847622, 'eval_runtime': 55.8415, 'eval_samples_per_second': 34.025, 'eval_steps_per_second': 34.025, 'epoch': 1.89}
{'loss': 0.0001, 'learning_rate': 2.878787878787879e-07, 'epoch': 1.9}
{'loss': 0.0003, 'learning_rate': 2.7272727272727274e-07, 'epoch': 1.91}
{'loss': 0.0002, 'learning_rate': 2.575757575757576e-07, 'epoch': 1.91}
{'loss': 0.0003, 'learning_rate': 2.4242424242424244e-07, 'epoch': 1.92}
{'loss': 0.0002, 'learning_rate': 2.2727272727272729e-07, 'epoch': 1.92}
{'loss': 0.0002, 'learning_rate': 2.1212121212121216e-07, 'epoch': 1.93}
{'loss': 0.0004, 'learning_rate': 1.9696969696969698e-07, 'epoch': 1.93}
{'loss': 0.0003, 'learning_rate': 1.8181818181818183e-07, 'epoch': 1.94}
{'loss': 0.0019, 'learning_rate': 1.6666666666666668e-07, 'epoch': 1.94}
{'loss': 0.0001, 'learning_rate': 1.5151515151515152e-07, 'epoch': 1.95}
{'loss': 0.0029, 'learnin

TrainOutput(global_step=3800, training_loss=0.0034090387485278036, metrics={'train_runtime': 1660.5459, 'train_samples_per_second': 9.152, 'train_steps_per_second': 2.288, 'train_loss': 0.0034090387485278036, 'epoch': 2.0})

In [27]:
create_submission(model, f"submission.csv")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


    row_id  document  token           label
0        0         7      9  B-NAME_STUDENT
1        1         7     10  I-NAME_STUDENT
2        2         7    482  B-NAME_STUDENT
3        3         7    483  I-NAME_STUDENT
4        4         7    741  B-NAME_STUDENT
5        5         7    742  I-NAME_STUDENT
6        6        10      0  B-NAME_STUDENT
7        7        10      1  I-NAME_STUDENT
8        8        10    464  B-NAME_STUDENT
9        9        10    465  I-NAME_STUDENT
10      10        16      4  B-NAME_STUDENT
11      11        16      5  I-NAME_STUDENT
12      12        20      5  B-NAME_STUDENT
13      13        20      6  I-NAME_STUDENT
14      14        20      8  I-NAME_STUDENT
15      15        56     12  B-NAME_STUDENT
16      16        56     13  I-NAME_STUDENT
17      17        86      6  B-NAME_STUDENT
18      18        86      7  I-NAME_STUDENT
19      19        93      0  B-NAME_STUDENT
20      20        93      1  I-NAME_STUDENT
21      21       104      7  B-N