In [39]:
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForTokenClassification
from pathlib import Path
import numpy as np
import torch
from tokenizers import AddedToken
from tqdm.notebook import tqdm
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
import pandas as pd
from datasets import Dataset

kaggle=False

path="/kaggle/input/pii-detection-removal-from-educational-data" if kaggle else "data"
train_path = path + "/train.json"
test_path = path + "/test.json"

mixtral_path="data/mpware_mixtral8x7b_v1.1.json" if not kaggle else "/kaggle/input/mixtral-8x7b-v11/mixtral8x7b_v1.1.json"

model_path = "/kaggle/input/huggingfacedebertav3variants/deberta-v3-base" if kaggle else "microsoft/deberta-v3-base"

if not kaggle: import neptune
if not kaggle: from seqeval.metrics import recall_score, precision_score, f1_score, accuracy_score

https://www.kaggle.com/datasets/mpware/pii-mixtral8x7b-generated-essays

In [40]:
cross_entropy_weight_multi = 400

CROSS_ENTROPY_WEIGHTS = [cross_entropy_weight_multi]*12
CROSS_ENTROPY_WEIGHTS.append(1)


# best PII-265

parameter= {
    "model": model_path,
    "max_length": 512,
    "stride": 128,
    "inference_max_length": 2500,
    "batch_size": 8,
    "inference_batch_size": 1,
    "lr": 5e-05,
    "lr_scale_unfreeze": 0.01,
    "filter_no_pii_percent_allow": 0.2,
    "notebook": "20_deberta base_1024len.ipynb",
    "CROSS_ENTROPY_WEIGHT_MULTI": cross_entropy_weight_multi,
    "epochs_before_unfreeze": 1,
    "epochs_after_unfreeze": 2,
    "train_test_split": 0.2,
    "num_proc": 16 if not kaggle else 2, 
    "freeze_embeddings": False,
    "freeze_layers": 6,
    "warumup_steps": 500,
    "weight_decay": 0.01,
    "logging_dir": './logs',
    "logging_steps": 10,
    "evaluation_strategy": "steps",
    "eval_steps": 400,
    "save_steps": 400,
    "save_total_limit": 3,
    "load_best_model_at_end": False,
    "metric_for_best_model": "f1",
    "greater_is_better": True,
    "overwrite_output_dir": True,
    "report_to": "none",
}

print(parameter["lr"]*parameter["lr_scale_unfreeze"])

5.000000000000001e-07


In [41]:
target = [
    'B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 
    'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 
    'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL'
]

In [42]:
from itertools import chain
import json

data = json.load(open(train_path))
all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
label2id = {l: i for i,l in enumerate(all_labels)}
id2label = {v:k for k,v in label2id.items()}

In [43]:
import random

def tokenize(example, tokenizer, label2id, max_length, all_labels_list, stride):
    text = []
    import numpy as np

    # these are at the character level
    labels = []
    targets = []

    for t, l, ws in zip(example["tokens"], example["labels"], example["trailing_whitespace"]):

        text.append(t)
        labels.extend([l]*len(t))
        
        if l in all_labels_list:
            targets.append(1)
        else:
            targets.append(0)
        # if there is trailing whitespace
        if ws:
            text.append(" ")
            labels.append("O")

    tokenized = tokenizer(
        "".join(text),              
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True, 
        truncation=True, 
        max_length=max_length, padding="max_length")
    
    
    target_num = sum(targets)
    labels = np.array(labels)

    text = "".join(text)

    for i in range(len(tokenized.input_ids)):
        single_tokenized = {
            "input_ids":tokenized["input_ids"][i],
            "token_type_ids":tokenized["token_type_ids"][i],
            "attention_mask":tokenized["attention_mask"][i],
            "offset_mapping":tokenized["offset_mapping"][i],
            "overflow_to_sample_mapping":tokenized["overflow_to_sample_mapping"][i],
        }
        yield get_token_data_for_chunk(single_tokenized, text, label2id, labels, target_num)
   
            

def get_token_data_for_chunk(tokenized, text, label2id, labels, target_num):
    token_labels = []

    for start_idx, end_idx in tokenized["offset_mapping"]:

        # CLS token
        if start_idx == 0 and end_idx == 0: 
            token_labels.append(label2id["O"])
            continue

        # case when token starts with whitespace
        if text[start_idx].isspace():
            start_idx += 1

        try:
            token_labels.append(label2id[labels[start_idx]])
        except:
            token_labels.append(label2id["O"])

    length = len(tokenized["input_ids"])

    return {
        **tokenized,
        "labels": token_labels,
        "length": length,
        "target_num": target_num,
        "group": 1 if target_num>0 else 0
    }

# https://www.kaggle.com/competitions/pii-detection-removal-from-educational-data/discussion/468844
def filter_no_pii(example, percent_allow=parameter["filter_no_pii_percent_allow"]):
    # Return True if there is PII
    # Or 20% of the time if there isn't
    has_pii = set("O") != set(example["labels"])
    return has_pii or (random.random() < percent_allow)

In [44]:
tokenizer = AutoTokenizer.from_pretrained(parameter["model"])
tokenizer.add_tokens(AddedToken("\n", normalized=False)) 

data = json.load(open(train_path))
for tokenized in tokenize(data[0], tokenizer, label2id, 2000, all_labels, 128):
    print(tokenized)
    


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'input_ids': [1, 2169, 12103, 270, 3513, 28310, 4593, 271, 57498, 24360, 16789, 271, 1609, 30065, 12287, 662, 86260, 128001, 128001, 6738, 429, 1857, 128001, 128001, 279, 1637, 273, 380, 264, 408, 305, 6998, 1879, 308, 384, 390, 262, 6870, 265, 266, 663, 269, 262, 791, 2269, 260, 128001, 128001, 458, 1444, 269, 266, 791, 2269, 302, 1663, 264, 262, 3742, 265, 72791, 1398, 897, 260, 263, 72791, 1398, 736, 260, 287, 15724, 261, 10040, 268, 5152, 271, 92671, 2531, 280, 51388, 260, 3045, 294, 9110, 25247, 42255, 268, 1931, 280, 65426, 7933, 260, 285, 261, 262, 791, 2269, 287, 698, 59729, 6000, 285, 269, 266, 4981, 5190, 3395, 272, 3832, 262, 1008, 7392, 265, 262, 791, 263, 1279, 262, 1959, 280, 268, 1068, 264, 282, 1315, 260, 45110, 30097, 435, 128001, 128001, 329, 1637, 303, 386, 5228, 294, 128001, 128001, 1795, 325, 269, 3469, 264, 305, 263, 490, 298, 1449, 1318, 1146, 1578, 263, 295, 282, 619, 1126, 128001, 128001, 1795, 325, 269, 18440, 128001, 128001, 1795, 325, 1279, 51669, 263, 9563

In [45]:
def get_ds_dict_from_data(data):
    tokenized_list=[]
    for i, example in tqdm(enumerate(data), total=len(data)):
        for tokenized in tokenize(example, tokenizer, label2id, parameter["max_length"], all_labels, parameter["stride"]):
            tokenized_list.append(tokenized)
    ds_dict={}
    for k in tokenized_list[0].keys():
        ds_dict[k] = [x[k] for x in tokenized_list]
    return ds_dict

ds=get_ds_dict_from_data(data[:10])
print(len(ds["input_ids"]))

  0%|          | 0/10 [00:00<?, ?it/s]

23


In [46]:
len_data=len(data)
valid_idx = random.sample(range(len_data), int(len_data*parameter["train_test_split"]))
train_idx = list(set(range(len_data)) - set(valid_idx))

train_data = [data[i] for i in train_idx]
valid_data = [data[i] for i in valid_idx]




mixtral_data=json.load(open(mixtral_path))

train_data=train_data+mixtral_data


In [47]:
train_data_tokenized = get_ds_dict_from_data(train_data)
valid_data_tokenized = get_ds_dict_from_data(valid_data)


  0%|          | 0/8138 [00:00<?, ?it/s]

  0%|          | 0/1361 [00:00<?, ?it/s]

In [48]:
train_ds = Dataset.from_dict(train_data_tokenized)
train_ds=train_ds.filter(filter_no_pii, num_proc=parameter["num_proc"])
valid_ds = Dataset.from_dict(valid_data_tokenized)

print(len(train_ds), len(valid_ds))


Filter (num_proc=16):   0%|          | 0/17528 [00:00<?, ? examples/s]

17528 2829


In [49]:
def tokenize_inference(example, tokenizer, max_length):
        text = []
        for t,  ws in zip(example["tokens"], example["trailing_whitespace"]):
            text.append(t)
            if ws:
                text.append(" ")
        tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=max_length, padding="max_length")
        text = "".join(text)
        length = len(tokenized.input_ids)
        return {
            **tokenized,
            "length": length,
        }
        
class TestTokenizer():
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    
    def preprocess(self, example):
        # Preprocess the tokens and labels by adding trailing whitespace and labels
        tokens = []
        tokens_without_ws = []
        token_map = [] # Use the index as labels
        index = 0
        for token, t_ws in zip(example["tokens"], example["trailing_whitespace"]):
            tokens_without_ws.append(token)
            tokens.append(token)
            token_map.extend([index] * len(token))
            # Added trailing whitespace and label if true and 
            if t_ws:
                tokens.append(" ")
                token_map.append(-1)
            index += 1
        return tokens, token_map, tokens_without_ws
    
    def tokenize(self, example):
        tokens, token_map, tokens_without_ws = self.preprocess(example)
        text = "".join(tokens)
        tokenized = self.tokenizer(text, return_offsets_mapping=True, padding="max_length",
                                   truncation=True, max_length=parameter["inference_max_length"])
        return {**tokenized, "token_map": token_map, "tokens": tokens, "tokens_without_ws": tokens_without_ws} 

class PiiDatasetInference(torch.utils.data.Dataset):
        def __init__(self, dataset, tokenizer):
            self.dataset = dataset
            self.tokenizer=TestTokenizer(tokenizer)
            
        def __getitem__(self, idx):
            vals=self.tokenizer.tokenize(self.dataset[idx])
            input_ids = torch.tensor(vals["input_ids"])
            attention_mask = torch.tensor(vals["attention_mask"])
            document_id = self.dataset[idx]["document"]
            return input_ids, attention_mask, document_id, vals
        
        def __len__(self):
            return len(self.dataset)

# Convert preds to a list of dictionaries
def to_test_submission(preds=None, dataset=None, document_ids=None, id2label=None):
    pairs = []
    row_id = 0
    results = []
    
    for i in range(len(preds)):
        input_ids, attention_mask, document_id, vals = dataset[i]
        token_map=vals["token_map"]
        offsets=vals["offset_mapping"]
        tokens=vals["tokens_without_ws"]
        #print("tokens", tokens)
        pred=preds[i]
        #print("original_text", original_text)
        #print("token_map", token_map)
        #print("offsets", offsets)   
        #print("pred", pred)


        for token_pred, input_id, (start_idx, end_idx) in zip(pred, input_ids, offsets):
            #print("\nnow doing ", start_idx,  end_idx, token_pred)
            if start_idx == 0 and end_idx == 0: # Skip 0 offset
                continue
            # Skip spaces 
            while start_idx < len(token_map):
                #print("loop, start_idx now", start_idx) 
                #print(" tokens[token_map[start_idx]]: ", tokens[token_map[start_idx]] if not tokens[token_map[start_idx]].isspace() else "WHITESPACE")          
                if token_map[start_idx] == -1: # Skip unknown tokens               
                    start_idx += 1
                elif tokens[token_map[start_idx]].isspace(): # Skip white space
                    start_idx += 1
                else:
                    break
            # Ensure start index < length
            if start_idx < len(token_map):
                token_id = token_map[start_idx]
                #print("token_id", token_id)
                #token_id= input_id.item()
                label_pred = id2label[token_pred.item()]
                #print("label_pred", label_pred)
                # ignore "O" and whitespace preds
                if label_pred != "O" and token_id != -1:
                    #print("is PII", token_id, label_pred)
                    token_str = tokens[token_id]
                    pair=(document_id, token_id)
                    if pair not in pairs:
                        results.append({
                            "row_id": row_id, 
                            "document": document_id,
                            "token": token_id, 
                            "label": label_pred,
                            "token_str": token_str
                        })
                        pairs.append(pair)
                        row_id += 1

    # Create a dataframe 
    return results

def create_submission(model, filename="submission.csv"):
    data = json.load(open(train_path))
    from itertools import chain
    all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
    label2id = {l: i for i,l in enumerate(all_labels)}
    id2label = {v:k for k,v in label2id.items()}

    data=json.load(open(test_path))
    tokenizer = AutoTokenizer.from_pretrained(parameter["model"])
    my_dataset=PiiDatasetInference(data, tokenizer)
    loader=torch.utils.data.DataLoader(my_dataset, batch_size=1, shuffle=False)

    device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    
    # stack all predictions into tensor
    all_preds = []

    for id, attention_mask, document_ids, vals in loader:
        id=id.to(device)
        attention_mask=attention_mask.to(device)
        preds=model(id, attention_mask).get('logits').argmax(dim=2)
        all_preds.append(preds)
        #for pred, id in zip(preds.flatten(), id.flatten()):
        #    if pred != 12:
                #print(f"Document: {document_id.item()} TOKEN:{tokenizer.decode(id)}  --- pred:{id2label[pred.item()]}")
        #        output[row_id]={"document":document_id.item(), "token":id.item(), "label":id2label[pred.item()]}
        #        row_id+=1
        #for pred, id in zip(preds.flatten(), id.flatten()):
        #    if pred != 12:
        #        print(f"TOKEN:{tokenizer.decode(id)}  --- pred:{id2label[pred.item()]}")
    
   
    all_preds = torch.cat(all_preds, dim=0)
    
    results = to_test_submission(preds=all_preds, dataset=my_dataset, document_ids=document_ids, id2label=id2label)
    if len(results) == 0:
        print("Error in create_submission(): No predictions made, probably because the model is not learning. Check the model and the data.")
        return
    df = pd.DataFrame(results)
    df=df[["row_id", "document", "token", "label"]]
    print(df)
    df.to_csv(filename, index=False)

#create_submission(MyModel(parameter['model'], len(label2id)).to(device), "submission_just_dumb.csv")
# create_submission(model, "submission.csv")
    




In [50]:
from transformers import DataCollatorForTokenClassification

collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

In [51]:
# using Trainer and TrainingArguments from transformers


def compute_metrics(p, all_labels):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    f1_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)
    
    results = {
        'recall': recall,
        'precision': precision,
        'f1': f1_score
    }
    return results

from functools import partial
from transformers import TrainerCallback
from transformers.trainer_callback import TrainerControl, TrainerState
from transformers.training_args import TrainingArguments

def get_trainer(model, train_dataloader, valid_dataloader, learnrate_multiplier=1.0):

    if not kaggle:
        from transformers.integrations import NeptuneCallback

        run = neptune.init_run(
            project="bernd.heidemann/PII",
            api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIzNjBlYzVkNi0zZTUwLTQ1ODYtODhlNC02NDUxNDg0MDdjNzUifQ==",
        )  # your credentials
        run["parameters"] = {
        **parameter
        }

        neptune_callback = NeptuneCallback(run=run, log_model_weights=False, log_parameters=False)

    training_args = TrainingArguments(
        output_dir='./results',          # output directory
        num_train_epochs=parameter["epochs_before_unfreeze"]+parameter["epochs_after_unfreeze"],
        per_device_train_batch_size=parameter["batch_size"],  # batch size per device during training
        per_device_eval_batch_size=parameter["inference_batch_size"],   # batch size for evaluation
        warmup_steps=parameter["warumup_steps"],                # number of warmup steps for learning rate scheduler
        weight_decay=parameter["weight_decay"],               # strength of weight decay
        logging_dir=parameter["logging_dir"],            # directory for storing logs
        logging_steps=parameter["logging_steps"],
        evaluation_strategy=parameter["evaluation_strategy"],
        eval_steps=parameter["eval_steps"],
        save_steps=parameter["save_steps"],
        save_total_limit=parameter["save_total_limit"],
        load_best_model_at_end=parameter["load_best_model_at_end"],
        metric_for_best_model="f1" if not kaggle else "eval_loss",
        greater_is_better=True if not kaggle else False,
        overwrite_output_dir=parameter["overwrite_output_dir"],
        report_to=parameter["report_to"],
        learning_rate=parameter["lr"]
    )

    class FreezingCallback(TrainerCallback):
        def on_epoch_begin(self, args, state, control, model, **kwargs):
            if state.epoch == parameter["epochs_before_unfreeze"]:
                # change learning rate
                optimizer= kwargs["optimizer"]

                for param_group in optimizer.param_groups:
                    param_group['lr'] = parameter["lr"]*parameter["lr_scale_unfreeze"]
                for param in model.base_model.parameters():
                    param.requires_grad = True
                
    class MyTrainer(Trainer):
        def __init__(self, model=None, args=None, train_dataset=None, eval_dataset=None, compute_metrics=None, callbacks=None):
            super().__init__(model=model, args=args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, callbacks=callbacks)
            # Definieren Sie hier Ihre Gewichte für die Klassen, z.B. torch.tensor([1.0, 2.0, 0.5])
            self.weight = torch.tensor(CROSS_ENTROPY_WEIGHTS).to(device)
            self.loss_func=torch.nn.CrossEntropyLoss(ignore_index=-100, weight=torch.tensor(CROSS_ENTROPY_WEIGHTS, dtype=torch.float32).to(device))

        def compute_loss(self, model, inputs, return_outputs=False):
            labels = inputs.get("labels")
            outputs = model(**inputs)
            logits = outputs.get('logits')
            loss = self.loss_func(logits.view(-1, self.model.config.num_labels), labels.view(-1))
            return (loss, outputs) if return_outputs else loss
        
    trainer = MyTrainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataloader,         # training dataset
        eval_dataset=valid_dataloader,             # evaluation dataset
        compute_metrics=partial(compute_metrics, all_labels=all_labels) if not kaggle else None,
        callbacks=[neptune_callback, FreezingCallback()] if not kaggle else [FreezingCallback()]
    )
    return trainer

In [52]:


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [53]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"



model = AutoModelForTokenClassification.from_pretrained(
    parameter["model"],
    num_labels=len(all_labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

if parameter['freeze_embeddings']:
    for param in model.deberta.embeddings.parameters():
        param.requires_grad = False
        
if parameter['freeze_layers'] > 0:
    for layer in model.deberta.encoder.layer[:parameter['freeze_layers']]:
        for param in layer.parameters():
            param.requires_grad = False

#my_model=MyModel(parameter['model'], len(label2id))
# set torch seed
torch.manual_seed(189237)
trainer=get_trainer(model, train_ds, valid_ds)
trainer.train()

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


https://app.neptune.ai/bernd.heidemann/PII/e/PII-271


  0%|          | 0/6573 [00:00<?, ?it/s]

{'loss': 2.9511, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}
{'loss': 3.0291, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.01}
{'loss': 2.8849, 'learning_rate': 3e-06, 'epoch': 0.01}
{'loss': 2.7755, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.02}
{'loss': 2.328, 'learning_rate': 5e-06, 'epoch': 0.02}
{'loss': 2.1189, 'learning_rate': 6e-06, 'epoch': 0.03}
{'loss': 2.0533, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.03}
{'loss': 1.6546, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.04}
{'loss': 1.6616, 'learning_rate': 9e-06, 'epoch': 0.04}
{'loss': 1.0699, 'learning_rate': 1e-05, 'epoch': 0.05}
{'loss': 1.0664, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.05}
{'loss': 0.5222, 'learning_rate': 1.2e-05, 'epoch': 0.05}
{'loss': 1.1713, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.06}
{'loss': 0.7269, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.06}
{'loss': 0.7546, 'learning_rate': 1.5e-05, 'epoch': 0.07}
{'loss': 0.4735, 'learnin

  0%|          | 0/2829 [00:00<?, ?it/s]

{'eval_loss': 0.02020096406340599, 'eval_recall': 0.8482328482328483, 'eval_precision': 0.29715950473415875, 'eval_f1': 0.7917599641737573, 'eval_runtime': 38.5999, 'eval_samples_per_second': 73.29, 'eval_steps_per_second': 73.29, 'epoch': 0.18}
{'loss': 0.2963, 'learning_rate': 4.1e-05, 'epoch': 0.19}
{'loss': 0.2102, 'learning_rate': 4.2e-05, 'epoch': 0.19}
{'loss': 0.1455, 'learning_rate': 4.3e-05, 'epoch': 0.2}
{'loss': 0.0479, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.2}
{'loss': 0.1071, 'learning_rate': 4.5e-05, 'epoch': 0.21}
{'loss': 0.055, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.21}
{'loss': 0.1106, 'learning_rate': 4.7e-05, 'epoch': 0.21}
{'loss': 0.0882, 'learning_rate': 4.8e-05, 'epoch': 0.22}
{'loss': 0.1225, 'learning_rate': 4.9e-05, 'epoch': 0.22}
{'loss': 0.1186, 'learning_rate': 5e-05, 'epoch': 0.23}
{'loss': 0.1582, 'learning_rate': 4.991766836818706e-05, 'epoch': 0.23}
{'loss': 0.0885, 'learning_rate': 4.9835336736374116e-05, 'epoch': 0.24}
{'los

  0%|          | 0/2829 [00:00<?, ?it/s]

{'eval_loss': 0.013400131836533546, 'eval_recall': 0.9272349272349273, 'eval_precision': 0.5357357357357357, 'eval_f1': 0.901886058720591, 'eval_runtime': 38.2334, 'eval_samples_per_second': 73.993, 'eval_steps_per_second': 73.993, 'epoch': 0.37}
{'loss': 0.056, 'learning_rate': 4.7447719413798785e-05, 'epoch': 0.37}
{'loss': 0.0172, 'learning_rate': 4.736538778198584e-05, 'epoch': 0.37}
{'loss': 0.0608, 'learning_rate': 4.72830561501729e-05, 'epoch': 0.38}
{'loss': 0.0919, 'learning_rate': 4.7200724518359955e-05, 'epoch': 0.38}
{'loss': 0.0205, 'learning_rate': 4.711839288654701e-05, 'epoch': 0.39}
{'loss': 0.0385, 'learning_rate': 4.703606125473407e-05, 'epoch': 0.39}
{'loss': 0.0023, 'learning_rate': 4.6953729622921125e-05, 'epoch': 0.4}
{'loss': 0.0567, 'learning_rate': 4.687139799110819e-05, 'epoch': 0.4}
{'loss': 0.056, 'learning_rate': 4.6789066359295245e-05, 'epoch': 0.41}
{'loss': 0.0112, 'learning_rate': 4.67067347274823e-05, 'epoch': 0.41}
{'loss': 0.0068, 'learning_rate': 4

  0%|          | 0/2829 [00:00<?, ?it/s]

{'eval_loss': 0.02234867587685585, 'eval_recall': 0.9074844074844075, 'eval_precision': 0.5262206148282098, 'eval_f1': 0.8828814811933564, 'eval_runtime': 38.0555, 'eval_samples_per_second': 74.339, 'eval_steps_per_second': 74.339, 'epoch': 0.55}
{'loss': 0.2566, 'learning_rate': 4.4154454141281084e-05, 'epoch': 0.55}
{'loss': 0.0224, 'learning_rate': 4.407212250946814e-05, 'epoch': 0.56}
{'loss': 0.0163, 'learning_rate': 4.39897908776552e-05, 'epoch': 0.56}
{'loss': 0.0137, 'learning_rate': 4.3907459245842254e-05, 'epoch': 0.57}
{'loss': 0.0077, 'learning_rate': 4.382512761402931e-05, 'epoch': 0.57}
{'loss': 0.0853, 'learning_rate': 4.374279598221637e-05, 'epoch': 0.58}
{'loss': 0.5094, 'learning_rate': 4.366046435040343e-05, 'epoch': 0.58}
{'loss': 0.0103, 'learning_rate': 4.357813271859049e-05, 'epoch': 0.58}
{'loss': 0.0139, 'learning_rate': 4.3495801086777543e-05, 'epoch': 0.59}
{'loss': 0.0901, 'learning_rate': 4.34134694549646e-05, 'epoch': 0.59}
{'loss': 0.0699, 'learning_rate'

  0%|          | 0/2829 [00:00<?, ?it/s]

{'eval_loss': 0.012712503783404827, 'eval_recall': 0.8814968814968815, 'eval_precision': 0.47032723239046037, 'eval_f1': 0.852821722817468, 'eval_runtime': 37.8275, 'eval_samples_per_second': 74.787, 'eval_steps_per_second': 74.787, 'epoch': 0.73}
{'loss': 0.0168, 'learning_rate': 4.0861188868763376e-05, 'epoch': 0.73}
{'loss': 0.0039, 'learning_rate': 4.077885723695044e-05, 'epoch': 0.74}
{'loss': 0.0031, 'learning_rate': 4.0696525605137496e-05, 'epoch': 0.74}
{'loss': 0.0149, 'learning_rate': 4.061419397332455e-05, 'epoch': 0.75}
{'loss': 0.0482, 'learning_rate': 4.053186234151161e-05, 'epoch': 0.75}
{'loss': 0.0049, 'learning_rate': 4.0449530709698666e-05, 'epoch': 0.76}
{'loss': 0.0094, 'learning_rate': 4.036719907788572e-05, 'epoch': 0.76}
{'loss': 0.0082, 'learning_rate': 4.0284867446072786e-05, 'epoch': 0.77}
{'loss': 0.0218, 'learning_rate': 4.020253581425984e-05, 'epoch': 0.77}
{'loss': 0.0106, 'learning_rate': 4.01202041824469e-05, 'epoch': 0.78}
{'loss': 0.0539, 'learning_ra

  0%|          | 0/2829 [00:00<?, ?it/s]

{'eval_loss': 0.013211013749241829, 'eval_recall': 0.8076923076923077, 'eval_precision': 0.5886363636363636, 'eval_f1': 0.7962948364209697, 'eval_runtime': 37.9864, 'eval_samples_per_second': 74.474, 'eval_steps_per_second': 74.474, 'epoch': 0.91}
{'loss': 0.0327, 'learning_rate': 3.756792359624568e-05, 'epoch': 0.92}
{'loss': 0.0029, 'learning_rate': 3.748559196443274e-05, 'epoch': 0.92}
{'loss': 0.07, 'learning_rate': 3.7403260332619795e-05, 'epoch': 0.93}
{'loss': 0.0109, 'learning_rate': 3.732092870080685e-05, 'epoch': 0.93}
{'loss': 0.022, 'learning_rate': 3.723859706899391e-05, 'epoch': 0.94}
{'loss': 0.0096, 'learning_rate': 3.7156265437180965e-05, 'epoch': 0.94}
{'loss': 0.0016, 'learning_rate': 3.707393380536802e-05, 'epoch': 0.94}
{'loss': 0.0232, 'learning_rate': 3.6991602173555085e-05, 'epoch': 0.95}
{'loss': 0.0046, 'learning_rate': 3.690927054174214e-05, 'epoch': 0.95}
{'loss': 0.0021, 'learning_rate': 3.68269389099292e-05, 'epoch': 0.96}
{'loss': 0.003, 'learning_rate': 

  0%|          | 0/2829 [00:00<?, ?it/s]

{'eval_loss': 0.019407890737056732, 'eval_recall': 0.7286902286902287, 'eval_precision': 0.6332429990966576, 'eval_f1': 0.7244902015343643, 'eval_runtime': 38.152, 'eval_samples_per_second': 74.151, 'eval_steps_per_second': 74.151, 'epoch': 1.1}
{'loss': 0.0215, 'learning_rate': 3.4274658323727974e-05, 'epoch': 1.1}
{'loss': 0.0037, 'learning_rate': 3.419232669191503e-05, 'epoch': 1.1}
{'loss': 0.0103, 'learning_rate': 3.410999506010209e-05, 'epoch': 1.11}
{'loss': 0.0013, 'learning_rate': 3.402766342828915e-05, 'epoch': 1.11}
{'loss': 0.1877, 'learning_rate': 3.394533179647621e-05, 'epoch': 1.12}
{'loss': 0.0008, 'learning_rate': 3.3863000164663263e-05, 'epoch': 1.12}
{'loss': 0.0221, 'learning_rate': 3.378066853285033e-05, 'epoch': 1.13}
{'loss': 0.004, 'learning_rate': 3.3698336901037383e-05, 'epoch': 1.13}
{'loss': 0.0017, 'learning_rate': 3.361600526922444e-05, 'epoch': 1.14}
{'loss': 0.0104, 'learning_rate': 3.35336736374115e-05, 'epoch': 1.14}
{'loss': 0.0701, 'learning_rate': 3

  0%|          | 0/2829 [00:00<?, ?it/s]

{'eval_loss': 0.010665029287338257, 'eval_recall': 0.9282744282744283, 'eval_precision': 0.6297602256699577, 'eval_f1': 0.911653840113083, 'eval_runtime': 37.5572, 'eval_samples_per_second': 75.325, 'eval_steps_per_second': 75.325, 'epoch': 1.28}
{'loss': 0.0008, 'learning_rate': 3.098139305121028e-05, 'epoch': 1.28}
{'loss': 0.0016, 'learning_rate': 3.0899061419397336e-05, 'epoch': 1.29}
{'loss': 0.0015, 'learning_rate': 3.081672978758439e-05, 'epoch': 1.29}
{'loss': 0.0008, 'learning_rate': 3.073439815577145e-05, 'epoch': 1.3}
{'loss': 0.0647, 'learning_rate': 3.0652066523958506e-05, 'epoch': 1.3}
{'loss': 0.0007, 'learning_rate': 3.056973489214556e-05, 'epoch': 1.31}
{'loss': 0.0005, 'learning_rate': 3.048740326033262e-05, 'epoch': 1.31}
{'loss': 0.0063, 'learning_rate': 3.0405071628519676e-05, 'epoch': 1.31}
{'loss': 0.0466, 'learning_rate': 3.0322739996706732e-05, 'epoch': 1.32}
{'loss': 0.0324, 'learning_rate': 3.0240408364893796e-05, 'epoch': 1.32}
{'loss': 0.0013, 'learning_rat

  0%|          | 0/2829 [00:00<?, ?it/s]

{'eval_loss': 0.010004960000514984, 'eval_recall': 0.9760914760914761, 'eval_precision': 0.6361788617886179, 'eval_f1': 0.9564365744730863, 'eval_runtime': 37.577, 'eval_samples_per_second': 75.285, 'eval_steps_per_second': 75.285, 'epoch': 1.46}
{'loss': 0.001, 'learning_rate': 2.7688127778692575e-05, 'epoch': 1.47}
{'loss': 0.0007, 'learning_rate': 2.760579614687963e-05, 'epoch': 1.47}
{'loss': 0.0009, 'learning_rate': 2.7523464515066688e-05, 'epoch': 1.47}
{'loss': 0.0005, 'learning_rate': 2.7441132883253745e-05, 'epoch': 1.48}
{'loss': 0.0015, 'learning_rate': 2.7358801251440808e-05, 'epoch': 1.48}
{'loss': 0.0159, 'learning_rate': 2.7276469619627865e-05, 'epoch': 1.49}
{'loss': 0.0007, 'learning_rate': 2.719413798781492e-05, 'epoch': 1.49}
{'loss': 0.0052, 'learning_rate': 2.7111806356001978e-05, 'epoch': 1.5}
{'loss': 0.0604, 'learning_rate': 2.7029474724189034e-05, 'epoch': 1.5}
{'loss': 0.0029, 'learning_rate': 2.694714309237609e-05, 'epoch': 1.51}
{'loss': 0.0018, 'learning_ra

  0%|          | 0/2829 [00:00<?, ?it/s]

{'eval_loss': 0.008598806336522102, 'eval_recall': 0.9781704781704782, 'eval_precision': 0.5940656565656566, 'eval_f1': 0.9544355153312006, 'eval_runtime': 37.9449, 'eval_samples_per_second': 74.555, 'eval_steps_per_second': 74.555, 'epoch': 1.64}
{'loss': 0.0025, 'learning_rate': 2.4394862506174873e-05, 'epoch': 1.65}
{'loss': 0.0529, 'learning_rate': 2.431253087436193e-05, 'epoch': 1.65}
{'loss': 0.0255, 'learning_rate': 2.4230199242548987e-05, 'epoch': 1.66}
{'loss': 0.002, 'learning_rate': 2.4147867610736047e-05, 'epoch': 1.66}
{'loss': 0.0025, 'learning_rate': 2.4065535978923103e-05, 'epoch': 1.67}
{'loss': 0.0026, 'learning_rate': 2.398320434711016e-05, 'epoch': 1.67}
{'loss': 0.0022, 'learning_rate': 2.3900872715297217e-05, 'epoch': 1.68}
{'loss': 0.0091, 'learning_rate': 2.3818541083484273e-05, 'epoch': 1.68}
{'loss': 0.0034, 'learning_rate': 2.3736209451671333e-05, 'epoch': 1.68}
{'loss': 0.0009, 'learning_rate': 2.365387781985839e-05, 'epoch': 1.69}
{'loss': 0.0021, 'learning

  0%|          | 0/2829 [00:00<?, ?it/s]

{'eval_loss': 0.013387829065322876, 'eval_recall': 0.9636174636174636, 'eval_precision': 0.5904458598726114, 'eval_f1': 0.9407494145199062, 'eval_runtime': 37.8364, 'eval_samples_per_second': 74.769, 'eval_steps_per_second': 74.769, 'epoch': 1.83}
{'loss': 0.003, 'learning_rate': 2.1101597233657172e-05, 'epoch': 1.83}
{'loss': 0.0108, 'learning_rate': 2.101926560184423e-05, 'epoch': 1.83}
{'loss': 0.0009, 'learning_rate': 2.0936933970031286e-05, 'epoch': 1.84}
{'loss': 0.0006, 'learning_rate': 2.0854602338218346e-05, 'epoch': 1.84}
{'loss': 0.0035, 'learning_rate': 2.0772270706405402e-05, 'epoch': 1.85}
{'loss': 0.0016, 'learning_rate': 2.068993907459246e-05, 'epoch': 1.85}
{'loss': 0.0074, 'learning_rate': 2.0607607442779515e-05, 'epoch': 1.86}
{'loss': 0.0012, 'learning_rate': 2.0525275810966575e-05, 'epoch': 1.86}
{'loss': 0.0018, 'learning_rate': 2.0442944179153632e-05, 'epoch': 1.87}
{'loss': 0.0006, 'learning_rate': 2.0360612547340692e-05, 'epoch': 1.87}
{'loss': 0.0005, 'learnin

  0%|          | 0/2829 [00:00<?, ?it/s]

{'eval_loss': 0.019537225365638733, 'eval_recall': 0.9386694386694386, 'eval_precision': 0.7043681747269891, 'eval_f1': 0.9268119374703933, 'eval_runtime': 37.6017, 'eval_samples_per_second': 75.236, 'eval_steps_per_second': 75.236, 'epoch': 2.01}
{'loss': 0.0013, 'learning_rate': 1.780833196113947e-05, 'epoch': 2.01}
{'loss': 0.0011, 'learning_rate': 1.7726000329326528e-05, 'epoch': 2.02}
{'loss': 0.0001, 'learning_rate': 1.7643668697513584e-05, 'epoch': 2.02}
{'loss': 0.0003, 'learning_rate': 1.756133706570064e-05, 'epoch': 2.03}
{'loss': 0.0007, 'learning_rate': 1.74790054338877e-05, 'epoch': 2.03}
{'loss': 0.0006, 'learning_rate': 1.7396673802074758e-05, 'epoch': 2.04}
{'loss': 0.0008, 'learning_rate': 1.7314342170261814e-05, 'epoch': 2.04}
{'loss': 0.0005, 'learning_rate': 1.7232010538448874e-05, 'epoch': 2.04}
{'loss': 0.0107, 'learning_rate': 1.714967890663593e-05, 'epoch': 2.05}
{'loss': 0.0024, 'learning_rate': 1.7067347274822988e-05, 'epoch': 2.05}
{'loss': 0.0025, 'learning_

  0%|          | 0/2829 [00:00<?, ?it/s]

{'eval_loss': 0.02663557231426239, 'eval_recall': 0.761954261954262, 'eval_precision': 0.7207472959685349, 'eval_f1': 0.760282443052619, 'eval_runtime': 37.7457, 'eval_samples_per_second': 74.949, 'eval_steps_per_second': 74.949, 'epoch': 2.19}
{'loss': 0.0003, 'learning_rate': 1.451506668862177e-05, 'epoch': 2.2}
{'loss': 0.3553, 'learning_rate': 1.4432735056808827e-05, 'epoch': 2.2}
{'loss': 0.0009, 'learning_rate': 1.4350403424995883e-05, 'epoch': 2.2}
{'loss': 0.0064, 'learning_rate': 1.4268071793182942e-05, 'epoch': 2.21}
{'loss': 0.0116, 'learning_rate': 1.4185740161369998e-05, 'epoch': 2.21}
{'loss': 0.0008, 'learning_rate': 1.4103408529557058e-05, 'epoch': 2.22}
{'loss': 0.0004, 'learning_rate': 1.4021076897744115e-05, 'epoch': 2.22}
{'loss': 0.0011, 'learning_rate': 1.3938745265931172e-05, 'epoch': 2.23}
{'loss': 0.0002, 'learning_rate': 1.3856413634118228e-05, 'epoch': 2.23}
{'loss': 0.0161, 'learning_rate': 1.3774082002305286e-05, 'epoch': 2.24}
{'loss': 0.0005, 'learning_ra

  0%|          | 0/2829 [00:00<?, ?it/s]

{'eval_loss': 0.01577676460146904, 'eval_recall': 0.8367983367983368, 'eval_precision': 0.6874466268146883, 'eval_f1': 0.8298640022203719, 'eval_runtime': 37.6374, 'eval_samples_per_second': 75.165, 'eval_steps_per_second': 75.165, 'epoch': 2.37}
{'loss': 0.0005, 'learning_rate': 1.1221801416104067e-05, 'epoch': 2.38}
{'loss': 0.0003, 'learning_rate': 1.1139469784291124e-05, 'epoch': 2.38}
{'loss': 0.0003, 'learning_rate': 1.1057138152478184e-05, 'epoch': 2.39}
{'loss': 0.0002, 'learning_rate': 1.097480652066524e-05, 'epoch': 2.39}
{'loss': 0.0003, 'learning_rate': 1.0892474888852297e-05, 'epoch': 2.4}
{'loss': 0.0676, 'learning_rate': 1.0810143257039355e-05, 'epoch': 2.4}
{'loss': 0.0002, 'learning_rate': 1.0727811625226412e-05, 'epoch': 2.41}
{'loss': 0.0513, 'learning_rate': 1.0645479993413469e-05, 'epoch': 2.41}
{'loss': 0.0009, 'learning_rate': 1.0563148361600527e-05, 'epoch': 2.41}
{'loss': 0.0025, 'learning_rate': 1.0480816729787585e-05, 'epoch': 2.42}
{'loss': 0.0003, 'learning

  0%|          | 0/2829 [00:00<?, ?it/s]

{'eval_loss': 0.011466246098279953, 'eval_recall': 0.9781704781704782, 'eval_precision': 0.6353814989871708, 'eval_f1': 0.9582860052485213, 'eval_runtime': 37.4538, 'eval_samples_per_second': 75.533, 'eval_steps_per_second': 75.533, 'epoch': 2.56}
{'loss': 0.0013, 'learning_rate': 7.928536143586366e-06, 'epoch': 2.56}
{'loss': 0.0005, 'learning_rate': 7.846204511773424e-06, 'epoch': 2.57}
{'loss': 0.0019, 'learning_rate': 7.763872879960481e-06, 'epoch': 2.57}
{'loss': 0.0006, 'learning_rate': 7.68154124814754e-06, 'epoch': 2.57}
{'loss': 0.0366, 'learning_rate': 7.599209616334596e-06, 'epoch': 2.58}
{'loss': 0.0005, 'learning_rate': 7.5168779845216534e-06, 'epoch': 2.58}
{'loss': 0.0001, 'learning_rate': 7.434546352708712e-06, 'epoch': 2.59}
{'loss': 0.0006, 'learning_rate': 7.352214720895768e-06, 'epoch': 2.59}
{'loss': 0.0061, 'learning_rate': 7.269883089082825e-06, 'epoch': 2.6}
{'loss': 0.02, 'learning_rate': 7.187551457269884e-06, 'epoch': 2.6}
{'loss': 0.0003, 'learning_rate': 7.

  0%|          | 0/2829 [00:00<?, ?it/s]

{'eval_loss': 0.010693891905248165, 'eval_recall': 0.9656964656964657, 'eval_precision': 0.6785975164353543, 'eval_f1': 0.9502340768716315, 'eval_runtime': 37.5196, 'eval_samples_per_second': 75.401, 'eval_steps_per_second': 75.401, 'epoch': 2.74}
{'loss': 0.0013, 'learning_rate': 4.635270871068665e-06, 'epoch': 2.74}
{'loss': 0.0001, 'learning_rate': 4.5529392392557216e-06, 'epoch': 2.75}
{'loss': 0.0035, 'learning_rate': 4.47060760744278e-06, 'epoch': 2.75}
{'loss': 0.0014, 'learning_rate': 4.388275975629837e-06, 'epoch': 2.76}
{'loss': 0.0012, 'learning_rate': 4.305944343816894e-06, 'epoch': 2.76}
{'loss': 0.0001, 'learning_rate': 4.223612712003952e-06, 'epoch': 2.77}
{'loss': 0.0012, 'learning_rate': 4.14128108019101e-06, 'epoch': 2.77}
{'loss': 0.0094, 'learning_rate': 4.058949448378066e-06, 'epoch': 2.77}
{'loss': 0.0004, 'learning_rate': 3.976617816565125e-06, 'epoch': 2.78}
{'loss': 0.0007, 'learning_rate': 3.894286184752182e-06, 'epoch': 2.78}
{'loss': 0.0013, 'learning_rate':

  0%|          | 0/2829 [00:00<?, ?it/s]

{'eval_loss': 0.011813093908131123, 'eval_recall': 0.9604989604989606, 'eval_precision': 0.6978851963746223, 'eval_f1': 0.9467959328446443, 'eval_runtime': 35.783, 'eval_samples_per_second': 79.06, 'eval_steps_per_second': 79.06, 'epoch': 2.92}
{'loss': 0.0002, 'learning_rate': 1.3420055985509634e-06, 'epoch': 2.93}
{'loss': 0.0003, 'learning_rate': 1.2596739667380209e-06, 'epoch': 2.93}
{'loss': 0.0002, 'learning_rate': 1.1773423349250783e-06, 'epoch': 2.93}
{'loss': 0.0004, 'learning_rate': 1.0950107031121356e-06, 'epoch': 2.94}
{'loss': 0.0057, 'learning_rate': 1.0126790712991933e-06, 'epoch': 2.94}
{'loss': 0.0002, 'learning_rate': 9.303474394862506e-07, 'epoch': 2.95}
{'loss': 0.0015, 'learning_rate': 8.480158076733082e-07, 'epoch': 2.95}
{'loss': 0.0004, 'learning_rate': 7.656841758603656e-07, 'epoch': 2.96}
{'loss': 0.0003, 'learning_rate': 6.83352544047423e-07, 'epoch': 2.96}
{'loss': 0.0001, 'learning_rate': 6.010209122344805e-07, 'epoch': 2.97}
{'loss': 0.0009, 'learning_rate

TrainOutput(global_step=6573, training_loss=0.07242961128891215, metrics={'train_runtime': 2096.6867, 'train_samples_per_second': 25.08, 'train_steps_per_second': 3.135, 'train_loss': 0.07242961128891215, 'epoch': 3.0})

In [54]:
create_submission(model, f"submission.csv")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


    row_id  document  token           label
0        0         7      9  B-NAME_STUDENT
1        1         7     10  I-NAME_STUDENT
2        2         7    482  B-NAME_STUDENT
3        3         7    483  I-NAME_STUDENT
4        4         7    741  B-NAME_STUDENT
5        5         7    742  I-NAME_STUDENT
6        6        10      0  B-NAME_STUDENT
7        7        10      1  I-NAME_STUDENT
8        8        10    464  B-NAME_STUDENT
9        9        10    465  I-NAME_STUDENT
10      10        16      4  B-NAME_STUDENT
11      11        16      5  I-NAME_STUDENT
12      12        86      6  B-NAME_STUDENT
13      13        86      7  I-NAME_STUDENT
14      14        93      0  B-NAME_STUDENT
15      15        93      1  I-NAME_STUDENT
16      16       104      7  B-NAME_STUDENT
17      17       104      8  B-NAME_STUDENT
18      18       104      9  I-NAME_STUDENT
19      19       112      5  B-NAME_STUDENT
20      20       112      6  I-NAME_STUDENT


In [55]:
# save trainer
trainer.save_model("final_model")

: 