In [1]:
from transformers import AutoTokenizer
from pathlib import Path
import numpy as np
import torch
from transformers import AutoModelForTokenClassification
from tokenizers import AddedToken
from tqdm.notebook import tqdm
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
import pandas as pd

kaggle=False

path="/kaggle/input/pii-detection-removal-from-educational-data" if kaggle else "data"
train_path = path + "/train.json"
test_path = path + "/test.json"

model_path = "/kaggle/input/huggingfacedebertav3variants/deberta-v3-large" if kaggle else "microsoft/deberta-v3-large"


if not kaggle: import neptune


In [2]:
cross_entropy_weight_multi = 200

CROSS_ENTROPY_WEIGHTS = [cross_entropy_weight_multi]*12
CROSS_ENTROPY_WEIGHTS.append(1)

parameter= {
    "model": model_path,
    "max_length": 512,
    "batch_size": 5,
    "lr": 1e-3,
    "filter_no_pii_percent_allow": 0.1,
    "notebook": "12_unfreeze.ipynb",
    "CROSS_ENTROPY_WEIGHT_MULTI": cross_entropy_weight_multi
}

print(parameter)

{'model': 'microsoft/deberta-v3-large', 'max_length': 512, 'batch_size': 5, 'lr': 0.001, 'filter_no_pii_percent_allow': 0.1, 'notebook': '12_unfreeze.ipynb', 'CROSS_ENTROPY_WEIGHT_MULTI': 200}


In [3]:
target = [
    'B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 
    'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 
    'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL'
]

In [4]:
from itertools import chain
import json

data = json.load(open(train_path))
all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
label2id = {l: i for i,l in enumerate(all_labels)}
id2label = {v:k for k,v in label2id.items()}

In [5]:
import random

def tokenize(example, tokenizer, label2id, max_length):
    text = []

    # these are at the character level
    labels = []
    targets = []

    for t, l, ws in zip(example["tokens"], example["labels"], example["trailing_whitespace"]):

        text.append(t)
        labels.extend([l]*len(t))
        
        if l in target:
            targets.append(1)
        else:
            targets.append(0)
        # if there is trailing whitespace
        if ws:
            text.append(" ")
            labels.append("O")

    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=max_length, padding="max_length")
    
    target_num = sum(targets)
    labels = np.array(labels)

    text = "".join(text)
    token_labels = []

    for start_idx, end_idx in tokenized.offset_mapping:

        # CLS token
        if start_idx == 0 and end_idx == 0: 
            token_labels.append(label2id["O"])
            continue

        # case when token starts with whitespace
        if text[start_idx].isspace():
            start_idx += 1

        try:
            token_labels.append(label2id[labels[start_idx]])
        except:
            token_labels.append(label2id["O"])

    length = len(tokenized.input_ids)

    return {
        **tokenized,
        "labels": token_labels,
        "length": length,
        "target_num": target_num,
        "group": 1 if target_num>0 else 0
    }

# https://www.kaggle.com/competitions/pii-detection-removal-from-educational-data/discussion/468844
def filter_no_pii(example, percent_allow=parameter["filter_no_pii_percent_allow"]):
    # Return True if there is PII
    # Or 20% of the time if there isn't
    has_pii = set("O") != set(example["labels"])
    return has_pii or (random.random() < percent_allow)

In [6]:
class PiiDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, tokenizer, label2id, max_length):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_length = max_length
        
    def __getitem__(self, idx):
        vals=tokenize(self.dataset[idx], self.tokenizer, self.label2id, self.max_length)

        input_ids = torch.tensor(vals["input_ids"])
        attention_mask = torch.tensor(vals["attention_mask"])
        labels = torch.tensor(vals["labels"], dtype=torch.long)
        return input_ids, attention_mask, labels
    
    def __len__(self):
        return len(self.dataset)
    
data = json.load(open(train_path))
tokenizer = AutoTokenizer.from_pretrained(parameter["model"])
my_dataset=PiiDataset(data, tokenizer, label2id, parameter["max_length"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
loader=torch.utils.data.DataLoader(my_dataset, batch_size=8, shuffle=True)

for id, attention_mask, labels in loader:
    print(id.shape)
    print(attention_mask.shape)
    print(labels.shape)
    break

torch.Size([8, 512])
torch.Size([8, 512])
torch.Size([8, 512])


In [8]:
device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class MyModel(torch.nn.Module):
    def __init__(self, model_name, num_labels, dropout_p=0.4):
        super().__init__()
        self.model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)
        self.softmax=torch.nn.Softmax(dim=-1)
        self.freeze()

    def freeze(self):
        for param in self.model.parameters():
            param.requires_grad = False
        for param in self.model.classifier.parameters():
            param.requires_grad = True
        for name, param in self.model.named_parameters():
            if param.requires_grad==True:
                print("still learning", name, "parameter_size:", param.size())

    def unfreeze(self):
        for param in self.model.parameters():
            param.requires_grad = True
        
    def forward(self, input_ids, attention_mask, labels=None):
        if labels is not None:
            out=self.model(input_ids, attention_mask=attention_mask, labels=labels)['logits']
        else:
            out=self.model(input_ids, attention_mask=attention_mask)['logits']
        out=self.softmax(out)
        return out

model = MyModel(parameter['model'], len(label2id))

model= model.to(device)
for id, attention_mask, labels in loader:
    print(id.shape)
    print(attention_mask.shape)
    print(labels.shape)
    print(labels)
    id = id.to(device)
    attention_mask = attention_mask.to(device)
    labels = labels.to(device)
    print(model(id, attention_mask, labels).shape)
    break

#free gpu memory
del model
torch.cuda.empty_cache()

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


still learning classifier.weight parameter_size: torch.Size([13, 1024])
still learning classifier.bias parameter_size: torch.Size([13])
torch.Size([8, 512])
torch.Size([8, 512])
torch.Size([8, 512])
tensor([[12, 12, 12,  ..., 12, 12, 12],
        [12, 12, 12,  ..., 12, 12, 12],
        [12, 12, 12,  ..., 12, 12, 12],
        ...,
        [12, 12, 12,  ..., 12, 12, 12],
        [12, 12, 12,  ..., 12, 12, 12],
        [12, 12, 12,  ..., 12, 12, 12]])
torch.Size([8, 512, 13])


In [9]:
class Learner():
    def __init__(self, model, train_dataloader, valid_dataloader, parameter=None):
        self.model=model
        #self.loss_fn=torch.nn.CrossEntropyLoss()
        self.loss_fn=torch.nn.CrossEntropyLoss(ignore_index=-100, weight=torch.tensor(CROSS_ENTROPY_WEIGHTS, dtype=torch.float32).to(device))
        self.device=torch.device("cpu")
        if torch.cuda.is_available():
            self.device=torch.device("cuda")
        self.model.to(self.device)
        self.parameter = parameter

        if not kaggle:
            self.run = neptune.init_run(
                project="bernd.heidemann/PII",
                api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIzNjBlYzVkNi0zZTUwLTQ1ODYtODhlNC02NDUxNDg0MDdjNzUifQ==",
            )  # your credentials
            self.run["parameters"] = {
                **self.parameter
            }

        self.train_dataloader = train_dataloader
        self.valid_dataloader = valid_dataloader
        self.non_pii_label=label2id["O"]

    def fit(self, lr=0.1, epochs=10):
        
        optimizer=torch.optim.AdamW(self.model.parameters(), lr=lr)
        T_0 = epochs//3          # Number of epochs before the first restart
        T_mult = 2        # Factor by which T_0 is multiplied after each restart
        eta_min = lr*0.01   # Minimum learning rate at restarts

        scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=T_0, T_mult=T_mult, eta_min=eta_min)
        bar = tqdm(total=len(self.train_dataloader) * epochs, desc="Training")
        bar.set_description("Epoch 0/{}".format(epochs))
        for epoch in range(epochs):
            self.model.train()     
            pii_count=0       
            for ids, att_mask, labels in self.train_dataloader:
                ids=ids.to(self.device)
                labels=labels.to(self.device)
                att_mask=att_mask.to(self.device)
                pred=self.model(ids, att_mask, labels)
                metrics=self.f_beta_score_multiclass(labels, pred)
                pred = pred.permute(0, 2, 1)
                loss=self.loss_fn(pred, labels)
                if not kaggle:
                    self.run["train_f_beta_score"].log(metrics["f_beta"])
                    self.run["train_precision"].log(metrics["precision"])
                    self.run["train_recall"].log(metrics["recall"])
                    self.run["train_true_positives"].log(metrics["true_positives"])
                    self.run["train_false_positives"].log(metrics["false_positives"])
                    self.run["train_false_negatives"].log(metrics["false_negatives"])
                    self.run["train_loss"].log(loss.item())
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                bar.update(1)
               
            if not kaggle:
                self.run["learnrate"].log(optimizer.param_groups[0]["lr"])
            scheduler.step()
            self.model.eval()
            # log current state to neptune
            if self.valid_dataloader is not None:
                metrics=self.get_accuracy()
                if not kaggle:
                    self.run["valid_accuracy"].log(metrics["accuracy"])
                    self.run["valid_loss"].log(metrics["loss"])
                    self.run["valid_f_beta_score"].log(metrics["f_beta_score"])
                    self.run["valid_precision"].log(metrics["precision"])
                    self.run["valid_recall"].log(metrics["recall"])
                    self.run["valid_true_positives"].log(metrics["true_positives"])
                    self.run["valid_false_positives"].log(metrics["false_positives"])
                    self.run["valid_false_negatives"].log(metrics["false_negatives"])
                    self.run["valid_pii_count"].log(metrics["pii_count"])
                bar.set_description("Epoch {}/{} validAccuracy: {:.2f} validLoss: {:.2f}".format(epoch+1, epochs, metrics["accuracy"], metrics["loss"]))

    def get_accuracy(self):
        self.model.eval()
        with torch.no_grad():
            correct=0
            losses=[]
            batch_metrics=[]
            pii_count=0       
            for ids, att_mask, labels in self.valid_dataloader:
                ids=ids.to(self.device)
                labels=labels.to(self.device)
                att_mask=att_mask.to(self.device)
                pred=self.model(ids, att_mask, labels)
                f_beta_score_results = self.f_beta_score_multiclass(labels, pred)
                batch_metrics.append(f_beta_score_results)
                pred = pred.permute(0, 2, 1)
                loss=self.loss_fn(pred, labels)
                losses.append(loss.item())
                pred=torch.argmax(pred, dim=1)
                correct+=torch.sum(pred==labels).item()
                pii_count_batch = torch.sum((pred != self.non_pii_label)).item()
                pii_count += pii_count_batch
            # calc mean of the dict entries in batch_metrics
            f_beta_scores = np.mean([x["f_beta"] for x in batch_metrics])
            precision = np.mean([x["precision"] for x in batch_metrics])
            recall = np.mean([x["recall"] for x in batch_metrics])
            true_positives = np.mean([x["true_positives"] for x in batch_metrics])
            false_positives = np.mean([x["false_positives"] for x in batch_metrics])
            false_negatives = np.mean([x["false_negatives"] for x in batch_metrics])

            return {
                "accuracy": correct/len(self.valid_dataloader.dataset),
                "loss": np.mean(losses),
                "f_beta_score": f_beta_scores,
                "precision": precision,
                "recall": recall,
                "true_positives": true_positives,
                "false_positives": false_positives,
                "false_negatives": false_negatives,
                 "pii_count": pii_count
            }
        
    def f_beta_score_multiclass(self, y_true, y_pred, beta=5, epsilon=1e-7):

        # assert y_pred has values between 0 and 1
        assert y_pred.min() >= 0
        assert y_pred.max() <= 1

        y_true_one_hot = torch.nn.functional.one_hot(y_true, num_classes=y_pred.shape[2])
    
        # Berechnung von True Positives, False Positives und False Negatives
        tp = torch.sum(y_true_one_hot * y_pred, dim=0)
        fp = torch.sum((1 - y_true_one_hot) * y_pred, dim=0)
        fn = torch.sum(y_true_one_hot * (1 - y_pred), dim=0)

        # Summierung über alle Klassen
        tp_sum = torch.sum(tp)
        fp_sum = torch.sum(fp)
        fn_sum = torch.sum(fn)

        # Berechnung von Präzision und Recall
        precision = tp_sum / (tp_sum + fp_sum + epsilon)
        recall = tp_sum / (tp_sum + fn_sum + epsilon)
        # Berechnung des F-Beta-Scores
        f_beta = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall + epsilon)

        return {
            "f_beta": f_beta.item(),
            "precision": precision.item(),
            "recall": recall.item(),
            "true_positives": tp_sum.item(),
            "false_positives": fp_sum.item(),
            "false_negatives": fn_sum.item()
        }

    

In [10]:
def tokenize_inference(example, tokenizer, max_length):
        text = []
        for t,  ws in zip(example["tokens"], example["trailing_whitespace"]):
            text.append(t)
            if ws:
                text.append(" ")
        tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=max_length, padding="max_length")
        text = "".join(text)
        length = len(tokenized.input_ids)
        return {
            **tokenized,
            "length": length,
        }
        
class TestTokenizer():
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    
    def preprocess(self, example):
        # Preprocess the tokens and labels by adding trailing whitespace and labels
        tokens = []
        tokens_without_ws = []
        token_map = [] # Use the index as labels
        index = 0
        for token, t_ws in zip(example["tokens"], example["trailing_whitespace"]):
            tokens_without_ws.append(token)
            tokens.append(token)
            token_map.extend([index] * len(token))
            # Added trailing whitespace and label if true and 
            if t_ws:
                tokens.append(" ")
                token_map.append(-1)
            index += 1
        return tokens, token_map, tokens_without_ws
    
    def tokenize(self, example):
        tokens, token_map, tokens_without_ws = self.preprocess(example)
        text = "".join(tokens)
        tokenized = self.tokenizer(text, return_offsets_mapping=True, padding="max_length",
                                   truncation=True, max_length=parameter["max_length"])
        return {**tokenized, "token_map": token_map, "tokens": tokens, "tokens_without_ws": tokens_without_ws} 

class PiiDatasetInference(torch.utils.data.Dataset):
        def __init__(self, dataset, tokenizer):
            self.dataset = dataset
            self.tokenizer=TestTokenizer(tokenizer)
            
        def __getitem__(self, idx):
            vals=self.tokenizer.tokenize(self.dataset[idx])
            input_ids = torch.tensor(vals["input_ids"])
            attention_mask = torch.tensor(vals["attention_mask"])
            document_id = self.dataset[idx]["document"]
            return input_ids, attention_mask, document_id, vals
        
        def __len__(self):
            return len(self.dataset)

def inference(model):
    data = json.load(open(train_path))
    from itertools import chain
    all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
    label2id = {l: i for i,l in enumerate(all_labels)}
    id2label = {v:k for k,v in label2id.items()}

    tokenizer = AutoTokenizer.from_pretrained(parameter["model"])
    data = json.load(open(test_path))
    my_dataset=PiiDatasetInference(data, tokenizer)
    loader=torch.utils.data.DataLoader(my_dataset, batch_size=parameter['batch_size'])
    for id, attention_mask, document_id, vals  in loader:
        id = id.to(device)
        print(id.shape)
        attention_mask = attention_mask.to(device)
        preds=model(id, attention_mask).argmax(dim=2)

        for pred, id in zip(preds.flatten(), id.flatten()):
            if pred != 12:
                print(f"TOKEN:{tokenizer.decode(id)}  --- pred:{id2label[pred.item()]}")
        print("next")

# Convert preds to a list of dictionaries
def to_test_submission(preds=None, dataset=None, document_ids=None, id2label=None):
    triplets = []
    row_id = 0
    results = []
    
    for i in range(len(preds)):
        input_ids, attention_mask, document_id, vals = dataset[i]
        token_map=vals["token_map"]
        offsets=vals["offset_mapping"]
        tokens=vals["tokens_without_ws"]
        #print("tokens", tokens)
        pred=preds[i]
        original_text = tokenizer.decode(input_ids)[6:] # skip CLS token
        #print("original_text", original_text)
        #print("token_map", token_map)
        #print("offsets", offsets)   
        #print("pred", pred)

        for token_pred, input_id, (start_idx, end_idx) in zip(pred, input_ids, offsets):
            #print("\nnow doing ", start_idx,  end_idx, token_pred)
            if start_idx == 0 and end_idx == 0: # Skip 0 offset
                continue
            # Skip spaces 
            while start_idx < len(token_map):
                #print("loop, start_idx now", start_idx) 
                #print(" tokens[token_map[start_idx]]: ", tokens[token_map[start_idx]] if not tokens[token_map[start_idx]].isspace() else "WHITESPACE")          
                if token_map[start_idx] == -1: # Skip unknown tokens               
                    start_idx += 1
                elif tokens[token_map[start_idx]].isspace(): # Skip white space
                    start_idx += 1
                else:
                    break
            # Ensure start index < length
            if start_idx < len(token_map):
                token_id = token_map[start_idx]
                #print("token_id", token_id)
                #token_id= input_id.item()
                label_pred = id2label[token_pred.item()]
                #print("label_pred", label_pred)
                # ignore "O" and whitespace preds
                if label_pred != "O" and token_id != -1:
                    #print("is PII", token_id, label_pred)
                    token_str = tokens[token_id]
                    triplet = (label_pred, token_id, token_str)
                    if triplet not in triplets:
                        results.append({
                            "row_id": row_id, 
                            "document": document_id,
                            "token": token_id, 
                            "label": label_pred,
                            "token_str": token_str
                        })
                        triplets.append(triplet)
                        row_id += 1

    # Create a dataframe 
    return results

def create_submission(model, filename="submission.csv"):
    data = json.load(open(train_path))
    from itertools import chain
    all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
    label2id = {l: i for i,l in enumerate(all_labels)}
    id2label = {v:k for k,v in label2id.items()}

    data=json.load(open(test_path))
    tokenizer = AutoTokenizer.from_pretrained(parameter["model"])
    my_dataset=PiiDatasetInference(data, tokenizer)
    loader=torch.utils.data.DataLoader(my_dataset, batch_size=1, shuffle=False)

    device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    

    # stack all predictions into tensor
    all_preds = []

    for id, attention_mask, document_ids, vals in loader:
        id=id.to(device)
        attention_mask=attention_mask.to(device)
        preds=model(id, attention_mask).argmax(dim=2)
        all_preds.append(preds)
        #for pred, id in zip(preds.flatten(), id.flatten()):
        #    if pred != 12:
                #print(f"Document: {document_id.item()} TOKEN:{tokenizer.decode(id)}  --- pred:{id2label[pred.item()]}")
        #        output[row_id]={"document":document_id.item(), "token":id.item(), "label":id2label[pred.item()]}
        #        row_id+=1
        #for pred, id in zip(preds.flatten(), id.flatten()):
        #    if pred != 12:
        #        print(f"TOKEN:{tokenizer.decode(id)}  --- pred:{id2label[pred.item()]}")
    
   
    all_preds = torch.cat(all_preds, dim=0)
    
    results = to_test_submission(preds=all_preds, dataset=my_dataset, document_ids=document_ids, id2label=id2label)
    #print(results)

    df = pd.DataFrame(results)
    df=df[["row_id", "document", "token", "label"]]
    print(df)
    df.to_csv(filename, index=False)

create_submission(MyModel(parameter['model'], len(label2id)).to(device), "submission_just_dumb.csv")
#create_submission(learner.model, "submission.csv")


Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


still learning classifier.weight parameter_size: torch.Size([13, 1024])
still learning classifier.bias parameter_size: torch.Size([13])


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


TOKEN:[CLS]  --- pred:I-STREET_ADDRESS
TOKEN:Design  --- pred:B-NAME_STUDENT
TOKEN:N  --- pred:B-URL_PERSONAL
TOKEN:Challenge  --- pred:B-PHONE_NUM
TOKEN:The  --- pred:B-PHONE_NUM
TOKEN:I  --- pred:B-STREET_ADDRESS
TOKEN:use  --- pred:B-PHONE_NUM
TOKEN:to  --- pred:B-USERNAME
TOKEN:help  --- pred:B-PHONE_NUM
TOKEN:all  --- pred:B-STREET_ADDRESS
TOKEN:stakeholders  --- pred:I-STREET_ADDRESS
TOKEN:finding  --- pred:I-STREET_ADDRESS
TOKEN:their  --- pred:B-PHONE_NUM
TOKEN:through  --- pred:I-STREET_ADDRESS
TOKEN:complexity  --- pred:I-STREET_ADDRESS
TOKEN:a  --- pred:I-STREET_ADDRESS
TOKEN:.  --- pred:B-USERNAME
TOKEN:exactly  --- pred:B-USERNAME
TOKEN:is  --- pred:B-PHONE_NUM
TOKEN:a  --- pred:B-PHONE_NUM
TOKEN:?  --- pred:B-USERNAME
TOKEN:According  --- pred:B-USERNAME
TOKEN:to  --- pred:B-USERNAME
TOKEN:(  --- pred:B-PHONE_NUM
TOKEN:Des  --- pred:B-URL_PERSONAL
TOKEN:ine  --- pred:B-PHONE_NUM
TOKEN:-  --- pred:B-USERNAME
TOKEN:l  --- pred:B-NAME_STUDENT
TOKEN:intelligence  --- pred:B-P

In [11]:
data = json.load(open(train_path))
data_filterd = list(filter(filter_no_pii, data))
data_len=len(data_filterd)
train_len=int(len(data_filterd)*0.8)
valid_len=len(data_filterd)-train_len
train_data_idx=np.random.choice(data_len, train_len, replace=False)
valid_data_idx=np.array(list(set(range(data_len))-set(train_data_idx)))
train_data=[data_filterd[i] for i in train_data_idx]
valid_data=[data_filterd[i] for i in valid_data_idx]

In [12]:

# set environment variables: TOKENIZERS_PARALLELISM=false
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

tokenizer = AutoTokenizer.from_pretrained(parameter["model"])
tokenizer.add_tokens(AddedToken("\n", normalized=False))

train_dataset = PiiDataset(train_data, tokenizer, label2id, parameter["max_length"])
valid_dataset = PiiDataset(valid_data, tokenizer, label2id, parameter["max_length"])
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=parameter['batch_size'], shuffle=True)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=parameter['batch_size'], shuffle=False)
my_model=MyModel(parameter['model'], len(label2id))

learner=Learner(my_model, train_dataloader, valid_dataloader, parameter=parameter)
learner.fit(lr=parameter['lr'], epochs=6)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


still learning classifier.weight parameter_size: torch.Size([13, 1024])
still learning classifier.bias parameter_size: torch.Size([13])




https://app.neptune.ai/bernd.heidemann/PII/e/PII-142


Training:   0%|          | 0/1440 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
learner.model.unfreeze()

In [None]:
for i in range(2):
    learner.fit(lr=parameter['lr']*0.01, epochs=12)
    create_submission(learner.model, f"submission_{i}.csv")
    torch.save(learner.model.state_dict(), f"model_16_{i}.pth")


Training:   0%|          | 0/2892 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'token_map', 'tokens'])
[0, 0, 0, 0, 0, 0, -1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 2, 2, 2, -1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, -1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 6, 6, 6, 6, 6, -1, 7, 7, 7, 7, 8, 9, 9, 9, 9, 9, 9, 9, 9, -1, 10, 10, 10, 10, 10, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, -1, 13, -1, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 16, 16, 16, -1, 17, 17, 17, 17, -1, 18, -1, 19, 19, 19, -1, 20, 20, -1, 21, 21, 21, 21, -1, 22, 22, 22, -1, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, -1, 24, 24, 24, 24, 24, 24, 24, -1, 25, 25, 25, 25, 25, -1, 26, 26, 26, -1, 27, 27, 27, 27, 27, 27, 27, -1, 28, 28, 28, -1, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, -1, 30, 30, -1, 31, -1, 32, 32, 32, 32, 32, 32, 32, -1, 33, 33, -1, 34, 34, 34, -1, 35, 36, 36, 36, 36, -1, 37, 37, 37, 38, 39, 39, 40, 40, 40, 40, -1, 41, 41, 41, 41, 41, 41, 41, -1, 42, 42, -1, 43, -1, 44, 44, 44, 44, -1, 45, 45, 45, 46, -1, 47, 47, 47, 47, 47, 47, 

Training:   0%|          | 0/2892 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'token_map', 'tokens'])
[0, 0, 0, 0, 0, 0, -1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 2, 2, 2, -1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, -1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 6, 6, 6, 6, 6, -1, 7, 7, 7, 7, 8, 9, 9, 9, 9, 9, 9, 9, 9, -1, 10, 10, 10, 10, 10, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, -1, 13, -1, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 16, 16, 16, -1, 17, 17, 17, 17, -1, 18, -1, 19, 19, 19, -1, 20, 20, -1, 21, 21, 21, 21, -1, 22, 22, 22, -1, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, -1, 24, 24, 24, 24, 24, 24, 24, -1, 25, 25, 25, 25, 25, -1, 26, 26, 26, -1, 27, 27, 27, 27, 27, 27, 27, -1, 28, 28, 28, -1, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, -1, 30, 30, -1, 31, -1, 32, 32, 32, 32, 32, 32, 32, -1, 33, 33, -1, 34, 34, 34, -1, 35, 36, 36, 36, 36, -1, 37, 37, 37, 38, 39, 39, 40, 40, 40, 40, -1, 41, 41, 41, 41, 41, 41, 41, -1, 42, 42, -1, 43, -1, 44, 44, 44, 44, -1, 45, 45, 45, 46, -1, 47, 47, 47, 47, 47, 47, 

Training:   0%|          | 0/2892 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'token_map', 'tokens'])
[0, 0, 0, 0, 0, 0, -1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 2, 2, 2, -1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, -1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 6, 6, 6, 6, 6, -1, 7, 7, 7, 7, 8, 9, 9, 9, 9, 9, 9, 9, 9, -1, 10, 10, 10, 10, 10, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, -1, 13, -1, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 16, 16, 16, -1, 17, 17, 17, 17, -1, 18, -1, 19, 19, 19, -1, 20, 20, -1, 21, 21, 21, 21, -1, 22, 22, 22, -1, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, -1, 24, 24, 24, 24, 24, 24, 24, -1, 25, 25, 25, 25, 25, -1, 26, 26, 26, -1, 27, 27, 27, 27, 27, 27, 27, -1, 28, 28, 28, -1, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, -1, 30, 30, -1, 31, -1, 32, 32, 32, 32, 32, 32, 32, -1, 33, 33, -1, 34, 34, 34, -1, 35, 36, 36, 36, 36, -1, 37, 37, 37, 38, 39, 39, 40, 40, 40, 40, -1, 41, 41, 41, 41, 41, 41, 41, -1, 42, 42, -1, 43, -1, 44, 44, 44, 44, -1, 45, 45, 45, 46, -1, 47, 47, 47, 47, 47, 47, 