In [17]:
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification
import json

parameter= {
    "model": "microsoft/deberta-v3-large",
    "model_path": "model_e2_nb_12_unfreeze.pt",
    "max_length": 512,
    "batch_size": 5,
    "lr": 1e-3,
    "filter_no_pii_percent_allow": 0.1,
    "notebook": "11_weighted loss.ipynb"
}

def tokenize(example, tokenizer, max_length):
    text = []
    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        text.append(t)
        if ws:
            text.append(" ")
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=max_length, padding="max_length")
    text = "".join(text)
    length = len(tokenized.input_ids)
    return {
        **tokenized,
        "length": length,
    }

class PiiDatasetInference(torch.utils.data.Dataset):
    def __init__(self, dataset, tokenizer, max_length):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __getitem__(self, idx):
        vals=tokenize(self.dataset[idx], self.tokenizer, self.max_length)
        input_ids = torch.tensor(vals["input_ids"])
        attention_mask = torch.tensor(vals["attention_mask"])
        document_id = self.dataset[idx]["document"]
        return input_ids, attention_mask, document_id
    
    def __len__(self):
        return len(self.dataset)
    
class MyModel(torch.nn.Module):
    def __init__(self, model_name, num_labels, dropout_p=0.4):
        super().__init__()
        self.model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)
        self.softmax=torch.nn.Softmax(dim=-1)
        
    def forward(self, input_ids, attention_mask):
        out=self.model(input_ids, attention_mask=attention_mask)['logits']
        out=self.softmax(out)
        return out

    
model=MyModel(parameter["model"], 13)
state_dict = torch.load(parameter['model_path'], map_location=torch.device('cpu'))
model.load_state_dict(state_dict, strict=False)

data = json.load(open("data/train.json"))
from itertools import chain
all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
label2id = {l: i for i,l in enumerate(all_labels)}
id2label = {v:k for k,v in label2id.items()}

tokenizer = AutoTokenizer.from_pretrained(parameter["model"])
data = json.load(open("data/test.json"))
my_dataset=PiiDatasetInference(data, tokenizer, parameter["max_length"])
loader=torch.utils.data.DataLoader(my_dataset, batch_size=1, shuffle=True)

device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
model=model.to(device)
model.eval()
output={}
row_id=0
for id, attention_mask, document_id in loader:
    id=id.to(device)
    attention_mask=attention_mask.to(device)
    preds=model(id, attention_mask).argmax(dim=2)
    for pred, id in zip(preds.flatten(), id.flatten()):
        if pred != 12:
            print(f"Document: {document_id.item()} TOKEN:{tokenizer.decode(id)}  --- pred:{id2label[pred.item()]}")
            output[row_id]={"document":document_id.item(), "token":id.item(), "label":id2label[pred.item()]}
            row_id+=1

output = {k: v for k, v in sorted(output.items(), key=lambda item: (item[1]['document'], item[0]))}

import pandas as pd
df=pd.DataFrame(output).T
df.reset_index(drop=True, inplace=True)
df['row_id'] = df.index
df = df[ ['row_id'] + [ col for col in df.columns if col != 'row_id' ] ]
df.to_csv("submission.csv", index=False)

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Document: 16 TOKEN:Gilberto  --- pred:B-NAME_STUDENT
Document: 16 TOKEN:Gamb  --- pred:I-NAME_STUDENT
Document: 16 TOKEN:oa  --- pred:I-NAME_STUDENT
Document: 93 TOKEN:Silvia  --- pred:B-NAME_STUDENT
Document: 93 TOKEN:Villa  --- pred:I-NAME_STUDENT
Document: 93 TOKEN:lobo  --- pred:I-NAME_STUDENT
Document: 93 TOKEN:s  --- pred:I-NAME_STUDENT
Document: 86 TOKEN:Ela  --- pred:B-NAME_STUDENT
Document: 86 TOKEN:dio  --- pred:B-NAME_STUDENT
Document: 86 TOKEN:Amaya  --- pred:I-NAME_STUDENT
Document: 56 TOKEN:Nadine  --- pred:B-NAME_STUDENT
Document: 56 TOKEN:Born  --- pred:I-NAME_STUDENT
Document: 123 TOKEN:Stefano  --- pred:B-NAME_STUDENT
Document: 123 TOKEN:Lovato  --- pred:I-NAME_STUDENT
Document: 123 TOKEN:Gera  --- pred:B-NAME_STUDENT
Document: 123 TOKEN:shchenko  --- pred:B-NAME_STUDENT
Document: 123 TOKEN:Igor  --- pred:I-NAME_STUDENT
Document: 123 TOKEN:Alexander  --- pred:B-NAME_STUDENT
Document: 123 TOKEN:Sh  --- pred:I-NAME_STUDENT
Document: 123 TOKEN:m  --- pred:I-NAME_STUDENT


Unnamed: 0,document,token,label,row_id
0,7,1609,B-NAME_STUDENT,0
1,7,30065,B-NAME_STUDENT,1
2,7,12287,B-NAME_STUDENT,2
3,7,662,I-NAME_STUDENT,3
4,7,86260,I-NAME_STUDENT,4
5,7,1609,B-NAME_STUDENT,5
6,7,30065,B-NAME_STUDENT,6
7,7,12287,B-NAME_STUDENT,7
8,7,662,I-NAME_STUDENT,8
9,7,86260,I-NAME_STUDENT,9
