In [9]:
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification
import json


parameter= {
    "model": "microsoft/deberta-v3-base",
    "max_length": 512,
    "batch_size": 8,
    "model_path": "../PII Models/model_nb_04.pt"
}

def tokenize(example, tokenizer, max_length):
    text = []
    for t,  ws in zip(example["tokens"], example["trailing_whitespace"]):
        text.append(t)
        if ws:
            text.append(" ")
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=max_length, padding="max_length")
    text = "".join(text)
    length = len(tokenized.input_ids)
    return {
        **tokenized,
        "length": length,
    }

class PiiDatasetInference(torch.utils.data.Dataset):
    def __init__(self, dataset, tokenizer, max_length):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __getitem__(self, idx):
        vals=tokenize(self.dataset[idx], self.tokenizer, self.max_length)
        input_ids = torch.tensor(vals["input_ids"])
        attention_mask = torch.tensor(vals["attention_mask"])
        return input_ids, attention_mask
    
    def __len__(self):
        return len(self.dataset)
    
class MyModel(torch.nn.Module):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)
        self.softmax=torch.nn.Softmax(dim=-1)
       
    def forward(self, input_ids, attention_mask):
        out=self.model(input_ids, attention_mask=attention_mask)['logits']
        out=self.softmax(out)
        return out
    
model=MyModel(parameter["model"], 13)
state_dict = torch.load(parameter['model_path'], map_location=torch.device('cpu'))
model.load_state_dict(state_dict, strict=False)

data = json.load(open("data/train.json"))
from itertools import chain
all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
label2id = {l: i for i,l in enumerate(all_labels)}
id2label = {v:k for k,v in label2id.items()}

tokenizer = AutoTokenizer.from_pretrained(parameter["model"])
data = json.load(open("data/test.json"))
my_dataset=PiiDatasetInference(data, tokenizer, parameter["max_length"])
loader=torch.utils.data.DataLoader(my_dataset, batch_size=1, shuffle=True)
for id, attention_mask in loader:
    preds=model(id, attention_mask).argmax(dim=2)

    for pred, id in zip(preds.flatten(), id.flatten()):
        if pred != 12:
            print(f"TOKEN:{tokenizer.decode(id)}  --- pred:{id2label[pred.item()]}")
    print("next")
    
    

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForTokenClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a Be

next
next
next
next
next
TOKEN:wanted  --- pred:B-URL_PERSONAL
TOKEN:needed  --- pred:B-URL_PERSONAL
TOKEN:without  --- pred:I-PHONE_NUM
TOKEN:why  --- pred:I-PHONE_NUM
next
next
next
next
next
