In [73]:
TRAINING_MODEL_PATH = "microsoft/deberta-v3-base"


In [74]:

import json
import numpy as np

data = json.load(open("data/train.json"))

print(len(data))
print(data[0].keys())

x = data[0]

print(x["tokens"][:10])
print(x["labels"][:10])
print(x["trailing_whitespace"][:10])

6807
dict_keys(['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels'])
['Design', 'Thinking', 'for', 'innovation', 'reflexion', '-', 'Avril', '2021', '-', 'Nathalie']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NAME_STUDENT']
[True, True, True, True, False, False, True, False, False, True]


In [75]:

from itertools import chain

all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
label2id = {l: i for i,l in enumerate(all_labels)}
id2label = {v:k for k,v in label2id.items()}

id2label

{0: 'B-EMAIL',
 1: 'B-ID_NUM',
 2: 'B-NAME_STUDENT',
 3: 'B-PHONE_NUM',
 4: 'B-STREET_ADDRESS',
 5: 'B-URL_PERSONAL',
 6: 'B-USERNAME',
 7: 'I-ID_NUM',
 8: 'I-NAME_STUDENT',
 9: 'I-PHONE_NUM',
 10: 'I-STREET_ADDRESS',
 11: 'I-URL_PERSONAL',
 12: 'O'}

In [76]:
from datasets import Dataset

ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [x["document"] for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
    "provided_labels": [x["labels"] for x in data],
})

In [77]:
TRAINING_MAX_LENGTH = 512

In [78]:
import random


def tokenize(example, tokenizer, label2id, max_length):
    text = []
    labels = []
    
    for t, l, ws in zip(example["tokens"], example["provided_labels"], example["trailing_whitespace"]):
        
        text.append(t)
        labels.extend([l]*len(t))
        if ws:
            text.append(" ")
            labels.append("O")
    
    
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, max_length=max_length, truncation=True, padding="max_length")
    
    labels = np.array(labels)
    
    text = "".join(text)
    token_labels = []
    
    for start_idx, end_idx in tokenized.offset_mapping:
        
        # CLS token
        if start_idx == 0 and end_idx == 0: 
            token_labels.append(label2id["O"])
            continue
        
        # case when token starts with whitespace
        if text[start_idx].isspace():
            start_idx += 1
        
        while start_idx >= len(labels):
            start_idx -= 1
            
        token_labels.append(label2id[labels[start_idx]])
        
    length = len(tokenized.input_ids)
        
    return {
        **tokenized,
        "labels": token_labels,
        "length": length
    }

# https://www.kaggle.com/competitions/pii-detection-removal-from-educational-data/discussion/468844
def filter_no_pii(example, percent_allow=0.2):
    # Return True if there is PII
    # Or 20% of the time if there isn't
    has_pii = set("O") != set(example["provided_labels"])
    return has_pii or (random.random() < percent_allow)
    
def compute_metrics(p, metric, all_labels):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)

    # Unpack nested dictionaries
    final_results = {}
    for key, value in results.items():
        if isinstance(value, dict):
            for n, v in value.items():
                final_results[f"{key}_{n}"] = v
        else:
            final_results[key] = value
    return final_results   


In [79]:
import torch
import json

class PiiDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, tokenizer, label2id, max_length):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_length = max_length
        
    def __getitem__(self, idx):
        vals=tokenize(self.dataset[idx], self.tokenizer, self.label2id, self.max_length)

        input_ids = torch.tensor(vals["input_ids"])
        attention_mask = torch.tensor(vals["attention_mask"])
        labels = torch.tensor(vals["labels"], dtype=torch.long)

        return input_ids, attention_mask, labels
    
    def __len__(self):
        return len(self.dataset)
    

data = json.load(open("data/train.json"))

ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [x["document"] for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
    "provided_labels": [x["labels"] for x in data],
})

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)

my_dataset=PiiDataset(ds, tokenizer, label2id, TRAINING_MAX_LENGTH)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [80]:

loader=torch.utils.data.DataLoader(my_dataset, batch_size=8, shuffle=True)

for id, attention_mask, labels in loader:
    print(id.shape)
    print(attention_mask.shape)
    print(labels.shape)
    break

torch.Size([8, 512])
torch.Size([8, 512])
torch.Size([8, 512])


In [81]:
TRAINING_MODEL_PATH

'microsoft/deberta-v3-base'

In [82]:
from transformers import AutoModelForTokenClassification

device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class MyModel(torch.nn.Module):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)
        
    def forward(self, input_ids, attention_mask, labels):
        return self.model(input_ids, attention_mask=attention_mask, labels=labels)['logits']



model = MyModel(TRAINING_MODEL_PATH, len(label2id))
model= model.to(device)
for id, attention_mask, labels in loader:
    print(id.shape)
    print(attention_mask.shape)
    print(labels.shape)
    print(labels)
    id = id.to(device)
    attention_mask = attention_mask.to(device)
    labels = labels.to(device)
    print(model(id, attention_mask, labels).shape)
    break

#free gpu memory
del model
torch.cuda.empty_cache()



Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForTokenClassification: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'deberta.embeddings.position_embeddings.weight', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a Be

torch.Size([8, 512])
torch.Size([8, 512])
torch.Size([8, 512])
tensor([[12, 12, 12,  ..., 12, 12, 12],
        [12, 12, 12,  ..., 12, 12, 12],
        [12, 12, 12,  ..., 12, 12, 12],
        ...,
        [12, 12, 12,  ..., 12, 12, 12],
        [12, 12, 12,  ..., 12, 12, 12],
        [12, 12, 12,  ..., 12, 12, 12]])
torch.Size([8, 512, 13])


In [87]:

import neptune
from tqdm.notebook import tqdm

class Learner():
    def __init__(self, model, train_dataloader, valid_dataloader, batch_size=32):
        self.model=model
        #self.loss_fn=torch.nn.CrossEntropyLoss()
        self.loss_fn=torch.nn.CrossEntropyLoss(ignore_index=-100)
        self.device=torch.device("cpu")
        if torch.cuda.is_available():
            self.device=torch.device("cuda")
        #elif torch.backends.mps.is_available():
        #    self.device=torch.device("mps")

        self.model.to(self.device)
        self.run = neptune.init_run(
            project="bernd.heidemann/PII",
            api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIzNjBlYzVkNi0zZTUwLTQ1ODYtODhlNC02NDUxNDg0MDdjNzUifQ==",
        )  # your credentials
        self.batch_size=batch_size
        self.train_dataloader = train_dataloader
        self.valid_dataloader = valid_dataloader

    def fit(self, lr=0.001, epochs=10):
        self.run["parameters"] = {
            "lr": lr,
            "epochs": epochs,
            "batch_size": self.batch_size,
            "model": TRAINING_MODEL_PATH,
            "loss": "CrossEntropyLoss",
            "MAX_LENGTH": TRAINING_MAX_LENGTH
        }
        optimizer=torch.optim.AdamW(self.model.parameters(), lr=lr)
        scheduler=scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
        bar = tqdm(total=len(self.train_dataloader) * epochs, desc="Training")
        bar.set_description("Epoch 0/{}".format(epochs))

        for epoch in range(epochs):
            self.model.train()            
            for ids, att_mask, labels in self.train_dataloader:
                
                ids=ids.to(self.device)
                labels=labels.to(self.device)
                att_mask=att_mask.to(self.device)
                pred=self.model(ids, att_mask, labels)
                # reshape pred to [batch_size, num_classes, sequence_length]
                pred = pred.permute(0, 2, 1)
                loss=self.loss_fn(pred, labels)
                self.run["train_loss"].log(loss.item())
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                bar.update(1)
            scheduler.step()
            self.model.eval()
            # log current state to neptune
            if self.valid_dataloader is not None:
                metrics=self.get_accuracy()
                self.run["valid_accuracy"].log(metrics["accuracy"])
                self.run["valid_loss"].log(metrics["loss"])
                bar.set_description("Epoch {}/{} validAccuracy: {:.2f} validLoss: {:.2f}".format(epoch+1, epochs, metrics["accuracy"], metrics["loss"]))
            
                
    def get_accuracy(self):
        self.model.eval()
        with torch.no_grad():
            correct=0
            losses=[]
            for ids, att_mask, labels in self.valid_dataloader:
                ids=ids.to(self.device)
                labels=labels.to(self.device)
                att_mask=att_mask.to(self.device)
                pred=self.model(ids, att_mask, labels)
                pred = pred.permute(0, 2, 1)
                loss=self.loss_fn(pred, labels)
                losses.append(loss.item())
                pred=torch.argmax(pred, dim=1)
                correct+=torch.sum(pred==labels).item()
            return {
                "accuracy": correct/len(self.valid_dataloader.dataset),
                "loss": np.mean(losses)
            }
    

In [88]:
data = json.load(open("data/train.json"))




data_len=len(data)

train_len=int(data_len*0.8)
valid_len=data_len-train_len

train_data_idx=np.random.choice(data_len, train_len, replace=False)
valid_data_idx=np.array(list(set(range(data_len))-set(train_data_idx)))

train_data=[data[i] for i in train_data_idx]
valid_data=[data[i] for i in valid_data_idx]


train_ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in train_data],
    "document": [x["document"] for x in train_data],
    "tokens": [x["tokens"] for x in train_data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in train_data],
    "provided_labels": [x["labels"] for x in train_data],
})

valid_ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in valid_data],
    "document": [x["document"] for x in valid_data],
    "tokens": [x["tokens"] for x in valid_data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in valid_data],
    "provided_labels": [x["labels"] for x in valid_data],
})


print("len train ds", len(train_ds))
print("len valid ds", len(valid_ds))

len train ds 5445
len valid ds 1362


In [89]:
len(label2id)

13

In [90]:

BATCH_SIZE=8

# set environment variables: TOKENIZERS_PARALLELISM=false
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


train_dataset = PiiDataset(train_ds, tokenizer, label2id, TRAINING_MAX_LENGTH)
valid_dataset = PiiDataset(valid_ds, tokenizer, label2id, TRAINING_MAX_LENGTH)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
my_model=MyModel(TRAINING_MODEL_PATH, len(label2id))

learner=Learner(my_model, train_dataloader, valid_dataloader, batch_size=BATCH_SIZE)
learner.fit(lr=0.0001, epochs=2)

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForTokenClassification: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'deberta.embeddings.position_embeddings.weight', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a Be

https://app.neptune.ai/bernd.heidemann/PII/e/PII-8


Training:   0%|          | 0/1362 [00:00<?, ?it/s]

pred.shape torch.Size([8, 512, 13])
labels.shape torch.Size([8, 512])
pred.shape torch.Size([8, 512, 13])
labels.shape torch.Size([8, 512])
pred.shape torch.Size([8, 512, 13])
labels.shape torch.Size([8, 512])


KeyboardInterrupt: 

In [None]:
for id, attention_mask, labels in valid_dataloader:
    id=id.to(device)
    attention_mask=attention_mask.to(device)
    labels=labels.to(device)
    preds=learner.model(id, attention_mask, labels).argmax(dim=2)
    # print frequency of labels
    print("predictions:" , torch.bincount(preds.flatten()))
    print("ground truth: ", torch.bincount(labels.flatten()))
    break

In [None]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

import os
import json
import argparse
from itertools import chain
from functools import partial

from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from tokenizers import AddedToken
import evaluate
from datasets import Dataset
import numpy as np

os.environ["TOKENIZERS_PARALLELISM"] = "false"


# lots of newlines in the text
# adding this should be helpful
tokenizer.add_tokens(AddedToken("\n", normalized=False))

ds = ds.map(lambda x: tokenize(x, tokenizer, label2id, TRAINING_MAX_LENGTH), num_proc=12)
ds = ds.filter(
    filter_no_pii,
    num_proc=12,
)

In [None]:
x

In [None]:

sample = x


sample_tokenized=tokenize(sample, tokenizer, label2id, max_length=TRAINING_MAX_LENGTH)

# create text of sample_tokenized

text = tokenizer.decode(sample_tokenized["input_ids"])

print("original: [CLS] " + sample["full_text"].replace("\n", " "))
print("tokenize: " + text)
print("labels: " + " ".join([id2label[x] for x in sample_tokenized["labels"]]))