In [1]:
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from datasets import load_dataset
dataset = load_dataset("ai4privacy/pii-masking-300k")

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['source_text', 'target_text', 'privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set'],
        num_rows: 177677
    })
    validation: Dataset({
        features: ['source_text', 'target_text', 'privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set'],
        num_rows: 47728
    })
})

In [4]:
input_labels = dataset['train'][:29908]['mbert_text_tokens']
target_labels = dataset['train'][:29908]['mbert_bio_labels']
document_ids = dataset['train'][:29908]['id']
full_texts = dataset['train'][:29908]['source_text']
privacy_mask = dataset['train'][:29908]['privacy_mask']

train_dataset = [{'id':document_ids[i][:-1], 'ner_tags': target_labels[i], 'tokens': input_labels[i], 'full_text': full_texts[i].replace("\\n"," "), 'privacy_mask': privacy_mask[i]} for i in range(29908)]


In [137]:
all_labels = ['B-STREET',
 'B-CITY',
 'I-DATE',
 'B-PASS',
 'I-CITY',
 'B-TIME',
 'B-EMAIL',
 'I-DRIVERLICENSE',
 'I-POSTCODE',
 'I-BOD',
 'B-USERNAME',
 'B-BOD',
 'B-COUNTRY',
 'B-SECADDRESS',
 'I-GIVENNAME2',
 'B-IDCARD',
 'I-SOCIALNUMBER',
 'I-PASSPORT',
 'B-IP',
 'O',
 'B-LASTNAME2',
 'B-TEL',
 'B-SOCIALNUMBER',
 'I-TIME',
 'B-BUILDING',
 'B-LASTNAME1',
 'B-PASSPORT',
 'I-TITLE',
 'I-SEX',
 'I-STREET',
 'B-STATE',
 'I-STATE',
 'B-TITLE',
 'I-GIVENNAME1',
 'B-DATE',
 'B-GEOCOORD',
 'I-IDCARD',
 'I-TEL',
 'B-GIVENNAME2',
 'B-POSTCODE',
 'I-LASTNAME2',
 'B-DRIVERLICENSE',
 'I-LASTNAME3',
 'I-GEOCOORD',
 'I-COUNTRY',
 'I-EMAIL',
 'I-PASS',
 'B-SEX',
 'B-LASTNAME3',
 'I-USERNAME',
 'I-BUILDING',
 'I-IP',
 'B-GIVENNAME1',
 'I-LASTNAME1',
 'I-SECADDRESS',
 'B-CARDISSUER',
 'I-CARDISSUER']
id2label = {i: l for i, l in enumerate(all_labels)}
label2id = {v: k for k, v in id2label.items()}
target = [l for l in all_labels if l != "O"]

In [138]:
from transformers import AutoTokenizer

deberta_tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v2-xlarge")
mbert_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-multilingual-cased")



In [139]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["full_text"], truncation=True, is_split_into_words=False)
    print(len(tokenized_inputs['input_ids']))
    print(examples['ner_tags'])
    print(tokenizer.convert_ids_to_tokens(tokenized_inputs['input_ids']))
    labels = []
    previous_word_idx = None
    word_ids = tokenized_inputs.word_ids()
    print(len(word_ids))
    print(word_ids)
    for i, label in enumerate(word_ids):
        word_idx = label
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:  
            labels.append(label2id[examples['ner_tags'][label]])
        else:
            labels.append(-100)
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [150]:
def find_sub_list(sl,l):
    sll=len(sl)
    for i in range(len(l)-sll):
        if l[i:i+sll] == sl:
            return i, i+sll

In [153]:
find_sub_list([1,2], [1,1,3,1,2,3])

(3, 5)

In [158]:
tokenized_train_dataset = []
from ast import literal_eval
for item in train_dataset: 
    privacy_mask = item['privacy_mask']
    #getting tokenized text
    tokenized_inputs = deberta_tokenizer(item["full_text"], truncation=True, is_split_into_words=False)
    #print(tokenized_inputs)
    #tokenizing mask labels to hopefully replace in tokenized_inputs
    tags = ['O' for i in range(len(tokenized_inputs['input_ids']))]
    append = True
    for mask in privacy_mask:
        #print(item['full_text'][mask[0]:mask[1]])
        tokenized_mask = deberta_tokenizer(item['full_text'][mask['start']:mask['end']], truncation=True, is_split_into_words=False)
        mask_ids = tokenized_mask['input_ids'][1:-1]
        #print(mask_ids)
        match_indices = find_sub_list(mask_ids, tokenized_inputs['input_ids'])
        if match_indices == None:
            append = False
            # print(item['full_text'][mask['start']:mask['end']])
            # print(mask['value'])
            # print(deberta_tokenizer.convert_ids_to_tokens(tokenized_inputs['input_ids']))
            # print(deberta_tokenizer.convert_ids_to_tokens(mask_ids))
            break
            
        #print(tokenized_inputs['input_ids'][match_indices[0]:match_indices[1]])
        #print(match_indices)
        tags[match_indices[0]] = 'B-' + mask['label']
        #print(len(tags[match_indices[0]+1:match_indices[1]]) == len(range(len(mask_ids)-1)))
        tags[match_indices[0]+1:match_indices[1]] = ['I-' + mask['label'] for i in range(len(mask_ids)-1)]
    if append:
        tokenized_train_dataset.append({
            'tokenized_input_ids': tokenized_inputs['input_ids'],
            'tokenized_labels': [-100] + [label2id[i] for i in tags[1:-1]] + [-100]
        })

In [142]:
tokenized_train_dataset = []
from ast import literal_eval
for item in train_dataset: 
    span_labels = literal_eval(item['span_labels'])
    #getting tokenized text
    tokenized_inputs = deberta_tokenizer(item["full_text"], truncation=True, is_split_into_words=False)
    #print(tokenized_inputs)
    #tokenizing span labels to hopefully replace in tokenized_inputs
    tags = ['O' for i in range(len(tokenized_inputs['input_ids']))]
    for span in span_labels:
        #print(item['full_text'][span[0]:span[1]])
        tokenized_span = deberta_tokenizer(item['full_text'][span[0]:span[1]], truncation=True, is_split_into_words=False)
        span_ids = tokenized_span['input_ids'][1:-1]
        #print(span_ids)
        match_indices = find_sub_list(span_ids, tokenized_inputs['input_ids'])
        print(match_indices)
        if match_indices == None:
            print(item['full_text'][span[0]:span[1]])
            print(deberta_tokenizer.convert_ids_to_tokens(tokenized_inputs['input_ids']))
            print(deberta_tokenizer.convert_ids_to_tokens(span_ids))
            
        #print(tokenized_inputs['input_ids'][match_indices[0]:match_indices[1]])
        #print(match_indices)
        tags[match_indices[0]] = 'B-' + span[2]
        #print(len(tags[match_indices[0]+1:match_indices[1]]) == len(range(len(span_ids)-1)))
        tags[match_indices[0]+1:match_indices[1]] = ['I-' + span[2] for i in range(len(span_ids)-1)]

    tokenized_train_dataset.append({
        'tokenized_input_ids': tokenized_inputs['input_ids'],
        'tokenized_labels': [label2id[i] for i in tags]
    })

(102, 111)
(98, 101)
(86, 95)
(82, 85)
(74, 79)
(72, 73)
(65, 69)
(62, 64)
(53, 59)
(23, 25)
(17, 20)
(13, 16)
(8, 10)
(4, 7)
(83, 84)
None
oo.com    Social 
['[CLS]', '▁Subject', ':', '▁Admission', '▁Notification', '▁-', '▁Great', '▁Britain', '▁University', '▁Dear', '▁Applicants', ',', '▁We', '▁are', '▁thrilled', '▁to', '▁inform', '▁you', '▁about', '▁the', '▁status', '▁of', '▁your', '▁admission', '▁to', '▁Great', '▁Britain', '▁University', '.', '▁Please', '▁read', '▁the', '▁details', '▁below', '▁for', '▁the', '▁automated', '▁notification', '.', '▁Date', '▁of', '▁Notification', ':', '▁5', ':24', 'am', '▁on', '▁August', '▁5', 'th', ',', '▁20', '57', '▁**', 'App', 'lic', 'ant', '▁Details', '**', '▁1.', '▁Applicant', ':', '▁Ball', 'oi', '▁Eck', 'rich', '▁Email', ':', '▁b', 'ball', 'oi', '@', 'yahoo', '.', 'com', '▁Social', '▁Number', ':', '▁996', '▁07', '6', '▁6', '460', '▁ID', '[SEP]']
['▁oo', '.', 'com', '▁Social']


TypeError: 'NoneType' object is not subscriptable

In [31]:
import evaluate
import numpy as np

seqeval = evaluate.load("seqeval")

In [32]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [6]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "microsoft/deberta-v2-xlarge", num_labels=len(all_labels), id2label=id2label, label2id=label2id
)

NameError: name 'all_labels' is not defined