In [1]:
import datasets
import spacy
from spacy.training import offsets_to_biluo_tags

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
from datasets import load_dataset
dataset = load_dataset("ai4privacy/pii-masking-300k")

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['source_text', 'target_text', 'privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set'],
        num_rows: 177677
    })
    validation: Dataset({
        features: ['source_text', 'target_text', 'privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set'],
        num_rows: 47728
    })
})

In [27]:
dataset['validation'][7946]['language']

'Dutch'

In [28]:
document_ids = dataset['validation'][:7946]['id']
full_texts = dataset['validation'][:7946]['source_text']
span_labels = dataset['validation'][:7946]['span_labels']
from ast import literal_eval

test_dataset = [{'id':document_ids[i][:-1], 'full_text': full_texts[i], 'span_label': literal_eval(span_labels[i])} for i in range(7946)]


In [29]:
all_labels = ['B-STREET',
 'B-CITY',
 'I-DATE',
 'B-PASS',
 'I-CITY',
 'B-TIME',
 'B-EMAIL',
 'I-DRIVERLICENSE',
 'I-POSTCODE',
 'I-BOD',
 'B-USERNAME',
 'B-BOD',
 'B-COUNTRY',
 'B-SECADDRESS',
 'B-IDCARD',
 'I-SOCIALNUMBER',
 'I-PASSPORT',
 'B-IP',
 'O',
 'B-TEL',
 'B-SOCIALNUMBER',
 'I-TIME',
 'B-BUILDING',
 'B-PASSPORT',
 'I-TITLE',
 'I-SEX',
 'I-STREET',
 'B-STATE',
 'I-STATE',
 'B-TITLE',
 'B-DATE',
 'B-GEOCOORD',
 'I-IDCARD',
 'I-TEL',
 'B-POSTCODE',
 'B-DRIVERLICENSE',
 'I-GEOCOORD',
 'I-COUNTRY',
 'I-EMAIL',
 'I-PASS',
 'B-SEX',
 'I-USERNAME',
 'I-BUILDING',
 'I-IP',
 'I-SECADDRESS',
 'B-CARDISSUER',
 'I-CARDISSUER']
id2label = {i: l for i, l in enumerate(all_labels)}
label2id = {v: k for k, v in id2label.items()}
target = [l for l in all_labels if l != "O"]

In [30]:
unique_labels = set()
for i in all_labels:
    unique_labels.add(i[2:])

In [31]:
from transformers import AutoTokenizer

deberta_tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v2-xlarge")
mbert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [32]:
len(all_labels)

47

In [33]:
def biluo_to_bio(tags):
    new_tags = []
    for i in tags:
        if i.startswith("U-"):
            new_tags.append(i.replace("U-", "B-"))
            continue
        if i.startswith("L-"):
            new_tags.append(i.replace("L-", "I-"))
            continue
        new_tags.append(i)
    return new_tags

def create_bio_labels(text, spans):
    doc = nlp(text)
    spacy_tokens = [token.text for token in doc]
    spacy_tags = offsets_to_biluo_tags(doc, spans)
    if '-' in spacy_tags:
        raise Exception("fuck")
    bio_tags = biluo_to_bio(spacy_tags)
    tokenized_input = mbert_tokenizer(spacy_tokens, is_split_into_words=True, truncation=True)
    tokens = mbert_tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
    word_ids = tokenized_input.word_ids()
    tokenized_bio = []
    for word_idx in word_ids:
        if word_idx is None:
            tokenized_bio.append(-100)
        elif word_idx != previous_word_idx:  # Only label the first token of a given words
            if bio_tags[word_idx][-1] in ['1', '2', '3']:
                tokenized_bio.append(label2id[bio_tags[word_idx][:-1]])
            else:
                tokenized_bio.append(label2id[bio_tags[word_idx]])
        else:
            tokenized_bio.append(-100)
        previous_word_idx = word_idx
    return tokens, tokenized_input["input_ids"], tokenized_bio

In [34]:
tokenized_train_dataset = []
for i in test_dataset:
    try:
        tokens, token_ids, tokenized_bio = create_bio_labels(i['full_text'], i['span_label'])
        tokenized_train_dataset.append({
            'id': i['id'],
            'tokens': tokens,
            'token_ids': token_ids,
            'bio_labels': tokenized_bio,
            'source_text': i['full_text']
        })
    except:
        continue

<financial_..." with entities "[[447, 449, 'COUNTRY'], [392, 409, 'TEL'], [345, 3...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
            </personal_info>
            <addres..." with entities "[[424, 437, 'IP'], [312, 324, 'SECADDRESS'], [262,...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
            <..." with entities "[[382, 390, 'TIME'], [250, 257, 'LASTNAME3'], [192...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
<curriculum..." with entities "[[368, 386, 'DRIVERLICENSE'], [313, 325, 'SOCIALNU...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during

In [35]:
len(tokenized_train_dataset)


4811

In [36]:
import json


with open("distilbert_test.json", "w") as f:
    f.write(json.dumps(tokenized_train_dataset))

In [26]:
tokenized_train_dataset[0]

{'id': '40767',
 'tokens': ['[CLS]',
  'Sub',
  '##ject',
  ':',
  'Group',
  'Mess',
  '##aging',
  'for',
  'Ad',
  '##mission',
  '##s',
  'Process',
  'Good',
  'morning',
  ',',
  'everyone',
  ',',
  'I',
  'hope',
  'this',
  'message',
  'finds',
  'you',
  'well',
  '.',
  'As',
  'we',
  'continue',
  'our',
  'admission',
  '##s',
  'processes',
  ',',
  'I',
  'would',
  'like',
  'to',
  'update',
  'you',
  'on',
  'the',
  'latest',
  'developments',
  'and',
  'key',
  'information',
  '.',
  'Please',
  'find',
  'below',
  'the',
  'time',
  '##line',
  'for',
  'our',
  'upcoming',
  'meetings',
  ':',
  '-',
  'w',
  '##yn',
  '##q',
  '##vr',
  '##h',
  '##0',
  '##53',
  '-',
  'Meeting',
  'at',
  '10',
  ':',
  '20',
  '##am',
  '-',
  'luka',
  '.',
  'bu',
  '##rg',
  '-',
  'Meeting',
  'at',
  '21',
  '-',
  'q',
  '##ahi',
  '##l',
  '.',
  'wit',
  '##tau',
  '##er',
  '-',
  'Meeting',
  'at',
  'quarter',
  'past',
  '13',
  '-',
  'gh',
  '##ola',
  '##