In [2]:
import uuid
from functools import partial

import evaluate
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments,
)


def is_subword(text, tokenized, tokenizer, index):
    word = tokenizer.convert_ids_to_tokens(tokenized["input_ids"][index])
    start_ind, end_ind = tokenized["offset_mapping"][index]
    word_ref = text[start_ind:end_ind]
    is_subword = len(word) != len(word_ref)
    return is_subword


def tokenize(example, labels2int, tokenizer, iob=True, ignore_subwords=True):

    text, labels = example["source_text"], example["privacy_mask"]

    i = 0
    token_labels = []

    tokenized = tokenizer(text, return_offsets_mapping=True, return_special_tokens_mask=True)
    start_token_to_label = {
        tokenized.char_to_token(label["start"]): (label["start"], label["end"], label["label"]) for label in labels
    }
    while i < len(tokenized["input_ids"]):
        if tokenized["special_tokens_mask"][i] == 1:
            token_labels.append(-100)
            i += 1
        elif i not in start_token_to_label:
            if ignore_subwords and is_subword(text, tokenized, tokenizer, i):
                token_labels.append(-100)
            else:
                token_labels.append(labels2int["O"])
            i += 1
        else:
            start, end, label = start_token_to_label[i]
            start_token = tokenized.char_to_token(start)
            assert start_token == i
            j = start_token
            while j < (len(tokenized["input_ids"]) - 1) and tokenized.token_to_chars(j).start < end:
                if j == start_token:
                    if iob:
                        token_labels.append(labels2int["B-" + label])
                    else:
                        token_labels.append(labels2int[label])
                elif ignore_subwords and is_subword(text, tokenized, tokenizer, j):
                    token_labels.append(-100)
                else:
                    if iob:
                        token_labels.append(labels2int["I-" + label])
                    else:
                        token_labels.append(labels2int[label])

                j += 1
            i = j
    tokenized["labels"] = token_labels
    return tokenized


def compute_metrics(eval_pred, label_list, seqeval_metric):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval_metric.compute(predictions=true_predictions, references=true_labels)
    results_flat = {f"{k}_f1": v["f1"] for k, v in results.items() if isinstance(v, dict)}
    results_flat.update(
        {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }
    )
    return results_flat


  from .autonotebook import tqdm as notebook_tqdm
2024-09-24 23:30:58.778620: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-24 23:30:58.793632: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-24 23:30:58.798229: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-24 23:30:58.810080: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
label2id =  {
    "0": "O",
    "1": "B-CITY",
    "2": "I-CITY",
    "3": "B-FIRSTNAME",
    "4": "I-FIRSTNAME",
    "5": "B-USERNAME",
    "6": "I-USERNAME",
    "7": "B-JOBTYPE",
    "8": "B-PREFIX",
    "9": "I-PREFIX",
    "10": "B-LASTNAME",
    "11": "B-EMAIL",
    "12": "I-EMAIL",
    "13": "B-NEARBYGPSCOORDINATE",
    "14": "I-NEARBYGPSCOORDINATE",
    "15": "B-ACCOUNTNUMBER",
    "16": "I-ACCOUNTNUMBER",
    "17": "B-ACCOUNTNAME",
    "18": "I-ACCOUNTNAME",
    "19": "B-MIDDLENAME",
    "20": "I-MIDDLENAME",
    "21": "B-COUNTY",
    "22": "I-COUNTY",
    "23": "B-AGE",
    "24": "B-CREDITCARDCVV",
    "25": "B-DOB",
    "26": "I-DOB",
    "27": "B-MASKEDNUMBER",
    "28": "I-MASKEDNUMBER",
    "29": "B-PASSWORD",
    "30": "I-PASSWORD",
    "31": "B-SEX",
    "32": "B-STATE",
    "33": "B-COMPANYNAME",
    "34": "I-COMPANYNAME",
    "35": "B-PHONEIMEI",
    "36": "I-PHONEIMEI",
    "37": "B-STREET",
    "38": "I-STREET",
    "39": "B-SSN",
    "40": "I-SSN",
    "41": "B-IPV4",
    "42": "I-IPV4",
    "43": "B-USERAGENT",
    "44": "I-USERAGENT",
    "45": "B-MAC",
    "46": "I-MAC",
    "47": "B-PIN",
    "48": "I-PIN",
    "49": "B-IP",
    "50": "I-IP",
    "51": "B-URL",
    "52": "I-URL",
    "53": "B-CURRENCYSYMBOL",
    "54": "B-DATE",
    "55": "I-DATE",
    "56": "B-TIME",
    "57": "I-TIME",
    "58": "B-VEHICLEVRM",
    "59": "I-VEHICLEVRM",
    "60": "I-AMOUNT",
    "61": "B-ETHEREUMADDRESS",
    "62": "I-ETHEREUMADDRESS",
    "63": "B-BITCOINADDRESS",
    "64": "I-BITCOINADDRESS",
    "65": "B-LITECOINADDRESS",
    "66": "I-LITECOINADDRESS",
    "67": "I-JOBTYPE",
    "68": "B-CREDITCARDNUMBER",
    "69": "I-CREDITCARDNUMBER",
    "70": "B-IPV6",
    "71": "I-IPV6",
    "72": "I-LASTNAME",
    "73": "B-PHONENUMBER",
    "74": "I-PHONENUMBER",
    "75": "B-CREDITCARDISSUER",
    "76": "I-CREDITCARDISSUER",
    "77": "B-SECONDARYADDRESS",
    "78": "I-SECONDARYADDRESS",
    "79": "B-ZIPCODE",
    "80": "I-ZIPCODE",
    "81": "B-VEHICLEVIN",
    "82": "I-VEHICLEVIN",
    "83": "I-AGE",
    "84": "B-GENDER",
    "85": "I-GENDER",
    "86": "B-ORDINALDIRECTION",
    "87": "B-JOBAREA",
    "88": "B-HEIGHT",
    "89": "I-HEIGHT",
    "90": "B-JOBTITLE",
    "91": "I-JOBTITLE",
    "92": "B-BUILDINGNUMBER",
    "93": "I-BUILDINGNUMBER",
    "94": "B-AMOUNT",
    "95": "I-STATE",
    "96": "I-CURRENCYSYMBOL",
    "97": "B-IBAN",
    "98": "I-IBAN",
    "99": "B-BIC",
    "100": "I-BIC",
    "101": "B-EYECOLOR",
    "102": "B-CURRENCYNAME",
    "103": "I-CURRENCYNAME",
    "104": "B-CURRENCY",
    "105": "I-CURRENCY",
    "106": "B-CURRENCYCODE",
    "107": "I-CURRENCYCODE",
    "108": "I-JOBAREA",
    "109": "I-EYECOLOR",
    "110": "I-CREDITCARDCVV",
    "111" : "I-ORDINALDIRECTION"
  }

In [4]:
labels = list(label2id.values())
label2id = {k: v for v, k in enumerate(labels)}
#Invert
id2label = {v: k for k, v in label2id.items()}

In [12]:


pretrained_name = "microsoft/deberta-v3-small"
tokenizer = AutoTokenizer.from_pretrained(pretrained_name)
model = AutoModelForTokenClassification.from_pretrained(pretrained_name, num_labels=len(labels), id2label=id2label)

ds = load_dataset("ai4privacy/pii-masking-200k")
# ds = ds.filter(lambda x: x["language"] == "English", num_proc=4)
ds = ds.map(
    partial(tokenize, labels2int=label2id, tokenizer=tokenizer, iob=True, ignore_subwords=True),
    batched=False,
    remove_columns=[
        "source_text",
        "target_text",
        "privacy_mask",
        "span_labels",
        "mbert_text_tokens",
        "mbert_bio_labels",
        "id",
        "language",
        "set",
    ],
    num_proc=8,
).remove_columns(["offset_mapping"])

training_arguments = TrainingArguments(
    output_dir="output_DeBERTa_v3_small",
    max_steps=40000,
    eval_steps=5000,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    overwrite_output_dir=True,
    warmup_ratio=0.2,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_strategy="steps",
    eval_strategy="steps",
    load_best_model_at_end=True,
    save_total_limit=1,
    save_steps=5000,
    lr_scheduler_type="cosine",
    warmup_steps=3000,
    metric_for_best_model="f1",
    greater_is_better=True,
    torch_compile=False,

)
test_size = 0.1
ds["train"], ds["test"] = ds["train"].train_test_split(test_size=test_size, seed=42)

trainer = Trainer(
    model,
    training_arguments,
    train_dataset=ds["train"],
    eval_dataset= ds["test"],
    data_collator=DataCollatorForTokenClassification(tokenizer),
    tokenizer=tokenizer,
    compute_metrics=partial(compute_metrics, label_list=labels, seqeval_metric=evaluate.load("seqeval")),
)


Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
max_steps is given, it will override any value given in num_train_epochs


In [13]:
trainer.train()


AttributeError: 'list' object has no attribute 'keys'

{'eval_loss': 0.06687254458665848, 'eval_BOD_f1': 0.9671848013816926, 'eval_BUILDING_f1': 0.9815005138746146, 'eval_CARDISSUER_f1': 0.0, 'eval_CITY_f1': 0.9705593719332678, 'eval_COUNTRY_f1': 0.9685138539042821, 'eval_DATE_f1': 0.9226327944572749, 'eval_DRIVERLICENSE_f1': 0.9569736021617129, 'eval_EMAIL_f1': 0.9851541682527598, 'eval_GEOCOORD_f1': 0.9677419354838711, 'eval_GIVENNAME1_f1': 0.8613455510007235, 'eval_GIVENNAME2_f1': 0.7895716945996275, 'eval_IDCARD_f1': 0.939365496527126, 'eval_IP_f1': 0.9852330410706046, 'eval_LASTNAME1_f1': 0.8338430173292558, 'eval_LASTNAME2_f1': 0.7272727272727272, 'eval_LASTNAME3_f1': 0.6918918918918918, 'eval_PASS_f1': 0.933046683046683, 'eval_PASSPORT_f1': 0.9550030819806863, 'eval_POSTCODE_f1': 0.9765013054830287, 'eval_SECADDRESS_f1': 0.9680789320951828, 'eval_SEX_f1': 0.9722222222222222, 'eval_SOCIALNUMBER_f1': 0.9524744697564809, 'eval_STATE_f1': 0.9787023278850916, 'eval_STREET_f1': 0.9688796680497925, 'eval_TEL_f1': 0.970946113732307, 'eval_TIME_f1': 0.9695451549110086, 'eval_TITLE_f1': 0.9568607068607069, 'eval_USERNAME_f1': 0.9333333333333333, 'eval_precision': 0.9410922874603543, 'eval_recall': 0.952028798372266, 'eval_f1': 0.9465289529478127, 'eval_accuracy': 0.992088277633207, 'eval_runtime': 28.7216, 'eval_samples_per_second': 276.656, 'eval_steps_per_second': 8.669, 'epoch': 42.78}
