### The first stage of training 
The initial training on pseudo-labeled data 

In [2]:
from datasets import load_dataset, load_metric, Dataset, DatasetDict, load_from_disk
from huggingface_hub import notebook_login
import json
import numpy as np
import pandas as pd

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
model_checkpoint = "microsoft/deberta-v3-base"
batch_size = 24

In [5]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DebertaV2TokenizerFast

from utils import LABEL2ID, ID2LABEL


model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(ID2LABEL))
tokenizer = DebertaV2TokenizerFast.from_pretrained(model_checkpoint, add_prefix_space=True)

model.config.id2label = {str(i):label for i, label in enumerate(ID2LABEL)}
model.config.label2id = LABEL2ID

tokenizer.model_max_length = 512

  _seqeval_metric = load_metric("seqeval")
Using the latest cached version of the module from /home/monty/.cache/huggingface/modules/datasets_modules/metrics/seqeval/c8563af43bdce095d0f9e8b8b79c9c96d5ea5499b3bf66f90301c9cb82910f11 (last modified on Thu Feb 16 17:58:29 2023) since it couldn't be found locally at seqeval, or remotely on the Hugging Face Hub.
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForTokenClassification: ['deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2ForToke

In [10]:
train_dataset = load_dataset("bigcode/pseudo-labeled-python-data-pii-detection-filtered", use_auth_token=True)['train']
dev_dataset = load_dataset("bigcode/pii-for-code-v2/", use_auth_token=True)['train']

In [11]:
from utils import label_tokenized

def tokenize_and_label(entry, tokenizer=tokenizer):
    inputs = tokenizer.encode_plus(entry['content'], return_offsets_mapping=True, add_special_tokens=False)
    entry.update(inputs)
    return label_tokenized(entry)

dev_dataset = dev_dataset.map(lambda x: dict(pii=json.loads(x['pii'])))
dev_dataset = dev_dataset.map(tokenize_and_label)
dev_dataset

  0%|          | 0/400 [00:00<?, ?ex/s]

  0%|          | 0/400 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (8742 > 512). Running this sequence through the model will result in indexing errors


Dataset({
    features: ['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'id', 'fold', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'labels'],
    num_rows: 400
})

In [12]:
from utils import chunk_dataset

train_dataset = train_dataset.map(lambda x: dict(pii=json.loads(x['pii'])))
train_dataset = train_dataset.map(tokenize_and_label, num_proc=8)
train_dataset = chunk_dataset(train_dataset, tokenizer)

  0%|          | 0/17678 [00:00<?, ?ex/s]

         

#0:   0%|          | 0/2210 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/2210 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/2210 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/2210 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/2210 [00:00<?, ?ex/s]

 

#5:   0%|          | 0/2210 [00:00<?, ?ex/s]

 

#6:   0%|          | 0/2209 [00:00<?, ?ex/s]

 

#7:   0%|          | 0/2209 [00:00<?, ?ex/s]

TypeError: chunk_dataset() got an unexpected keyword argument 'load_from_cache_file'

In [14]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels', 'id', 'chunk_id'],
    num_rows: 121080
})

In [15]:
ner_dataset = DatasetDict(
    train = train_dataset,
    validation = chunk_dataset(dev_dataset, tokenizer),
    test = chunk_dataset(dev_dataset, tokenizer, overlap_freq=2),
)
ner_dataset

  0%|          | 0/400 [00:00<?, ?it/s]

  0%|          | 0/400 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'id', 'chunk_id'],
        num_rows: 121080
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'id', 'chunk_id'],
        num_rows: 2040
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'id', 'chunk_id'],
        num_rows: 3853
    })
})

## Training

In [18]:
from transformers import DataCollatorForTokenClassification, EarlyStoppingCallback

data_collator = DataCollatorForTokenClassification(tokenizer)
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-pretrained",
    overwrite_output_dir=True,
    evaluation_strategy = "steps",
    save_strategy='steps',
    num_train_epochs=1,
    eval_steps=300,
    save_steps=300,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    metric_for_best_model="f1",
    load_best_model_at_end = True,
    weight_decay=0.01,
    logging_steps=10,
    save_total_limit=30,
    push_to_hub=False,
)

In [19]:
from utils import compute_metrics

trainer = Trainer(
    model,
    args,
    train_dataset=ner_dataset["train"],
    eval_dataset=ner_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience = 30, early_stopping_threshold= 1e-3)]
)

In [22]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DebertaV2ForTokenClassification.forward` and have been ignored: id, chunk_id. If id, chunk_id are not expected by `DebertaV2ForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 121080
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 7568


Step,Training Loss,Validation Loss,Avg.precision,Precision,Recall,F1,Ambiguous,Email,Ip Address,Key,Name,Password,Username
300,0.0499,0.024034,0.649894,0.302181,0.400413,0.34443,0.0,0.434783,0.0,0.090909,0.612245,0.0,0.0
600,0.0661,0.025051,0.727751,0.434845,0.623323,0.512299,0.0,0.933638,0.423077,0.052083,0.758221,0.0,0.402277
900,0.0417,0.054753,0.607098,0.200584,0.637771,0.305185,0.0,0.314797,0.351779,0.035714,0.771481,0.116129,0.402662
1200,0.0215,0.019985,0.79268,0.528986,0.678019,0.594301,0.0,0.795322,0.555556,0.184211,0.75,0.543689,0.394973
1500,0.0237,0.040654,0.711795,0.467731,0.695562,0.559336,0.0,0.79922,0.553846,0.104784,0.761726,0.5625,0.5
1800,0.0172,0.029019,0.695333,0.376392,0.697626,0.488969,0.0,0.811133,0.586572,0.032138,0.780399,0.6,0.503093
2100,0.0494,0.037411,0.718493,0.468475,0.713106,0.565466,0.0,0.771536,0.684615,0.076923,0.760417,0.580153,0.45173
2400,0.017,0.032984,0.776229,0.426748,0.724458,0.537108,0.0,0.850716,0.395745,0.067901,0.798493,0.537143,0.4219
2700,0.0348,0.034663,0.728199,0.515337,0.693498,0.591289,0.0,0.819802,0.676923,0.114286,0.76779,0.589928,0.417423
3000,0.0305,0.03871,0.673239,0.553344,0.717234,0.624719,0.0,0.804642,0.752137,0.092527,0.800731,0.57931,0.503018


The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForTokenClassification.forward` and have been ignored: id, chunk_id. If id, chunk_id are not expected by `DebertaV2ForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2040
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to /data3/monty/deberta-v3-base-pretrained/checkpoint-300
Configuration saved in /data3/monty/deberta-v3-base-pretrained/checkpoint-300/config.json
Model weights saved in /data3/monty/deberta-v3-base-pretrained/checkpoint-300/pytorch_model.bin
tokenizer config file saved in /data3/monty/deberta-v3-base-pretrained/checkpoint-300/tokenizer_config.json
Special tokens file saved in /data3/monty/deberta-v3-base-pretrained/checkpoint-300/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForTokenClassificat

TrainOutput(global_step=7568, training_loss=0.026313426741940336, metrics={'train_runtime': 7125.7811, 'train_samples_per_second': 16.992, 'train_steps_per_second': 1.062, 'total_flos': 3.176570305184112e+16, 'train_loss': 0.026313426741940336, 'epoch': 1.0})

In [None]:
from utils.chunking import compose_chunk_predictions_with_samples

pred = trainer.predict(ner_dataset['test'])
dev_dataset = compose_chunk_predictions_with_samples(dev_dataset['dev'], pred, ner_dataset['test']['id'], tokenizer)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix

true_labels = np.array(list(itertools.chain(*dev_dataset['labels'])))
pred_labels = np.argmax(list(itertools.chain(*dev_dataset['pred'])), axis=-1)

data = confusion_matrix(true_labels, pred_labels, labels=range(len(ID2LABEL)), normalize = 'true')
df_cm = pd.DataFrame(data, columns=ID2LABEL, index = ID2LABEL)
df_cm.index.name = 'Actual'
df_cm.columns.name = 'Predicted'


f, ax = plt.subplots(figsize=(15, 15))
cmap = sns.cubehelix_palette(light=1, as_cmap=True)

sns.heatmap(df_cm, cbar=False, annot=True, cmap=cmap, square=True, fmt='.1%',
            annot_kws={'size': 10})
plt.title('Actuals vs Predicted')
plt.show()