In [1]:
import sys, os
import json
import numpy as np
import pandas as pd
from functools import partial
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import evaluate
from datasets import Dataset
from sklearn.model_selection import train_test_split
import torch
from seqeval.metrics import recall_score, precision_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

In [2]:
# !pip install evaluate

In [3]:
file_train = 'train.json'
file_test = 'test.json'

MAXLEN = 512

In [4]:
df = pd.read_json(file_train)

In [5]:
df_test = pd.read_json('test.json')

In [6]:
df.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."


### Preprocessing

**Step 1**: Convert `labels` column into ordinal.

In [7]:
label_list = ['O',
  'B-NAME_STUDENT',
  'I-NAME_STUDENT',
  'B-EMAIL',
  'I-EMAIL',
  'B-USERNAME',
  'I-USERNAME',
  'B-ID_NUM',
  'I-ID_NUM',
  'B-PHONE_NUM',
  'I-PHONE_NUM',
  'B-URL_PERSONAL',
  'I-URL_PERSONAL',
  'B-STREET_ADDRESS',
  'I-STREET_ADDRESS'
]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

In [8]:
def create_mapped_labels(labels):
    mapped_labels = pd.DataFrame({
        'mapped_labels': labels
    })['mapped_labels'].map(label2id).tolist()

    return mapped_labels

In [9]:
# categorical column -- friendlier for classifiers
df['labels_cat'] = df['labels'].apply(create_mapped_labels)

In [10]:
# downsample because too many non-PII examples
filter = df['labels'].apply(lambda arr: any([l != 'O' for l in arr]))
downsampled_df = df[filter]

train, valid = train_test_split(downsampled_df, test_size=0.1, shuffle=True, random_state=22124)

def create_dataset(df):
    ds = Dataset.from_dict({
        'document': [d for d in df['document']],
        'full_text': [ft for ft in df['full_text']],
        'tokens': [t for t in df['tokens']],
        'trailing_whitespace': [tw for tw in df['trailing_whitespace']],
        'labels': [l for l in df['labels']],
        'labels_cat': [ml for ml in df['labels_cat']]
    })
    return ds

train_ds = create_dataset(train)
valid_ds = create_dataset(valid)

#### Some preprocessing helper functions

In [11]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        # pdb.set_trace()
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels


def tokenize_and_align_labels(examples):
    """
    After running tokenizer, word ids can get misaligned
    need to re-align BIO labels, i.e. make sure split-up words
    get tagged as I-, [CLS] and [SEP] etc. are given sentinel values
    """
    tokenized_inputs = tokenizer(
        examples["tokens"], padding=True, truncation=True, is_split_into_words=True, max_length=MAXLEN
    )
    # pdb.set_trace()
    all_labels = examples["labels_cat"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)                
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

## DistilBert LLM

In [12]:
# MODEL = 'microsoft/deberta-v3-small'
# tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [13]:
MODEL = 'distilbert/distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [14]:
tokenized_train = train_ds.map(tokenize_and_align_labels, batched=True)
tokenized_valid = valid_ds.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/850 [00:00<?, ? examples/s]

Map:   0%|          | 0/95 [00:00<?, ? examples/s]

In [15]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, pad_to_multiple_of=16)

##### Define eval metrics before starting finetuning

In [16]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    precision = precision_score(true_labels, true_predictions)
    recall = recall_score(true_labels, true_predictions)
    f1 = f1_score(true_labels, true_predictions)

    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

In [17]:
# Specify device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [18]:
model = AutoModelForTokenClassification.from_pretrained(
    MODEL,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
).to(device)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
training_args = TrainingArguments(
    output_dir='output',
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy='steps',  # Change to 'steps'
    save_strategy='steps',  # Change to 'steps'
    eval_steps=500,  # Adjust based on your dataset size
    save_steps=500,  # Adjust based on your dataset size
    load_best_model_at_end=True,
    report_to='none'
)

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

Step,Training Loss,Validation Loss,Precision,Recall,F1
500,0.066,0.020062,0.730769,0.546763,0.625514
1000,0.0116,0.012434,0.729323,0.697842,0.713235
1500,0.0078,0.013597,0.721739,0.597122,0.653543
2000,0.004,0.010671,0.727273,0.690647,0.708487
2500,0.0032,0.009423,0.807407,0.784173,0.79562
3000,0.0017,0.010056,0.832061,0.784173,0.807407
3500,0.0017,0.011008,0.846154,0.791367,0.817844
4000,0.0005,0.010676,0.832168,0.856115,0.843972
4500,0.0005,0.011735,0.869565,0.863309,0.866426
5000,0.0003,0.01107,0.853846,0.798561,0.825279


TrainOutput(global_step=8500, training_loss=0.005804179456304102, metrics={'train_runtime': 396.2013, 'train_samples_per_second': 21.454, 'train_steps_per_second': 21.454, 'total_flos': 1110812391936000.0, 'train_loss': 0.005804179456304102, 'epoch': 10.0})

In [21]:
trainer.save_model('distilbert-finetuned-downsampled-512')

### Inference

In [29]:
# Load the saved model from your directory
model_path = 'distilbert-finetuned-downsampled-512'  # Update with the correct path
model = AutoModelForTokenClassification.from_pretrained(model_path)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [30]:
def tokenize_test_data(example):
    tokenized_inputs = tokenizer(
        example["tokens"], 
        padding=True, 
        truncation=True, 
        is_split_into_words=True, 
        max_length=MAXLEN,
        return_tensors="pt"  # Ensure the output is PyTorch tensors
    )
    return tokenized_inputs

# Tokenize test data

tokenized_test = Dataset.from_dict({
    'document': df_test['document'],
    'full_text': df_test['full_text'],
    'tokens': df_test['tokens'],
    'trailing_whitespace': df_test['trailing_whitespace']
}).map(tokenize_test_data, batched=True)

# Ensure tokenized_test is properly formatted
assert isinstance(tokenized_test, Dataset), "tokenized_test is not a Dataset object"


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [31]:
# Make predictions
predictions = trainer.predict(tokenized_test)

In [32]:
# Print predictions
# print(predictions)

In [33]:
# Convert predictions to DataFrame
predicted_labels = np.argmax(predictions.predictions, axis=2)


In [34]:
# Initialize a DataFrame to store the predictions
predicted_df = pd.DataFrame(columns=['row_id', 'document', 'token', 'label'])
row_id = 0

# Iterate over each example in the test dataset
for i, example_labels in enumerate(predicted_labels):
    # Iterate over tokens in the example
    for j, label_id in enumerate(example_labels):
        # Only include positive PII label values (excluding O)
        if label_list[label_id] != 'O':
            predicted_df.loc[row_id] = [row_id, df_test['document'][i], j, label_list[label_id]]
            row_id += 1

# Save predictions to file
predicted_df.to_csv('distil-bert-predicted_labels.csv', index=False)

In [40]:
# df_test