## 📔 Inference Notebook 
soon™️

## 🏟️ Credits
- @Valentin Werner (https://www.kaggle.com/code/valentinwerner/915-deberta3base-training?scriptVersionId=161278765)
- @Nicholas Broad (https://www.kaggle.com/datasets/nbroad/pii-dd-mistral-generated)
- @Joseph Josia (https://www.kaggle.com/code/takanashihumbert/piidd-deberta-model-starter-training)

## 📥 Imports

In [None]:
!pip install evaluate seqeval -q

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random, os
import torch
import itertools
import gc

from typing import Any
from transformers import AutoTokenizer
from datasets import Dataset
from sklearn.model_selection import train_test_split
from seqeval.metrics import recall_score, precision_score
from transformers import (
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
)

## 📝 Config
- 1024 max length has been working well for me
- For each experiment, the SEED value was always equal to 42

In [None]:
def seed_everything(seed: int) -> None:
    os.environ["PL_GLOBAL_SEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [None]:
MODEL_PATHS = dict(
    small="/kaggle/input/huggingfacedebertav3variants/deberta-v3-small",
    base="/kaggle/input/huggingfacedebertav3variants/deberta-v3-base",
)
BATCH_SIZE = dict(base=[4, 2], small=[8, 1])
TRAIN_JSON = "/kaggle/input/pii-detection-removal-from-educational-data/train.json"
TEST_JSON = "/kaggle/input/pii-detection-removal-from-educational-data/test.json"
SAMPLE_CSV = (
    "/kaggle/input/pii-detection-removal-from-educational-data/sample_submission.csv"
)
NICHOLAS_JSON = "/kaggle/input/pii-dd-mistral-generated/mixtral-8x7b-v1.json"
MAX_LENGTH = 1024
SEED = 42

In [None]:
seed_everything(SEED)

## 📊 (Simple) EDA

In [None]:
df = pd.read_json(TRAIN_JSON)
df.head(10)

In [None]:
nicholas_df = pd.read_json(NICHOLAS_JSON)
nicholas_df.head(10)

In [None]:
df.isnull().sum()

In [None]:
nicholas_df.isnull().sum()

In [None]:
df["labels"].explode().unique()

In [None]:
df["labels"].explode().value_counts()

In [None]:
nicholas_df["labels"].explode().value_counts()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df["full_text"].str.len(), kde=True)
plt.title("Distribution of Document Lengths")
plt.xlabel("Length of Text")
plt.ylabel("Number of Documents")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(nicholas_df["full_text"].str.len(), kde=True)
plt.title("Distribution of Document Lengths (Nicholas)")
plt.xlabel("Length of Text (Nicholas)")
plt.ylabel("Number of Documents (Nicholas)")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df["tokens"].apply(len), kde=True)
plt.title("Distribution of Token Lengths")
plt.xlabel("Number of Tokens")
plt.ylabel("Number of Documents")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(nicholas_df["tokens"].apply(len), kde=True)
plt.title("Distribution of Token Lengths")
plt.xlabel("Number of Tokens (Nicholas)")
plt.ylabel("Number of Documents (Nicholas)")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
trailing_whitespace_count = df["trailing_whitespace"].apply(lambda x: x.count(True))
trailing_whitespace_count.value_counts().sort_index().plot(kind="bar", xticks=[])
plt.title("Distribution of Trailing Whitespace Count")
plt.xlabel("Trailing Whitespace Count")
plt.ylabel("Frequency")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
trailing_whitespace_count = nicholas_df["trailing_whitespace"].apply(
    lambda x: x.count(True)
)
trailing_whitespace_count.value_counts().sort_index().plot(kind="bar", xticks=[])
plt.title("Distribution of Trailing Whitespace Count (Nicholas)")
plt.xlabel("Trailing Whitespace Count (Nicholas)")
plt.ylabel("Frequency (Nicholas)")
plt.show()

## ⬇️ Downsampling
- There are so many negative examples in the dataset
- I intend to use only one-third of the available negative examples
- The Nicholas dataset will also be used

In [None]:
label_list = sorted(list(set(itertools.chain.from_iterable(df.labels))))
label_list

In [None]:
df_original = df.copy(deep=True)
df["is_other_labels"] = df["labels"].apply(
    lambda labels: any(label != "O" for label in labels)
)
negative = df[~df.is_other_labels]
df = df.drop(negative.iloc[(negative.shape[0] // 3) : negative.shape[0]].index)
df = df.drop(columns=["is_other_labels"])
print(f"From {df_original.shape} to {df.shape}")

In [None]:
df_final = pd.concat([df, nicholas_df])
print(f"From {df.shape} to {df_final.shape}")

## 🏷️ Label Mapping

In [None]:
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}
label2id

In [None]:
id2label

In [None]:
df_final["labels"]

## 📚 Creating ataset
- I want to train on all data, so I am not going to use eval set

In [None]:
train = df_final.copy(deep=False)
# train, val = train_test_split(df_final, test_size=0.15, random_state=SEED, shuffle=True)
# print(f"Train size: {train.shape} | Validation size: {val.shape}")

In [None]:
def create_dataset(df: pd.DataFrame) -> Dataset:
    return Dataset.from_dict(
        {
            "document": df["document"].astype(str),
            "full_text": df["full_text"],
            "tokens": df["tokens"],
            "trailing_whitespace": df["trailing_whitespace"],
            "labels": df["labels"],
        }
    )

In [None]:
train_dataset = create_dataset(train)
# val_dataset = create_dataset(val)

In [None]:
print(train_dataset)

## ♟️ Data Tokenization Function

In [None]:
def tokenize(dataset: Dataset, tokenizer: AutoTokenizer) -> dict[str, Any]:
    text, labels = [], []

    for token, ws, label in zip(
        dataset["tokens"], dataset["trailing_whitespace"], dataset["labels"]
    ):
        text.append(token)
        labels.extend([label] * len(token))

        if ws:
            text.append(" ")
            labels.append("O")

    tokenized_input = tokenizer(
        "".join(text),
        truncation=True,
        max_length=MAX_LENGTH,
        return_offsets_mapping=True,
    )

    text = "".join(text)
    labels = np.array(labels)
    token_labels = []

    for start_idx, end_idx in tokenized_input.offset_mapping:
        if start_idx + end_idx == 0:
            token_labels.append(label2id["O"])
            continue

        if text[start_idx].isspace():
            start_idx += 1

        token_labels.append(label2id[labels[start_idx]])

    length = len(tokenized_input.input_ids)

    return {**tokenized_input, "labels": token_labels, "length": length}

## 🧮 Competition Metrics
- We are not using the normal F1 score
- There are plenty of discussions already explaining this (https://www.kaggle.com/competitions/pii-detection-removal-from-educational-data/discussion/470024)

In [None]:
def compute_metrics(p) -> dict[str, float]:
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    recall = recall_score(y_true=true_labels, y_pred=true_predictions)
    precision = precision_score(y_true=true_labels, y_pred=true_predictions)

    f5_score = ((1 + 5 * 5) * (precision * recall)) / ((5 * 5) * precision + recall)

    return {
        "precision": precision,
        "recall": recall,
        "f1": f5_score,
    }

## 💾 Training and Saving Models
- I decided to train a small and basic model
- The "for" loop was used because of the desire to avoid running the notebook twice
- The values are not really tuned, I did some naive experiments (such as manually changing the values)

In [None]:
for name, path in MODEL_PATHS.items():
    print(f"Fine-tuning deberta-v3-{name}")
    tokenizer = AutoTokenizer.from_pretrained(path)

    tokenized_train = train_dataset.map(
        tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc=3
    )
    #     tokenized_val = val_dataset.map(tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc=3)

    print(tokenized_train)

    data_collator = DataCollatorForTokenClassification(
        tokenizer=tokenizer, pad_to_multiple_of=16
    )

    model = AutoModelForTokenClassification.from_pretrained(
        path,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True,
    )

    training_args = TrainingArguments(
        do_eval=False,
        output_dir="output",
        weight_decay=0.01,
        fp16=True,
        learning_rate=2e-5,
        per_device_train_batch_size=BATCH_SIZE[name][0],
        gradient_accumulation_steps=BATCH_SIZE[name][1],
        #         per_device_eval_batch_size=1,
        num_train_epochs=3,
        metric_for_best_model="f1",
        warmup_ratio=0.1,
        lr_scheduler_type="cosine",
        greater_is_better=True,
        #         load_best_model_at_end=True,
        push_to_hub=False,
        report_to="none",
        save_total_limit=1,
        overwrite_output_dir=True,
        #         evaluation_strategy="epoch",
        logging_strategy="steps",
        #         save_strategy="epoch",
        logging_steps=20,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        #         eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    trainer.save_model(f"deberta-{name}-v3-{MAX_LENGTH}-downsample-external-data")
    tokenizer.save_pretrained(
        f"deberta-{name}-v3-{MAX_LENGTH}-downsample-external-data"
    )

    del model, trainer, tokenizer
    torch.cuda.empty_cache()
    gc.collect()