In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset
import json
import re
from scipy.special import softmax
from spacy.lang.en import English
import os
import regex

In [None]:
data = json.load(open("/kaggle/input/pii-detection-removal-from-educational-data/test.json"))

ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [x["document"] for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
})

In [None]:
def tokenize(example, tokenizer):
    text = []
    token_map = []

    idx = 0

    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):

        text.append(t)
        token_map.extend([idx]*len(t))

        if ws:
            text.append(" ")
            token_map.append(-1)

        idx += 1

    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=3500)

    return {
        **tokenized,
        "token_map": token_map,
    }

def find_span(target: list[str], document: list[str]) -> list[list[int]]:
    idx = 0
    spans = []
    span = []

    for i, token in enumerate(document):
        if token != target[idx]:
            idx = 0
            span = []
            continue
        span.append(i)
        idx += 1
        if idx == len(target):
            spans.append(span)
            span = []
            idx = 0
            continue

    return spans

In [None]:
model_paths = [
    '/kaggle/input/deberta-large-lora',
    '/kaggle/input/deberta-large2/deberta-large',
    '/kaggle/input/deberta-base/deberta-base_2dataset_5e'
]
tokenizer = AutoTokenizer.from_pretrained(model_paths[0])

# Tokenize the dataset using the 'tokenize' function in parallel
ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc = 1)


ensemble_preds = []
model_weights = [6, 2.5, 1.5]
total_weight = 5

for model, w in zip(model_paths,model_weights):
    tokenizer = AutoTokenizer.from_pretrained(model)
    collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of = 16)
    model = AutoModelForTokenClassification.from_pretrained(model)
    args = TrainingArguments(
        "/kaggle/working/output",
        report_to="none",
        do_train = False,
        fp16 = True,
        per_device_eval_batch_size=1
    )

    trainer = Trainer(
        model=model,
        args=args,
        data_collator=collator,
        tokenizer=tokenizer,
    )

    predictions = trainer.predict(ds).predictions
    weighted_predictions = softmax(predictions, axis = -1) * w
    ensemble_preds.append(weighted_predictions)

weighted_average_predictions = np.sum(ensemble_preds, axis=0) / total_weight

In [None]:
config = json.load(open("/kaggle/input/deberta-large-lora/config.json"))
id2label = config["id2label"]
preds = weighted_average_predictions.argmax(-1)
preds_no_O = weighted_average_predictions[:,:,:12].argmax(-1)
only_O_preds = weighted_average_predictions[:,:,12]
threshold = 0.97
predictions_averaged = np.where(only_O_preds < threshold, preds_no_O , preds)

In [None]:
pairs = set()  # membership operation using set is faster O(1) than that of list O(n)
processed = []
emails = []
phone_nums = []
urls = []
streets = []

# For each prediction, token mapping, offsets, tokens, and document in the dataset
for p, token_map, offsets, tokens, doc, full_text in zip(
    preds_final,
    ds["token_map"],
    ds["offset_mapping"],
    ds["tokens"],
    ds["document"],
    ds["full_text"]
):

    # Iterate through each token prediction and its corresponding offsets
    for token_pred, (start_idx, end_idx) in zip(p, offsets):
        label_pred = id2label[str(token_pred)]  # Predicted label from token
        if start_idx + end_idx == 0:
            continue
        if token_map[start_idx] == -1:
            start_idx += 1
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1
        if start_idx >= len(token_map):
            break
        token_id = token_map[start_idx]  # Token ID at start index
        if label_pred in ("O", "B-EMAIL", "B-PHONE_NUM", "I-PHONE_NUM") or token_id == -1:
            continue

        pair = (doc, token_id)
        if pair not in pairs:
            processed.append({"document": doc, "token": token_id, "label": label_pred, "token_str": tokens[token_id]})
            pairs.add(pair)

    # email
    for token_idx, token in enumerate(tokens):
        if re.fullmatch(email_regex, token) is not None:
            emails.append(
                {"document": doc, "token": token_idx, "label": "B-EMAIL", "token_str": token}
            )

    # phone number
    matches = phone_num_regex.findall(full_text)
    if not matches:
        continue
    for match in matches:
        target = [t.text for t in nlp.tokenizer(match)]
        matched_spans = find_span(target, tokens)
    for matched_span in matched_spans:
        for intermediate, token_idx in enumerate(matched_span):
            prefix = "I" if intermediate else "B"
            phone_nums.append(
                {"document": doc, "token": token_idx, "label": f"{prefix}-PHONE_NUM", "token_str": tokens[token_idx]}
            )

In [None]:
# Fix BIO Format
for i in range(len(processed)):
    if i == 0:
        temp = processed[i]['label']
        processed[i]['label'] = 'B-' + processed[i]['label'][2:]
    else:
        previous = processed[i-1]['token']
        current = processed[i]['token'] - 1
        if temp[2:] == processed[i]['label'][2:] and previous == current and processed[i]['document'] == processed[i-1]['document']:
            temp = processed[i]['label']
            processed[i]['label'] = 'I-' + processed[i]['label'][2:]
        else:
            temp = processed[i]['label']
            processed[i]['label'] = 'B-' + processed[i]['label'][2:]

import regex
i = 0
delete_count = 0
while i < len(processed):
    doc_idx = processed[i]['document']

    if bool(re.search(r'^B-', processed[i]['label'])):

        is_next_token = False
        if i+1 < len(processed):
            if bool(re.search(r'^I-', processed[i+1]['label'])):
                is_next_token = True
        if not(is_next_token):

            # B-ID_NUM
            if (processed[i]['label']  == 'B-ID_NUM'):
                if not(bool(re.search(r'\d{2}', processed[i]['token_str']))) or len(processed[i]['token_str']) <=3:
                    print(processed[i])
                    del processed[i]
                    delete_count += 1
                    continue

            # B-STREET_ADDRESS
#             if (processed[i]['label']  == 'B-STREET_ADDRESS'):
#                 if len(processed[i]['token_str']) <=11:
#                     print(processed[i])
#                     del processed[i]
#                     delete_count += 1
#                     continue

            # B-NAME_STUDENT
            if (processed[i]['label']  == 'B-NAME_STUDENT'):
                if len(processed[i]['token_str']) <=1: #Because there are cases where "." is labeled as NAME_STUDENT!
                    print(processed[i])
                    del processed[i]
                    delete_count += 1
                    continue

            # B-PHONE_NUM
            if (processed[i]['label']  == 'B-PHONE_NUM'):
                if len(processed[i]['token_str']) <=4:
                    print(processed[i])
                    del processed[i]
                    delete_count += 1
                    continue

            # B-EMAIL
            if (processed[i]['label']  == 'B-EMAIL'):
                if not(bool(re.search(r'[\w.+-]+@[\w-]+\.[\w.-]+', processed[i]['token_str']))):
                    print(processed[i])
                    del processed[i]
                    delete_count += 1
                    continue


    if processed[i]['label']  == 'B-NAME_STUDENT':
        if not(bool(regex.match(r'^\p{Lu}\p{Ll}+$', processed[i]['token_str']))):
            print(processed[i])
            del processed[i]
            delete_count += 1
            continue

    if processed[i]['label']  == 'I-NAME_STUDENT':
        if not(bool(re.search(r'^[A-Z][a-z\.]+$', processed[i]['token_str']))):
            print(processed[i])
            del processed[i]
            delete_count += 1
            continue

    if processed[i]['label']  == 'B-URL_PERSONAL':
        if len(processed[i]['token_str']) <=1:
            print(processed[i])
            del processed[i]
            delete_count += 1
            continue
    i+=1

# Fix BIO Format AGAIN
for i in range(len(processed)):
    if i == 0:
        temp = processed[i]['label']
        processed[i]['label'] = 'B-' + processed[i]['label'][2:]
    else:
        previous = processed[i-1]['token']
        current = processed[i]['token'] - 1
        if temp[2:] == processed[i]['label'][2:] and previous == current and processed[i]['document'] == processed[i-1]['document']:
            temp = processed[i]['label']
            processed[i]['label'] = 'I-' + processed[i]['label'][2:]
        else:
            temp = processed[i]['label']
            processed[i]['label'] = 'B-' + processed[i]['label'][2:]

In [None]:
df = pd.DataFrame(processed + phone_nums + emails + urls)

# Assign each row a unique 'row_id'
df["row_id"] = list(range(len(df)))

# Cast your findings into a CSV file for further exploration
df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)
df

NOTE: The training notebooks are still the same as the previous milestone