In [1]:
import numpy as np 
import pandas as pd 
import re
import json
import os
import argparse
from itertools import chain
from spacy.lang.en import English
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset
import gc
import torch
from scipy.special import softmax

2024-04-22 19:50:17.046995: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-22 19:50:17.047167: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-22 19:50:17.181807: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
BASE_PATH = "/kaggle/input/pii-detection-removal-from-educational-data/"

In [3]:
train_path = BASE_PATH + "train.json"
df = pd.read_json(train_path)
df.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."


In [4]:
INFERENCE_MAX_LENGTH = 3700

def tokenize(example, tokenizer):
    text = []
    token_map = []
    idx = 0
    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        text.append(t)
        token_map.extend([idx] * len(t))
        if ws:
            text.append(" ")
            token_map.append(-1)
        idx += 1
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=False, max_length=INFERENCE_MAX_LENGTH)
    return {
        **tokenized,
        "token_map": token_map,
    }

In [5]:
test_path = BASE_PATH + "test.json"
test_data = json.load(open(test_path))

ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in test_data],
    "document": [x["document"] for x in test_data],
    "tokens": [x["tokens"] for x in test_data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in test_data],
})

model_paths = {'/kaggle/input/pii-deberta-models/cola-de-piiranha' : 1,
              '/kaggle/input/pii-deberta-models/cuerpo-de-piiranha' : 4, #3
              '/kaggle/input/pii-deberta-models/cabeza-de-piiranha' : 4, #4
              '/kaggle/input/pii-deberta-models/cabeza-del-piinguuino' : 6 #6
              }

first_model_path = list(model_paths.keys())[0]
tokenizer = AutoTokenizer.from_pretrained(first_model_path)
ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc = 2)
weights = list(model_paths.values())

Map (num_proc=2):   0%|          | 0/10 [00:00<?, ? examples/s]

In [6]:
all_preds = []
total_weight = sum(model_paths.values())
intermediate_dir = './preds_directory'
os.makedirs(intermediate_dir, exist_ok=True)
for idx, (model_path, weight) in enumerate(model_paths.items()):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForTokenClassification.from_pretrained(model_path)
    collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)
    args = TrainingArguments(
        ".",
        per_device_eval_batch_size=1,
        report_to="none",
    )
    trainer = Trainer(
        model=model,
        args=args,
        data_collator=collator,
        tokenizer=tokenizer,
    )
    predictions = trainer.predict(ds).predictions
    weighted_predictions = softmax(predictions, axis=-1) * weight
    np.save(os.path.join(intermediate_dir, f'model_{idx}.npy'), weighted_predictions)
    del model, trainer, tokenizer, predictions, weighted_predictions
    torch.cuda.empty_cache()
    gc.collect()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [7]:
aggregated_predictions = None
model_group = [[0,1,2], [3]]
for grp in model_group:
    preds = None
    model_weights = 0
    for idx in grp:
        weighted_predictions = np.load(os.path.join(intermediate_dir, f'model_{idx}.npy'))
        preds = weighted_predictions if preds is None else preds + weighted_predictions  
        model_weights += weights[idx]
    preds /= model_weights
    aggregated_predictions = preds if aggregated_predictions is None else preds + aggregated_predictions
weighted_average_predictions = aggregated_predictions / len(model_group)
config = json.load(open(Path(model_path) / "config.json"))
id2label = config["id2label"]
preds = weighted_average_predictions.argmax(-1)
preds_without_O = weighted_average_predictions[:,:,:12].argmax(-1)
O_preds = weighted_average_predictions[:,:,12] # 12th column
threshold = 0.975
preds_final = np.where(O_preds < threshold, preds_without_O , preds)
triplets = []
pairs = set() 
processed = []

for p, token_map, offsets, tokens, doc in zip(preds_final, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"]):
    for token_pred, (start_idx, end_idx) in zip(p, offsets):
        label_pred = id2label[str(token_pred)]  
        if start_idx + end_idx == 0:
            continue
        if token_map[start_idx] == -1:
            start_idx += 1
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1
        if start_idx >= len(token_map):
            break
        token_id = token_map[start_idx]  
        if label_pred in ("O", "B-EMAIL", "B-PHONE_NUM", "I-PHONE_NUM") or token_id == -1:
            continue
        pair = (doc, token_id)
        if pair not in pairs:
            processed.append({"document": doc, "token": token_id, "label": label_pred, "token_str": tokens[token_id]})
            pairs.add(pair)

In [8]:
nlp = English()
def find_span(target: list[str], document: list[str]) -> list[list[int]]:
    idx = 0
    spans = []
    span = []
    for i, token in enumerate(document):
        if token != target[idx]:
            idx = 0
            span = []
            continue
        span.append(i)
        idx += 1
        if idx == len(target):
            spans.append(span)
            span = []
            idx = 0
            continue
    return spans

In [9]:
data = json.load(open(test_path))

email_regex = re.compile(r'[\w.+-]+@[\w-]+\.[\w.-]+')
phone_num_regex = re.compile(r"(\(\d{3}\)\d{3}\-\d{4}\w*|\d{3}\.\d{3}\.\d{4})\s")
emails = []
phone_nums = []

In [10]:
for _data in data:
    for token_idx, token in enumerate(_data["tokens"]):
        if re.fullmatch(email_regex, token) is not None:
            emails.append(
                {"document": _data["document"], "token": token_idx, "label": "B-EMAIL", "token_str": token}
            )
    matches = phone_num_regex.findall(_data["full_text"])
    if not matches:
        continue
    for match in matches:
        target = [t.text for t in nlp.tokenizer(match)]
        matched_spans = find_span(target, _data["tokens"])
    for matched_span in matched_spans:
        for intermediate, token_idx in enumerate(matched_span):
            prefix = "I" if intermediate else "B"
            phone_nums.append(
                {"document": _data["document"], "token": token_idx, "label": f"{prefix}-PHONE_NUM",
                 "token_str": _data["tokens"][token_idx]}
            )

In [11]:
df = pd.DataFrame(processed + phone_nums + emails)
df["row_id"] = list(range(len(df)))
display(df.head(100))

df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)

Unnamed: 0,document,token,label,token_str,row_id
0,7,9,B-NAME_STUDENT,Nathalie,0
1,7,10,I-NAME_STUDENT,Sylla,1
2,7,482,B-NAME_STUDENT,Nathalie,2
3,7,483,I-NAME_STUDENT,Sylla,3
4,7,741,B-NAME_STUDENT,Nathalie,4
5,7,742,I-NAME_STUDENT,Sylla,5
6,10,0,B-NAME_STUDENT,Diego,6
7,10,1,I-NAME_STUDENT,Estrada,7
8,10,464,B-NAME_STUDENT,Diego,8
9,10,465,I-NAME_STUDENT,Estrada,9
