In [1]:
## Dataset - llama3

In [2]:
import gc
gc.enable()

import os
import numpy as np  
import pandas as pd  
import torch
import json
import joblib
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel, AutoConfig
from tqdm import tqdm
from scipy.special import softmax
import datasets  ## Huggingface Datasets

from unidecode import unidecode

os.environ['CUDA_VISIBLE_DEVICES'] = "0,1"

%env TOKENIZERS_PARALLELISM=false


env: TOKENIZERS_PARALLELISM=false


In [3]:
from spacy.lang.en import English

en_tokenizer = English().tokenizer


def tokenize_with_spacy(text, tokenizer=en_tokenizer):
    tokenized_text = tokenizer(text)
    tokens = [token.text for token in tokenized_text]
    trailing_whitespace = [bool(token.whitespace_) for token in tokenized_text]
    return {"tokens": tokens, "trailing_whitespace": trailing_whitespace}

In [4]:
class Config:
    model = "/kaggle/input/huggingfacedebertav3variants/deberta-v3-large"
    tokenizer_path = "/kaggle/input/pii-data-detection-deberta-v3-large/pytorch/5folds/1/tokenizer/"
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True)
    gradient_checkpointing = False
    max_len = 5000

    batch_size = 4
    num_workers = 2
    num_labels = 13


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

CFG = Config()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
model_paths = [
'/kaggle/input/pii-data-detection-deberta-v3-large/pytorch/5folds/1/microsoft-deberta-v3-large_fold0_best.pth',
'/kaggle/input/pii-data-detection-deberta-v3-large/pytorch/5folds/1/microsoft-deberta-v3-large_fold1_best.pth',
'/kaggle/input/pii-data-detection-deberta-v3-large/pytorch/5folds/1/microsoft-deberta-v3-large_fold2_best.pth',
'/kaggle/input/pii-data-detection-deberta-v3-large/pytorch/5folds/1/microsoft-deberta-v3-large_fold3_best.pth',
'/kaggle/input/pii-data-detection-deberta-v3-large/pytorch/5folds/1/microsoft-deberta-v3-large_fold4_best.pth',
]
config_path = '/kaggle/input/pii-data-detection-deberta-v3-large/pytorch/5folds/1/config.pth'

In [6]:
test = json.load(open('/kaggle/input/pii-detection-removal-from-educational-data/test.json'))
sub = pd.read_csv('/kaggle/input/pii-detection-removal-from-educational-data/sample_submission.csv')

config = torch.load(config_path)
id2label = config.id2label
label2id = config.label2id

In [7]:
## Create dataset, collator, prepare input
def prepare_input(example, tokenizer):
    text = []
    token_map = []
    idx = 0

    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        t=unidecode(t)
        text.append(t)
        token_map.extend([idx]*len(t))

        if ws:
            text.append(" ")
            token_map.append(-1)
            
        idx += 1
        
    # Now, we tokenize the concatenated 'text' and return offsets mappings along with 'token_map'.
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=CFG.max_len)
    text = "".join(text)
   
    length = len(tokenized.input_ids)
        
    return {
        **tokenized,
        "length": length,
        "spacy_char_map": token_map, # Now includes mapping to original tokens
    }



class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.document_id = df['document'].values
        self.full_texts = df['full_text'].values
        self.tokens = df['tokens'].values
        self.trailing_whitespace = df['trailing_whitespace'].values

    def __len__(self):
        return len(self.document_id)

    def __getitem__(self, item):

        example = {
            "document_id": self.document_id[item],
            "full_text": self.full_texts[item],
            "tokens": self.tokens[item],
            "trailing_whitespace": self.trailing_whitespace[item],
        }

        inputs = prepare_input(example, tokenizer=self.cfg.tokenizer)
        inputs['document_id'] = example['document_id']

        return inputs
    
class Collate:
    def __init__(self, cfg):
        self.tokenizer = cfg.tokenizer
        self.cfg = cfg

    def __call__(self, batch):
        output = dict()

        # calculate max token length of this batch

        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        output["token_type_ids"] = [sample["token_type_ids"] for sample in batch]
        output["offset_mapping"] = [sample["offset_mapping"] for sample in batch]
        output["document_id"] = [sample["document_id"] for sample in batch]
        output["length"] = [sample["length"] for sample in batch]
        output["spacy_char_map"] = [sample["spacy_char_map"] for sample in batch]

        batch_max = max([len(ids) for ids in output["input_ids"]])
        
        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [list(s) + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
            output["attention_mask"] = [list(s) + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
            output["token_type_ids"] = [list(s) + (batch_max - len(s)) * [0] for s in output["token_type_ids"]]
            output["offset_mapping"] = [list(s) + (batch_max - len(s)) * [(0,0)] for s in output["offset_mapping"]]

#         else:
#             output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + list(s) for s in output["input_ids"]]
#             output["attention_mask"] = [(batch_max - len(s)) * [0] + list(s) for s in output["attention_mask"]]
#             output["token_type_ids"] = [(batch_max - len(s)) * [0] + list(s) for s in output["token_type_ids"]]
        
        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        output["token_type_ids"] = torch.tensor(output["token_type_ids"], dtype=torch.long)
        output["offset_mapping"] = torch.tensor(output["offset_mapping"], dtype=torch.long)

        return output
    
    

In [8]:
test_df = pd.DataFrame(test)


def replace_space(lst):
    return ['[SPACE]' if x.isspace() else x for x in lst]
    
test_df['tokens']=test_df['tokens'].apply(replace_space)

document_order = test_df.document.tolist()
# Re-order text to full text for faster inference
test_df['tokenize_length'] = [len(CFG.tokenizer(text)['input_ids']) for text in test_df['full_text'].values]
test_df = test_df.sort_values('tokenize_length', ascending = False).reset_index(drop = True)
test_df.drop(['tokenize_length'], axis = 1, inplace = True)

test_dataset = TestDataset(CFG, test_df)
print(len(test_dataset))

test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         collate_fn=Collate(CFG),
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)


10


In [9]:
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.num_labels=len(all_labels)
            self.config.id2label = id2label
            self.config.label2id = label2id
        else:
            self.config = torch.load(config_path)
        
        if pretrained:
            self.model = AutoModel.from_pretrained(
                                                    cfg.model, ignore_mismatched_sizes=True, config=self.config
                                                )
        else:
            self.model = AutoModel.from_config(self.config)
            
        self.model.resize_token_embeddings(len(CFG.tokenizer))
        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
        
        self.bilstm = nn.LSTM(
            self.config.hidden_size,
            (self.config.hidden_size) // 2,
            num_layers=2,
            dropout=self.config.hidden_dropout_prob,
            batch_first=True,
            bidirectional=True,
        )

        self.gru = nn.GRU(
            self.config.hidden_size,
            self.config.hidden_size // 2,
            num_layers=2,
            dropout=self.config.hidden_dropout_prob,
            batch_first=True,
            bidirectional=True,
        )
        self.lstm_weights = nn.Parameter(torch.tensor(0.5), requires_grad=False)

        self.fc = nn.Linear(self.config.hidden_size, CFG.num_labels)
        
        self._init_weights(self.fc)
        
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
            
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

            
    def forward(self,  input_ids, attention_mask, token_type_ids):
        output = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )  # returns ['last_hidden_state', 'hidden_states']
        sequence_output = output[0]
        sequence_output = self.dropout(sequence_output)
        lstm_output, hc = self.bilstm(sequence_output)
        gru_output, _ = self.gru(sequence_output)

        lstm_output_new = (
            self.lstm_weights * lstm_output + (1 - self.lstm_weights) * gru_output
        )

        logits = self.fc(lstm_output_new)

        return logits
    

In [10]:
# ====================================================
# inference
# ====================================================
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    model = nn.DataParallel(model)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs['input_ids'] = inputs['input_ids'].to(device)
        inputs['attention_mask'] = inputs['attention_mask'].to(device)
        inputs['token_type_ids'] = inputs['token_type_ids'].to(device)

        with torch.no_grad():
            with torch.cuda.amp.autocast(enabled=True, dtype=torch.float16, cache_enabled=True):
                y_preds = model(inputs['input_ids'], inputs['attention_mask'], inputs['token_type_ids']) 
        preds.append(y_preds.to('cpu').numpy().astype(np.float32))
    
    flatten_preds = [item for sublist in preds for item in sublist]
    return flatten_preds

In [11]:
predictions = []
for i in range(len(model_paths)):
    model = CustomModel(CFG, config_path=config_path, pretrained=False)
    state = torch.load(model_paths[i],
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    preds = inference_fn(test_loader, model, device)
    predictions.append(preds)
    del model, state, preds; gc.collect()
    torch.cuda.empty_cache()

100%|██████████| 3/3 [00:06<00:00,  2.24s/it]
100%|██████████| 3/3 [00:02<00:00,  1.02it/s]
100%|██████████| 3/3 [00:02<00:00,  1.01it/s]
100%|██████████| 3/3 [00:02<00:00,  1.02it/s]
100%|██████████| 3/3 [00:02<00:00,  1.01it/s]


In [12]:
length_list = []
token_map_list = []
offset_mapping_list = []
document_id_list = []
token_list = test_df.tokens.tolist()

for step, batch in enumerate(test_loader):
    length_list.extend(batch["length"])


In [13]:
## Average over 5 folds also limiting length
preds_arr = np.zeros((len(test_df), CFG.max_len, CFG.num_labels))

for j in range(len(test_df)):
    arrays_to_average = []
    for sublist in predictions:
        arrays_to_average.append(sublist[j][:length_list[j],:])
    averaged_array = np.mean(arrays_to_average, axis=0)
    
    preds_arr[j, : averaged_array.shape[0], :] = averaged_array

In [14]:
threshold = 0.85

In [15]:
preds_arr_softmax = softmax(preds_arr, axis=-1)

preds_dict = {}
pred_dict_tokens = {}

for i, r in test_df.iterrows():
    preds_dict[r['document']] = np.zeros((len(r['tokens']), CFG.num_labels))
    pred_dict_tokens[r['document']] = np.zeros((len(r['tokens'])))

    
for idx, d in enumerate(test_dataset):
    doc_id = d['document_id']
    model_offset = d['offset_mapping']

    spacy_char_map = d['spacy_char_map']
    
    for i, (start_idx, end_idx) in enumerate(model_offset):

        spacy_token_index = []
        for j in range(start_idx, end_idx):
            spacy_token_index.append(spacy_char_map[j])

        spacy_token_index = sorted(set(spacy_token_index))

        spacy_token_index = [x for x in spacy_token_index if x!=-1]

        for x in spacy_token_index:
            preds_dict[doc_id][x] += preds_arr_softmax[idx, i]
            pred_dict_tokens[doc_id][x] += 1


for k, v in preds_dict.items():
    for i, x in enumerate(v):
        if pred_dict_tokens[k][i]:
            preds_dict[k][i] /= pred_dict_tokens[k][i]
        

document, token, label, token_str = [], [], [], []

for _, r in test_df.iterrows():
    doc_id = r['document']
    doc_preds = preds_dict[doc_id]

    # create mask for spacy tokens which do not have any predictions from model
    pred_mask = doc_preds[:, 0] != 0

    preds_0_12 = doc_preds.argmax(-1)
    preds_without_O = doc_preds[:, :12].argmax(-1)
    O_preds = doc_preds[:, 12]

    if threshold is None:
        preds_final = preds_0_12
    else:
        preds_final = np.where(O_preds < threshold, preds_without_O, preds_0_12)    


    for i, t in enumerate(r['tokens']):
        if preds_final[i] != 12 and pred_mask[i]:
            document.append(doc_id)
            token.append(i)
            label.append(id2label[preds_final[i]])
            token_str.append(t)


In [16]:
df = pd.DataFrame({
    "document": document,
    "token": token,
    "label": label,
    "token_str": token_str
})

In [17]:
df = df[df.token_str!='Dr'].reset_index(drop=True)

# Convert 'name' to a categorical column with the desired order
df['document'] = pd.Categorical(df['document'], categories=document_order, ordered=True)
df = df.sort_values(['document','token']).reset_index(drop=True)
df['document'] = df.document.astype(np.int64)

df["row_id"] = list(range(len(df)))
df.head(100)

Unnamed: 0,document,token,label,token_str,row_id
0,7,9,B-NAME_STUDENT,Nathalie,0
1,7,10,I-NAME_STUDENT,Sylla,1
2,7,482,B-NAME_STUDENT,Nathalie,2
3,7,483,I-NAME_STUDENT,Sylla,3
4,7,741,B-NAME_STUDENT,Nathalie,4
5,7,742,I-NAME_STUDENT,Sylla,5
6,10,0,B-NAME_STUDENT,Diego,6
7,10,1,I-NAME_STUDENT,Estrada,7
8,10,464,B-NAME_STUDENT,Diego,8
9,10,465,I-NAME_STUDENT,Estrada,9


In [18]:
df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)