In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd 
import os 
import json
import numpy as np
import torch 
import random 


import torch
from torch import nn
import numpy as np
import pandas as pd
from spacy.lang.en import English
from transformers.trainer import Trainer
from transformers.training_args import TrainingArguments
from transformers.trainer_utils import EvalPrediction
from transformers.data.data_collator import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForTokenClassification


def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    
SEED = 23
set_seed(SEED)

In [2]:
! ls

data  DebertaV2_Training.ipynb	DistilBert_Training.ipynb  test


In [3]:
class Config:
    data_dir = 'data/'
    extra_data_dir = 'data/'
    train_file_name = 'train.json'
    extra_file_name = 'mixtral-8x7b-v1.json'
    test_file_name = 'test.json'
    TEST_SAMPLES  = 500
    TRAINING_MODEL_PATH = "distilbert/distilbert-base-uncased"
    TRAINING_MAX_LENGTH = 512
    EVAL_MAX_LENGTH = 512
    FREEZE_EMBEDDINGS = True
    FREEZE_LAYERS = 20

In [4]:
cfg = Config()

In [6]:
train_data1 = json.load(open(os.path.join(cfg.data_dir, cfg.train_file_name)))
test_data = json.load(open(os.path.join(cfg.data_dir, cfg.test_file_name)))
train_data2 = json.load(open(os.path.join(cfg.extra_data_dir, cfg.extra_file_name)))

train_data1 = pd.DataFrame(train_data1)
test_data = pd.DataFrame(test_data)
train_data2 = pd.DataFrame(train_data2)

# Merge train and extra data 
train_data = pd.concat([train_data1, train_data2])

# Update document Id as integer
train_data['document'] = [ind+1 for ind in range(len(train_data))]

train_data = train_data.reset_index()

print(f"Train Data Size : {train_data.shape[0]}")
print(f"Test Data Size : {test_data.shape[0]}")

Train Data Size : 9162
Test Data Size : 10


In [7]:
def get_no_label_ids(data):
    document_ids = []
    for idx, row in data.iterrows():
        label_list = row['labels']
        
        if ['O'] * len(label_list) == label_list:
            document_ids.append(row['document'])
    return document_ids

In [8]:
no_label_doc_ids = get_no_label_ids(data = train_data)
with_label_doc_ids = train_data[~train_data.document.isin(no_label_doc_ids)].document.tolist()
print(f"Document with no PIL Labels : {len(no_label_doc_ids)}\nDocument with PIL Labels: {len(with_label_doc_ids)}")

Document with no PIL Labels : 5862
Document with PIL Labels: 3300


In [9]:
def sample_test_df(df, num_samples):
    test_df = df.sample(n=num_samples)
    train_df = df[~df.document.isin(test_df.document)]
    
    test_df = test_df.reset_index()
    train_df = train_df.reset_index()
    return train_df, test_df


def split_dataset(data, no_label_ids, with_label_ids, test_samples):
    data_no_labels = data[data.document.isin(no_label_ids)]
    data_with_labels = data[data.document.isin(with_label_ids)]
    
    # TEST_SAMPLES // 2 from data_no_labels
    # TEST_SAMPLES // 2 from data_with_labels
    no_label_train_df , no_label_test_df = sample_test_df(df=data_no_labels, 
                                                          num_samples=test_samples//2)
    with_label_train_df, with_label_test_df = sample_test_df(df=data_with_labels, 
                                                          num_samples=test_samples//2)
    
    train_df = pd.concat([with_label_train_df, no_label_train_df])
    test_df = pd.concat([with_label_test_df, no_label_test_df])    
    return train_df, test_df

In [10]:
train_df , eval_df = split_dataset(data=train_data,
                                   no_label_ids=no_label_doc_ids,
                                   with_label_ids=with_label_doc_ids,
                                   test_samples=cfg.TEST_SAMPLES
                                  )
print(f"Train Data Size : {train_df.shape[0]}")
print(f"Eval Data Size : {eval_df.shape[0]}")

Train Data Size : 8662
Eval Data Size : 500


In [11]:
all_labels = [
    'B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL', 'O'
]
id2label = {i: l for i, l in enumerate(all_labels)}
label2id = {v: k for k, v in id2label.items()}
target = [l for l in all_labels if l != "O"]

In [12]:
def tokenize(example, tokenizer, label2id):
    # Preprocess the tokens and labels by adding trailing whitespace and labels
    tokens = []
    labels = []
    for token, label, t_ws in zip(example["tokens"], 
                                  example["labels"],
                                  example["trailing_whitespace"]):
        tokens.append(token)
        labels.extend([label] * len(token))
        # Added trailing whitespace and label if true and 
        if t_ws:
            tokens.append(" ")
            labels.append("O")   
    text = "".join(tokens)
    # tokenization without truncation
    tokenized = tokenizer(text, return_offsets_mapping=True,
                          truncation=True, max_length=cfg.TRAINING_MAX_LENGTH)
    labels = np.array(labels)
    
    # Labels
    token_labels = []
    for start_idx, end_idx in tokenized.offset_mapping:
        # Added 'O' 
        if start_idx == 0 and end_idx == 0:
            token_labels.append(label2id["O"])
            continue
        
        # case when the text starts with whitespace
        if text[start_idx].isspace():
            start_idx += 1
        # Convert label to id (int)
        if start_idx< len(labels):
            label_id = label2id[labels[start_idx]]
            token_labels.append(label_id)
    return {**tokenized, "labels": token_labels, "length": len(tokenized.input_ids)}

In [13]:
from seqeval.metrics import recall_score, precision_score, f1_score
# Compute the model performance metrics using `seqeval`
def compute_metrics(preds):    
    try:
        print("Compute metrics")
        predictions, labels = preds
        predictions = np.argmax(predictions, axis=2)

        # Include prediction Remove ignored index (special tokens)
        true_preds = []
        true_labels = []
        for pred, label in zip(predictions, labels):
            true_preds.append([id2label[p] for p, l in zip(pred, label) if l != -100])
            true_labels.append([id2label[l] for p, l in zip(pred, label) if l != -100])
        # Compute recall, precision and f1 score
        recall = recall_score(true_labels, true_preds)
        precision = precision_score(true_labels, true_preds)
        # Use modified f1 score to measure the performance
        f1_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)
        result = {'f1': f1_score,  
                  'recall': recall,
                  'precision': precision}
        print(f"result = {result}")
        del predictions, labels, true_preds, true_labels, preds
        return result
    except Exception as e:
        print(e)

In [14]:
tokenizer = AutoTokenizer.from_pretrained(cfg.TRAINING_MODEL_PATH)

In [15]:
required_keys = ['tokens', 'trailing_whitespace','labels']
train_dict = DatasetDict({key: train_df[key].tolist() for key in required_keys})
eval_dict = DatasetDict({key: eval_df[key].tolist() for key in required_keys})
train_ds = Dataset.from_dict(train_dict)
eval_ds = Dataset.from_dict(eval_dict)



max_length = cfg.TRAINING_MAX_LENGTH
# To Change 
train_ds_tokenised = train_ds.map(tokenize, 
                              fn_kwargs={"tokenizer": tokenizer, 
                                         "label2id": label2id},
                              num_proc=4)


eval_ds_tokenised = eval_ds.map(tokenize, 
                              fn_kwargs={"tokenizer": tokenizer, 
                                         "label2id": label2id},
                              num_proc=4)

Map (num_proc=4):   0%|          | 0/8662 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/500 [00:00<?, ? examples/s]

In [16]:
model = AutoModelForTokenClassification.from_pretrained(cfg.TRAINING_MODEL_PATH, 
                                                        num_labels=len(all_labels), 
                                                        id2label=id2label, 
                                                        label2id=label2id,
                                                        ignore_mismatched_sizes=True
                                                       )

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
data_collator=DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

In [18]:
model

DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
    

In [19]:
for param in model.distilbert.embeddings.parameters():
    param.requires_grad = cfg.FREEZE_EMBEDDINGS

# for layer in model.deberta.encoder.layer[:cfg.FREEZE_LAYERS]:
#     for param in layer.parameters():
#         param.requires_grad = True

In [20]:
import os

train_args = TrainingArguments(
    'test',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    report_to="none"
)

trainer = Trainer(model=model, 
                  args=train_args, 
                  data_collator=data_collator,
                  train_dataset=train_ds_tokenised,
                  eval_dataset=eval_ds_tokenised,
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics,
                 )

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Recall,Precision
1,No log,0.033993,0.76435,0.76218,0.822912
2,No log,0.008808,0.953198,0.953775,0.938979
3,No log,0.005418,0.967439,0.967039,0.977565
4,0.054700,0.003988,0.982851,0.98306,0.977667
5,0.054700,0.003481,0.987048,0.987262,0.981718
6,0.054700,0.003282,0.988887,0.989232,0.980349
7,0.054700,0.003858,0.989096,0.989626,0.976039
8,0.001800,0.003132,0.989623,0.989888,0.983046
9,0.001800,0.003022,0.990042,0.990282,0.984079
10,0.001800,0.003058,0.989896,0.990151,0.983564


Compute metrics
result = {'f1': 0.7643495350203616, 'recall': 0.7621799080761654, 'precision': 0.8229122359279739}
Compute metrics
result = {'f1': 0.953197718439251, 'recall': 0.9537754432042023, 'precision': 0.9389786683904331}
Compute metrics
result = {'f1': 0.9674394162944397, 'recall': 0.9670387393302692, 'precision': 0.9775653789990707}
Compute metrics
result = {'f1': 0.9828512563626082, 'recall': 0.9830597504924491, 'precision': 0.9776674937965261}
Compute metrics
result = {'f1': 0.9870476132765751, 'recall': 0.9872619829284307, 'precision': 0.9817184643510055}
Compute metrics
result = {'f1': 0.9888871497886992, 'recall': 0.9892317793827972, 'precision': 0.9803487766788131}
Compute metrics
result = {'f1': 0.9890961957838623, 'recall': 0.9896257386736704, 'precision': 0.9760393731381946}
Compute metrics
result = {'f1': 0.9896234656110037, 'recall': 0.9898883782009192, 'precision': 0.9830464267083986}
Compute metrics
result = {'f1': 0.9900423151112412, 'recall': 0.9902823374917925,

TrainOutput(global_step=1360, training_loss=0.02104302697321948, metrics={'train_runtime': 377.5736, 'train_samples_per_second': 229.412, 'train_steps_per_second': 3.602, 'total_flos': 1.13194224362496e+16, 'train_loss': 0.02104302697321948, 'epoch': 10.0})