In [35]:
import os, re, math, random, json, string

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML

import transformers
from transformers import BertForTokenClassification, BertTokenizerFast
from transformers import DataCollatorForTokenClassification

from transformers import Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from datasets import load_dataset, ClassLabel, Sequence, load_metric

In [15]:
json_file_path = "../data/cuad-v1-annotated.json"

In [16]:
TRAIN = 1
TRAIN_SPLIT = 0.90
RANDOM_SEED = 42
BATCH_SIZES = 1
EPOCHS = 10

In [17]:
datasets = load_dataset('json', data_files=json_file_path, field='data')

# Create train and validation datasets
datasets = datasets['train'].train_test_split(test_size=1-TRAIN_SPLIT, seed=RANDOM_SEED)
print(datasets)

DatasetDict({
    train: Dataset({
        features: ['split_tokens', 'id', 'ner_tags'],
        num_rows: 282
    })
    test: Dataset({
        features: ['split_tokens', 'id', 'ner_tags'],
        num_rows: 32
    })
})


In [18]:
label_list=['B-AGMT_DATE','B-DOC_NAME','B-PARTY','I-AGMT_DATE','T-DOC_NAME','I-PARTY','0']

In [19]:
def show_elements(dataset, num_examples=1):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))


In [20]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [44]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(label_list))


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
def encode_dataset(dataset):
    encodings = tokenizer(dataset['split_tokens'], is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True, return_tensors="pt")
    labels = [label for label in dataset['ner_tags']]
    encodings.pop('offset_mapping')
    return encodings, labels

In [46]:
train_encodings, train_labels = encode_dataset(datasets['train'])
test_encodings, test_labels = encode_dataset(datasets['test'])

train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

In [47]:
data_collator = DataCollatorForTokenClassification(tokenizer)


In [48]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    report_to=[], 
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator, 
)

trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,2.3304
20,2.1942
30,1.9201
40,1.5136
50,0.9857
60,0.5708
70,0.5239
80,0.4973
90,0.463
100,0.3989


TrainOutput(global_step=180, training_loss=0.7715027981334263, metrics={'train_runtime': 2783.2222, 'train_samples_per_second': 1.013, 'train_steps_per_second': 0.065, 'total_flos': 428893102907280.0, 'train_loss': 0.7715027981334263, 'epoch': 10.0})

In [56]:
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader

test_dataloader = DataLoader(test_dataset, batch_size=32, collate_fn=data_collator)
model.eval()
true_labels = []
pred_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)
        labels = batch['labels'].to(model.device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        predictions = predictions.detach().cpu().numpy()
        labels = labels.detach().cpu().numpy()

        true_labels.extend(labels.flatten())
        pred_labels.extend(predictions.flatten())
accuracy = accuracy_score(true_labels, pred_labels)
print("Test Accuracy:", accuracy)


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Test Accuracy: 0.6718146718146718
