# BERT Model
### Imports

In [1]:
import os, re, math, random, json, string, pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML

import transformers
from transformers import BertForTokenClassification, BertTokenizerFast
from transformers import DataCollatorForTokenClassification

from transformers import Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from datasets import load_dataset, ClassLabel, Sequence, load_metric

from sklearn.metrics import accuracy_score, f1_score, precision_score

### Constants

In [2]:
TRAIN = 1
TRAIN_SPLIT = 0.90
RANDOM_SEED = 42
BATCH_SIZES = 16
EPOCHS = 25

json_file_path = "../data/cuad-v1-annotated.json"

### Data Loading

In [3]:
datasets = load_dataset('json', data_files=json_file_path, field='data')

# Create train and validation datasets
datasets = datasets['train'].train_test_split(test_size=1-TRAIN_SPLIT, seed=RANDOM_SEED)
print(datasets)

DatasetDict({
    train: Dataset({
        features: ['ner_tags', 'split_tokens', 'id'],
        num_rows: 282
    })
    test: Dataset({
        features: ['ner_tags', 'split_tokens', 'id'],
        num_rows: 32
    })
})


### Helper Functions

In [4]:
label_list=['B-AGMT_DATE','B-DOC_NAME','B-PARTY','I-AGMT_DATE','T-DOC_NAME','I-PARTY','0']

In [5]:
def show_elements(dataset, num_examples=1):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))


In [6]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

### Tokenizer & Model 

In [7]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(label_list))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Data Encoding

In [8]:
def encode_dataset(dataset):
    encodings = tokenizer(dataset['split_tokens'], is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True, return_tensors="pt")
    labels = [label for label in dataset['ner_tags']]
    encodings.pop('offset_mapping')
    return encodings, labels

In [9]:
train_encodings, train_labels = encode_dataset(datasets['train'])
test_encodings, test_labels = encode_dataset(datasets['test'])

train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

### Data Batching

In [10]:
data_collator = DataCollatorForTokenClassification(tokenizer)

### Training

In [11]:
training_args = TrainingArguments(
    output_dir='./results/BERT',
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZES,
    per_device_eval_batch_size=BATCH_SIZES*4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    report_to=[], 
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator, 
)

trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
10,1.9072
20,1.7804
30,1.5207
40,1.118
50,0.7059
60,0.5325
70,0.4988
80,0.4699
90,0.4359
100,0.3927


TrainOutput(global_step=450, training_loss=0.34638539883825514, metrics={'train_runtime': 174.6912, 'train_samples_per_second': 40.357, 'train_steps_per_second': 2.576, 'total_flos': 1072232757268200.0, 'train_loss': 0.34638539883825514, 'epoch': 25.0})

In [12]:
# Save Training Loss by Step
fp = open("training_log_BERT", 'wb') 
pickle.dump(trainer.state.log_history, fp)

## Evaluation:
### Test-set

In [13]:
test_dataloader = DataLoader(test_dataset, batch_size=32, collate_fn=data_collator)
model.eval()
true_labels = []
pred_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)
        labels = batch['labels'].to(model.device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        predictions = predictions.detach().cpu().numpy()
        labels = labels.detach().cpu().numpy()

        true_labels.extend(labels.flatten())
        pred_labels.extend(predictions.flatten())
        
# Evaluation Metrics
accuracy_test = accuracy_score(true_labels, pred_labels)
f1_score_test = f1_score(true_labels, pred_labels,average=None)
precision_test = precision_score(true_labels, pred_labels,average=None)

# Print Metrics
print("Test Accuracy:", accuracy_test)
print("Test F1:")
print(np.round(f1_score_test,3))
print("Test Precision:")
print(np.round(precision_test,3))

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Test Accuracy: 0.6768822393822393
Test F1:
[0.    0.277 0.504 0.325 0.558 0.664 0.472 0.804]
Test Precision:
[0.    0.321 0.465 0.379 0.531 0.678 0.488 0.692]


  _warn_prf(average, modifier, msg_start, len(result))


### Train-set

In [14]:
train_dataloader = DataLoader(train_dataset, batch_size=32, collate_fn=data_collator)
model.eval()
true_labels = []
pred_labels = []

with torch.no_grad():
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)
        labels = batch['labels'].to(model.device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        predictions = predictions.detach().cpu().numpy()
        labels = labels.detach().cpu().numpy()

        true_labels.extend(labels.flatten())
        pred_labels.extend(predictions.flatten())
        
# Evaluation Metrics
accuracy_train = accuracy_score(true_labels, pred_labels)
f1_score_train = f1_score(true_labels, pred_labels, average=None)
precision_train = precision_score(true_labels, pred_labels, average=None)

# Print Metrics
print("Train Accuracy:", accuracy_train)
print("Train F1:")
print(np.round(f1_score_train,3))
print("Train Precision:")
print(np.round(precision_train,3))

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train Accuracy: 0.6387262601742111
Train F1:
[0.    0.838 0.838 0.72  0.887 0.948 0.871 0.766]
Train Precision:
[0.    0.892 0.78  0.78  0.808 0.945 0.794 0.623]


  _warn_prf(average, modifier, msg_start, len(result))
