In [25]:
import torch
import json
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import torch.nn as nn
from collections import Counter
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
import torch
from torch.nn import CrossEntropyLoss
from sklearn.metrics import precision_recall_fscore_support
from torch.nn.functional import softmax
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np
import torch


In [5]:
with open('../Data/biloc_tagged_clauses.json') as f:
    d = json.load(f)

In [6]:
data_files ='../Data/biloc_tagged_clauses.json'
datasets = load_dataset('json', data_files=data_files, field='data')
test_size=0.15
random_seed=42

datasets = datasets['train'].train_test_split(test_size=test_size, seed=random_seed)
print(datasets)

DatasetDict({
    train: Dataset({
        features: ['split_tokens', 'id', 'ner_tags'],
        num_rows: 401
    })
    test: Dataset({
        features: ['split_tokens', 'id', 'ner_tags'],
        num_rows: 71
    })
})


In [7]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [8]:
def tokenize_and_align_labels(tokenizer, examples):
    tokenized_inputs = tokenizer(examples["split_tokens"], truncation=True, padding="max_length", is_split_into_words=True, return_tensors="pt")
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100 if word_id is None else label[word_id] for word_id in word_ids]
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs
def convert_to_tensors(batch):
    batch_tensors = {key: tensor(value) for key, value in batch.items()}

In [9]:
tokenized_datasets = datasets.map(lambda examples: tokenize_and_align_labels(tokenizer, examples), batched=True)

Map:   0%|          | 0/71 [00:00<?, ? examples/s]

In [10]:
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'labels'])

In [54]:
train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["test"]

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2)


In [69]:
model_name = "bert-base-cased"  
num_labels = 165

bert_model = AutoModel.from_pretrained(model_name, num_labels=num_labels)


In [70]:
class CustomNERModel(nn.Module):
    def __init__(self, bert_model, num_labels):
        super(CustomNERModel, self).__init__()
        self.bert = bert_model  # The BERT model
        self.classifier = nn.Linear(bert_model.config.hidden_size, num_labels)  # Classifier

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = outputs.last_hidden_state
        logits = self.classifier(sequence_output)
        return logits


In [71]:
model = CustomNERModel(bert_model, num_labels)

In [72]:
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_function = nn.CrossEntropyLoss()

In [73]:
num_epochs = 30
device = torch.device("cuda" if torch.cuda.is_available() else torch.device("cpu"))
model.to(device)

model.train()
for epoch in range(num_epochs):
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
    for batch in progress_bar:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(**inputs)
        logits = outputs
        loss = loss_function(logits.view(-1, num_labels), labels.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        progress_bar.set_postfix(loss=f"{loss.item():.4f}")

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

Epoch 1/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 1/30, Average Loss: 0.8600


Epoch 2/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 2/30, Average Loss: 0.5114


Epoch 3/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 3/30, Average Loss: 0.3591


Epoch 4/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 4/30, Average Loss: 0.2655


Epoch 5/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 5/30, Average Loss: 0.2140


Epoch 6/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 6/30, Average Loss: 0.1913


Epoch 7/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 7/30, Average Loss: 0.1531


Epoch 8/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 8/30, Average Loss: 0.1250


Epoch 9/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 9/30, Average Loss: 0.1052


Epoch 10/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 10/30, Average Loss: 0.0988


Epoch 11/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 11/30, Average Loss: 0.0855


Epoch 12/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 12/30, Average Loss: 0.0706


Epoch 13/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 13/30, Average Loss: 0.0580


Epoch 14/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 14/30, Average Loss: 0.0520


Epoch 15/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 15/30, Average Loss: 0.0501


Epoch 16/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 16/30, Average Loss: 0.0485


Epoch 17/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 17/30, Average Loss: 0.0570


Epoch 18/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 18/30, Average Loss: 0.0544


Epoch 19/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 19/30, Average Loss: 0.0426


Epoch 20/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 20/30, Average Loss: 0.0380


Epoch 21/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 21/30, Average Loss: 0.0465


Epoch 22/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 22/30, Average Loss: 0.0369


Epoch 23/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 23/30, Average Loss: 0.0399


Epoch 24/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 24/30, Average Loss: 0.0280


Epoch 25/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 25/30, Average Loss: 0.0257


Epoch 26/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 26/30, Average Loss: 0.0351


Epoch 27/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 27/30, Average Loss: 0.0220


Epoch 28/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 28/30, Average Loss: 0.0382


Epoch 29/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 29/30, Average Loss: 0.0315


Epoch 30/30:   0%|          | 0/201 [00:00<?, ?it/s]

Epoch 30/30, Average Loss: 0.0208


In [74]:
true_labels_list = []
pred_labels_list = []

with torch.no_grad(): 
    for batch in test_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(**inputs)
        logits = outputs
        predictions = torch.argmax(logits, dim=-1)
        predictions = predictions.detach().cpu().numpy()
        labels = labels.detach().cpu().numpy()
        
        true_labels_list.append(labels)
        pred_labels_list.append(predictions)

true_labels_flat = np.concatenate(true_labels_list, axis=None)
pred_labels_flat = np.concatenate(pred_labels_list, axis=None)


In [75]:

mask = true_labels_flat != -100 
true_labels_filtered = true_labels_flat[mask]
pred_labels_filtered = pred_labels_flat[mask]

precision, recall, f1, _ = precision_recall_fscore_support(true_labels_filtered, pred_labels_filtered, average='weighted')
accuracy = accuracy_score(true_labels_filtered, pred_labels_filtered)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 0.8948
Precision: 0.8850
Recall: 0.8948
F1 Score: 0.8865


  _warn_prf(average, modifier, msg_start, len(result))


In [76]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
non_trainable_params = total_params - trainable_params

print(f"Total Parameters: {total_params}")
print(f"Trainable Parameters: {trainable_params}")
print(f"Non-trainable Parameters: {non_trainable_params}")


Total Parameters: 108437157
Trainable Parameters: 108437157
Non-trainable Parameters: 0
