In [1]:
import torch
import json
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import torch.nn as nn
from collections import Counter
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
import torch
from torch.nn import CrossEntropyLoss
from sklearn.metrics import precision_recall_fscore_support
from torch.nn.functional import softmax
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np
import torch


In [2]:
data_files ='../Data/biloc_tagged_clauses.json'
datasets = load_dataset('json', data_files=data_files, field='data')
test_size=0.15
random_seed=42

datasets = datasets['train'].train_test_split(test_size=test_size, seed=random_seed)
print(datasets)

DatasetDict({
    train: Dataset({
        features: ['split_tokens', 'id', 'ner_tags'],
        num_rows: 401
    })
    test: Dataset({
        features: ['split_tokens', 'id', 'ner_tags'],
        num_rows: 71
    })
})


In [4]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [5]:
def tokenize_and_align_labels(tokenizer, examples):
    tokenized_inputs = tokenizer(examples["split_tokens"], truncation=True, padding="max_length", is_split_into_words=True, return_tensors="pt")
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100 if word_id is None else label[word_id] for word_id in word_ids]
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs
def convert_to_tensors(batch):
    batch_tensors = {key: tensor(value) for key, value in batch.items()}

In [6]:
tokenized_datasets = datasets.map(lambda examples: tokenize_and_align_labels(tokenizer, examples), batched=True)

In [7]:
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'labels'])

In [8]:
train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["test"]

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2)


In [16]:
model_name = 'bert-base-cased'
bert_model = AutoModel.from_pretrained(model_name)

In [17]:
num_labels = 165

In [71]:
class LoRAAttention(nn.Module):
    def __init__(self, bert_self_attention, rank):
        super(LoRAAttention, self).__init__()
        self.bert_self_attention = bert_self_attention
        
        self.rank = rank
        self.hidden_size = bert_self_attention.query.in_features
        self.A_q = nn.Parameter(torch.randn(self.hidden_size, self.rank))
        self.B_q = nn.Parameter(torch.randn(self.rank, self.hidden_size))

    def forward(self, hidden_states, attention_mask=None):
        Q = self.bert_self_attention.query
        Q.weight.data += torch.mm(self.A_q, self.B_q)
        outputs = self.bert_self_attention(hidden_states, attention_mask=attention_mask)
        return outputs


In [72]:
class CustomNERModel(nn.Module):
    def __init__(self, bert_model_name, num_labels, rank=64):
        super(CustomNERModel, self).__init__()
        self.bert = AutoModel.from_pretrained(bert_model_name)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        for i, layer in enumerate(self.bert.encoder.layer):
            self.bert.encoder.layer[i].attention.self = LoRAAttention(layer.attention.self, rank=rank)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = outputs.last_hidden_state
        logits = self.classifier(sequence_output)
        return logits

In [73]:
model = CustomNERModel(model_name, num_labels)

In [74]:
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_function = nn.CrossEntropyLoss()

In [75]:
num_epochs = 30
device = torch.device("cuda" if torch.cuda.is_available() else torch.device("cpu"))
model.to(device)

model.train()
for epoch in range(num_epochs):
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
    for batch in progress_bar:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(**inputs)
        logits = outputs
        loss = loss_function(logits.view(-1, num_labels), labels.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        progress_bar.set_postfix(loss=f"{loss.item():.4f}")

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

Epoch 1/30:   0%|          | 0/201 [00:00<?, ?it/s]

TypeError: LoRAAttention.forward() takes from 2 to 3 positional arguments but 8 were given