In [1]:
import torch
import json
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import torch.nn as nn
from collections import Counter
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from torch.nn import CrossEntropyLoss
from sklearn.metrics import precision_recall_fscore_support
from torch.nn.functional import softmax
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
from transformers import DataCollatorForTokenClassification
import numpy as np
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import evaluate
from transformers import TrainerCallback, AdamW, get_cosine_schedule_with_warmup

In [2]:
from datasets import load_dataset
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
import evaluate
import torch
import numpy as np



In [3]:
data_files ='../Data/biloc_tagged_clauses.json'
datasets = load_dataset('json', data_files=data_files, field='data')
test_size=0.15
random_seed=42

datasets = datasets['train'].train_test_split(test_size=test_size, seed=random_seed)
print(datasets)

DatasetDict({
    train: Dataset({
        features: ['ner_tags', 'id', 'split_tokens'],
        num_rows: 401
    })
    test: Dataset({
        features: ['ner_tags', 'id', 'split_tokens'],
        num_rows: 71
    })
})


In [4]:
model_name = "bert-base-uncased"
lr = 1e-3
batch_size = 16
num_epochs = 10

In [5]:
seqeval = evaluate.load("seqeval")

In [6]:
with open('../Data/feature_class_labels.json', 'r') as f:
    label_list = json.load(f)

In [18]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[id2label[p] for p, l in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[id2label[l] for p, l in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    # Assuming you have a seqeval wrapper or configuration that accepts scheme specification
    results = seqeval.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }
    


In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

In [20]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["split_tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [21]:
tokenized_dataset = datasets.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/71 [00:00<?, ? examples/s]

In [38]:
id2label = {id: label for id, label in enumerate(label_list)}

label2id = {label: id for id, label in id2label.items()}

In [39]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=165, id2label=id2label, label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [41]:
peft_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS, inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias="all"
)

In [42]:
model = get_peft_model(model, peft_config)

In [43]:
training_args = TrainingArguments(
    output_dir="my_awesome_wnut_model",
    learning_rate=5e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=100,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="no",  
    load_best_model_at_end=False,  
    push_to_hub=False,  
    logging_dir="./logs",  
    logging_steps=10,  
)

In [44]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.292123,0.0,0.0,0.0,0.869968
2,3.355900,0.840726,0.0,0.0,0.0,0.869968
3,0.864100,0.825046,0.0,0.0,0.0,0.869968
4,0.809400,0.815332,0.0,0.0,0.0,0.869968
5,0.746000,0.795781,0.0,0.0,0.0,0.869968
6,0.743500,0.766719,0.0,0.0,0.0,0.869968
7,0.751600,0.722222,0.0,0.0,0.0,0.869968
8,0.648800,0.649634,0.0,0.0,0.0,0.870167
9,0.617800,0.599668,0.037037,0.009604,0.015253,0.873827
10,0.509500,0.531869,0.299546,0.237695,0.26506,0.881782


TrainOutput(global_step=900, training_loss=0.24910887817541758, metrics={'train_runtime': 606.5957, 'train_samples_per_second': 66.107, 'train_steps_per_second': 1.484, 'total_flos': 1.05817304659968e+16, 'train_loss': 0.24910887817541758, 'epoch': 100.0})

In [45]:
trainer.evaluate()

{'eval_loss': 0.45355498790740967,
 'eval_precision': 0.48389830508474574,
 'eval_recall': 0.6854741896758704,
 'eval_f1': 0.5673124689518132,
 'eval_accuracy': 0.9068814638027048,
 'eval_runtime': 0.9781,
 'eval_samples_per_second': 72.59,
 'eval_steps_per_second': 2.045,
 'epoch': 100.0}

In [46]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
non_trainable_params = total_params - trainable_params

print(f"Total Parameters: {total_params}")
print(f"Trainable Parameters: {trainable_params}")
print(f"Non-trainable Parameters: {non_trainable_params}")

Total Parameters: 109735242
Trainable Parameters: 818853
Non-trainable Parameters: 108916389
