In [1]:
from pathlib import Path
import numpy as np
import os,sys
import pandas as pd
import os
import skseq
from skseq.sequences.extended_feature import ExtendedFeatures
import utils
import skseq.sequences.structured_perceptron as spc

currentdir = Path.cwd()
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

N_EPOCHS = 15 


# Structured Perceptron

We need a sequence with each sentence, x having every word in a string and the tags in another vector

In [2]:
X_tr, y_tr = utils.gen_set("./data/train_data_ner.csv")
X_test, y_test = utils.gen_set("./data/test_data_ner.csv")
X_tiny, y_tiny = utils.gen_set("./data/tiny_test.csv")

To create a sequence list, first we need a dictionary for the words and another for the tags

In [3]:
word_dict, tag_dict, rev_dict = utils.dictionary(X_tr, y_tr)

We use the class SequenceList from the skseq package used in the code provided in class. This package also includes a class for dictionaries called LabelDictionary defines some useful functions needed for creating the sequence, so we need to transform our dictionaries into these ones.

In [4]:
seq = utils.get_seq(word_dict, tag_dict, X_tr, y_tr)

## Base Model

In [5]:
feature_mapper = skseq.sequences.id_feature.IDFeatures(seq)

In [6]:
feature_mapper.build_features()

In [7]:
sp = spc.StructuredPerceptron(word_dict, tag_dict, feature_mapper)
sp.num_epochs = 5
pred_train = sp.viterbi_decode_corpus(seq)
sp.fit(feature_mapper.dataset, N_EPOCHS)

Epoch: 0 Accuracy: 0.893522
Epoch: 1 Accuracy: 0.931903
Epoch: 2 Accuracy: 0.941308
Epoch: 3 Accuracy: 0.946066
Epoch: 4 Accuracy: 0.949996
Epoch: 5 Accuracy: 0.952464
Epoch: 6 Accuracy: 0.954540
Epoch: 7 Accuracy: 0.956122
Epoch: 8 Accuracy: 0.957765
Epoch: 9 Accuracy: 0.957984
Epoch: 10 Accuracy: 0.959716
Epoch: 11 Accuracy: 0.959939
Epoch: 12 Accuracy: 0.960999
Epoch: 13 Accuracy: 0.961453
Epoch: 14 Accuracy: 0.962112


In [None]:
sp.save_model("./fitted_models/sp_base_")

## Extended Features

In [None]:
feature_mapper_extra = ExtendedFeatures(seq)
feature_mapper_extra.build_features()

In [None]:
sp = spc.StructuredPerceptron(word_dict, tag_dict, feature_mapper_extra)
sp.num_epochs = 5

In [None]:
_ = sp.viterbi_decode_corpus(seq)

In [None]:
sp.fit(feature_mapper_extra.dataset, N_EPOCHS)

Epoch: 0 Accuracy: 0.928059
Epoch: 1 Accuracy: 0.943572
Epoch: 2 Accuracy: 0.947625
Epoch: 3 Accuracy: 0.949844
Epoch: 4 Accuracy: 0.951875
Epoch: 5 Accuracy: 0.953769
Epoch: 6 Accuracy: 0.954850
Epoch: 7 Accuracy: 0.955497
Epoch: 8 Accuracy: 0.956710
Epoch: 9 Accuracy: 0.957138
Epoch: 10 Accuracy: 0.957752
Epoch: 11 Accuracy: 0.958760
Epoch: 12 Accuracy: 0.958912
Epoch: 13 Accuracy: 0.959638
Epoch: 14 Accuracy: 0.960259


In [None]:
sp.save_model("./fitted_models/sp_ext_")

# Deep Learning

### BERT fine-tuning

In [None]:
!pip install torcheval peft

In [1]:
import torcheval.metrics
from utils import parse_dataset, Params, NERDataset, get_evaluation_pred_and_label

In [None]:
base_folder = './data/'
training = 'train_data_ner.csv'
test = 'test_data_ner.csv'
tiny = 'tiny_test.csv'

full_train, full_train_label = parse_dataset(base_folder + training)
len_train = int(len(full_train) * 0.9)

train_sent, train_label = full_train[:len_train], full_train_label[:len_train]
valid_sent, valid_label = full_train[len_train + 1:], full_train_label[len_train + 1:]

test_sent, test_label = parse_dataset(base_folder + test)
tiny_sent, tiny_label = parse_dataset(base_folder + tiny)

In [None]:
organized_list = []
for inner_list in train_label + test_label + tiny_label:
    organized_list.extend(inner_list)
set(organized_list)

In [None]:
PARAMS = {
    'batch_size': 64,
    'epochs': 10,
    'lr': 3e-5,
    'shuffle': False,
    'weight_decay': 1e-4,
    'embeddings_dim': 1024,
}

params = Params(PARAMS)

label2id = {
    'B-art': 0,
    'B-eve': 1,
    'B-geo': 2,
    'B-gpe': 3,
    'B-nat': 4,
    'B-org': 5,
    'B-per': 6,
    'B-tim': 7,
    'I-art': 8,
    'I-eve': 9,
    'I-geo': 10,
    'I-gpe': 11,
    'I-nat': 12,
    'I-org': 13,
    'I-per': 14,
    'I-tim': 15,
    'O': 16
}

In [None]:
from tqdm import tqdm
import torch
import torch.utils.data as data


# Create the dataset object
train_dataset = NERDataset(train_sent, train_label)
valid_dataset = NERDataset(valid_sent, valid_label)
test_dataset = NERDataset(test_sent, test_label)
tiny_dataset = NERDataset(tiny_sent, tiny_label)

def collate_fn(list_items):
     x = []
     y = []
     for x_, y_ in list_items:
         x.append(x_)
         y.append(y_)
     return x, y

train_loader = data.DataLoader(train_dataset, batch_size=params.batch_size, shuffle=params.shuffle, collate_fn=collate_fn)
valid_loader = data.DataLoader(valid_dataset, batch_size=params.batch_size, shuffle=params.shuffle, collate_fn=collate_fn)
test_loader = data.DataLoader(test_dataset, batch_size=params.batch_size, shuffle=params.shuffle, collate_fn=collate_fn)
tiny_loader = data.DataLoader(tiny_dataset, batch_size=params.batch_size, shuffle=params.shuffle, collate_fn=collate_fn)

In [None]:
from collections import defaultdict

label_count = defaultdict(lambda: 0)
for l in train_label:
  for ll in l:
    label_count[label2id[ll]] += 1

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from peft import get_peft_model, LoraConfig, TaskType
import torch
from torch.optim import AdamW
from tqdm import tqdm


tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(label2id.keys()), ignore_mismatched_sizes=True)

optimizer = torch.optim.AdamW(model.parameters(), lr=params.lr)

weights = torch.ones(len(label2id))
loss_fn = torch.nn.CrossEntropyLoss()
f1_func = torcheval.metrics.functional.multiclass_f1_score
acc_func = torcheval.metrics.functional.multiclass_accuracy

history = {}

In [None]:
import torch

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"


model.to(device)
best_val_loss = float('inf')  

for epoch in range(params.epochs):
    running_loss = 0.0

    for batch_idx, (sentences, target) in tqdm(enumerate(train_loader)):
        model.train()
        optimizer.zero_grad()

        sentences_splited_into_words = [sentence.split(" ") for sentence in sentences]
        tokens = tokenizer.batch_encode_plus(sentences_splited_into_words, padding=True, return_tensors='pt', truncation=True, is_split_into_words=True)
        tokens = tokens.to(device)
        outputs = model(**tokens)

        loss = 0.0
        for bb in range(outputs.logits.shape[0]):
            ob = outputs.logits[bb].to(device)
            words_ids = torch.tensor([x for x in tokens.word_ids(bb) if x is not None]).to(device)
            predicted_class = ob[words_ids].to(device)
            real_class = target[bb].to(device)[words_ids]
            loss += loss_fn(predicted_class, real_class)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print("    Train Loss: ", running_loss / len(train_loader))

    ######################
    model.eval() 
    val_loss = 0.0
    val_acc = 0.0
    val_f1 = 0.0
    with torch.no_grad(): 
        for batch_idx, (val_sentences, val_target) in tqdm(enumerate(valid_loader)):
            val_sentences_splited_into_words = [sentence.split(" ") for sentence in val_sentences]
            val_tokens = tokenizer.batch_encode_plus(val_sentences_splited_into_words, padding=True, return_tensors='pt', truncation=True, is_split_into_words=True)
            val_tokens = val_tokens.to(device)
            val_outputs = model(**val_tokens)

            val_loss_batch = 0.0
            val_acc_batch = 0.0
            val_f1_batch = 0.0
            for bb in range(val_outputs.logits.shape[0]):
                ob = val_outputs.logits[bb]
                words_ids = torch.tensor([x for x in val_tokens.word_ids(bb) if x is not None])
                predicted_class = ob[words_ids].to(device)
                real_class = val_target[bb].to(device)[words_ids]

                val_loss_batch = val_loss_batch + loss_fn(predicted_class, real_class)
                val_acc_batch = acc_func(predicted_class, real_class)
                val_f1_batch = f1_func(predicted_class, real_class)

            val_loss += val_loss_batch.item()
            val_acc += val_acc_batch.item()
            val_f1 += val_f1_batch.item()

    avg_val_loss = val_loss / len(valid_loader)
    avg_val_acc = val_acc / len(valid_loader)
    avg_val_f1 = val_f1 / len(valid_loader)
    print(f"    Validation Loss: {avg_val_loss}, acc: {avg_val_acc}, f1: {avg_val_f1}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), 'best_model.pt')
        print(f"    Best model saved with Validation Loss: {best_val_loss}")

### BERT + LoRA fine-tuning

In [None]:
import torcheval.metrics
from utils import parse_dataset, Params, NERDataset, get_evaluation_pred_and_label

In [None]:
base_folder = './data/'
training = 'train_data_ner.csv'
test = 'test_data_ner.csv'
tiny = 'tiny_test.csv'

full_train, full_train_label = parse_dataset(base_folder + training)
len_train = int(len(full_train) * 0.9)

train_sent, train_label = full_train[:len_train], full_train_label[:len_train]
valid_sent, valid_label = full_train[len_train + 1:], full_train_label[len_train + 1:]

test_sent, test_label = parse_dataset(base_folder + test)
tiny_sent, tiny_label = parse_dataset(base_folder + tiny)

In [None]:
PARAMS = {
    'batch_size': 64,
    'epochs': 80,
    'lr': 5e-5,
    'shuffle': False,
    'weight_decay': 1e-4,
    'embeddings_dim': 1024,
}

params = Params(PARAMS)

label2id = {
    'B-art': 0,
    'B-eve': 1,
    'B-geo': 2,
    'B-gpe': 3,
    'B-nat': 4,
    'B-org': 5,
    'B-per': 6,
    'B-tim': 7,
    'I-art': 8,
    'I-eve': 9,
    'I-geo': 10,
    'I-gpe': 11,
    'I-nat': 12,
    'I-org': 13,
    'I-per': 14,
    'I-tim': 15,
    'O': 16
}

In [None]:
from tqdm import tqdm
import torch
import torch.utils.data as data


# Create the dataset object
train_dataset = NERDataset(train_sent, train_label)
valid_dataset = NERDataset(valid_sent, valid_label)
test_dataset = NERDataset(test_sent, test_label)
tiny_dataset = NERDataset(tiny_sent, tiny_label)

def collate_fn(list_items):
     x = []
     y = []
     for x_, y_ in list_items:
         x.append(x_)
         y.append(y_)
     return x, y

train_loader = data.DataLoader(train_dataset, batch_size=params.batch_size, shuffle=params.shuffle, collate_fn=collate_fn)
valid_loader = data.DataLoader(valid_dataset, batch_size=params.batch_size, shuffle=params.shuffle, collate_fn=collate_fn)
test_loader = data.DataLoader(test_dataset, batch_size=params.batch_size, shuffle=params.shuffle, collate_fn=collate_fn)
tiny_loader = data.DataLoader(tiny_dataset, batch_size=params.batch_size, shuffle=params.shuffle, collate_fn=collate_fn)

In [None]:
from collections import defaultdict

label_count = defaultdict(lambda: 0)
for l in train_label:
  for ll in l:
    label_count[label2id[ll]] += 1

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from peft import get_peft_model, LoraConfig, TaskType
import torch
from torch.optim import AdamW
from tqdm import tqdm


tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(label2id.keys()), ignore_mismatched_sizes=True)

lora_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)

model = get_peft_model(model, lora_config)

optimizer = torch.optim.AdamW(model.parameters(), lr=params.lr)

weights = torch.ones(len(label2id))
loss_fn = torch.nn.CrossEntropyLoss()
f1_func = torcheval.metrics.functional.multiclass_f1_score
acc_func = torcheval.metrics.functional.multiclass_accuracy

history = {}

In [None]:
import torch

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"


model.to(device)
best_val_loss = float('inf') 

for epoch in range(params.epochs):
    running_loss = 0.0

    for batch_idx, (sentences, target) in tqdm(enumerate(train_loader)):
        model.train()
        optimizer.zero_grad()

        sentences_splited_into_words = [sentence.split(" ") for sentence in sentences]
        tokens = tokenizer.batch_encode_plus(sentences_splited_into_words, padding=True, return_tensors='pt', truncation=True, is_split_into_words=True)
        tokens = tokens.to(device)
        outputs = model(**tokens)

        loss = 0.0
        for bb in range(outputs.logits.shape[0]):
            ob = outputs.logits[bb].to(device)
            words_ids = torch.tensor([x for x in tokens.word_ids(bb) if x is not None]).to(device)
            predicted_class = ob[words_ids].to(device)
            real_class = target[bb].to(device)[words_ids]
            loss += loss_fn(predicted_class, real_class)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print("    Train Loss: ", running_loss / len(train_loader))

    ######################
    model.eval()  
    val_loss = 0.0
    val_acc = 0.0
    val_f1 = 0.0
    with torch.no_grad(): 
        for batch_idx, (val_sentences, val_target) in tqdm(enumerate(valid_loader)):
            val_sentences_splited_into_words = [sentence.split(" ") for sentence in val_sentences]
            val_tokens = tokenizer.batch_encode_plus(val_sentences_splited_into_words, padding=True, return_tensors='pt', truncation=True, is_split_into_words=True)
            val_tokens = val_tokens.to(device)
            val_outputs = model(**val_tokens)

            val_loss_batch = 0.0
            val_acc_batch = 0.0
            val_f1_batch = 0.0
            for bb in range(val_outputs.logits.shape[0]):
                ob = val_outputs.logits[bb]
                words_ids = torch.tensor([x for x in val_tokens.word_ids(bb) if x is not None])
                predicted_class = ob[words_ids].to(device)
                real_class = val_target[bb].to(device)[words_ids]

                val_loss_batch = val_loss_batch + loss_fn(predicted_class, real_class)
                val_acc_batch = acc_func(predicted_class, real_class)
                val_f1_batch = f1_func(predicted_class, real_class)

            val_loss += val_loss_batch.item()
            val_acc += val_acc_batch.item()
            val_f1 += val_f1_batch.item()

    avg_val_loss = val_loss / len(valid_loader)
    avg_val_acc = val_acc / len(valid_loader)
    avg_val_f1 = val_f1 / len(valid_loader)
    print(f"    Validation Loss: {avg_val_loss}, acc: {avg_val_acc}, f1: {avg_val_f1}")

    # Save the best model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), 'best_model.pt')
        print(f"    Best model saved with Validation Loss: {best_val_loss}")