# Group 8 - main code

### Instalations needed:  
torch, transformers, evaluate

In [None]:
#imports
import torch
from torch.utils.data import DataLoader, Dataset, ConcatDataset
from transformers import RobertaTokenizerFast, AutoModelForTokenClassification, AutoConfig, DataCollatorForTokenClassification, get_scheduler
from torch.cuda.amp import autocast, GradScaler
from torch.optim import AdamW
import torch.nn.functional as F
from tqdm import tqdm
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import numpy as np
import pandas as pd
from evaluate import load

## Baseline model

### INPUT:
 
#### English Web Treebank sets 
- en_ewt-ud-train.iob2
- en_ewt-ud-dev.iob2
- en_ewt-ud-test-masked.iob2

in the repository, you can find the files here:    
/NLP-spring-2025/datasets_orginal    
  
  
### OUTPUT:
#### Baseline model:
- baseline_model  
recommended path:  
  
/NLP-spring-2025/baseline

#### Prediction:  
- baseline_predictions.iob2  
  
recommended path:  
/NLP-spring-2025/baseline

In [None]:
MODEL_NAME = "deepset/roberta-base-squad2"

tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME, add_prefix_space=True, use_fast=True)

def get_label_mappings(file_path):
    label_set = set()
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line and not line.startswith('#'):
                parts = line.split('\t')
                if len(parts) > 2:
                    label_set.add(parts[2])
    
    tag2idx = {label: idx for idx, label in enumerate(sorted(label_set))}
    idx2tag = {idx: label for label, idx in tag2idx.items()}
    return tag2idx, idx2tag

# ------------------------------------------------------------------------------------------------------


train_file = "\path\en_ewt-ud-train.iob2"             #path to EWT training set 
dev_file = "\path\en_ewt-ud-dev.iob2"                 #path to EWT dev set 
test_file = "\prth\en_ewt-ud-test-masked.iob2"        #path to EWT test set 

# ------------------------------------------------------------------------------------------------------

tag2idx, idx2tag = get_label_mappings(train_file)

class NERDataset(Dataset):
    def __init__(self, file_path, tokenizer, tag2idx, max_len=128):
        self.sentences, self.labels, self.raw_data = self.load_data(file_path)
        self.tokenizer = tokenizer
        self.tag2idx = tag2idx
        self.max_len = max_len

    def load_data(self, file_path):
        sentences, labels, raw_data = [], [], []
        sentence, label, sentence_data = [], [], []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    sentence_data.append(line)
                    if line.startswith('#'):
                        continue
                    
                    parts = line.split('\t')
                    if len(parts) > 2:
                        sentence.append(parts[1])  # Word
                        label.append(parts[2])  # NER tag
                else:
                    if sentence:
                        sentences.append(sentence)
                        labels.append(label)
                        raw_data.append(sentence_data)
                    sentence, label, sentence_data = [], [], []
        
        if sentence: 
            sentences.append(sentence)
            labels.append(label)
            raw_data.append(sentence_data)
            
        return sentences, labels, raw_data

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        words = self.sentences[idx]
        tags = self.labels[idx]
        
        tag_ids = [self.tag2idx[tag] for tag in tags]

        encodings = self.tokenizer(
            words,
            is_split_into_words=True,
            truncation=True,
            max_length=self.max_len,
            padding=False
        )
        
        word_ids = encodings.word_ids()
        
        aligned_labels = [-100] * len(word_ids)
        
        prev_word_id = None
        for i, word_id in enumerate(word_ids):
            if word_id is None:
                # Special tokens get -100
                aligned_labels[i] = -100
            elif word_id != prev_word_id:
                # Only first subword of a given word gets the label
                aligned_labels[i] = tag_ids[word_id]
            else:
                # Other subwords get -100
                aligned_labels[i] = -100
            prev_word_id = word_id
        
        return {
            'input_ids': torch.tensor(encodings['input_ids']),
            'attention_mask': torch.tensor(encodings['attention_mask']),
            'labels': torch.tensor(aligned_labels)
        }

train_dataset = NERDataset(train_file, tokenizer, tag2idx)
dev_dataset = NERDataset(dev_file, tokenizer, tag2idx)
test_dataset = NERDataset(test_file, tokenizer, tag2idx)

data_collator = DataCollatorForTokenClassification(tokenizer)

full_train_dataset = ConcatDataset([train_dataset, dev_dataset])

config = AutoConfig.from_pretrained(
    MODEL_NAME,
    num_labels=len(tag2idx),
    id2label=idx2tag,
    label2id=tag2idx
)

model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    config=config
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

train_batch_size = 16
eval_batch_size = 32
learning_rate = 2e-5
weight_decay = 0.01
epochs = 5

optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

train_loader = DataLoader(
    full_train_dataset, 
    batch_size=train_batch_size, 
    shuffle=True,
    collate_fn=data_collator
)

dev_loader = DataLoader(
    dev_dataset, 
    batch_size=eval_batch_size, 
    collate_fn=data_collator
)

test_loader = DataLoader(
    test_dataset, 
    batch_size=eval_batch_size, 
    collate_fn=data_collator
)

num_training_steps = len(train_loader) * epochs
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=int(0.1 * num_training_steps),
    num_training_steps=num_training_steps
)

try:
    metric = load("seqeval")
except:
    metric = None
    print("Warning: 'evaluate' package not found. Will skip detailed metrics calculation.")

def pred2label(predictions, labels):
    preds = np.argmax(predictions, axis=-1)
    
    true_labels = []
    pred_labels = []
    
    for pred_seq, label_seq in zip(preds, labels):
        true_seq = []
        pred_seq_clean = []
        
        for i, label_id in enumerate(label_seq):
            if label_id != -100:
                true_seq.append(idx2tag[label_id.item()])
                pred_seq_clean.append(idx2tag[pred_seq[i]])
        
        true_labels.append(true_seq)
        pred_labels.append(pred_seq_clean)
    
    return true_labels, pred_labels

def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()
            
            logits = outputs.logits
            predictions = logits.detach().cpu().numpy()
            labels = batch["labels"].detach().cpu().numpy()
            
            all_predictions.append(predictions)
            all_labels.append(labels)
    
    all_predictions = np.vstack([p.reshape(-1, len(tag2idx)) for p in all_predictions])
    all_labels = np.concatenate([l.flatten() for l in all_labels])
    
    valid_indices = all_labels != -100
    filtered_predictions = all_predictions[valid_indices]
    filtered_labels = all_labels[valid_indices]
    
    accuracy = np.mean(np.argmax(filtered_predictions, axis=1) == filtered_labels)
    
    return {
        "loss": total_loss / len(dataloader),
        "accuracy": accuracy
    }

scaler = GradScaler()
best_loss = float('inf')
train_losses = []
val_losses = []

print(f"Starting training on {device}")
for epoch in range(epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
    
    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        with autocast():
            outputs = model(**batch)
            loss = outputs.loss
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        lr_scheduler.step()
        optimizer.zero_grad()
        
        total_loss += loss.item()
        progress_bar.set_postfix({"loss": loss.item()})
    
    avg_train_loss = total_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    
    eval_results = evaluate(model, dev_loader, device)
    val_losses.append(eval_results["loss"])
    
    print(f"Epoch {epoch+1}/{epochs} - Train loss: {avg_train_loss:.4f} - Val loss: {eval_results['loss']:.4f} - Val accuracy: {eval_results['accuracy']:.4f}")
    
    if eval_results["loss"] < best_loss:
        best_loss = eval_results["loss"]
        model.save_pretrained("baseline_model")
        tokenizer.save_pretrained("baseline_model")
        print("Model saved!")

plt.figure(figsize=(10, 6))
plt.plot(range(1, epochs+1), train_losses, label='Training Loss')
plt.plot(range(1, epochs+1), val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.savefig("training_curve.png")
plt.close()

def write_predictions(model, dataset, dataloader, output_path):
    model.eval()
    all_predictions = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Generating predictions"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            
            predictions = outputs.logits.argmax(dim=-1).detach().cpu().numpy()
            labels = batch["labels"].detach().cpu().numpy()
            
            for i in range(len(predictions)):
                pred_tags = []
                for j, pred in enumerate(predictions[i]):
                    if labels[i][j] != -100:
                        pred_tags.append(idx2tag[pred.item()])
                
                all_predictions.append(pred_tags)
    
    with open(output_path, "w", encoding="utf-8") as f:
        pred_idx = 0
        for i, data in enumerate(dataset.raw_data):
            sentence_counter = 0
            for line in data:
                if line.startswith('#'):
                    f.write(f"{line}\n")
                    continue
                
                if line:
                    parts = line.split('\t')
                    if len(parts) > 2:
                        if sentence_counter < len(all_predictions[pred_idx]):
                            f.write(f"{parts[0]}\t{parts[1]}\t{all_predictions[pred_idx][sentence_counter]}\t-\t-\n")
                            sentence_counter += 1
                        else:
                            f.write(f"{parts[0]}\t{parts[1]}\tO\t-\t-\n")
            
            f.write("\n")
            pred_idx += 1
    
    print(f"Predictions saved to: {output_path}")

best_model = AutoModelForTokenClassification.from_pretrained("baseline_model")
best_model.to(device)
write_predictions(best_model, test_dataset, test_loader, "baseline_predictions.iob2")

print("Training and evaluation complete")

___________________________________________


For the next part you will be using DAPT NER model - model after domain-adaptive pretraining.  
In README github there is instruction for it.

# Fine-turning the  DAPT  NER model 

### INPUT:
 
 #### NER model with domain adaptive pretraining
- ner_DAPT_model 


#### English Web Treebank training set 
- en_ewt-ud-train.iob2  

in the repository, you can find the files here:    
/NLP-spring-2025/datasets_orginal    
  
  
#### training dataset for fine-tuning (one of):

- rap-hip-hop-manual-1000-train.iob2   
- pop-manual-1000-train.iob2   
- country-manual-1000-train.iob2   
- 3genres_.iob2   
 
in the repository, you can find the files here:  
/NLP-spring-2025/'GENRE'/datasets/manual  
  
 ### OUTPUT:
 #### Fine-tuned new model (one of):

- ner_DAPT_model_finetuned_on_3genres   
- ner_DAPT_model_finetuned_on_rap-hip-hop   
- ner_DAPT_model_finetuned_on_pop   
- ner_DAPT_model_finetuned_on_country   
  
recommended path:  
/NLP-spring-2025/'GENRE'

In [None]:
# ------------------------------------------------------------------------------------------------------
BASE_MODEL_PATH = "/path/ner_DAPT_model"                         # path to DAPT NER model 
NEW_TRAIN_FILE = "/path/pop-manual-1000-train.iob2"              # path to training dataset 
OUTPUT_MODEL_PATH = "/path/ner_DAPT_model_finetuned_on_pop"      # path to output model (DOMAIN SPECIFIC FINE-TUNING)
# ------------------------------------------------------------------------------------------------------

epochs = 5
train_batch_size = 16
learning_rate = 5e-6  
max_len = 128


#load tokenizer and model
tokenizer = RobertaTokenizerFast.from_pretrained(BASE_MODEL_PATH, add_prefix_space=True, use_fast=True)

#reuse label mappings from EWT
def get_label_mappings(file_path):
    label_set = set()
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line and not line.startswith('#'):
                parts = line.split('\t')
                if len(parts) > 2:
                    label_set.add(parts[2])
    tag2idx = {label: idx for idx, label in enumerate(sorted(label_set))}
    idx2tag = {idx: label for label, idx in tag2idx.items()}
    return tag2idx, idx2tag

# ------------------------------------------------------------------------------------------------------
tag2idx, idx2tag = get_label_mappings("/path/en_ewt-ud-train.iob2")  #path to ewt training data
# ------------------------------------------------------------------------------------------------------

class NERDataset(Dataset):
    def __init__(self, file_path, tokenizer, tag2idx, max_len=128):
        self.sentences, self.labels = self.load_data(file_path)
        self.tokenizer = tokenizer
        self.tag2idx = tag2idx
        self.max_len = max_len

    def load_data(self, file_path):
        sentences, labels = [], []
        sentence, label = [], []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    if not line.startswith('#'):
                        parts = line.split('\t')
                        if len(parts) > 2:
                            sentence.append(parts[1])
                            label.append(parts[2])
                else:
                    if sentence:
                        sentences.append(sentence)
                        labels.append(label)
                        sentence, label = [], []
        if sentence:
            sentences.append(sentence)
            labels.append(label)
        return sentences, labels

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        words = self.sentences[idx]
        tags = self.labels[idx]
        tag_ids = [self.tag2idx[tag] for tag in tags]

        encodings = self.tokenizer(words, is_split_into_words=True, truncation=True, max_length=self.max_len, padding=False)
        word_ids = encodings.word_ids()

        aligned_labels = [-100] * len(word_ids)
        prev_word_id = None
        for i, word_id in enumerate(word_ids):
            if word_id is None:
                aligned_labels[i] = -100
            elif word_id != prev_word_id:
                aligned_labels[i] = tag_ids[word_id]
            else:
                aligned_labels[i] = -100
            prev_word_id = word_id

        return {
            'input_ids': torch.tensor(encodings['input_ids']),
            'attention_mask': torch.tensor(encodings['attention_mask']),
            'labels': torch.tensor(aligned_labels)
        }

#load training dataset
lyrics_dataset = NERDataset(NEW_TRAIN_FILE, tokenizer, tag2idx)
lyrics_loader = DataLoader(lyrics_dataset, batch_size=train_batch_size, shuffle=True, collate_fn=DataCollatorForTokenClassification(tokenizer))

#load model
config = AutoConfig.from_pretrained(BASE_MODEL_PATH, num_labels=len(tag2idx), id2label=idx2tag, label2id=tag2idx)
model = AutoModelForTokenClassification.from_pretrained(BASE_MODEL_PATH, config=config)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

#optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
num_training_steps = len(lyrics_loader) * epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0.1*num_training_steps, num_training_steps=num_training_steps)

scaler = GradScaler()

#fine-tuning
print("Starting fine-tuning")
model.train()
train_losses = []

for epoch in range(epochs):
    total_loss = 0
    progress_bar = tqdm(lyrics_loader, desc=f"Epoch {epoch+1}/{epochs}")
    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        with autocast():
            outputs = model(**batch)
            loss = outputs.loss
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        lr_scheduler.step()

        total_loss += loss.item()
    avg_loss = total_loss / len(lyrics_loader)
    train_losses.append(avg_loss)
    print(f"Epoch {epoch+1} completed. Average loss: {avg_loss:.4f}")

#save the new fine-tuned model
os.makedirs(OUTPUT_MODEL_PATH, exist_ok=True)
model.save_pretrained(OUTPUT_MODEL_PATH)
tokenizer.save_pretrained(OUTPUT_MODEL_PATH)
print(f"New fine-tuned model saved to {OUTPUT_MODEL_PATH}")

# Generating pseudo-labels on unlabeled data using a previously fine-tuned model, as part of a self-training pipeline.

## CODE 1 (for a single genre)
### INPUT:
#### ner_DAPT_model fine-tuned  (one of):  

- ner_DAPT_model_finetuned_on_rap-hip-hop     
- ner_DAPT_model_finetuned_on_pop     
- ner_DAPT_model_finetuned_on_country   
  
if you follow the recommended paths:  
/NLP-spring-2025/'GENRE' 
    
#### dataset for generating more training datasets  (one of):

- rap-hip-hop_labeled_no_2000_all_O.iob2  
- pop_labeled_no_2000_all_O.iob2  
- country_labeled_no_2000_all_O.iob2  

in the repository, you can find the files here:    
/NLP-spring-2025/'GENRE'/datasets  
  
### OUTPUT:
#### new training dataset (one of):  
 

- rap-hip-hop_labeled_no_2000.iob2
- pop_labeled_no_2000.iob2
- country_labeled_no_2000.iob2

recommended path:  
/NLP-spring-2025/'GENRE'/datasets 
 
-----------------------------------------------------------------------------------------------------------------

## CODE 2 (3 genres combined) 
### INPUT:
#### ner_DAPT_model fine-tuned :

- ner_DAPT_model_finetuned_on_3genres 
  
if you follow the recommended paths:    
/NLP-spring-2025/3genres   
    
####  datasets for generating more training datasets (all): 

- rap-hip-hop_labeled_no_2000_all_O.iob2  
- pop_labeled_no_2000_all_O.iob2  
- country_labeled_no_2000_all_O.iob2  

in the repository, you can find the files here:      
/NLP-spring-2025/'GENRE'/datasets  
 
### OUTPUT:
#### new training dataset:

- merged_3genres_labeled.iob2
 
recommended path:    
/NLP-spring-2025/3genres/datasets  

In [None]:
#------------------------------------------------------------------------------------------
#
#
#                                            CODE 1 
#
#
#------------------------------------------------------------------------------------------

#------------------------------------------------------------------------------------------

MODEL_PATH = "/path/ner_DAPT_model_finetuned_on_pop"   #path to the fine-tuned model 

#------------------------------------------------------------------------------------------


tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_PATH, add_prefix_space=True, use_fast=True)
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

#label mappings 
tag2idx = model.config.label2id
idx2tag = model.config.id2label


class NERDataset(Dataset):
    def __init__(self, file_path, tokenizer, tag2idx, max_len=128):
        self.sentences, self.labels, self.raw_data = self.load_data(file_path)
        self.tokenizer = tokenizer
        self.tag2idx = tag2idx
        self.max_len = max_len

    def load_data(self, file_path):
        sentences, labels, raw_data = [], [], []
        sentence, label, sentence_data = [], [], []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    sentence_data.append(line)
                    if line.startswith('#'):
                        continue
                    parts = line.split('\t')
                    if len(parts) > 2:
                        sentence.append(parts[1])
                        label.append(parts[2])
                else:
                    if sentence:
                        sentences.append(sentence)
                        labels.append(label)
                        raw_data.append(sentence_data)
                    sentence, label, sentence_data = [], [], []
        if sentence:
            sentences.append(sentence)
            labels.append(label)
            raw_data.append(sentence_data)
        return sentences, labels, raw_data

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        words = self.sentences[idx]
        tags = self.labels[idx]
        tag_ids = [self.tag2idx.get(tag, 0) for tag in tags]
        encodings = self.tokenizer(
            words, is_split_into_words=True, truncation=True,
            max_length=self.max_len, padding=False
        )
        word_ids = encodings.word_ids()
        aligned_labels = [-100] * len(word_ids)
        prev_word_id = None
        for i, word_id in enumerate(word_ids):
            if word_id is None:
                aligned_labels[i] = -100
            elif word_id != prev_word_id:
                aligned_labels[i] = tag_ids[word_id]
            else:
                aligned_labels[i] = -100
            prev_word_id = word_id
        return {
            'input_ids': torch.tensor(encodings['input_ids']),
            'attention_mask': torch.tensor(encodings['attention_mask']),
            'labels': torch.tensor(aligned_labels)
        }

#prediction function
def write_predictions(model, dataset, dataloader, output_path):
    model.eval()
    all_predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Predicting"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1).cpu().numpy()
            labels = batch["labels"].cpu().numpy()

            for i in range(len(predictions)):
                pred_tags = []
                for j, pred in enumerate(predictions[i]):
                    if labels[i][j] != -100:
                        pred_tags.append(idx2tag[pred.item()])
                all_predictions.append(pred_tags)

    with open(output_path, "w", encoding="utf-8") as f:
        pred_idx = 0
        for data in dataset.raw_data:
            sent_counter = 0
            for line in data:
                if line.startswith('#'):
                    f.write(f"{line}\n")
                    continue
                if line:
                    parts = line.split('\t')
                    if len(parts) > 2:
                        tag = all_predictions[pred_idx][sent_counter] if sent_counter < len(all_predictions[pred_idx]) else "O"
                        f.write(f"{parts[0]}\t{parts[1]}\t{tag}\t-\t-\n")
                        sent_counter += 1
            f.write("\n")
            pred_idx += 1
    print(f"Predictions written to {output_path}")


    
#------------------------------------------------------------------------------------------

test_file = "/path/pop_labeled_no_2000_all_O.iob2"  #path to not labeled training file
output_file = "/path/pop_labeled_no_2000.iob2"      #path to output file (new training file)

#------------------------------------------------------------------------------------------


test_dataset = NERDataset(test_file, tokenizer, tag2idx)
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=DataCollatorForTokenClassification(tokenizer))

write_predictions(model, test_dataset, test_loader, output_file)

In [None]:
#------------------------------------------------------------------------------------------
#
#
#                                            CODE 2 
#
#
#------------------------------------------------------------------------------------------

#------------------------------------------------------------------------------------------

MODEL_PATH = "/work/project/ner_model_finetuned_on_3genres"   #path to the fine-tuned model 

#------------------------------------------------------------------------------------------

tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_PATH, add_prefix_space=True, use_fast=True)
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

#label mappings
tag2idx = model.config.label2id
idx2tag = model.config.id2label


class NERDataset(Dataset):
    def __init__(self, file_path, tokenizer, tag2idx, max_len=128):
        self.sentences, self.labels, self.raw_data = self.load_data(file_path)
        self.tokenizer = tokenizer
        self.tag2idx = tag2idx
        self.max_len = max_len

    def load_data(self, file_path):
        sentences, labels, raw_data = [], [], []
        sentence, label, sentence_data = [], [], []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    sentence_data.append(line)
                    if line.startswith('#'):
                        continue
                    parts = line.split('\t')
                    if len(parts) > 2:
                        sentence.append(parts[1])
                        label.append(parts[2])
                else:
                    if sentence:
                        sentences.append(sentence)
                        labels.append(label)
                        raw_data.append(sentence_data)
                    sentence, label, sentence_data = [], [], []
        if sentence:
            sentences.append(sentence)
            labels.append(label)
            raw_data.append(sentence_data)
        return sentences, labels, raw_data

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        words = self.sentences[idx]
        tags = self.labels[idx]
        tag_ids = [self.tag2idx.get(tag, 0) for tag in tags]
        encodings = self.tokenizer(
            words, is_split_into_words=True, truncation=True,
            max_length=self.max_len, padding=False
        )
        word_ids = encodings.word_ids()
        aligned_labels = [-100] * len(word_ids)
        prev_word_id = None
        for i, word_id in enumerate(word_ids):
            if word_id is None:
                aligned_labels[i] = -100
            elif word_id != prev_word_id:
                aligned_labels[i] = tag_ids[word_id]
            else:
                aligned_labels[i] = -100
            prev_word_id = word_id
        return {
            'input_ids': torch.tensor(encodings['input_ids']),
            'attention_mask': torch.tensor(encodings['attention_mask']),
            'labels': torch.tensor(aligned_labels)
        }

#prediction function
def write_predictions(model, dataset, dataloader, output_path):
    model.eval()
    all_predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Predicting"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1).cpu().numpy()
            labels = batch["labels"].cpu().numpy()

            for i in range(len(predictions)):
                pred_tags = []
                for j, pred in enumerate(predictions[i]):
                    if labels[i][j] != -100:
                        pred_tags.append(idx2tag[pred.item()])
                all_predictions.append(pred_tags)

    with open(output_path, "w", encoding="utf-8") as f:
        pred_idx = 0
        for data in dataset.raw_data:
            sent_counter = 0
            for line in data:
                if line.startswith('#'):
                    f.write(f"{line}\n")
                    continue
                if line:
                    parts = line.split('\t')
                    if len(parts) > 2:
                        tag = all_predictions[pred_idx][sent_counter] if sent_counter < len(all_predictions[pred_idx]) else "O"
                        f.write(f"{parts[0]}\t{parts[1]}\t{tag}\t-\t-\n")
                        sent_counter += 1
            f.write("\n")
            pred_idx += 1
    print(f"Predictions written to {output_path}")


#------------------------------------------------------------------------------------------

test_files = [
    "/path/pop_labeled_no_2000_all_O.iob2",  
    "path/country_labeled_no_2000_all_O.iob2",
    "/path/rap-hip-hop/rap-hip-hop_labeled_no_2000_all_O.iob2"     #paths to all not labeled training files
]


output_file = "/path/merged_3genres_labeled.iob2"              #path to output file (new training file)

#------------------------------------------------------------------------------------------

with open(output_file, "w", encoding="utf-8") as fout:
    for test_file in test_files:
        print(f" Processing {test_file}")

        #load test dataset
        test_dataset = NERDataset(test_file, tokenizer, tag2idx)
        test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=DataCollatorForTokenClassification(tokenizer))

        #predict
        model.eval()
        all_predictions = []
        with torch.no_grad():
            for batch in tqdm(test_loader, desc=f"Predicting {os.path.basename(test_file)}"):
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                predictions = outputs.logits.argmax(dim=-1).cpu().numpy()
                labels = batch["labels"].cpu().numpy()

                for i in range(len(predictions)):
                    pred_tags = []
                    for j, pred in enumerate(predictions[i]):
                        if labels[i][j] != -100:
                            pred_tags.append(idx2tag[pred.item()])
                    all_predictions.append(pred_tags)

        #write predictions
        pred_idx = 0
        for data in test_dataset.raw_data:
            sent_counter = 0
            for line in data:
                if line.startswith('#'):
                    fout.write(f"{line}\n")
                    continue
                if line:
                    parts = line.split('\t')
                    if len(parts) > 2:
                        tag = all_predictions[pred_idx][sent_counter] if sent_counter < len(all_predictions[pred_idx]) else "O"
                        fout.write(f"{parts[0]}\t{parts[1]}\t{tag}\t-\t-\n")
                        sent_counter += 1
            fout.write("\n")
            pred_idx += 1

print(f" All predictions written to: {output_file}")

# Continuous Learning (Self-Training)

### INPUT:
#### ner_DAPT_model fine-tuned  (one of):      
  
- ner_DAPT_model_finetuned_on_rap-hip-hop     
- ner_DAPT_model_finetuned_on_pop     
- ner_DAPT_model_finetuned_on_country     
- ner_DAPT_model_finetuned_on_3genres  
  
if you follow the recommended paths:      
/NLP-spring-2025/'GENRE'

#### training dataset for continuous learning (one of):  

- rap-hip-hop_labeled_no_2000.iob2
- pop_labeled_no_2000.iob2
- country_labeled_no_2000.iob2
- merged_3genres_labeled.iob2  
  
if you follow the recomended paths:         
/NLP-spring-2025/'GENRE'/datasets 


#### English Web Treebank training set 
- en_ewt-ud-train.iob2  
  
in the repository, you can find the files here:  
/NLP-spring-2025/datasets_orginal   
  
### OUTPUT:
#### new ner DAPT model - after continuous learning (one of):  

- ner_DAPT_model_cont_on_rap-hip-hop
- ner_DAPT_model_cont_on_pop
- ner_DAPT_model_cont_on_country
- ner_DAPT_model_cont_on_3genres  
  
recommended path:    
/NLP-spring-2025/'GENRE' 

In [None]:
#------------------------------------------------------------------------------------------

BASE_MODEL_PATH = "/path/ner_DAPT_model_finetuned_on_pop"         # path to DAPT NER model
NEW_TRAIN_FILE = "/path/pop_labeled_no_2000.iob2"            #path to new training dataset
OUTPUT_MODEL_PATH = "/path/ner_DAPT_model_cont_on_pop"       #new model (CONTINUOUS LEARNING)

#------------------------------------------------------------------------------------------
epochs = 3
train_batch_size = 64
learning_rate = 5e-6  
max_len = 128



tokenizer = RobertaTokenizerFast.from_pretrained(BASE_MODEL_PATH, add_prefix_space=True, use_fast=True)

#reuse label mappings from EWT 
def get_label_mappings(file_path):
    label_set = set()
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line and not line.startswith('#'):
                parts = line.split('\t')
                if len(parts) > 2:
                    label_set.add(parts[2])
    tag2idx = {label: idx for idx, label in enumerate(sorted(label_set))}
    idx2tag = {idx: label for label, idx in tag2idx.items()}
    return tag2idx, idx2tag

#------------------------------------------------------------------------------------------

tag2idx, idx2tag = get_label_mappings("/path/en_ewt-ud-train.iob2") #path to ewt training data

#------------------------------------------------------------------------------------------
# Dataset class
class NERDataset(Dataset):
    def __init__(self, file_path, tokenizer, tag2idx, max_len=128):
        self.sentences, self.labels = self.load_data(file_path)
        self.tokenizer = tokenizer
        self.tag2idx = tag2idx
        self.max_len = max_len

    def load_data(self, file_path):
        sentences, labels = [], []
        sentence, label = [], []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    if not line.startswith('#'):
                        parts = line.split('\t')
                        if len(parts) > 2:
                            sentence.append(parts[1])
                            label.append(parts[2])
                else:
                    if sentence:
                        sentences.append(sentence)
                        labels.append(label)
                        sentence, label = [], []
        if sentence:
            sentences.append(sentence)
            labels.append(label)
        return sentences, labels

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        words = self.sentences[idx]
        tags = self.labels[idx]
        tag_ids = [self.tag2idx[tag] for tag in tags]

        encodings = self.tokenizer(words, is_split_into_words=True, truncation=True, max_length=self.max_len, padding=False)
        word_ids = encodings.word_ids()

        aligned_labels = [-100] * len(word_ids)
        prev_word_id = None
        for i, word_id in enumerate(word_ids):
            if word_id is None:
                aligned_labels[i] = -100
            elif word_id != prev_word_id:
                aligned_labels[i] = tag_ids[word_id]
            else:
                aligned_labels[i] = -100
            prev_word_id = word_id

        return {
            'input_ids': torch.tensor(encodings['input_ids']),
            'attention_mask': torch.tensor(encodings['attention_mask']),
            'labels': torch.tensor(aligned_labels)
        }

#load new dataset
lyrics_dataset = NERDataset(NEW_TRAIN_FILE, tokenizer, tag2idx)
lyrics_loader = DataLoader(lyrics_dataset, batch_size=train_batch_size, shuffle=True, collate_fn=DataCollatorForTokenClassification(tokenizer))

#load model
config = AutoConfig.from_pretrained(BASE_MODEL_PATH, num_labels=len(tag2idx), id2label=idx2tag, label2id=tag2idx)
model = AutoModelForTokenClassification.from_pretrained(BASE_MODEL_PATH, config=config)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

#optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
num_training_steps = len(lyrics_loader) * epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0.1*num_training_steps, num_training_steps=num_training_steps)

scaler = GradScaler()

#continuous learning
print("Starting continuous learning:")
model.train()
train_losses = []

for epoch in range(epochs):
    total_loss = 0
    progress_bar = tqdm(lyrics_loader, desc=f"Epoch {epoch+1}/{epochs}")
    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        with autocast():
            outputs = model(**batch)
            loss = outputs.loss
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        lr_scheduler.step()

        total_loss += loss.item()
    avg_loss = total_loss / len(lyrics_loader)
    train_losses.append(avg_loss)
    print(f"Epoch {epoch+1} completed. Average loss: {avg_loss:.4f}")

#save the new model after continuous learning
os.makedirs(OUTPUT_MODEL_PATH, exist_ok=True)
model.save_pretrained(OUTPUT_MODEL_PATH)
tokenizer.save_pretrained(OUTPUT_MODEL_PATH)
print(f"New model (after continuous learning) saved to {OUTPUT_MODEL_PATH}")

# Predictions

### INPUT:
#### ner_DAPT_model path (one of):  
- ner_DAPT_model_finetuned_on_rap-hip-hop     
- ner_DAPT_model_finetuned_on_pop     
- ner_DAPT_model_finetuned_on_country     
- ner_DAPT_model_finetuned_on_3genres

- ner_DAPT_model_cont_on_rap-hip-hop
- ner_DAPT_model_cont_on_pop
- ner_DAPT_model_cont_on_country
- ner_DAPT_model_cont_on_3genres  
    
if you follow the recomended paths:  
/NLP-spring-2025/'GENRE'  

#### training dataset for continuous learning (one of):

- rap-hip-hop_labeled_no_2000.iob2
- pop_labeled_no_2000.iob2
- country_labeled_no_2000.iob2
- merged_3genres_labeled.iob2  
  
if you follow the recommended paths:  
/NLP-spring-2025/'GENRE'/datasets    
  
### OUTPUT:
#### prediction file (one of):  
- predictions_rap-hip-hop.iob2
- predictions_pop.iob2
- predictions_country.iob2
- predictions_3genres.iob2

- predictions_continuous_learning_rap-hip-hop.iob2
- predictions_continuous_learning_pop.iob2
- predictions_continuous_learning_country.iob2
- predictions_continuous_learning_3genres.iob2

recommended path:  
/NLP-spring-2025/'GENRE'/predictions

In [None]:
#------------------------------------------------------------------------------------------

MODEL_PATH = "/path/ner_DAPT_model_finetuned_on_pop"

#------------------------------------------------------------------------------------------
tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_PATH, add_prefix_space=True, use_fast=True)
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

#label mappings
tag2idx = model.config.label2id
idx2tag = model.config.id2label

class NERDataset(Dataset):
    def __init__(self, file_path, tokenizer, tag2idx, max_len=128):
        self.sentences, self.labels, self.raw_data = self.load_data(file_path)
        self.tokenizer = tokenizer
        self.tag2idx = tag2idx
        self.max_len = max_len

    def load_data(self, file_path):
        sentences, labels, raw_data = [], [], []
        sentence, label, sentence_data = [], [], []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    sentence_data.append(line)
                    if line.startswith('#'):
                        continue
                    parts = line.split('\t')
                    if len(parts) > 2:
                        sentence.append(parts[1])
                        label.append(parts[2])
                else:
                    if sentence:
                        sentences.append(sentence)
                        labels.append(label)
                        raw_data.append(sentence_data)
                    sentence, label, sentence_data = [], [], []
        if sentence:
            sentences.append(sentence)
            labels.append(label)
            raw_data.append(sentence_data)
        return sentences, labels, raw_data

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        words = self.sentences[idx]
        tags = self.labels[idx]
        tag_ids = [self.tag2idx.get(tag, 0) for tag in tags]
        encodings = self.tokenizer(
            words, is_split_into_words=True, truncation=True,
            max_length=self.max_len, padding=False
        )
        word_ids = encodings.word_ids()
        aligned_labels = [-100] * len(word_ids)
        prev_word_id = None
        for i, word_id in enumerate(word_ids):
            if word_id is None:
                aligned_labels[i] = -100
            elif word_id != prev_word_id:
                aligned_labels[i] = tag_ids[word_id]
            else:
                aligned_labels[i] = -100
            prev_word_id = word_id
        return {
            'input_ids': torch.tensor(encodings['input_ids']),
            'attention_mask': torch.tensor(encodings['attention_mask']),
            'labels': torch.tensor(aligned_labels)
        }

#prediction function
def write_predictions(model, dataset, dataloader, output_path):
    model.eval()
    all_predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Predicting"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1).cpu().numpy()
            labels = batch["labels"].cpu().numpy()

            for i in range(len(predictions)):
                pred_tags = []
                for j, pred in enumerate(predictions[i]):
                    if labels[i][j] != -100:
                        pred_tags.append(idx2tag[pred.item()])
                all_predictions.append(pred_tags)

    with open(output_path, "w", encoding="utf-8") as f:
        pred_idx = 0
        for data in dataset.raw_data:
            sent_counter = 0
            for line in data:
                if line.startswith('#'):
                    f.write(f"{line}\n")
                    continue
                if line:
                    parts = line.split('\t')
                    if len(parts) > 2:
                        tag = all_predictions[pred_idx][sent_counter] if sent_counter < len(all_predictions[pred_idx]) else "O"
                        f.write(f"{parts[0]}\t{parts[1]}\t{tag}\t-\t-\n")
                        sent_counter += 1
            f.write("\n")
            pred_idx += 1
    print(f"Predictions written to {output_path}")

#run prediction
test_file = "/path/lyrics_test_no_labels.iob2"
output_file = "/path/predictions_pop.iob2"

test_dataset = NERDataset(test_file, tokenizer, tag2idx)
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=DataCollatorForTokenClassification(tokenizer))

write_predictions(model, test_dataset, test_loader, output_file)

# Statistics
 #### Input:
 
prediction file (one of):  

   - predictions_rap-hip-hop.iob2

   - predictions_pop.iob2

   - predictions_country.iob2

   - predictions_3genres.iob2

   - predictions_continuous_learning_rap-hip-hop.iob2

   - predictions_continuous_learning_pop.iob2

   - predictions_continuous_learning_country.iob2

   - predictions_continuous_learning_3genres.iob2
   
if you follow the recommended paths:  
/NLP-spring-2025/'GENRE'/predictions  
  
and  
  
   - golden file ( lyrics_test.iob2 )
     
in the repository, you can find the files here:  
/NLP-spring-2025/test 




## Mistake breakdown:
#### correct   
Model correctly predicted the label (e.g. B-PER → B-PER)    

#### wrong_label    
Model predicted a named entity, but with the wrong type or boundary (e.g. B-LOC → B-ORG).     
       THIS FOLLOWS UP WITH A DETAILED BREAKDOWN       
  
#### spurious    
Model predicted an entity where the gold label was O. These are false positives — overpredictions.  

#### missed    
Model predicted O where the gold label was a named entity. These are false negatives — missed detections.  

In [None]:
def analyze_errors(gold_file, pred_file):
    def read_labels(path):
        sentences = []
        current = []
        with open(path, encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line == "":
                    if current:
                        sentences.append(current)
                        current = []
                elif line.startswith("#"):
                    continue
                else:
                    current.append(line.split("\t")[2])  # third column
        if current:
            sentences.append(current)
        return sentences

    gold = read_labels(gold_file)
    pred = read_labels(pred_file)

    assert len(gold) == len(pred), "Mismatch in number of sentences"

    counter = Counter()

    for g_sent, p_sent in zip(gold, pred):
        assert len(g_sent) == len(p_sent), "Mismatch in sentence lengths"
        for g, p in zip(g_sent, p_sent):
            if g == p:
                counter["correct"] += 1
            elif g == "O" and p != "O":
                counter["spurious"] += 1
            elif g != "O" and p == "O":
                counter["missed"] += 1
            elif g != p:
                counter[f"{g}->{p}"] += 1
                counter["wrong_label"] += 1

    print("Mistake breakdown:")
    for key, val in counter.most_common():
        print(f"{key:15}: {val}")

#------------------------------------------------------------------------------------------

analyze_errors("lyrics_test.iob2", "predictions_pop.iob2")

#------------------------------------------------------------------------------------------

## Confusion matrix of predicted entity labels

- Rows = Gold (true) labels

- Columns = Predicted labels

Each cell counts how often a gold label was predicted as a certain label.

In [None]:
def read_labels(file_path):
    sentences = []
    current = []
    with open(file_path, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line == "":
                if current:
                    sentences.append(current)
                    current = []
            elif line.startswith("#"):
                continue
            else:
                current.append(line.split("\t")[2])
    if current:
        sentences.append(current)
    return sentences

def create_confusion_matrix(gold_path, pred_path):
    gold_sentences = read_labels(gold_path)
    pred_sentences = read_labels(pred_path)

    assert len(gold_sentences) == len(pred_sentences), "Mismatch in number of sentences"

    all_gold = []
    all_pred = []

    for g_sent, p_sent in zip(gold_sentences, pred_sentences):
        assert len(g_sent) == len(p_sent), "Mismatch in sentence lengths"
        all_gold.extend(g_sent)
        all_pred.extend(p_sent)

    #get all unique labels
    all_labels = sorted(set(all_gold + all_pred))
   
    #build confusion matrix as DataFrame
    confusion = pd.DataFrame(0, index=all_labels, columns=all_labels)

    for g, p in zip(all_gold, all_pred):
        confusion.loc[g, p] += 1

    return confusion

def plot_confusion_matrix(confusion_df, normalize=False, figsize=(12, 10)):
    plt.figure(figsize=figsize)
    data = confusion_df.copy()
    if normalize:
        data = data.div(data.sum(axis=1), axis=0)
    sns.heatmap(data, annot=True, fmt=".2f" if normalize else "d", cmap="Blues")
    plt.title("NER Confusion Matrix" + (" (Normalized)" if normalize else ""))
    plt.xlabel("Predicted Label")
    plt.ylabel("Gold Label")
    plt.tight_layout()
    plt.show()

#------------------------------------------------------------------------------------------

gold_file = "lyrics_test.iob2"
pred_file = "predictions_pop.iob2"

#------------------------------------------------------------------------------------------

confusion = create_confusion_matrix(gold_file, pred_file)
plot_confusion_matrix(confusion)             
