In [1]:
import os
import json
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, BertPreTrainedModel, RobertaPreTrainedModel, DebertaV2PreTrainedModel, AutoModel, AutoModelForSequenceClassification, BertConfig, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score
import nltk
nltk.data.path.append("./nltk_data")
from nltk.corpus import wordnet as wn
from tqdm.notebook import tqdm
from nltk.wsd import lesk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [12]:
# Suppress false warnings from transformers
import logging
from transformers import logging as hf_logging
hf_logging.set_verbosity_error()

# Suppress the specific sklearn warning about class imbalance
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='sklearn.utils.multiclass')
warnings.filterwarnings("ignore", category=UserWarning, module='sklearn.metrics')

In [3]:
# List of benchmark files
BENCHMARK_FILES = [
    "senseval2.parquet",
    "senseval3.parquet",
    "semeval2007.parquet",
    "semeval2013.parquet",
    "semeval2015.parquet",
    "ALL.parquet"
]

In [4]:
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running on: {device}")

Running on: cuda


> ##### For each of the trained models, I will write code to load and calculate Accuracy and F1 score metrics on all test datasets, taking into account the specifics of each model.

### BERT (WSDInferenceDataset class + evaluation function)

In [4]:
# --- Configuration ---
MODEL_PATH = "./models/bert/bert_wsd_custom/"  
LABEL_MAP_FILE = "label_map.json"

In [5]:
# --- 1. Model Class Definition (BERT) ---
class BertForWSD(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.bert = AutoModel.from_config(config)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.classifier = torch.nn.Linear(config.hidden_size, config.num_labels)
        self.post_init()

    def forward(self, input_ids=None, attention_mask=None, target_token_idx=None, labels=None, **kwargs):
        kwargs.pop("num_items_in_batch", None)
        
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
        sequence_output = outputs.last_hidden_state

        batch_size = input_ids.shape[0]
        batch_indices = torch.arange(batch_size, device=input_ids.device)
        target_vectors = sequence_output[batch_indices, target_token_idx]

        target_vectors = self.dropout(target_vectors)
        logits = self.classifier(target_vectors)
        
        loss = None
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return {
            "loss": loss,
            "logits": logits
        }


In [5]:
# --- 2. Inference Dataset ---
class WSDInferenceDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        sentence = row['sentence']
        c_start = row['char_start']
        
        # Tokenize
        encoding = self.tokenizer(
            sentence,
            truncation=True,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=True,
            return_tensors="pt"
        )
        
        # Find target token index
        offsets = encoding['offset_mapping'].squeeze().tolist()
        target_token_idx = 0
        
        for i, (o_start, o_end) in enumerate(offsets):
            if o_start == 0 and o_end == 0: continue
            if o_start == c_start:
                target_token_idx = i
                break
            if o_start < c_start and o_end > c_start:
                target_token_idx = i
                break
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'target_token_idx': torch.tensor(target_token_idx, dtype=torch.long)
        }

In [6]:
# --- 3. Helper Functions ---
def load_resources():
    """Loads tokenizer, label map and the model."""
    print("Loading resources...")
    
    # Load Label Map
    with open(LABEL_MAP_FILE, 'r') as f:
        label2id = json.load(f)
    # Create reverse map: ID -> Label Name (e.g., 0 -> 'art.n.01')
    id2label = {v: k for k, v in label2id.items()}
    
    # Load Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    
    # Load Model
    model = BertForWSD.from_pretrained(MODEL_PATH, num_labels=len(label2id))
    model.to(device)
    model.eval()
    
    return tokenizer, id2label, model


In [7]:
def evaluate_dataset(file_path, model, tokenizer, id2label):
    """Runs prediction loop for a single dataset and calculates metrics."""

    file_path = os.path.join("./evaluation_data/parquet/", file_path)
    if not os.path.exists(file_path):
        print(f"Warning: File {file_path} not found.")
        return None

    df = pd.read_parquet(file_path)
    dataset = WSDInferenceDataset(df, tokenizer)
    loader = DataLoader(dataset, batch_size=32, shuffle=False)
    
    all_preds_ids = []
    
    # Inference Loop
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            target_idx = batch['target_token_idx'].to(device)
            
            outputs = model(
                input_ids=input_ids, 
                attention_mask=attention_mask, 
                target_token_idx=target_idx
            )
            
            # Get predicted class ID
            logits = outputs['logits']
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds_ids.extend(preds)
            
    # --- Metrics Calculation ---
    y_true_for_sklearn = []
    y_pred_for_sklearn = []
    
    correct_count = 0
    total_count = len(df)
    
    for idx, pred_id in enumerate(all_preds_ids):
        # Convert ID to String Label
        pred_label = id2label.get(pred_id, "UNK")
        
        # Get Gold Labels (List of strings)
        gold_labels = df.iloc[idx]['gold_synsets'] # Expecting a list/array
        
        # Correct if prediction is IN the gold list
        if pred_label in gold_labels:
            correct_count += 1
            # For sklearn metrics, if correct, we say Expected == Predicted
            y_true_for_sklearn.append(pred_label)
            y_pred_for_sklearn.append(pred_label)
        else:
            # If incorrect, we set Expected = First Gold Label
            # This allows us to calculate F1
            target_label = gold_labels[0] if len(gold_labels) > 0 else "UNK"
            y_true_for_sklearn.append(target_label)
            y_pred_for_sklearn.append(pred_label)
            
    # Calculate Metrics
    acc = accuracy_score(y_true_for_sklearn, y_pred_for_sklearn)
    micro_f1 = f1_score(y_true_for_sklearn, y_pred_for_sklearn, average='micro')
    macro_f1 = f1_score(y_true_for_sklearn, y_pred_for_sklearn, average='macro')
    
    return {
        "Dataset": os.path.basename(file_path).replace('.parquet', ''),
        "Accuracy": acc,
        "Micro F1": micro_f1,
        "Macro F1": macro_f1,
        "Samples": total_count
    }

In [19]:
# --- 4. Main Execution ---
tokenizer, id2label, model = load_resources()
    
results = []

print("\nStarting Benchmarking...")
print("-" * 60)

for filename in BENCHMARK_FILES:
    print(f"Processing {filename}...")
    metrics = evaluate_dataset(filename, model, tokenizer, id2label)
    if metrics:
        results.append(metrics)
        print(f"   -> Accuracy: {metrics['Accuracy']:.4f}")
        
print("-" * 60)

# Create Final DataFrame
results_df = pd.DataFrame(results)

# Formatting for display
print("\n=== Final Benchmark Results ===")
print(results_df.to_string(index=False, float_format="%.4f"))

# Save to CSV
results_df.to_csv("./results/bert_benchmark_results.csv", index=False)
print("\nResults saved to 'bert_benchmark_results.csv'")

Loading resources...

Starting Benchmarking...
------------------------------------------------------------
Processing senseval2.parquet...
   -> Accuracy: 0.6109
Processing senseval3.parquet...
   -> Accuracy: 0.6216
Processing semeval2007.parquet...
   -> Accuracy: 0.5978
Processing semeval2013.parquet...
   -> Accuracy: 0.5237
Processing semeval2015.parquet...
   -> Accuracy: 0.6057
Processing ALL.parquet...
   -> Accuracy: 0.5923
------------------------------------------------------------

=== Final Benchmark Results ===
    Dataset  Accuracy  Micro F1  Macro F1  Samples
  senseval2    0.6109    0.6109    0.3988     2282
  senseval3    0.6216    0.6216    0.3920     1850
semeval2007    0.5978    0.5978    0.4014      455
semeval2013    0.5237    0.5237    0.3075     1644
semeval2015    0.6057    0.6057    0.3794     1022
        ALL    0.5923    0.5923    0.3483     7253

Results saved to 'bert_benchmark_results.csv'


### RoBERTa

In [10]:
# --- Configuration ---
MODEL_PATH = "./models/roberta/roberta_wsd_custom/" 
LABEL_MAP_FILE = "label_map.json"

In [11]:
# --- 1. Model Class Definition (RoBERTa) ---
class RobertaForWSD(RobertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.roberta = AutoModel.from_config(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.post_init()

    def forward(self, input_ids=None, attention_mask=None, target_token_idx=None, labels=None, **kwargs):
        kwargs.pop("num_items_in_batch", None)

        # 1. Run RoBERTa
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
        sequence_output = outputs.last_hidden_state

        # 2. Extract specific vector
        batch_size = input_ids.shape[0]
        batch_indices = torch.arange(batch_size, device=input_ids.device)
        
        # Gather embeddings for the target token index
        target_vectors = sequence_output[batch_indices, target_token_idx]

        # 3. Classification
        target_vectors = self.dropout(target_vectors)
        logits = self.classifier(target_vectors)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return {
            "loss": loss,
            "logits": logits
        }

In [13]:
# --- 2. Inference Dataset (Same as for BERT) ---
# --- 3. Helper Functions ---
def load_resources():
    """Loads tokenizer, label map and the RoBERTa model."""
    print("Loading resources for RoBERTa...")
    
    # Load Label Map
    with open(LABEL_MAP_FILE, 'r') as f:
        label2id = json.load(f)
    # Create reverse map: ID -> Label Name
    id2label = {v: k for k, v in label2id.items()}
    
    # Load Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, add_prefix_space=True)
    
    # Load Model
    model = RobertaForWSD.from_pretrained(MODEL_PATH, num_labels=len(label2id))
    model.to(device)
    model.eval()
    
    return tokenizer, id2label, model

In [14]:
# --- 4. Main Execution ---
tokenizer, id2label, model = load_resources()
    
results = []

print("\nStarting Benchmarking...")
print("-" * 60)

for filename in BENCHMARK_FILES:
    print(f"Processing {filename}...")
    metrics = evaluate_dataset(filename, model, tokenizer, id2label)
    if metrics:
        results.append(metrics)
        print(f"   -> Accuracy: {metrics['Accuracy']:.4f}")
        
print("-" * 60)

# Create Final DataFrame
results_df = pd.DataFrame(results)

# Formatting for display
print("\n=== Final Benchmark Results ===")
print(results_df.to_string(index=False, float_format="%.4f"))

# Save to CSV
results_df.to_csv("./results/roberta_benchmark_results.csv", index=False)
print("\nResults saved to 'roberta_benchmark_results.csv'")

Loading resources for RoBERTa...

Starting Benchmarking...
------------------------------------------------------------
Processing senseval2.parquet...
   -> Accuracy: 0.6078
Processing senseval3.parquet...
   -> Accuracy: 0.6162
Processing semeval2007.parquet...
   -> Accuracy: 0.6000
Processing semeval2013.parquet...
   -> Accuracy: 0.5207
Processing semeval2015.parquet...
   -> Accuracy: 0.6057
Processing ALL.parquet...
   -> Accuracy: 0.5894
------------------------------------------------------------

=== Final Benchmark Results ===
    Dataset  Accuracy  Micro F1  Macro F1  Samples
  senseval2    0.6078    0.6078    0.3976     2282
  senseval3    0.6162    0.6162    0.3811     1850
semeval2007    0.6000    0.6000    0.4019      455
semeval2013    0.5207    0.5207    0.3149     1644
semeval2015    0.6057    0.6057    0.3774     1022
        ALL    0.5894    0.5894    0.3426     7253

Results saved to 'roberta_benchmark_results.csv'


### DeBERTa

In [8]:
# --- Configuration ---
MODEL_PATH = "./models/deberta/deberta_wsd_custom/" 
LABEL_MAP_FILE = "label_map.json"

In [9]:
# --- 1. Model Class Definition (DeBERTa V3) ---
class DebertaV3ForWSD(DebertaV2PreTrainedModel): 
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        # AutoModel handles loading the correct backbone (DebertaV2Model)
        self.deberta = AutoModel.from_config(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.post_init()

    def forward(self, input_ids=None, attention_mask=None, target_token_idx=None, labels=None, **kwargs):
        kwargs.pop("num_items_in_batch", None)

        outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
        sequence_output = outputs.last_hidden_state

        batch_size = input_ids.shape[0]
        batch_indices = torch.arange(batch_size, device=input_ids.device)
        
        # Extract vector for the specific target token
        target_vectors = sequence_output[batch_indices, target_token_idx]

        target_vectors = self.dropout(target_vectors)
        logits = self.classifier(target_vectors)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # Return dictionary to match evaluation loop expectations
        return {
            "loss": loss,
            "logits": logits
        }

In [10]:
# --- 2. Inference Dataset (The same one) ---
# --- 3. Helper Functions (Updated) ---
def load_resources():
    """Loads tokenizer, label map and the DeBERTa model."""
    print("Loading resources for DeBERTa...")
    
    # Load Label Map
    with open(LABEL_MAP_FILE, 'r') as f:
        label2id = json.load(f)
    id2label = {v: k for k, v in label2id.items()}
    
    # Load Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    
    # Load Model
    model = DebertaV3ForWSD.from_pretrained(MODEL_PATH, num_labels=len(label2id))
    model.to(device)
    model.eval()
    
    return tokenizer, id2label, model

In [15]:
# --- 4. Main Execution ---
tokenizer, id2label, model = load_resources()
    
results = []

print("\nStarting Benchmarking...")
print("-" * 60)

for filename in BENCHMARK_FILES:
    print(f"Processing {filename}...")
    metrics = evaluate_dataset(filename, model, tokenizer, id2label)
    if metrics:
        results.append(metrics)
        print(f"   -> Accuracy: {metrics['Accuracy']:.4f}")
        
print("-" * 60)

# Create Final DataFrame
results_df = pd.DataFrame(results)

# Formatting for display
print("\n=== Final Benchmark Results ===")
print(results_df.to_string(index=False, float_format="%.4f"))

# Save to CSV
results_df.to_csv("./results/deberta_benchmark_results.csv", index=False)
print("\nResults saved to 'deberta_benchmark_results.csv'")

Loading resources for DeBERTa...

Starting Benchmarking...
------------------------------------------------------------
Processing senseval2.parquet...
   -> Accuracy: 0.6074
Processing senseval3.parquet...
   -> Accuracy: 0.6168
Processing semeval2007.parquet...
   -> Accuracy: 0.6000
Processing semeval2013.parquet...
   -> Accuracy: 0.5182
Processing semeval2015.parquet...
   -> Accuracy: 0.5959
Processing ALL.parquet...
   -> Accuracy: 0.5875
------------------------------------------------------------

=== Final Benchmark Results ===
    Dataset  Accuracy  Micro F1  Macro F1  Samples
  senseval2    0.6074    0.6074    0.3902     2282
  senseval3    0.6168    0.6168    0.3845     1850
semeval2007    0.6000    0.6000    0.3941      455
semeval2013    0.5182    0.5182    0.3039     1644
semeval2015    0.5959    0.5959    0.3635     1022
        ALL    0.5875    0.5875    0.3367     7253

Results saved to 'deberta_benchmark_results.csv'


### DeBERTa - Logit Masking

In [16]:
# --- Configuration ---
MODEL_PATH = "./models/deberta_masked/deberta_wsd_masked/" 
LABEL_MAP_FILE = "label_map.json"

In [17]:
# --- Custom Dataset Class with Logit Masking ---
class WSDDataset(Dataset):
    def __init__(self, dataframe, tokenizer, label2id, max_len=128):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label2id = label2id
        self.num_labels = len(label2id)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        sentence = row['sentence']
        target_word = row['target_word']
        c_start = row['char_start']

        encoding = self.tokenizer(
            sentence,
            truncation=True,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=True,
            return_tensors="pt"
        )

        offsets = encoding['offset_mapping'].squeeze().tolist()
        target_token_idx = 0

        # Find target token
        for i, (o_start, o_end) in enumerate(offsets):
            if o_start == 0 and o_end == 0: continue
            if o_start == c_start:
                target_token_idx = i
                break
            if o_start < c_start and o_end > c_start:
                 target_token_idx = i
                 break
        
        # --- Logit Masking Logic ---
        # 1. Initialize mask with large negative value
        mask = torch.full((self.num_labels,), -1e4, dtype=torch.float32)
        
        # 2. Get candidate synsets from NLTK based purely on the target word
        lookup_word = target_word.replace(" ", "_")
        synsets = wn.synsets(lookup_word)
        
        found_candidates = False

        # 3. Activate valid indices found in NLTK
        for synset in synsets:
            s_name = synset.name()
            if s_name in self.label2id:
                mask[self.label2id[s_name]] = 0.0
                found_candidates = True
        
        # 4. Fallback: If NLTK finds NOTHING known, unmask ALL labels.
        if not found_candidates:
            mask = torch.zeros((self.num_labels,), dtype=torch.float32)

        item = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'target_token_idx': torch.tensor(target_token_idx, dtype=torch.long),
            'logit_mask': mask
        }
        return item


In [18]:
# --- Custom Model Class ---
class DebertaV3ForWSD(DebertaV2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.deberta = AutoModel.from_config(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.post_init()

    def forward(self, input_ids=None, attention_mask=None, target_token_idx=None, labels=None, logit_mask=None, **kwargs):
        kwargs.pop("num_items_in_batch", None)

        outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
        sequence_output = outputs.last_hidden_state

        batch_size = input_ids.shape[0]
        batch_indices = torch.arange(batch_size, device=input_ids.device)

        target_vectors = sequence_output[batch_indices, target_token_idx]
        target_vectors = self.dropout(target_vectors)
        logits = self.classifier(target_vectors)

        # Apply Logit Masking 
        if logit_mask is not None:
            logits = logits + logit_mask

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return {
            "loss": loss,
            "logits": logits
        }


In [19]:
# --- 3. Helper Functions ---
def load_resources():
    print("Loading resources for DeBERTa Masked...")
    
    with open(LABEL_MAP_FILE, 'r') as f:
        label2id = json.load(f)
    id2label = {v: k for k, v in label2id.items()}
    
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    
    model = DebertaV3ForWSD.from_pretrained(MODEL_PATH, num_labels=len(label2id))
    model.to(device)
    model.eval()
    
    return tokenizer, id2label, model


In [20]:
def evaluate_dataset(file_path, model, tokenizer, id2label):
    """Runs prediction loop using Logit Masking."""
    
    file_path = os.path.join("./evaluation_data/parquet/", file_path)
    if not os.path.exists(file_path):
        print(f"Warning: File {file_path} not found.")
        return None

    df = pd.read_parquet(file_path)
    label2id = {v: k for k, v in id2label.items()}

    dataset = WSDDataset(df, tokenizer, label2id)
    loader = DataLoader(dataset, batch_size=32, shuffle=False)
    
    all_preds_ids = []
    
    # Inference Loop
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            target_idx = batch['target_token_idx'].to(device)
            logit_mask = batch['logit_mask'].to(device)
            
            outputs = model(
                input_ids=input_ids, 
                attention_mask=attention_mask, 
                target_token_idx=target_idx,
                logit_mask=logit_mask
            )
            
            logits = outputs['logits']
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds_ids.extend(preds)
            
    # --- Metrics Calculation ---
    y_true_for_sklearn = []
    y_pred_for_sklearn = []
    
    correct_count = 0
    total_count = len(df)
    
    for idx, pred_id in enumerate(all_preds_ids):
        pred_label = id2label.get(pred_id, "UNK")
        
        gold_labels = df.iloc[idx]['gold_synsets'] 
        
        if pred_label in gold_labels:
            correct_count += 1
            y_true_for_sklearn.append(pred_label)
            y_pred_for_sklearn.append(pred_label)
        else:
            target_label = gold_labels[0] if len(gold_labels) > 0 else "UNK"
            y_true_for_sklearn.append(target_label)
            y_pred_for_sklearn.append(pred_label)
            
    acc = accuracy_score(y_true_for_sklearn, y_pred_for_sklearn)
    micro_f1 = f1_score(y_true_for_sklearn, y_pred_for_sklearn, average='micro')
    macro_f1 = f1_score(y_true_for_sklearn, y_pred_for_sklearn, average='macro')
    
    return {
        "Dataset": os.path.basename(file_path).replace('.parquet', ''),
        "Accuracy": acc,
        "Micro F1": micro_f1,
        "Macro F1": macro_f1,
        "Samples": total_count
    }

In [21]:
tokenizer, id2label, model = load_resources()
    
results = []
print("\nStarting Benchmarking with Logit Masking...")
print("-" * 60)

for filename in BENCHMARK_FILES:
    print(f"Processing {filename}...")
    metrics = evaluate_dataset(filename, model, tokenizer, id2label)
    if metrics:
        results.append(metrics)
        print(f"   -> Accuracy: {metrics['Accuracy']:.4f}")
        
print("-" * 60)

if results:
    results_df = pd.DataFrame(results)
    print("\n=== Final Benchmark Results ===")
    print(results_df.to_string(index=False, float_format="%.4f"))
    
    output_path = "./results/deberta_masked_benchmark_results.csv"
    results_df.to_csv(output_path, index=False)
    print(f"\nResults saved to '{output_path}'")

Loading resources for DeBERTa Masked...

Starting Benchmarking with Logit Masking...
------------------------------------------------------------
Processing senseval2.parquet...
   -> Accuracy: 0.6494
Processing senseval3.parquet...
   -> Accuracy: 0.6730
Processing semeval2007.parquet...
   -> Accuracy: 0.6264
Processing semeval2013.parquet...
   -> Accuracy: 0.6223
Processing semeval2015.parquet...
   -> Accuracy: 0.6614
Processing ALL.parquet...
   -> Accuracy: 0.6495
------------------------------------------------------------

=== Final Benchmark Results ===
    Dataset  Accuracy  Micro F1  Macro F1  Samples
  senseval2    0.6494    0.6494    0.4598     2282
  senseval3    0.6730    0.6730    0.4570     1850
semeval2007    0.6264    0.6264    0.4365      455
semeval2013    0.6223    0.6223    0.4177     1644
semeval2015    0.6614    0.6614    0.4304     1022
        ALL    0.6495    0.6495    0.4313     7253

Results saved to './results/deberta_masked_benchmark_results.csv'


### Gloss - DeBERTa

In [28]:
# --- Configuration ---
MODEL_PATH = "./models/deberta_gloss/deberta_wsd_gloss/" 
BATCH_SIZE = 32

In [29]:
# --- 1. Dataset Class ---
class GlossInferenceDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=256):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        context = row['context']
        gloss = row['gloss']
        
        # Prepare input as pairs: [CLS] Context [SEP] Gloss [SEP]
        encoding = self.tokenizer(
            context,
            gloss,
            truncation=True,
            max_length=self.max_len,
            padding=False,
        )

        item = {
            'input_ids': encoding['input_ids'],
            'attention_mask': encoding['attention_mask']
        }
        
        if 'token_type_ids' in encoding:
            item['token_type_ids'] = encoding['token_type_ids']

        return item

In [30]:
# --- 2. Evaluation Function ---
def evaluate_gloss_model(original_filename, model, tokenizer, data_collator):
    """
    Runs prediction pipeline:
    1. Loads gloss_dataset (candidates)
    2. Predicts scores for all candidates
    3. Selects best candidate per instance
    4. Compares with ORIGINAL dataset (ground truth)
    """
    
    # Paths
    original_path = os.path.join("evaluation_data/parquet", original_filename)
    gloss_filename = original_filename.replace(".parquet", "_gloss.parquet")
    gloss_path = os.path.join("evaluation_data/gloss", gloss_filename)

    if not os.path.exists(original_path):
        print(f"Warning: Original file {original_path} not found.")
        return None
    if not os.path.exists(gloss_path):
        print(f"Warning: Gloss file {gloss_path} not found.")
        return None

    # Load Dataframes
    df_original = pd.read_parquet(original_path)
    df_gloss = pd.read_parquet(gloss_path)
    
    # Create Dataset
    dataset = GlossInferenceDataset(df_gloss, tokenizer)
    loader = DataLoader(
        dataset, 
        batch_size=BATCH_SIZE, 
        shuffle=False, 
        collate_fn=data_collator
    )
    
    all_scores = []
    
    # --- Inference Loop ---
    # We only need the score for the "Positive" class (label 1)
    print(f"Running inference on {len(df_gloss)} pairs...")
    with torch.no_grad():
        for batch in tqdm(loader, desc="Inference"):
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)

            inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
            if 'token_type_ids' in batch:
                inputs['token_type_ids'] = batch['token_type_ids'].to(DEVICE)
            
            outputs = model(**inputs)
            
            logits = outputs.logits
            
            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=1)
            
            # We take the probability of class 1 (Entailment/Correct Gloss)
            positive_scores = probs[:, 1].cpu().numpy()
            all_scores.extend(positive_scores)
            
    # Add scores back to the dataframe
    df_gloss['score'] = all_scores
    
    # --- Aggregation (Select Best Candidate) ---
    # For each instance_group_id, find the row with the max score
    best_candidates_idx = df_gloss.groupby('instance_group_id')['score'].idxmax()
    
    # Extract just the predictions: dictionary {instance_id: predicted_synset}
    best_predictions = df_gloss.loc[best_candidates_idx][['instance_group_id', 'candidate_synset']]
    prediction_map = dict(zip(best_predictions['instance_group_id'], best_predictions['candidate_synset']))
    
    # --- Final Scoring against Original Ground Truth ---
    y_true_for_sklearn = []
    y_pred_for_sklearn = []
    
    # Iterate over the ORIGINAL dataset to ensure full coverage
    for idx, row in df_original.iterrows():
        inst_id = row['id']
        gold_synsets = row['gold_synsets']
        
        # Ensure gold is a list
        if isinstance(gold_synsets, np.ndarray): gold_synsets = gold_synsets.tolist()
        elif not isinstance(gold_synsets, list): gold_synsets = [gold_synsets]
        
        # Get model prediction
        pred_label = prediction_map.get(inst_id, "UNK")
        
        # Check correctness
        if pred_label in gold_synsets:
            y_true_for_sklearn.append(pred_label)
            y_pred_for_sklearn.append(pred_label)
        else:
            # For F1 calc, align expected with one of the valid golds
            target_label = gold_synsets[0] if len(gold_synsets) > 0 else "UNK"
            y_true_for_sklearn.append(target_label)
            y_pred_for_sklearn.append(pred_label)
            
    # Calculate Metrics
    acc = accuracy_score(y_true_for_sklearn, y_pred_for_sklearn)
    micro_f1 = f1_score(y_true_for_sklearn, y_pred_for_sklearn, average='micro')
    macro_f1 = f1_score(y_true_for_sklearn, y_pred_for_sklearn, average='macro')
    
    return {
        "Dataset": original_filename.replace('.parquet', ''),
        "Accuracy": acc,
        "Micro F1": micro_f1,
        "Macro F1": macro_f1,
        "Samples": len(df_original)
    }


In [31]:
def load_resources():
    print("Loading Gloss DeBERTa resources...")
    
    # Load Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    
    # Ensure [TGT] token exists
    if '[TGT]' not in tokenizer.additional_special_tokens:
        tokenizer.add_special_tokens({'additional_special_tokens': ['[TGT]']})
    
    # Load Model (Binary Classification)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=2)
    model.to(DEVICE)
    model.eval()
    
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    return tokenizer, model, data_collator

In [32]:
tokenizer, model, data_collator = load_resources()

results = []

print("\nStarting Gloss DeBERTa Benchmarking...")
print("-" * 60)

for filename in BENCHMARK_FILES:
    print(f"Processing {filename}...")
    metrics = evaluate_gloss_model(filename, model, tokenizer, data_collator)
    if metrics:
        results.append(metrics)
        print(f"   -> Accuracy: {metrics['Accuracy']:.4f}")
        
print("-" * 60)

# Create Final DataFrame
results_df = pd.DataFrame(results)

# Formatting for display
print("\n=== Final Benchmark Results (Gloss DeBERTa) ===")
print(results_df.to_string(index=False, float_format="%.4f"))

# Save to CSV
os.makedirs("./results", exist_ok=True)
results_df.to_csv("./results/deberta_gloss_benchmark_results.csv", index=False)
print("\nResults saved to 'deberta_gloss_benchmark_results.csv'")

Loading Gloss DeBERTa resources...

Starting Gloss DeBERTa Benchmarking...
------------------------------------------------------------
Processing senseval2.parquet...
Running inference on 16736 pairs...


Inference:   0%|          | 0/523 [00:00<?, ?it/s]

   -> Accuracy: 0.7401
Processing senseval3.parquet...
Running inference on 16789 pairs...


Inference:   0%|          | 0/525 [00:00<?, ?it/s]

   -> Accuracy: 0.7103
Processing semeval2007.parquet...
Running inference on 4634 pairs...


Inference:   0%|          | 0/145 [00:00<?, ?it/s]

   -> Accuracy: 0.6857
Processing semeval2013.parquet...
Running inference on 11451 pairs...


Inference:   0%|          | 0/358 [00:00<?, ?it/s]

   -> Accuracy: 0.7749
Processing semeval2015.parquet...
Running inference on 7582 pairs...


Inference:   0%|          | 0/237 [00:00<?, ?it/s]

   -> Accuracy: 0.7808
Processing ALL.parquet...
Running inference on 57192 pairs...


Inference:   0%|          | 0/1788 [00:00<?, ?it/s]

   -> Accuracy: 0.7427
------------------------------------------------------------

=== Final Benchmark Results (Gloss DeBERTa) ===
    Dataset  Accuracy  Micro F1  Macro F1  Samples
  senseval2    0.7401    0.7401    0.5278     2282
  senseval3    0.7103    0.7103    0.5290     1850
semeval2007    0.6857    0.6857    0.5345      455
semeval2013    0.7749    0.7749    0.5969     1644
semeval2015    0.7808    0.7808    0.5963     1022
        ALL    0.7427    0.7427    0.5503     7253

Results saved to 'deberta_gloss_benchmark_results.csv'


### Lesk & Most Frequent Sense (MFS)

> I will write functions to obtain predictions from MFS, MFS with automatic part-of-speech accounting, and Lesk algorithm.

In [5]:
def get_wordnet_pos(treebank_tag):
    """
    Maps NLTK POS tags (Treebank) to WordNet POS tags.
    """
    if treebank_tag.startswith('J'):
        return wn.ADJ
    elif treebank_tag.startswith('V'):
        return wn.VERB
    elif treebank_tag.startswith('N'):
        return wn.NOUN
    elif treebank_tag.startswith('R'):
        return wn.ADV
    else:
        return None

In [6]:
def predict_mfs_pos(sentence, target_word):
    """
    Predicts MFS considering POS and LEMMATIZATION.
    """
    # 1. Tokenize and Tag
    tokens = word_tokenize(sentence)
    tagged_tokens = nltk.pos_tag(tokens)
    
    # 2. Find the POS tag for the target word
    wn_pos = None
    for word, tag in tagged_tokens:
        if word == target_word or word.lower() == target_word.lower():
            wn_pos = get_wordnet_pos(tag)
            break
            
    # 3. Lemmatize and Query WordNet
    if wn_pos:
        # Lemmatize with the specific POS
        lemma = lemmatizer.lemmatize(target_word.lower(), pos=wn_pos)
        synsets = wn.synsets(lemma, pos=wn_pos)
    else:
        # Fallback: Lemmatize as Noun (default) and query without POS restriction
        lemma = lemmatizer.lemmatize(target_word.lower())
        synsets = wn.synsets(lemma)

    # 4. Final Fallback mechanism
    # If finding by specific POS failed, try finding by lemma only
    if not synsets and wn_pos:
         synsets = wn.synsets(lemma)
         
    if not synsets:
        # Last resort: try original word
        synsets = wn.synsets(target_word)

    if not synsets:
        return "UNK"
        
    return synsets[0].name()

In [7]:
def predict_mfs(target_word):
    """
    Predicts the Most Frequent Sense (MFS).
    WordNet returns synsets ordered by frequency usage.
    """
    synsets = wn.synsets(target_word)
    if not synsets:
        return "UNK"
    # Take the first one as it is the most frequent
    return synsets[0].name()

In [8]:
def predict_lesk(sentence, target_word):
    """
    Predicts sense using the Lesk algorithm (gloss overlap).
    """
    # Tokenization is required for the Lesk algorithm
    sent_tokens = word_tokenize(sentence)
    
    # nltk.wsd.lesk returns a Synset object or None
    synset = lesk(sent_tokens, target_word)
    
    if synset:
        return synset.name()
    return "UNK"

In [9]:
def calculate_metrics_for_baseline(df, method_name):
    """
    Calculates metrics for a specific baseline method (MFS, MFS_POS, or Lesk).
    Replicates the logic used in the transformer evaluation for consistency.
    """
    y_true_for_sklearn = []
    y_pred_for_sklearn = []
    
    correct_count = 0
    
    # Iterate through the dataset
    for idx, row in df.iterrows():
        sentence = row['sentence']
        target_word = row['target_word']
        gold_labels = row['gold_synsets']
        
        # 1. Prediction Step
        if method_name == "Lesk":
            pred_label = predict_lesk(sentence, target_word)
        elif method_name == "MFS":
            pred_label = predict_mfs(target_word)
        elif method_name == "MFS_POS":
            pred_label = predict_mfs_pos(sentence, target_word)
        else:
            pred_label = "UNK"
            
        # 2. Evaluation Logic
        if pred_label in gold_labels:
            correct_count += 1
            y_true_for_sklearn.append(pred_label)
            y_pred_for_sklearn.append(pred_label)
        else:
            target_label = gold_labels[0] if len(gold_labels) > 0 else "UNK"
            y_true_for_sklearn.append(target_label)
            y_pred_for_sklearn.append(pred_label)
            
    # Calculate Metrics
    acc = accuracy_score(y_true_for_sklearn, y_pred_for_sklearn)
    micro_f1 = f1_score(y_true_for_sklearn, y_pred_for_sklearn, average='micro')
    macro_f1 = f1_score(y_true_for_sklearn, y_pred_for_sklearn, average='macro')
    
    return acc, micro_f1, macro_f1

In [10]:
def run_benchmark_loop(method_name, file_list, output_filename):
    """
    Iterates through all files for a specific method, prints a table, 
    and saves the results to a CSV file.
    """
    print(f"\nStarting Benchmarking for method: {method_name}")
    print("=" * 70)
    print(f"{'Dataset':<20} | {'Accuracy':<10} | {'Micro F1':<10} | {'Macro F1':<10}")
    print("-" * 70)

    results = []

    for filename in file_list:
        file_path = os.path.join("./evaluation_data/parquet/", filename)
        
        if not os.path.exists(file_path):
            print(f"Warning: File {file_path} not found.")
            continue
            
        # Load Dataset
        df = pd.read_parquet(file_path)
        total_count = len(df)
        dataset_name = filename.replace('.parquet', '')
        
        # Calculate metrics
        acc, micro, macro = calculate_metrics_for_baseline(df, method_name)
        
        results.append({
            "Dataset": dataset_name,
            "Accuracy": acc,
            "Micro F1": micro,
            "Macro F1": macro,
            "Samples": total_count
        })
        
        # Print row to console
        print(f"{dataset_name:<20} | {acc:.4f}     | {micro:.4f}     | {macro:.4f}")

    print("-" * 70)
    
    # Save results to CSV
    if results:
        results_df = pd.DataFrame(results)
        save_path = os.path.join("./results/", output_filename)
        results_df.to_csv(save_path, index=False)
        print(f"Results for {method_name} saved to '{save_path}'")
        return results_df
    else:
        print(f"No results generated for {method_name}.")
        return None

In [13]:
# 1. Run Loop for Lesk
lesk_df = run_benchmark_loop(
    method_name="Lesk", 
    file_list=BENCHMARK_FILES, 
    output_filename="lesk_benchmark_results.csv"
)


Starting Benchmarking for method: Lesk
Dataset              | Accuracy   | Micro F1   | Macro F1  
----------------------------------------------------------------------
senseval2            | 0.3545     | 0.3545     | 0.1868
senseval3            | 0.3097     | 0.3097     | 0.1678
semeval2007          | 0.2110     | 0.2110     | 0.1319
semeval2013          | 0.3631     | 0.3631     | 0.2403
semeval2015          | 0.3542     | 0.3542     | 0.2009
ALL                  | 0.3360     | 0.3360     | 0.1992
----------------------------------------------------------------------
Results for Lesk saved to './results/lesk_benchmark_results.csv'


In [14]:
# 2. Run Loop for MFS (Simple)
mfs_df = run_benchmark_loop(
    method_name="MFS", 
    file_list=BENCHMARK_FILES, 
    output_filename="mfs_benchmark_results.csv"
)


Starting Benchmarking for method: MFS
Dataset              | Accuracy   | Micro F1   | Macro F1  
----------------------------------------------------------------------
senseval2            | 0.4939     | 0.4939     | 0.2824
senseval3            | 0.4708     | 0.4708     | 0.2874
semeval2007          | 0.3956     | 0.3956     | 0.2584
semeval2013          | 0.5335     | 0.5335     | 0.3919
semeval2015          | 0.4941     | 0.4941     | 0.2954
ALL                  | 0.4908     | 0.4908     | 0.2972
----------------------------------------------------------------------
Results for MFS saved to './results/mfs_benchmark_results.csv'


In [15]:
# 3. Run Loop for MFS with POS (Smart) - NEW
mfs_pos_df = run_benchmark_loop(
    method_name="MFS_POS", 
    file_list=BENCHMARK_FILES, 
    output_filename="mfs_pos_benchmark_results.csv"
)


Starting Benchmarking for method: MFS_POS
Dataset              | Accuracy   | Micro F1   | Macro F1  
----------------------------------------------------------------------
senseval2            | 0.6091     | 0.6091     | 0.3933
senseval3            | 0.5897     | 0.5897     | 0.3866
semeval2007          | 0.5099     | 0.5099     | 0.3651
semeval2013          | 0.5493     | 0.5493     | 0.4018
semeval2015          | 0.6067     | 0.6067     | 0.4097
ALL                  | 0.5840     | 0.5840     | 0.3766
----------------------------------------------------------------------
Results for MFS_POS saved to './results/mfs_pos_benchmark_results.csv'


### Final comparison

In [16]:
RESULTS_DIR = "./results/"

files = [
    "bert_benchmark_results.csv",
    "lesk_benchmark_results.csv",
    "roberta_benchmark_results.csv",
    "deberta_benchmark_results.csv",
    "deberta_gloss_benchmark_results.csv",
    "mfs_benchmark_results.csv",
    "mfs_pos_benchmark_results.csv",
    "deberta_masked_benchmark_results.csv"
]

In [18]:
dfs = []

print("Loading files...")
for filename in files:
    file_path = os.path.join(RESULTS_DIR, filename)
    
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        model_name = filename.replace("_benchmark_results.csv", "").replace("_", " ").upper()
        df['Model'] = model_name
        dfs.append(df)
    else:
        print(f"Warning: File {filename} not found.")
print("Done!")

Loading files...
Done!


In [19]:
full_df = pd.concat(dfs, ignore_index=True)

cols = ['Model', 'Dataset', 'Accuracy', 'Micro F1', 'Macro F1', 'Samples']
cols = [c for c in cols if c in full_df.columns]
full_df = full_df[cols]

full_df.to_csv("./results/combined_benchmark_results.csv", index=False)
print("\nSaved combined results to './results/combined_benchmark_results.csv'")


Saved combined results to './results/combined_benchmark_results.csv'


In [20]:
print("\n=== All Results Combined ===")
print(full_df.to_string(index=False, float_format="%.4f"))


=== All Results Combined ===
         Model     Dataset  Accuracy  Micro F1  Macro F1  Samples
          BERT   senseval2    0.6109    0.6109    0.3988     2282
          BERT   senseval3    0.6216    0.6216    0.3920     1850
          BERT semeval2007    0.5978    0.5978    0.4014      455
          BERT semeval2013    0.5237    0.5237    0.3075     1644
          BERT semeval2015    0.6057    0.6057    0.3794     1022
          BERT         ALL    0.5923    0.5923    0.3483     7253
          LESK   senseval2    0.3545    0.3545    0.1868     2282
          LESK   senseval3    0.3097    0.3097    0.1678     1850
          LESK semeval2007    0.2110    0.2110    0.1319      455
          LESK semeval2013    0.3631    0.3631    0.2403     1644
          LESK semeval2015    0.3542    0.3542    0.2009     1022
          LESK         ALL    0.3360    0.3360    0.1992     7253
       ROBERTA   senseval2    0.6078    0.6078    0.3976     2282
       ROBERTA   senseval3    0.6162    0.6162

In [22]:
print("\n=== Comparison on 'ALL' Dataset ===")
all_dataset_df = full_df[full_df['Dataset'] == 'ALL'].sort_values(by='Accuracy', ascending=True)
print(all_dataset_df[['Model', 'Accuracy', 'Macro F1']].to_string(index=False, float_format="%.4f"))


=== Comparison on 'ALL' Dataset ===
         Model  Accuracy  Macro F1
          LESK    0.3360    0.1992
           MFS    0.4908    0.2972
       MFS POS    0.5840    0.3766
       DEBERTA    0.5875    0.3367
       ROBERTA    0.5894    0.3426
          BERT    0.5923    0.3483
DEBERTA MASKED    0.6495    0.4313
 DEBERTA GLOSS    0.7427    0.5503
