## Unmasked

In [None]:
import os
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModel,
    DebertaV2PreTrainedModel, 
    DebertaV2Config
)
from transformers.modeling_outputs import SequenceClassifierOutput
from tqdm.notebook import tqdm
import ast

In [59]:
# hide false warnings from transformers
import logging
from transformers import logging as hf_logging
hf_logging.set_verbosity_error()

In [60]:
# --- 1. Custom Dataset Class ---
class WSDDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        sentence = row['sentence']
        c_start = row['char_start']
        # c_end = row['char_end'] 

        # Tokenization with offset mapping
        encoding = self.tokenizer(
            sentence,
            truncation=True,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=True,
            return_tensors="pt"
        )

        offsets = encoding['offset_mapping'].squeeze().tolist()
        target_token_idx = 0

        # Find the token corresponding to the word start
        for i, (o_start, o_end) in enumerate(offsets):
            if o_start == 0 and o_end == 0: continue 

            if o_start == c_start:
                target_token_idx = i
                break

            if o_start < c_start and o_end > c_start:
                 target_token_idx = i
                 break

        item = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'target_token_idx': torch.tensor(target_token_idx, dtype=torch.long)
        }

        return item

In [61]:
# --- 2. Custom Model Class ---
class DebertaV3ForWSD(DebertaV2PreTrainedModel): 
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.deberta = AutoModel.from_config(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.post_init()

    def forward(self, input_ids=None, attention_mask=None, target_token_idx=None, labels=None, **kwargs):
        kwargs.pop("num_items_in_batch", None)

        # Basic forward pass
        outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
        sequence_output = outputs.last_hidden_state

        batch_size = input_ids.shape[0]
        batch_indices = torch.arange(batch_size, device=input_ids.device)
        
        # Extract vector for the specific target token
        target_vectors = sequence_output[batch_indices, target_token_idx]

        target_vectors = self.dropout(target_vectors)
        logits = self.classifier(target_vectors)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # Simplified return
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits
        )

In [62]:
# --- 3. Setup Paths and Device ---
model_path = "./models/deberta/deberta_wsd_custom"
test_data_path = "./evaluation_data/parquet/ALL.parquet"
label_map_path = "./label_map.json"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cuda


In [63]:
# --- 4. Load Resources ---

# Load Label Map (Label -> ID) and invert it (ID -> Label)
with open(label_map_path, 'r', encoding='utf-8') as f:
    label2id = json.load(f)

id2label = {v: k for k, v in label2id.items()}

# Load Tokenizer and Config
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = DebertaV2Config.from_pretrained(model_path)

# Load Model
model = DebertaV3ForWSD.from_pretrained(model_path, config=config)
model.to(device)
model.eval()

# Load Data
df_test = pd.read_parquet(test_data_path)

# Create Dataset and DataLoader
test_dataset = WSDDataset(df_test, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [64]:
# --- 5. Inference Loop ---
all_preds = []

print("Running inference...")
with torch.no_grad():
    for batch in tqdm(test_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        target_token_idx = batch['target_token_idx'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            target_token_idx=target_token_idx
        )
        
        # Get the ID with maximum probability
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)

Running inference...


  0%|          | 0/227 [00:00<?, ?it/s]

In [65]:
# --- 6. Process Results ---
results = []
correct_count = 0
total_count = len(df_test)

# Iterate through original dataframe to build the result
for idx, row in df_test.iterrows():
    predicted_id = all_preds[idx]
    
    # Convert ID back to string label
    if predicted_id in id2label:
        predicted_label_str = id2label[predicted_id]
    else:
        predicted_label_str = "UNKNOWN" 

    # Check against gold synsets
    gold_list = row['gold_synsets']
    
    # Check if prediction is in the list of correct answers
    is_correct = predicted_label_str in gold_list
    
    if is_correct:
        correct_count += 1
    
    # Construct result row
    res_row = row.drop(['char_start', 'char_end']).to_dict()
    res_row['predicted_label'] = predicted_label_str
    res_row['is_correct'] = is_correct
    results.append(res_row)

# Calculate Accuracy
accuracy = correct_count / total_count if total_count > 0 else 0.0

# Print Accuracy
print(f"\nModel Accuracy: {accuracy:.4f} ({correct_count}/{total_count})")

# Create Final DataFrame
final_df = pd.DataFrame(results)
final_df['gold_synsets'] = final_df['gold_synsets'].apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)
final_df.to_csv("./results/unmasked_results.csv", index=False)

# Display first few rows
final_df.head()


Model Accuracy: 0.5875 (4261/7253)


Unnamed: 0,id,sentence,target_word,gold_synsets,predicted_label,is_correct
0,senseval2.d000.s000.t000,The art of change-ringing is peculiar to the E...,art,[art.n.03],art.n.01,False
1,senseval2.d000.s000.t001,The art of change-ringing is peculiar to the E...,change-ringing,[change_ringing.n.01],change.n.03,False
2,senseval2.d000.s000.t002,The art of change-ringing is peculiar to the E...,peculiar,"[particular.s.01, peculiar.s.04]",particular.s.01,True
3,senseval2.d000.s000.t003,The art of change-ringing is peculiar to the E...,English,[english.n.02],english.n.01,False
4,senseval2.d000.s000.t004,The art of change-ringing is peculiar to the E...,most,[most.a.01],most.a.01,True


## Masked

In [66]:
import nltk
nltk.data.path.append("./nltk_data")
from nltk.corpus import wordnet as wn

In [67]:
# --- 1. Custom Dataset Class ---
class WSDDataset(Dataset):
    def __init__(self, dataframe, tokenizer, label2id, max_len=128):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label2id = label2id
        self.num_labels = len(label2id)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        sentence = row['sentence']
        target_word = row['target_word']
        c_start = row['char_start']
        
        # Tokenization with offset mapping
        encoding = self.tokenizer(
            sentence,
            truncation=True,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=True,
            return_tensors="pt"
        )

        offsets = encoding['offset_mapping'].squeeze().tolist()
        target_token_idx = 0

        # Find the token corresponding to the word start
        for i, (o_start, o_end) in enumerate(offsets):
            if o_start == 0 and o_end == 0: continue
            if o_start == c_start:
                target_token_idx = i
                break
            if o_start < c_start and o_end > c_start:
                 target_token_idx = i
                 break
        
        # --- Logit Masking Logic ---
        # Initialize mask with large negative value
        mask = torch.full((self.num_labels,), -1e4, dtype=torch.float32)
        
        # 1. Get candidate synsets from NLTK
        lookup_word = target_word.replace(" ", "_")
        synsets = wn.synsets(lookup_word)
        
        found_candidates = False
        valid_candidate_ids = [] # To store used candidates for report

        # 2. Activate valid indices
        for synset in synsets:
            s_name = synset.name()
            if s_name in self.label2id:
                idx_to_activate = self.label2id[s_name]
                mask[idx_to_activate] = 0.0
                found_candidates = True
                valid_candidate_ids.append(idx_to_activate)
        
        # 3. If no candidates found, unmask all logits 
        if not found_candidates:
            mask = torch.zeros((self.num_labels,), dtype=torch.float32)
        
        item = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'target_token_idx': torch.tensor(target_token_idx, dtype=torch.long),
            'logit_mask': mask
        }

        return item

In [68]:
# --- 2. Custom Model Class ---
class DebertaV3ForWSD(DebertaV2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.deberta = AutoModel.from_config(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.post_init()

    def forward(self, input_ids=None, attention_mask=None, target_token_idx=None, labels=None, logit_mask=None, **kwargs):
        kwargs.pop("num_items_in_batch", None)

        outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
        sequence_output = outputs.last_hidden_state

        batch_size = input_ids.shape[0]
        batch_indices = torch.arange(batch_size, device=input_ids.device)

        # Extract vector for the specific target token
        target_vectors = sequence_output[batch_indices, target_token_idx]

        target_vectors = self.dropout(target_vectors)
        logits = self.classifier(target_vectors)

        # --- Apply Logit Masking ---
        if logit_mask is not None:
            logits = logits + logit_mask

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits
        )

In [69]:
# --- 3. Setup Paths and Device ---
model_path = "./models/deberta_masked/deberta_wsd_masked"
test_data_path = "./evaluation_data/parquet/ALL.parquet"
label_map_path = "./label_map.json"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cuda


In [70]:
# --- 4. Load Resources ---
# Load Label Map
with open(label_map_path, 'r', encoding='utf-8') as f:
    label2id = json.load(f)
id2label = {v: k for k, v in label2id.items()}

# Load Tokenizer & Config
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = DebertaV2Config.from_pretrained(model_path)

# Load Model
model = DebertaV3ForWSD.from_pretrained(model_path, config=config)
model.to(device)
model.eval()

# Load Data
df_test = pd.read_parquet(test_data_path)

# Create Dataset & Loader
dataset = WSDDataset(df_test, tokenizer, label2id)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

In [71]:
# --- 5. Inference Loop ---
all_preds = []
all_masks_str = [] # To store readable masks

print("Running inference with Logit Masking...")

with torch.no_grad():
    for batch in tqdm(dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        target_token_idx = batch['target_token_idx'].to(device)
        logit_mask = batch['logit_mask'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            target_token_idx=target_token_idx,
            logit_mask=logit_mask
        )
        
        # Get predictions
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        
        # Recover used masks for reporting
        # We look at logit_mask: 0.0 means active, -10000 means masked
        batch_masks = logit_mask.cpu().numpy()
        for mask_row in batch_masks:
            # Find indices where mask is 0.0 (or close to it)
            active_indices = np.where(mask_row > -100.0)[0]
            
            if len(active_indices) == len(mask_row):
                # If all are active, it was a fallback (no candidates found)
                all_masks_str.append("ALL_LABELS (Fallback)")
            else:
                # Convert indices to string labels
                active_labels = [id2label[idx] for idx in active_indices]
                all_masks_str.append(active_labels)

Running inference with Logit Masking...


  0%|          | 0/227 [00:00<?, ?it/s]

In [72]:
# --- 6. Build Results & Calculate Accuracy ---
results = []
correct_count = 0
total_count = len(df_test)

for idx, row in df_test.iterrows():
    pred_id = all_preds[idx]
    pred_label = id2label.get(pred_id, "UNKNOWN")
    mask_used = all_masks_str[idx]
    
    gold_list = row['gold_synsets']
    is_correct = pred_label in gold_list
    
    if is_correct:
        correct_count += 1
        
    # Build row
    res_row = row.drop(['char_start', 'char_end']).to_dict()
    res_row['predicted_label'] = pred_label
    res_row['is_correct'] = is_correct
    res_row['logit_mask_candidates'] = mask_used
    
    results.append(res_row)

# Calculate Accuracy
accuracy = correct_count / total_count if total_count > 0 else 0.0
print(f"\nModel Accuracy with Masking: {accuracy:.4f} ({correct_count}/{total_count})")

# Create Final DataFrame
final_df = pd.DataFrame(results)
final_df['gold_synsets'] = final_df['gold_synsets'].apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)
final_df.to_csv("./results/masked_results.csv", index=False)

# Display result
final_df.head()


Model Accuracy with Masking: 0.6495 (4711/7253)


Unnamed: 0,id,sentence,target_word,gold_synsets,predicted_label,is_correct,logit_mask_candidates
0,senseval2.d000.s000.t000,The art of change-ringing is peculiar to the E...,art,[art.n.03],art.n.01,False,"[art.n.01, art.n.02, art.n.03, artwork.n.01]"
1,senseval2.d000.s000.t001,The art of change-ringing is peculiar to the E...,change-ringing,[change_ringing.n.01],pickup.n.01,False,ALL_LABELS (Fallback)
2,senseval2.d000.s000.t002,The art of change-ringing is peculiar to the E...,peculiar,"[particular.s.01, peculiar.s.04]",particular.s.01,True,"[curious.s.01, particular.s.01]"
3,senseval2.d000.s000.t003,The art of change-ringing is peculiar to the E...,English,[english.n.02],english.a.01,False,"[english.a.01, english.n.01, english.n.02, eng..."
4,senseval2.d000.s000.t004,The art of change-ringing is peculiar to the E...,most,[most.a.01],most.a.01,True,"[about.r.07, most.a.01, most.a.02, most.r.01, ..."


## Merged

In [16]:
df_unmasked = pd.read_csv("./results/unmasked_results.csv")
df_masked = pd.read_csv("./results/masked_results.csv")

In [17]:
# 1. Rename columns in the unmasked dataframe
df_unmasked_ready = df_unmasked.copy().rename(columns={
    'predicted_label': 'unmasked_predicted_label',
    'is_correct': 'unmasked_is_correct'
})

# 2. Rename columns in the masked dataframe
df_masked_ready = df_masked.copy().rename(columns={
    'predicted_label': 'masked_predicted_label',
    'is_correct': 'masked_is_correct',
    'logit_mask_candidates': 'masked_logit_candidates'
})

# 3. Define which columns to take from the masked dataframe
# We only need 'id' for the join key and the specific result columns.
# We skip 'sentence', 'target_word', etc., because they are already in the first dataframe.
columns_to_merge = [
    'id', 
    'masked_predicted_label', 
    'masked_is_correct', 
    'masked_logit_candidates'
]

# 4. Merge the dataframes
merged_df = pd.merge(
    df_unmasked_ready, 
    df_masked_ready[columns_to_merge], 
    on='id', 
    how='inner'
)

# Drop "id" column
merged_df = merged_df.drop(["id"], axis=1)

# Display results
print(f"Shape of merged dataframe: {merged_df.shape}")
merged_df.head()

Shape of merged dataframe: (7253, 8)


Unnamed: 0,sentence,target_word,gold_synsets,unmasked_predicted_label,unmasked_is_correct,masked_predicted_label,masked_is_correct,masked_logit_candidates
0,The art of change-ringing is peculiar to the E...,art,['art.n.03'],art.n.01,False,art.n.01,False,"['art.n.01', 'art.n.02', 'art.n.03', 'artwork...."
1,The art of change-ringing is peculiar to the E...,change-ringing,['change_ringing.n.01'],change.n.03,False,pickup.n.01,False,ALL_LABELS (Fallback)
2,The art of change-ringing is peculiar to the E...,peculiar,"['particular.s.01', 'peculiar.s.04']",particular.s.01,True,particular.s.01,True,"['curious.s.01', 'particular.s.01']"
3,The art of change-ringing is peculiar to the E...,English,['english.n.02'],english.n.01,False,english.a.01,False,"['english.a.01', 'english.n.01', 'english.n.02..."
4,The art of change-ringing is peculiar to the E...,most,['most.a.01'],most.a.01,True,most.a.01,True,"['about.r.07', 'most.a.01', 'most.a.02', 'most..."


## Comparison

> Compare the model with and without using the mask.

In [18]:
# 1. Define Boolean masks for the comparison cases
unmasked_better_mask = (merged_df['unmasked_is_correct'] == True) & (merged_df['masked_is_correct'] == False)
masked_better_mask = (merged_df['masked_is_correct'] == True) & (merged_df['unmasked_is_correct'] == False)

# 2. Calculate counts
count_unmasked_better = unmasked_better_mask.sum()
count_masked_better = masked_better_mask.sum()

# 3. Print Summary Statistics
print("--- Performance Comparison Summary ---")
print(f"Cases where Unmasked model won (Masking hurt):    {count_unmasked_better}")
print(f"Cases where Masked model won (Masking improved):  {count_masked_better}")

net_improvement = count_masked_better - count_unmasked_better
print(f"Net Improvement (Masked wins - Unmasked wins):    {net_improvement}")

--- Performance Comparison Summary ---
Cases where Unmasked model won (Masking hurt):    321
Cases where Masked model won (Masking improved):  771
Net Improvement (Masked wins - Unmasked wins):    450


In [19]:
# --- 4. Display Examples ---
print("\n" + "="*80)
print(f"EXAMPLES: Unmasked model Correct vs Masked model Incorrect (Count: {count_unmasked_better})")
print("="*80)
df_unmasked_wins = merged_df[unmasked_better_mask]
display(df_unmasked_wins.head(5)) 


EXAMPLES: Unmasked model Correct vs Masked model Incorrect (Count: 321)


Unnamed: 0,sentence,target_word,gold_synsets,unmasked_predicted_label,unmasked_is_correct,masked_predicted_label,masked_is_correct,masked_logit_candidates
75,"Now , only one local ringer remains : 64-year-...",Now,['nowadays.r.01'],nowadays.r.01,True,now.r.01,False,"['immediately.r.01', 'now.n.01', 'now.r.01', '..."
126,"History , after all , is not on his side .",History,['history.n.01'],history.n.01,True,history.n.03,False,"['history.n.01', 'history.n.02', 'history.n.03..."
141,According to a nationwide survey taken a year ...,ring,['ring.v.03'],ring.v.03,True,call.v.03,False,"['call.v.03', 'gang.n.01', 'hoop.n.02', 'resou..."
144,It is easy to see why the ancient art is on th...,ancient,['ancient.s.01'],ancient.s.01,True,ancient.s.02,False,"['ancient.s.01', 'ancient.s.02']"
149,The less complicated version of playing tunes ...,playing,['play.v.06'],play.v.06,True,play.v.03,False,"['act.v.03', 'act.v.05', 'act.v.10', 'acting.n..."


In [20]:
print("\n" + "="*80)
print(f"EXAMPLES: Masked model Correct vs Unmasked model Incorrect (Count: {count_masked_better})")
print("="*80)
df_masked_wins = merged_df[masked_better_mask]
display(df_masked_wins.head(5))


EXAMPLES: Masked model Correct vs Unmasked model Incorrect (Count: 771)


Unnamed: 0,sentence,target_word,gold_synsets,unmasked_predicted_label,unmasked_is_correct,masked_predicted_label,masked_is_correct,masked_logit_candidates
6,The art of change-ringing is peculiar to the E...,peculiarities,"['peculiarity.n.01', 'peculiarity.n.02']",curious.s.01,False,peculiarity.n.01,True,"['curio.n.01', 'peculiarity.n.01', 'peculiarit..."
17,"Of all scenes that evoke rural England , this ...",ancient,['ancient.s.02'],ancient.s.01,False,ancient.s.02,True,"['ancient.s.01', 'ancient.s.02']"
24,"Of all scenes that evoke rural England , this ...",cascading,['cascade.v.01'],pour.v.02,False,cascade.v.01,True,['cascade.v.01']
29,The parishioners of St. Michael and All Angels...,parishioners,['parishioner.n.01'],resident.n.01,False,parishioner.n.01,True,['parishioner.n.01']
42,"In the tower , five men and women pull rhythmi...",ropes,['rope.n.01'],wire.n.01,False,rope.n.01,True,['rope.n.01']


> Analysis of errors depending on the availability of the correct answer in the mask.

In [23]:
def check_gold_availability(row):
    """
    Checks if at least one gold synset exists in the masked candidates list.
    """
    candidates = row['masked_logit_candidates']
    gold_synsets = row['gold_synsets']
    
    # --- Normalize Candidates (Mask) ---
    if isinstance(candidates, str):
        if "ALL_LABELS (Fallback)" in candidates:
            return True
        
        # 2. Parse string representation
        try:
            candidates = ast.literal_eval(candidates)
        except (ValueError, SyntaxError):
            return False

    # --- Normalize Gold Synsets ---
    if isinstance(gold_synsets, str):
        try:
            gold_synsets = ast.literal_eval(gold_synsets)
        except:
            pass

    # --- Check Intersection ---
    # Ensure inputs are sets for comparison
    if candidates is None: return False
    
    gold_set = set(gold_synsets)
    candidate_set = set(candidates)
    
    # Return True if there is ANY common element
    return not gold_set.isdisjoint(candidate_set)

In [24]:
# Apply the function
merged_df['gold_available_in_mask'] = merged_df.apply(check_gold_availability, axis=1)

In [30]:
# Mask Validity Stats
# Gold was NOT in the mask (missing in train or NLTK failure)
missing_gold_count = len(merged_df[merged_df['gold_available_in_mask'] == False])

# Gold WAS in the mask, but model still failed
present_but_wrong_count = len(merged_df[
    (merged_df['gold_available_in_mask'] == True) & 
    (merged_df['masked_is_correct'] == False)
])

print("="*40)
print(" COMPARISON REPORT ")
print("="*40)
print(f"Gold label MISSING from candidates:  {missing_gold_count} (Impossible to solve)")
print(f"Gold label AVAILABLE but predicted wrong: {present_but_wrong_count}")

 COMPARISON REPORT 
Gold label MISSING from candidates:  460 (Impossible to solve)
Gold label AVAILABLE but predicted wrong: 2082


> Cases of errors in the model with a mask, when the correct answer was in the mask, and the model without a mask answered correctly

In [56]:
mask = (merged_df['masked_is_correct'] == False) & (merged_df['unmasked_is_correct'] == True) & (merged_df['gold_available_in_mask'] == True)
display(mask.sum())
merged_df[mask].head()

315

Unnamed: 0,sentence,target_word,gold_synsets,unmasked_predicted_label,unmasked_is_correct,masked_predicted_label,masked_is_correct,masked_logit_candidates,gold_available_in_mask
75,"Now , only one local ringer remains : 64-year-...",Now,['nowadays.r.01'],nowadays.r.01,True,now.r.01,False,"['immediately.r.01', 'now.n.01', 'now.r.01', '...",True
126,"History , after all , is not on his side .",History,['history.n.01'],history.n.01,True,history.n.03,False,"['history.n.01', 'history.n.02', 'history.n.03...",True
141,According to a nationwide survey taken a year ...,ring,['ring.v.03'],ring.v.03,True,call.v.03,False,"['call.v.03', 'gang.n.01', 'hoop.n.02', 'resou...",True
144,It is easy to see why the ancient art is on th...,ancient,['ancient.s.01'],ancient.s.01,True,ancient.s.02,False,"['ancient.s.01', 'ancient.s.02']",True
149,The less complicated version of playing tunes ...,playing,['play.v.06'],play.v.06,True,play.v.03,False,"['act.v.03', 'act.v.05', 'act.v.10', 'acting.n...",True


> There are 315/321 such cases (where the model without a mask outperformed the model with a mask), which can generally be understood as the model with a mask being more effective because it restricts the prediction space and selects the class with the highest probability from the POSSIBLE ones, but the model without a mask learns in more complex conditions, as it selects from ALL possible classes, which in some cases allows it to predict the correct class with high confidence.