In [26]:
import pandas as pd
import nltk
import os
nltk.data.path.append("./nltk_data") 
from nltk.corpus import wordnet as wn
from tqdm.notebook import tqdm
import numpy as np

In [27]:
def mark_target_word(text, char_start, char_end, marker_start="[TGT]", marker_end="[TGT]"):
    """
    Wraps the target word in the sentence with special markers.
    Example: "The [TGT] bank [TGT] is closed."
    """     
    return text[:char_start] + marker_start + text[char_start:char_end] + marker_end + text[char_end:]

> The function for converting test datasets to gloss format, similar to converting training data. However, this function saves ALL instances where candidates are found, regardless of whether the gold label is present among them. This prevents selection bias. The model must try to predict best gloss even if the true answer is missing from candidates.

In [28]:
def prepare_eval_gloss_dataset(input_file, output_path):
    """
    Converts standard evaluation dataset into Gloss dataset format.
    Handles 'gold_synsets' as a list of valid keys.
    """

    input_path = os.path.join("parquet", input_file)
    # Check if file exists to avoid errors
    if not os.path.exists(input_path):
        print(f"File not found: {input_path}")
        return

    df = pd.read_parquet(input_path)
    
    new_rows = []
    
    print(f"Processing {len(df)} rows from {input_path}...")
    
    # Counters for statistics
    total_instances = 0
    kept_instances = 0
    positive_found_count = 0
    
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        sentence = row['sentence']
        target_word = row['target_word']
        gold_synsets = row['gold_synsets']
        
        # Ensure gold_synsets is a list
        if isinstance(gold_synsets, np.ndarray):
            gold_synsets = gold_synsets.tolist()
        elif not isinstance(gold_synsets, list):
            gold_synsets = [gold_synsets]
            
        # 1. Create context with markers
        context_with_markers = mark_target_word(
            sentence, 
            row['char_start'], 
            row['char_end'],
            marker_start=" [TGT] ", 
            marker_end=" [TGT] "
        )
        
        # 2. Get all candidate synsets from WordNet
        lookup_word = target_word.replace(" ", "_")
        candidates = wn.synsets(lookup_word)
        
        if not candidates:
            continue
            
        # Use the existing unique ID from the dataset
        instance_group_id = row['id']

        current_word_rows = []
        has_positive = False
        
        for synset in candidates:
            synset_name = synset.name()
            
            # 3. Handle missing definition
            gloss = synset.definition()
            if not gloss or len(gloss.strip()) == 0:
                gloss = ", ".join([lemma.name().replace("_", " ") for lemma in synset.lemmas()])
            
            # 4. Determine label
            label = 1 if synset_name in gold_synsets else 0
            
            if label == 1:
                has_positive = True

            current_word_rows.append({
                'instance_group_id': instance_group_id,
                'context': context_with_markers,
                'gloss': gloss,
                'candidate_synset': synset_name,
                'label': label
            })
            
        # Save the instance even if 'has_positive' is False.
        new_rows.extend(current_word_rows)
        kept_instances += 1
        
        if has_positive:
            positive_found_count += 1
            
        total_instances += 1

    gloss_df = pd.DataFrame(new_rows)
    
    print(f"Created {len(gloss_df)} gloss-context pairs.")
    print(f"Original instances processed (with candidates): {total_instances}")
    print(f"Instances containing at least one correct label: {positive_found_count}")
    print(f"Positive samples (rows with label=1): {gloss_df['label'].sum()}")
    
    gloss_df.to_parquet(output_path)
    print(f"Saved to {output_path}\n")

In [29]:
# List of datasets to process
datasets = [
    "senseval2.parquet",
    "senseval3.parquet",
    "semeval2007.parquet",
    "semeval2013.parquet",
    "semeval2015.parquet",
    "ALL.parquet"
]

In [30]:
# Process datasets
output_dir = "gloss"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for dataset_file in datasets:
    output_filename = dataset_file.replace(".parquet", "_gloss.parquet")
    output_path = os.path.join(output_dir, output_filename)
    prepare_eval_gloss_dataset(dataset_file, output_path)

Processing 2282 rows from parquet/senseval2.parquet...


  0%|          | 0/2282 [00:00<?, ?it/s]

Created 16736 gloss-context pairs.
Original instances processed (with candidates): 2257
Instances containing at least one correct label: 2249
Positive samples (rows with label=1): 2356
Saved to gloss/senseval2_gloss.parquet

Processing 1850 rows from parquet/senseval3.parquet...


  0%|          | 0/1850 [00:00<?, ?it/s]

Created 16789 gloss-context pairs.
Original instances processed (with candidates): 1798
Instances containing at least one correct label: 1785
Positive samples (rows with label=1): 1823
Saved to gloss/senseval3_gloss.parquet

Processing 455 rows from parquet/semeval2007.parquet...


  0%|          | 0/455 [00:00<?, ?it/s]

Created 4634 gloss-context pairs.
Original instances processed (with candidates): 445
Instances containing at least one correct label: 442
Positive samples (rows with label=1): 446
Saved to gloss/semeval2007_gloss.parquet

Processing 1644 rows from parquet/semeval2013.parquet...


  0%|          | 0/1644 [00:00<?, ?it/s]

Created 11451 gloss-context pairs.
Original instances processed (with candidates): 1643
Instances containing at least one correct label: 1643
Positive samples (rows with label=1): 1655
Saved to gloss/semeval2013_gloss.parquet

Processing 1022 rows from parquet/semeval2015.parquet...


  0%|          | 0/1022 [00:00<?, ?it/s]

Created 7582 gloss-context pairs.
Original instances processed (with candidates): 1011
Instances containing at least one correct label: 1011
Positive samples (rows with label=1): 1209
Saved to gloss/semeval2015_gloss.parquet

Processing 7253 rows from parquet/ALL.parquet...


  0%|          | 0/7253 [00:00<?, ?it/s]

Created 57192 gloss-context pairs.
Original instances processed (with candidates): 7154
Instances containing at least one correct label: 7130
Positive samples (rows with label=1): 7489
Saved to gloss/ALL_gloss.parquet



> Conversion check

In [31]:
df = pd.read_parquet("gloss/ALL_gloss.parquet")

In [33]:
df.head(10)

Unnamed: 0,instance_group_id,context,gloss,candidate_synset,label
0,senseval2.d000.s000.t000,The [TGT] art [TGT] of change-ringing is pec...,the products of human creativity; works of art...,art.n.01,0
1,senseval2.d000.s000.t000,The [TGT] art [TGT] of change-ringing is pec...,the creation of beautiful or significant things,art.n.02,0
2,senseval2.d000.s000.t000,The [TGT] art [TGT] of change-ringing is pec...,a superior skill that you can learn by study a...,art.n.03,1
3,senseval2.d000.s000.t000,The [TGT] art [TGT] of change-ringing is pec...,photographs or other visual representations in...,artwork.n.01,0
4,senseval2.d000.s000.t002,The art of change-ringing is [TGT] peculiar [...,beyond or deviating from the usual or expected,curious.s.01,0
5,senseval2.d000.s000.t002,The art of change-ringing is [TGT] peculiar [...,unique or specific to a person or thing or cat...,particular.s.01,1
6,senseval2.d000.s000.t002,The art of change-ringing is [TGT] peculiar [...,markedly different from the usual; ; -Virginia...,peculiar.s.03,0
7,senseval2.d000.s000.t002,The art of change-ringing is [TGT] peculiar [...,characteristic of one only; distinctive or spe...,peculiar.s.04,1
8,senseval2.d000.s000.t003,The art of change-ringing is peculiar to the ...,an Indo-European language belonging to the Wes...,english.n.01,0
9,senseval2.d000.s000.t003,The art of change-ringing is peculiar to the ...,the people of England,english.n.02,1
