In [1]:
import pandas as pd
import nltk
nltk.data.path.append("./nltk_data")
from nltk.corpus import wordnet as wn
from tqdm import tqdm

In [2]:
def mark_target_word(text, char_start, char_end, marker_start="[TGT]", marker_end="[TGT]"):
    """
    Wraps the target word in the sentence with special markers.
    Example: "The [TGT] bank [TGT] is closed."
    """     
    return text[:char_start] + marker_start + text[char_start:char_end] + marker_end + text[char_end:]

In [3]:
def prepare_gloss_dataset(input_path, output_path):
    """
    Converts standard dataset into Gloss dataset.
    """
    df = pd.read_parquet(input_path)
    
    new_rows = []
    
    print(f"Processing {len(df)} rows from {input_path}...")
    
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        sentence = row['sentence']
        target_word = row['target_word']
        true_label_name = row['label'] # e.g., 'group.n.01'
        
        # 1. Create context with markers to focus attention
        # Using [TGT] as a marker.
        context_with_markers = mark_target_word(
            sentence, 
            row['char_start'], 
            row['char_end'],
            marker_start=" [TGT] ", 
            marker_end=" [TGT] "
        )
        
        # 2. Get all candidate synsets from WordNet
        # Replace spaces with underscores for lookup if it's a multi-word expression
        lookup_word = target_word.replace(" ", "_")
        candidates = wn.synsets(lookup_word)
        
        if not candidates:
            # For simplicity, I skip if no candidates found i.e. skip all NE - Named Entities.
            continue
            
        # Unique ID for this specific instance (sentence + position)
        # This is crucial for splitting train/val later without leakage
        instance_group_id = f"{row['sentence_id']}_{row['char_start']}"

        current_word_rows = []
        has_positive = False
        
        for synset in candidates:
            synset_name = synset.name()
            
            # 3. Handle missing definition
            gloss = synset.definition()
            if not gloss or len(gloss.strip()) == 0:
                # Fallback: Use lemma names as definition
                gloss = ", ".join([lemma.name().replace("_", " ") for lemma in synset.lemmas()])
            
            # 4. Determine label (Binary: 1 if matches true label, else 0)
            label = 1 if synset_name == true_label_name else 0
            
            if label == 1:
                has_positive = True

            current_word_rows.append({
                'instance_group_id': instance_group_id,
                'context': context_with_markers,
                'gloss': gloss,
                'candidate_synset': synset_name,
                'label': label
            })
            
        if has_positive:
            new_rows.extend(current_word_rows)

    gloss_df = pd.DataFrame(new_rows)
    
    print(f"Created {len(gloss_df)} gloss-context pairs.")
    print(f"Positive samples: {gloss_df['label'].sum()}")
    
    gloss_df.to_parquet(output_path)
    print(f"Saved to {output_path}")

In [4]:
prepare_gloss_dataset("semcor_train.parquet", "semcor_gloss_train.parquet")

Processing 224515 rows from semcor_train.parquet...


100%|█████████████████████████████████████████████████████████████████████████| 224515/224515 [00:38<00:00, 5845.46it/s]


Created 1925220 gloss-context pairs.
Positive samples: 210495
Saved to semcor_gloss_train.parquet


In [5]:
df = pd.read_parquet("semcor_gloss_train.parquet")

In [11]:
df.head()

Unnamed: 0,instance_group_id,context,gloss,candidate_synset,label
0,0_29,The Fulton County Grand Jury [TGT] said [TGT]...,express in words,state.v.01,1
1,0_29,The Fulton County Grand Jury [TGT] said [TGT]...,report or maintain,allege.v.01,0
2,0_29,The Fulton County Grand Jury [TGT] said [TGT]...,express a supposition,suppose.v.01,0
3,0_29,The Fulton County Grand Jury [TGT] said [TGT]...,have or contain a certain wording or form,read.v.02,0
4,0_29,The Fulton County Grand Jury [TGT] said [TGT]...,give instructions to or direct somebody to do ...,order.v.01,0
