In [2]:
import os
import json
import pandas as pd
import nltk
nltk.data.path.append("./nltk_data")
from nltk.corpus import wordnet as wn
from typing import List, Dict, Union, Tuple

> Conversion of the original sense notation format into synsets used in the training dataset

In [3]:
def sense_key_to_synset_name(sense_key_str: str) -> List[str]:
    """
    Converts a string of Sense Keys (separated by ';') into a list of Synset names.
    Example: "art%1:09:00::" -> ["art.n.03"]
    """
    keys = sense_key_str.split(';')
    synset_names = []
    
    for key in keys:
        try:
            # Retrieve the lemma object using the sense key
            lemma = wn.lemma_from_key(key)
            # Get the synset associated with this lemma
            synset = lemma.synset()
            synset_names.append(synset.name())
        except Exception:
            continue
            
    # Remove duplicates if multiple keys map to the same synset
    return list(set(synset_names))

In [4]:
# Get synset name "art%1:09:00::"
sense_key_str = "art%1:09:00::"
sense_key_to_synset_name(sense_key_str)

['art.n.03']

> Conversion of the original target word indication format to the start and end character indices used in the training dataset

In [5]:
def get_char_offsets(sentence: str, token_start: int, token_end: int) -> Tuple[int, int]:
    """
    Calculates character start and end indices based on token indices.
    """
    tokens = sentence.split(' ')
    
    # We need to find the character position of the token at index "token_start"
    current_char_idx = 0
    target_char_start = -1
    target_char_end = -1
    
    for i, token in enumerate(tokens):
        token_len = len(token)
        
        if i == token_start:
            target_char_start = current_char_idx
        
        # If we are at the last token of the target phrase
        if i == token_end - 1:
            target_char_end = current_char_idx + token_len
            break
        
        # Advance cursor: token length + 1 for space
        current_char_idx += token_len + 1
        
    return target_char_start, target_char_end


In [6]:
# Get indicies for "old age"
sentence = "Mr. Hammond worries that old age and the flightiness of youth will diminish the ranks of the East Anglian group that keeps the Aslacton bells pealing ."
token_start = 4
token_end = 6
get_char_offsets(sentence, token_start, token_end)

(25, 32)

> Function for converting the entire dataset

In [6]:
def process_benchmark_file(input_path: str, output_path: str):
    """
    Reads the JSONL benchmark file, transforms data to the training format, and saves as Parquet.
    """

    input_path = os.path.join("jsonl", input_path)
    data = []
    
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            record = json.loads(line)
            
            # 1. Parse Sentence and IDs
            sent_text = record['sentence']
            word_val = record['word']
            
            # 2. Convert Sense Keys to Synset Names (Gold Labels)
            # We store this as a list because benchmarks can have multiple valid answers
            gold_synsets = sense_key_to_synset_name(record['sense'])
            
            if not gold_synsets:
                continue

            # 3. Calculate Character Offsets
            t_start = record['start']
            t_end = record['end']
            
            c_start, c_end = get_char_offsets(sent_text, t_start, t_end)
                        
            # Create the row entry
            row = {
                'id': record['id'],
                'sentence': sent_text,
                'target_word': word_val,
                'char_start': c_start,
                'char_end': c_end,
                'gold_synsets': gold_synsets, # List of valid strings, e.g. ['art.n.01']
            }
            data.append(row)

    # Convert to DataFrame
    df = pd.DataFrame(data)
    
    # Save to Parquet
    df.to_parquet(output_path, index=False)
    print(f"Processed {input_path}: {len(df)} records saved to {output_path}")

In [7]:
files = ["ALL.jsonl", "semeval2007.jsonl", "semeval2013.jsonl", "semeval2015.jsonl", "senseval2.jsonl", "senseval3.jsonl"]

output_dir = "parquet"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for f in files:
    output_filename = f.replace(".jsonl", ".parquet")
    output_path = os.path.join(output_dir, output_filename)
    process_benchmark_file(f, output_path)

Processed jsonl/ALL.jsonl: 7253 records saved to parquet/ALL.parquet
Processed jsonl/semeval2007.jsonl: 455 records saved to parquet/semeval2007.parquet
Processed jsonl/semeval2013.jsonl: 1644 records saved to parquet/semeval2013.parquet
Processed jsonl/semeval2015.jsonl: 1022 records saved to parquet/semeval2015.parquet
Processed jsonl/senseval2.jsonl: 2282 records saved to parquet/senseval2.parquet
Processed jsonl/senseval3.jsonl: 1850 records saved to parquet/senseval3.parquet


> Conversion check

In [8]:
df = pd.read_parquet("parquet/ALL.parquet")

In [9]:
df.head()

Unnamed: 0,id,sentence,target_word,char_start,char_end,gold_synsets
0,senseval2.d000.s000.t000,The art of change-ringing is peculiar to the E...,art,4,7,[art.n.03]
1,senseval2.d000.s000.t001,The art of change-ringing is peculiar to the E...,change-ringing,11,25,[change_ringing.n.01]
2,senseval2.d000.s000.t002,The art of change-ringing is peculiar to the E...,peculiar,29,37,"[peculiar.s.04, particular.s.01]"
3,senseval2.d000.s000.t003,The art of change-ringing is peculiar to the E...,English,45,52,[english.n.02]
4,senseval2.d000.s000.t004,The art of change-ringing is peculiar to the E...,most,66,70,[most.a.01]


In [10]:
df[df["id"] == "senseval2.d000.s010.t000"]

Unnamed: 0,id,sentence,target_word,char_start,char_end,gold_synsets
84,senseval2.d000.s010.t000,They belong to a group of 15 ringers -- includ...,belong to,5,14,[belong_to.v.01]


> The dataset contains gold_synsets lists and correctly stores cases with multiple answers, as well as correct indexes for targets containing multiple words.