# Word Sense Disambiguation

## Getting Started with NLTK

> I'll download and import SemCor corpus and other necessary packages

In [None]:
import ssl
import nltk

# Create an unverified SSL context to bypass certificate verification issues
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    # Older Python versions may not support SSL context modification
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download the SemCor corpus and WordNet lexicon (into a local directory)
nltk.download("semcor")
nltk.download("wordnet")
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

# Add the custom directory to NLTK's search path so it can find the downloaded data | uncomment if using 'download_dir="./nltk_data"' in nltk.download()
# nltk.data.path.append("./nltk_data")

[nltk_data] Downloading package semcor to ./nltk_data...
[nltk_data]   Package semcor is already up-to-date!
[nltk_data] Downloading package wordnet to ./nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to ./nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to ./nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     ./nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     ./nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [None]:
from nltk.corpus import semcor
from nltk.corpus import wordnet as wn
import pandas as pd
import json
import os
from transformers import AutoTokenizer

> Get the first sentence from the SemCor corpus

In [3]:
sent = semcor.sents()[0]
print(sent, "\n")
print(" ".join(semcor.sents()[0]))

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', 'Atlanta', "'s", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'] 

The Fulton County Grand Jury said Friday an investigation of Atlanta 's recent primary election produced `` no evidence '' that any irregularities took place .


> Get the first semantically tagged sentence from the SemCor corpus

In [4]:
# 'tagged_sents(tag="sem")' returns sentences where each word may have a WordNet sense annotation
# Parameter 'tag="sem"' means "semantic tags" (WordNet senses)
# NE = Named Entity
tagged_sent = semcor.tagged_sents(tag="sem")[0]

# Print the full structure of the first tagged sentence
print(tagged_sent)

[['The'], Tree(Lemma('group.n.01.group'), [Tree('NE', ['Fulton', 'County', 'Grand', 'Jury'])]), Tree(Lemma('state.v.01.say'), ['said']), Tree(Lemma('friday.n.01.Friday'), ['Friday']), ['an'], Tree(Lemma('probe.n.01.investigation'), ['investigation']), ['of'], Tree(Lemma('atlanta.n.01.Atlanta'), ['Atlanta']), ["'s"], Tree(Lemma('late.s.02.recent'), ['recent']), Tree(Lemma('primary.n.01.primary_election'), ['primary', 'election']), Tree(Lemma('produce.v.04.produce'), ['produced']), ['``'], ['no'], Tree(Lemma('evidence.n.01.evidence'), ['evidence']), ["''"], ['that'], ['any'], Tree(Lemma('abnormality.n.04.irregularity'), ['irregularities']), Tree(Lemma('happen.v.01.take_place'), ['took', 'place']), ['.']]


In [5]:
for word in tagged_sent:
    print(word)

['The']
(Lemma('group.n.01.group') (NE Fulton County Grand Jury))
(Lemma('state.v.01.say') said)
(Lemma('friday.n.01.Friday') Friday)
['an']
(Lemma('probe.n.01.investigation') investigation)
['of']
(Lemma('atlanta.n.01.Atlanta') Atlanta)
["'s"]
(Lemma('late.s.02.recent') recent)
(Lemma('primary.n.01.primary_election') primary election)
(Lemma('produce.v.04.produce') produced)
['``']
['no']
(Lemma('evidence.n.01.evidence') evidence)
["''"]
['that']
['any']
(Lemma('abnormality.n.04.irregularity') irregularities)
(Lemma('happen.v.01.take_place') took place)
['.']


> **Explanation of structure**:
> 
> - `semcor` is a corpus of English sentences annotated with WordNet senses.
> - Each sentence is represented as a tree-like structure of words and lemmas.  
> - Example elements:  
>     - `['The']` - a plain unannotated word  
>     - `Tree(Lemma('group.n.01.group'), [...])` - word/s linked to a WordNet sense (`group.n.01`)  
> - So each `tagged_sent` is a list of tokens, where some are annotated with `Lemma` objects that store:  
>     - the lemma (base form),  
>     - the WordNet sense ID (like `group.n.01`),  
>     - and the part of speech.  

> Get the sense for `happen.v.01`

In [6]:
# Retrieve a specific WordNet synset (a set of cognitive synonyms)
# The format is "word.pos.number" â€” e.g., "group.n.01" means:
#   - word = "group"
#   - pos = "n" (noun)
#   - 01 = the first sense of the noun "group"
synset = wn.synset("happen.v.01")

# Print the human-readable definition (gloss) of this sense
print(synset.definition())

# Print example sentences showing how this sense is used (if there are any)
print(synset.examples())

come to pass
['What is happening?', 'The meeting took place off without an incidence', 'Nothing occurred that seemed important']


## Creating Dataset

> I'll write a function to create a dataset of words and their meanings based on SemCor. Dataset will store the sentence ID, the sentence itself, the target word, the indices of its start and end characters in the sentence, the WordNet label, and the generated ID for that label. The function also saves the resulting dataset in parquet format and the dictionary of generated IDs for WordNet labels in json format.

In [5]:
def create_semcor_dataset_with_offsets(output_file="semcor_train.parquet", label_map_file="label_map.json"):
    
    print("Creating dataset from SemCor corpus...")
    
    dataset_rows = []
    seen_synsets = set()
    
    for sent_idx, chunk_sent in enumerate(semcor.tagged_sents(tag='sem')):
        
        # Construct the sentence dynamically, tracking the cursor position
        full_sentence = ""
        targets_in_sentence = [] 
        
        # Current cursor position in the string (character index)
        current_char_pos = 0
        
        for chunk in chunk_sent:
            # 1. Extract chunk text
            if isinstance(chunk, nltk.Tree):
                original_words = chunk.leaves()
                word_form = " ".join(original_words) # For the case of multiple words
                is_target = True
                label_lemma = chunk.label()
            else:
                # chunk is list ['The']
                word_form = " ".join(chunk)
                is_target = False
            
            # 2. Add a space before the word if it is not the start of the sentence
            prefix = ""
            if current_char_pos > 0:
                prefix = " "
                full_sentence += prefix
                current_char_pos += 1
            
            # 3. Calculate start/end for the current word
            start_idx = current_char_pos
            end_idx = current_char_pos + len(word_form)
            
            # Add the word to the sentence
            full_sentence += word_form
            current_char_pos += len(word_form)
            
            # 4. If it is a target, save it with character coordinates
            if is_target:
                try:
                    synset = label_lemma.synset()
                    synset_name = synset.name()
                    seen_synsets.add(synset_name)
                    
                    targets_in_sentence.append({
                        "target_word": word_form,
                        "char_start": start_idx,
                        "char_end": end_idx,
                        "label": synset_name
                    })
                except:
                    # <The problem with lemmas defined not by the Lemma class, 
                    # but by a regular string, which is why the .synset() method does not work>
                    pass

        # Save all targets from this sentence
        for target in targets_in_sentence:
            dataset_rows.append({
                "sentence_id": sent_idx,
                "sentence": full_sentence,
                "target_word": target['target_word'],
                "char_start": target['char_start'],
                "char_end": target['char_end'],
                "label": target['label']
            })
            
        if sent_idx % 2000 == 0 and sent_idx > 0:
            print(f"Processed {sent_idx} sentences...")

    print(f"Finished. Total examples: {len(dataset_rows)}")
    
    # Label Map
    sorted_synsets = sorted(list(seen_synsets))
    label2id = {label: idx for idx, label in enumerate(sorted_synsets)}
    
    df = pd.DataFrame(dataset_rows)
    df['label_id'] = df['label'].map(label2id)
    
    print(f"Saving to {output_file} ...")
    df.to_parquet(output_file, index=False)
    
    with open(label_map_file, 'w') as f:
        print(f"Saving label map to {label_map_file} ...")
        json.dump(label2id, f)

    print("Done!")
    return df, label2id

In [6]:
df, label_map = create_semcor_dataset_with_offsets()

Creating dataset from SemCor corpus...
Processed 2000 sentences...
Processed 4000 sentences...
Processed 6000 sentences...
Processed 8000 sentences...
Processed 10000 sentences...
Processed 12000 sentences...
Processed 14000 sentences...
Processed 16000 sentences...
Processed 18000 sentences...
Processed 20000 sentences...
Processed 22000 sentences...
Processed 24000 sentences...
Processed 26000 sentences...
Processed 28000 sentences...
Processed 30000 sentences...
Processed 32000 sentences...
Processed 34000 sentences...
Processed 36000 sentences...
Finished. Total examples: 224716
Saving to semcor_train.parquet ...
Saving label map to label_map.json ...
Done!


In [7]:
df.head()

Unnamed: 0,sentence_id,sentence,target_word,char_start,char_end,label,label_id
0,0,The Fulton County Grand Jury said Friday an in...,Fulton County Grand Jury,4,28,group.n.01,10742
1,0,The Fulton County Grand Jury said Friday an in...,said,29,33,state.v.01,22071
2,0,The Fulton County Grand Jury said Friday an in...,Friday,34,40,friday.n.01,9930
3,0,The Fulton County Grand Jury said Friday an in...,investigation,44,57,probe.n.01,18021
4,0,The Fulton County Grand Jury said Friday an in...,Atlanta,61,68,atlanta.n.01,1529


> I'll write a function to check and clean up the dataset, specifically to make sure the indices for the first and last characters of words in the dataset are working right. I'll check both how the indices match the words and how they match the tokens created for the bert-base-uncased model, since this will affect how well the model will be trained later on.

In [8]:
def verify_and_clean_dataset(df, output_file="semcor_train.parquet", model_name="bert-base-uncased"):

    print(f"--- Starting verification and cleaning for {model_name} ---")
    print(f"Initial dataset size: {len(df)}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Counters for statistics
    success_count = 0
    data_integrity_error_count = 0
    content_mismatch_count = 0
    token_not_found_count = 0
    
    # List to store indices of bad rows
    indices_to_drop = []
    
    total_rows = len(df)
    
    # Iterate through the entire dataframe
    for idx in range(total_rows):
        row = df.iloc[idx]
        sentence = row['sentence']
        target_word = row['target_word']
        c_start = row['char_start']
        c_end = row['char_end']
        
        # 1. Integrity Check: Does the slice match the target word?
        # If not, the coordinates in the dataset are wrong.
        slice_check = sentence[c_start:c_end]
        if slice_check != target_word:
            indices_to_drop.append(idx)
            data_integrity_error_count += 1
            continue
        
        # 2. BERT Tokenization Check
        # logic: max_length=128 with truncation
        encoding = tokenizer(
            sentence, 
            truncation=True, 
            max_length=128, 
            return_offsets_mapping=True, 
            add_special_tokens=True
        )
        
        offsets = encoding['offset_mapping']
        tokens = tokenizer.convert_ids_to_tokens(encoding['input_ids'])
        
        found_token_idx = -1
        
        for i, (o_start, o_end) in enumerate(offsets):
            # Skip special tokens ([CLS], [SEP]) which have (0,0) offset
            if o_start == 0 and o_end == 0:
                continue
                
            # Strict match: Token starts exactly where the word starts
            if o_start == c_start:
                found_token_idx = i
                break

            # Fallback match: Token starts before word but covers the start 
            # (only if strict match fails)
            if o_start < c_start and o_end > c_start:
                found_token_idx = i
                break
        
        # 3. Result Evaluation
        if found_token_idx != -1:
            bert_token = tokens[found_token_idx]
            
            # Cleanup for comparison
            clean_tok = bert_token.replace("##", "").lower()
            # Handle MWE (Multi-Word Expressions) by taking the first word
            clean_tgt = target_word.split()[0].lower() 
            
            # Check if the token is actually the start of our word
            if clean_tgt.startswith(clean_tok):
                success_count += 1
            else:
                # Token found, but it doesn't look like the target word
                indices_to_drop.append(idx)
                content_mismatch_count += 1
        else:
            # Token not found (likely truncated due to max_length)
            indices_to_drop.append(idx)
            token_not_found_count += 1

        # Progress log
        if idx % 10000 == 0 and idx > 0:
            print(f"Processed {idx}/{total_rows} rows...")

    # --- Summary ---
    print("-" * 30)
    print(f"Verification finished.")
    print(f"Success: {success_count}")
    print(f"Integrity Errors: {data_integrity_error_count}")
    print(f"Content Mismatch: {content_mismatch_count}")
    print(f"Token Not Found (Truncated): {token_not_found_count}")
    
    print(f"Total rows to drop: {len(indices_to_drop)}")
    
    # --- Cleaning ---
    if len(indices_to_drop) > 0:
        print(f"Dropping {len(indices_to_drop)} bad rows...")
        
        # Drop rows by index
        df_clean = df.drop(index=indices_to_drop)
        
        # Reset index so it is continuous (0, 1, 2...) again
        df_clean = df_clean.reset_index(drop=True)
        
        print(f"New dataset size: {len(df_clean)}")
        
        # Save to Parquet
        print(f"Overwriting {output_file}...")
        df_clean.to_parquet(output_file, index=False)
        print("Dataset successfully updated.")
        
        return df_clean
    else:
        print("No errors found. Dataset is already clean.")
        return df

In [9]:
df = pd.read_parquet("semcor_train.parquet")
df_clean = verify_and_clean_dataset(df)

--- Starting verification and cleaning for bert-base-uncased ---
Initial dataset size: 224716
Processed 10000/224716 rows...
Processed 20000/224716 rows...
Processed 30000/224716 rows...
Processed 40000/224716 rows...
Processed 50000/224716 rows...
Processed 60000/224716 rows...
Processed 70000/224716 rows...
Processed 80000/224716 rows...
Processed 90000/224716 rows...
Processed 100000/224716 rows...
Processed 110000/224716 rows...
Processed 120000/224716 rows...
Processed 130000/224716 rows...
Processed 140000/224716 rows...
Processed 150000/224716 rows...
Processed 160000/224716 rows...
Processed 170000/224716 rows...
Processed 180000/224716 rows...
Processed 190000/224716 rows...
Processed 200000/224716 rows...
Processed 210000/224716 rows...
Processed 220000/224716 rows...
------------------------------
Verification finished.
Success: 224515
Integrity Errors: 0
Content Mismatch: 0
Token Not Found (Truncated): 201
Total rows to drop: 201
Dropping 201 bad rows...
New dataset size: 2