 ## Homonym Generation using GPT

In [None]:
# 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

In [None]:
# @title WSD Data Generator (Adversarial Cross-Combination Logic)
# @title 1. Setup and Imports
!pip install openai pydantic --quiet

import json
import os
from typing import List
from pydantic import BaseModel, Field
from google.colab import userdata, files
from openai import OpenAI

# --- AUTHENTICATION ---
try:
    os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
except:
    # Fallback for local testing, though Colab secrets are highly recommended.
    print("WARNING: 'OPENAI_API_KEY' not found in Colab secrets.")
    # For a persistent run, ensure your key is set here or in secrets.

client = OpenAI()
print("OpenAI Client Initialized.")

# --- 2. GENERATION SCHEMA (Internal) ---
# This schema forces GPT to generate the full "Cluster" of data points in one call.
class ScenarioCluster(BaseModel):
    homonym: str
    precontext: str = Field(..., description="The setup text (2-3 sentences) that is valid for BOTH meanings.")
    sentence: str = Field(..., description="The ambiguous pivot sentence containing the homonym.")

    # Meaning A
    judged_meaning_a: str = Field(..., description="The definition of the first meaning.")
    endings_a: List[str] = Field(..., description="Generate exactly 2 different endings that resolve to Meaning A.", min_items=2, max_items=2)
    example_a: str = Field(..., description="A simple, unambiguous example sentence for Meaning A.")

    # Meaning B
    judged_meaning_b: str = Field(..., description="The definition of the second meaning.")
    endings_b: List[str] = Field(..., description="Generate exactly 2 different endings that resolve to Meaning B.", min_items=2, max_items=2)
    example_b: str = Field(..., description="A simple, unambiguous example sentence for Meaning B.")

class GenerationResponse(BaseModel):
    clusters: List[ScenarioCluster]

# --- 3. OUTPUT SCHEMA (Final List Item) ---
# This is the strict format for each entry in the final JSON list.
class WSDItem(BaseModel):
    homonym: str
    judged_meaning: str
    precontext: str
    sentence: str
    ending: str
    example_sentence: str

# --- 4. PIPELINE ---

def generate_wsd_dataset(homonyms_list: List[dict]):
    final_entries = []

    # Helper function for appending entries
    def append_entry(cluster, meaning, ending, example):
        return WSDItem(
            homonym=cluster.homonym,
            judged_meaning=meaning,
            precontext=cluster.precontext,
            sentence=cluster.sentence,
            ending=ending,
            example_sentence=example
        ).model_dump()

    for item in homonyms_list:
        word = item['word']
        print(f"\n--- Generating cluster for: {word} ---")

        prompt = f"""
See this example of how ambiguous sentences work (note how 'precontext' and 'sentence' stay the same, but the 'judged_meaning' and 'ending' change):

{{
    "1": {{
        "homonym": "bugs",
        "judged_meaning": "general term for any insect",
        "precontext": "Anna was having a tough week. Her room was a mess, and her computer kept crashing. Frustrated by everything going wrong, she called Jen.",
        "sentence": "She asked her friend to help her get rid of the bugs.",
        "ending": "They were crawling on the keyboard. Maybe that was the reason it didn't work.",
        "example_sentence": "The garden was full of bugs."
    }},
    "2": {{
        "homonym": "bugs",
        "judged_meaning": "a fault or defect in a computer program",
        "precontext": "Anna was having a tough week. Her room was a mess, and her computer kept crashing. Frustrated by everything going wrong, she called Jen.",
        "sentence": "She asked her friend to help her get rid of the bugs.",
        "ending": "They were crawling on the keyboard. Maybe that was the reason it didn't work.",
        "example_sentence": "There's a bug in the software."
    }}
}}

In a similar way, Generate a WSD Scenario Cluster for the homonym "{word}".

Target Meanings:
1. {item['meaning_a']}
2. {item['meaning_b']}

Requirements:
- Write ONE 'precontext' and ONE 'sentence' that should be ambiguously applicable and meaningful for BOTH meanings.
- The 'sentence' must be the ambiguous pivot.
- Provide 2 distinct 'endings' that resolve to Meaning 1.
- Provide 2 distinct 'endings' that resolve to Meaning 2.
- Provide a simple 'example_sentence' for each meaning.
"""

        try:
            completion = client.beta.chat.completions.parse(
                model="gpt-4o-2024-08-06",
                messages=[
                    {"role": "system", "content": "You are an expert NLP dataset generator. Generate a JSON object strictly following the ScenarioCluster schema."},
                    {"role": "user", "content": prompt},
                ],
                response_format=GenerationResponse,
                temperature=0.7
            )

            cluster = completion.choices[0].message.parsed.clusters[0]

            # --- ADVERSARIAL EXPANSION LOGIC ---

            # Core Components:
            W1_def = cluster.judged_meaning_a
            W2_def = cluster.judged_meaning_b
            E1_ending = cluster.endings_a[0] # The primary ending for W1
            E2_ending = cluster.endings_b[0] # The primary ending for W2
            EX1 = cluster.example_a
            EX2 = cluster.example_b

            # 1. W1 + E1 (Consistent)
            final_entries.append(append_entry(cluster, W1_def, E1_ending, EX1))

            # 2. W1 + E2 (Inconsistent/Adversarial: Judged Meaning A with Ending for B)
            final_entries.append(append_entry(cluster, W1_def, E2_ending, EX1))

            # 3. W2 + E2 (Consistent)
            final_entries.append(append_entry(cluster, W2_def, E2_ending, EX2))

            # 4. W2 + E1 (Inconsistent/Adversarial: Judged Meaning B with Ending for A)
            final_entries.append(append_entry(cluster, W2_def, E1_ending, EX2))

            # 5. W1 + Blank
            final_entries.append(append_entry(cluster, W1_def, "", EX1))

            # 6. W2 + Blank
            final_entries.append(append_entry(cluster, W2_def, "", EX2))

            # Add the secondary endings as additional, separate consistent examples
            final_entries.append(append_entry(cluster, W1_def, cluster.endings_a[1], EX1))
            final_entries.append(append_entry(cluster, W2_def, cluster.endings_b[1], EX2))

            print(f"   Successfully generated 8 entries (6 required + 2 secondary consistent) for '{word}'.")

        except Exception as e:
            print(f"Error generating {word}: {e}")

    return {"entries": final_entries}

# --- 5. EXECUTION ---

# Target homonyms using the comprehensive list
target_homonyms_list = [
    {
        "word": "balance",
        "meaning_a": "The difference between assets and liabilities (Finance)",
        "meaning_b": "A state of equilibrium; stability (Physical/General)"
    },
    {
        "word": "patch",
        "meaning_a": "A temporary code fix or software update (Technology)",
        "meaning_b": "A small piece of material used to mend a hole (Physical/Layperson)"
    },
    {
        "word": "protocol",
        "meaning_a": "A set of rules governing data transmission (Networking)",
        "meaning_b": "A formal procedure or set of etiquette rules (Diplomacy/General)"
    },
    {
        "word": "grave",
        "meaning_a": "An excavation site for burying a body (Literal)",
        "meaning_b": "Serious, somber, or important (Metaphorical)"
    },
    {
        "word": "nail",
        "meaning_a": "A small metal spike driven into wood (Literal)",
        "meaning_b": "To execute a task perfectly (Idiomatic)"
    },
    {
        "word": "produce",
        "meaning_a": "Fresh fruits and vegetables (Noun)",
        "meaning_b": "To create or manufacture something (Verb)"
    },
    {
        "word": "address",
        "meaning_a": "A location or coordinates (Noun)",
        "meaning_b": "To speak to or deal with a problem (Verb)"
    }
]

# Run generation
dataset = generate_wsd_dataset(target_homonyms_list)

# --- 6. SAVE AND DOWNLOAD ---

output_filename = 'wsd_generated_gpt_full_adversarial.json'
with open(output_filename, 'w') as f:
    json.dump(dataset, f, indent=2)

print(f"\n✅ Final Count: {len(dataset['entries'])} entries generated (8 entries per word).")
files.download(output_filename)

OpenAI Client Initialized.

--- Generating cluster for: balance ---


/tmp/ipython-input-2746003631.py:313: PydanticDeprecatedSince20: `min_items` is deprecated and will be removed, use `min_length` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  endings_a: List[str] = Field(..., description="Generate exactly 2 different endings that resolve to Meaning A.", min_items=2, max_items=2)
/tmp/ipython-input-2746003631.py:313: PydanticDeprecatedSince20: `max_items` is deprecated and will be removed, use `max_length` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  endings_a: List[str] = Field(..., description="Generate exactly 2 different endings that resolve to Meaning A.", min_items=2, max_items=2)
/tmp/ipython-input-2746003631.py:318: PydanticDeprecatedSince20: `min_items` is deprecated and will be removed, use `min_length` instead. Deprecated in Pydantic V2.0 to be removed in

   Successfully generated 8 entries (6 required + 2 secondary consistent) for 'balance'.

--- Generating cluster for: patch ---
   Successfully generated 8 entries (6 required + 2 secondary consistent) for 'patch'.

--- Generating cluster for: protocol ---
   Successfully generated 8 entries (6 required + 2 secondary consistent) for 'protocol'.

--- Generating cluster for: grave ---
   Successfully generated 8 entries (6 required + 2 secondary consistent) for 'grave'.

--- Generating cluster for: nail ---
   Successfully generated 8 entries (6 required + 2 secondary consistent) for 'nail'.

--- Generating cluster for: produce ---
   Successfully generated 8 entries (6 required + 2 secondary consistent) for 'produce'.

--- Generating cluster for: address ---
   Successfully generated 8 entries (6 required + 2 secondary consistent) for 'address'.

✅ Final Count: 56 entries generated (8 entries per word).


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>