## Prompting gpt-3.5-turbo to get N adversarial negative examples for each NE in pileNER

In [1]:
import json
import sys
import os

from dotenv import load_dotenv
from openai import OpenAI
import tiktoken 

# my libraries
sys.path.append("/Users/andrew/ExpertAI/MSEQA_for_NER/src/MSEQA_4_NER")
from data_handlers import data_handler_pileNER

In [2]:
# Load API key from .env
load_dotenv('./.env')
print(os.environ.get('OPENAI_API_KEY') is not None)

True


In [3]:
client = OpenAI()

In [4]:
# gpt_model = "gpt-3.5-turbo-1106"
gpt_model = "gpt-3.5-turbo-0125" 

In [3]:
dataset_name = "pileNER"
with open(f"../data_handlers/questions/{dataset_name}/all_423_NE_definitions.json", 'r') as file:
        guidelines_per_ne_type = json.load(file)
ne_types_list = list(guidelines_per_ne_type.keys())

print("NE types:")
print(len(ne_types_list))
print(ne_types_list)

NE types:
423
['person', 'organization', 'location', 'concept', 'product', 'variable', 'date', 'medical condition', 'object', 'technology', 'chemical', 'software', 'event', 'number', 'disease', 'attribute', 'protein', 'group', 'material', 'measurement', 'function', 'nationality', 'country', 'class', 'process', 'title', 'animal', 'component', 'condition', 'substance', 'food', 'city', 'activity', 'type', 'company', 'time', 'method', 'property', 'organism', 'drug', 'medical procedure', 'treatment', 'profession', 'cell type', 'anatomical structure', 'job title', 'data', 'quantity', 'sports team', 'biological process', 'data type', 'programming language', 'occupation', 'file', 'body part', 'medical treatment', 'language', 'chemical compound', 'gene', 'state', 'law', 'action', 'website', 'library', 'facility', 'publication', 'tool', 'field of study', 'compound', 'document', 'organ', 'abbreviation', 'character', 'brand', 'device', 'operating system', 'service', 'technique', 'species', 'sympto

In [6]:
def num_tokens_from_messages(messages, model="gpt-3.5-turbo-1106"):
    """Return the number of tokens used in the input prompt."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model in {
        "gpt-3.5-turbo-0613",
        "gpt-3.5-turbo-16k-0613",
        "gpt-4-0314",
        "gpt-4-32k-0314",
        "gpt-4-0613",
        "gpt-4-32k-0613",
        }:
        tokens_per_message = 3
        tokens_per_name = 1
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif "gpt-3.5-turbo" in model:
        # print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613")
    elif "gpt-4" in model:
        print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
        return num_tokens_from_messages(messages, model="gpt-4-0613")
    else:
        raise NotImplementedError(
            f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
        )
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

In [4]:
def fix_guidelines(gpt_definition):
    if not gpt_definition.endswith("}"):
        if not gpt_definition.endswith("\""):
            gpt_definition += "\""
        gpt_definition += "}"
    return gpt_definition

In [5]:
print(guidelines_per_ne_type['measurement']['gpt_answer'])

gpt_definition_fixed = fix_guidelines(guidelines_per_ne_type['measurement']['gpt_answer'])

print(gpt_definition_fixed)

{"Definition": "'measurement' refers to numerical or quantifiable entities such as units of length, mass, time, or data metrics used in scientific, engineering, or quantitative contexts.", "Guidelines": "Avoid labeling abstract concepts that are not quantifiable, such as 'happiness' or 'risk'. Exercise caution with ambiguous terms like 'feet' (could refer to the body part or a unit of measurement). Be mindful of context to distinguish between literal and figurative use of measurements, e
{"Definition": "'measurement' refers to numerical or quantifiable entities such as units of length, mass, time, or data metrics used in scientific, engineering, or quantitative contexts.", "Guidelines": "Avoid labeling abstract concepts that are not quantifiable, such as 'happiness' or 'risk'. Exercise caution with ambiguous terms like 'feet' (could refer to the body part or a unit of measurement). Be mindful of context to distinguish between literal and figurative use of measurements, e"}


# PROMPT formulation

In [None]:
n_sentences_to_generate = 3

ne_as_example = 'date'
ne_guidelines_example = fix_guidelines(guidelines_per_ne_type[ne_as_example]['gpt_answer'])
example_prompt = f"You will be given a Named Entity and its associated annotation guidelines (delimited by triple quotes). \nCarefully read the Definition and Guidelines before answering to the instruction in the end. \n\"\"\"\nNamed Entity: \'{ne_as_example}\', {ne_guidelines_example}\n\"\"\"" 
example_prompt += f"\nInstruction: Leverage on the provided Definition and annotation Guidelines to generate {n_sentences_to_generate} adversarial sentences containing negative occurrences of this Named Entity (negative occurrences are what guidelines say not to label). While doing so, be extremely careful NOT to include any positive occurrences of this Named Entity (what would instead be labeled as this Named Entity). For each sentence provide an explanation. Ensure to output {n_sentences_to_generate} adversarial sentences.\n"

print(example_prompt)

In [None]:
first_negative_example_answer = {"adversarial_sentence": "Jane recently completed her thesis on environmental sustainability, drawing from a plethora of cutting-edge research findings to present a comprehensive analysis of current challenges and potential solutions.", "explanation": "Non-specific time reference 'recently'"}
second_negative_example_answer = {"adversarial_sentence": "May I kindly request your assistance in resolving this matter as swiftly as possible?\"", "explanation": "Ambiguous term 'May' used here as verb and not as month."}
third_negative_example_answer = {"adversarial_sentence": "In the dimly lit auditorium, the audience couldn't help but be captivated by the eerie presence of Wednesday Addams as she delivered her lines with a chilling calmness during the school play, reminding everyone of her unforgettable character from the Addams Family series.",  "explanation": "Ambiguous term 'Wednesday' used here as person name and not as day of the week."} 
example_answer = json.dumps(f"[{first_negative_example_answer}, {second_negative_example_answer}, {third_negative_example_answer}]")
print(example_answer)
    

In [None]:
def get_prompt_for_a_ne(named_entity, guidelines_per_ne_type, gpt_model_name, max_output_tokens, n_sentences_to_generate=3):
    # system message
    system_message = "You are a helpful NER data annotator designed to output JSON."
    # user prompt example on date
    ne_as_example = 'date'
    ne_guidelines_example = fix_guidelines(guidelines_per_ne_type[ne_as_example]['gpt_answer'])
    example_prompt = f"You will be given a Named Entity and its associated annotation guidelines (delimited by triple quotes). \nCarefully read the Definition and Guidelines before answering to the instruction in the end. \n\"\"\"\nNamed Entity: \'{ne_as_example}\', {ne_guidelines_example}\n\"\"\"" 
    example_prompt += f"\nInstruction: Leverage on the provided Definition and annotation Guidelines to generate {n_sentences_to_generate} adversarial sentences containing negative occurrences of this Named Entity (negative occurrences are what the guidelines say not to label). While doing so, be extremely careful NOT to include any positive occurrences of this Named Entity (what would instead be labeled as this Named Entity). For each sentence provide an explanation. Ensure to output {n_sentences_to_generate} adversarial sentences.\n"
    
    # Output a list of length {n_sentences_to_generate} where each item is a dict with fields \"adversarial_sentence\" and \"explanation\".

    # example answer
    first_negative_example_answer = {"adversarial_sentence": "Jane recently completed her thesis on environmental sustainability, drawing from a plethora of cutting-edge research findings to present a comprehensive analysis of current challenges and potential solutions.", "explanation": "Non-specific time reference 'recently'"}
    second_negative_example_answer = {"adversarial_sentence": "May I kindly request your assistance in resolving this matter as swiftly as possible?\"", "explanation": "Ambiguous term 'May' used here as verb and not as month."}
    third_negative_example_answer = {"adversarial_sentence": "In the dimly lit auditorium, the audience couldn't help but be captivated by the eerie presence of Wednesday Addams as she delivered her lines with a chilling calmness during the school play, reminding everyone of her unforgettable character from the Addams Family series.",  "explanation": "Ambiguous term 'Wednesday' used here as person name and not as day of the week."} 
    example_answer = json.dumps(f"[{first_negative_example_answer}, {second_negative_example_answer}, {third_negative_example_answer}]")
    #example_answer = json.dumps([first_negative_example_answer, second_negative_example_answer, third_negative_example_answer])
    
    # real prompt
    ne_guidelines = fix_guidelines(guidelines_per_ne_type[named_entity]['gpt_answer'])
    real_prompt = f"Now do the same for \"\"\" Named Entity: \'{named_entity}\', {ne_guidelines} \"\"\"\n"

    # message to send
    messages_to_send = [
    {
        "role": "system",
        "content": system_message
    },
    {
        "role": "user",
        "content": example_prompt
    },
    {
        "role": "system",
        "content": example_answer
    },
    {
        "role": "user",
        "content": real_prompt
    }
    ]

    n_input_tokens = num_tokens_from_messages(messages_to_send, gpt_model)
    # print(messages_to_send)

    for message in messages_to_send:
        print(message)
    
    if n_input_tokens > 1000:
        raise ValueError(f"Too many input tokens in messages_to_send: {len(messages_to_send)}, {messages_to_send}")

    print(f"Sending prompt for NE: {named_entity}...")
    
    completion = client.chat.completions.create(
        messages=messages_to_send,
        model=gpt_model_name,
        max_tokens=max_output_tokens,
        response_format={ "type": "json_object" }, 
        temperature=0
    )

    choice = completion.choices[0]
    finish_reason = choice.finish_reason
    message = choice.message.content

    #if finish_reason == 'length' and message[-2:] != "\"}":
        #message += "\"}"

    return {"named_entity": named_entity,
            "prompt_length": completion.usage.prompt_tokens,
            "output_length": completion.usage.completion_tokens,
            "annotation_guidelines": json.loads(ne_guidelines),
            "negative_sentences": message,
            "finish_reason": finish_reason
        }
    

In [None]:
# NO FAKE SYSTEM MESSAGE BUT DIRECT PROMPT

def get_prompt_for_a_ne(named_entity, guidelines_per_ne_type, gpt_model_name, max_output_tokens, n_sentences_to_generate=3):
    # system message
    system_message = "You are a helpful NER data annotator designed to output JSON."
    # user prompt example on date
    ne_guidelines = fix_guidelines(guidelines_per_ne_type[named_entity]['gpt_answer'])
    example_prompt = f"You will be given a Named Entity and its associated annotation guidelines (delimited by triple quotes). \nCarefully read the Definition and Guidelines before answering to the instruction in the end. \n\"\"\"\nNamed Entity: \'{named_entity}\', {ne_guidelines}\n\"\"\"" 
    example_prompt += f"\nInstruction: Leverage on the provided Definition and annotation Guidelines to generate {n_sentences_to_generate} adversarial sentences containing negative occurrences of this Named Entity (negative occurrences are what the guidelines say not to label). While doing so, be extremely careful NOT to include any positive occurrences of this Named Entity (what would instead be labeled as this Named Entity). For each sentence provide a short explanation. Ensure to output {n_sentences_to_generate} adversarial sentences. The sentences should not be too short.\n"
    
    # Output a list of length {n_sentences_to_generate} where each item is a dict with fields \"adversarial_sentence\" and \"explanation\".

    # message to send
    messages_to_send = [
    {
        "role": "system",
        "content": system_message
    },
    {
        "role": "user",
        "content": example_prompt
    }
    ]

    n_input_tokens = num_tokens_from_messages(messages_to_send, gpt_model)
    # print(messages_to_send)

    for message in messages_to_send:
        print(message)
    
    if n_input_tokens > 1000:
        raise ValueError(f"Too many input tokens in messages_to_send: {len(messages_to_send)}, {messages_to_send}")

    print(f"Sending prompt for NE: {named_entity}...")
    
    completion = client.chat.completions.create(
        messages=messages_to_send,
        model=gpt_model_name,
        max_tokens=max_output_tokens,
        response_format={ "type": "json_object" }, 
        temperature=0
    )

    choice = completion.choices[0]
    finish_reason = choice.finish_reason
    message = choice.message.content

    #if finish_reason == 'length' and message[-2:] != "\"}":
        #message += "\"}"

    return {"named_entity": named_entity,
            "prompt_length": completion.usage.prompt_tokens,
            "output_length": completion.usage.completion_tokens,
            "annotation_guidelines": json.loads(ne_guidelines),
            "negative_sentences": message,
            "finish_reason": finish_reason
        }
    

In [None]:
response = get_prompt_for_a_ne('framework', guidelines_per_ne_type, gpt_model, max_output_tokens=1000, n_sentences_to_generate=3)

In [None]:
print(response)

In [None]:
print(response['negative_sentences'])

In [None]:
print(response['negative_sentences'])

In [None]:
print(response['negative_sentences'])

In [None]:
print(response['negative_sentences'])

In [None]:
print(response['negative_sentences'])

In [None]:
print(response['negative_sentences'])

In [None]:
print(response['negative_sentences'])

### get all definitions

In [None]:
negative_generated_examples = []
for ne in list(guidelines_per_ne_type.keys())[300:]:
    try:
        response = get_prompt_for_a_ne(ne, guidelines_per_ne_type, gpt_model, max_output_tokens=1000, n_sentences_to_generate=3)
        response['negative_sentences'] = json.loads(response['negative_sentences'])
        negative_generated_examples.append(response)
    except:
        with open(f"./adversarial_examples/{dataset_name}_err.json", 'w') as f:
            json.dump(negative_generated_examples, f, indent=2)
        print(f"Something went wrong while processing NE: {ne}")

with open(f"./adversarial_examples/{dataset_name}_300_423.json", 'w') as f:
    json.dump(negative_generated_examples, f, indent=2)

### END

In [None]:
with open('./adversarial_examples/ALL_pileNER_adv_examples_raw.json', 'r') as file:
    adversarial_examples_per_NE = json.load(file)

In [None]:
print(adversarial_examples_per_NE)
print(len(adversarial_examples_per_NE))

In [None]:
missing_ne = set(ne_types_list) - set([sample['named_entity'] for sample in adversarial_examples_per_NE])
print(missing_ne)

In [None]:
cleaned_adversarial_examples = {}
for sample in adversarial_examples_per_NE:
    named_entity = sample['named_entity']
    annotation_guidelines = sample['annotation_guidelines']
    negative_sentences = sample['negative_sentences']
    if isinstance(negative_sentences, dict):
        first_value = list(negative_sentences.values())[0]
        if isinstance(first_value, list):
            negative_sentences = first_value
    cleaned_adversarial_examples[named_entity] = {
        'named_entity': named_entity,
        'annotation_guidelines': annotation_guidelines,
        'negative_sentences': negative_sentences
    }


In [None]:
print(cleaned_adversarial_examples)

In [None]:
print(len(cleaned_adversarial_examples))

In [None]:
with open(f"./adversarial_examples/ALL_pileNER_adv_examples.json", 'w') as f:
    json.dump(cleaned_adversarial_examples, f, indent=2)

### Extracting also positive adversarial examples

In [14]:
named_entity = 'location'
n_sentences_to_generate = 3
ne_guidelines = fix_guidelines(guidelines_per_ne_type[named_entity]['gpt_answer'])
example_prompt = f"You will be given a Named Entity and its associated annotation guidelines (delimited by triple quotes). \nCarefully read the Definition and Guidelines before answering to the instruction in the end. \n\"\"\"\nNamed Entity: \'{named_entity}\', {ne_guidelines}\n\"\"\"" 
#example_prompt += f"\nInstruction: Leverage on the provided annotation Guidelines to generate {n_sentences_to_generate} sentences containing positive occurrences of this Named Entity (what should be labeled as this Named Entity). For Guidelines presenting ambiguous terms is very important that you generate a sentence for the term that should be labelled as this Named Entity (e.g. 'Apple' the company, and not the fruit, if the Named Entity is 'organization')."
example_prompt += f"\nInstruction: For guidelines mentioning ambiguous terms, generate a few exemplary sentences in which the term usage fits the definition of the Named Entity. For example, if the Named Entity is 'organization' and the word is 'Apple', use 'Apple' in a sentence as a company name, not as the name of a fruit. Make sure that you create sentences in which the use of the term makes it belong to the Named Entity in question."
example_prompt += f" \nFor each sentence, provide a very short explanation and a JSON list of the positive occurrences of the considered Named Entity in each sentence. Ensure to output a list of sentences. The sentences should not be too short.\n"

print(example_prompt)

You will be given a Named Entity and its associated annotation guidelines (delimited by triple quotes). 
Carefully read the Definition and Guidelines before answering to the instruction in the end. 
"""
Named Entity: 'location', {"Definition": "'location' denotes geographic entities, such as cities, countries, and landmarks, that represent specific places on Earth.", "Guidelines": "Do not label abstract references. Be cautious with ambiguous terms like 'Paris Hilton' (person with a location name) or 'Amazon' (both a company and a river)."}
"""
Instruction: For guidelines mentioning ambiguous terms, generate a few exemplary sentences in which the term usage fits the definition of the Named Entity. For example, if the Named Entity is 'organization' and the word is 'Apple', use 'Apple' in a sentence as a company name, not as the name of a fruit. Make sure that you create sentences in which the use of the term makes it belong to the Named Entity in question. 
For each sentence, provide a v

In [19]:
# NO FAKE SYSTEM MESSAGE BUT DIRECT PROMPT

def get_prompt_for_a_ne(named_entity, guidelines_per_ne_type, gpt_model_name, max_output_tokens, n_sentences_to_generate=3):
    # system message
    system_message = "You are a helpful NER data annotator designed to output JSON."
    # user prompt example on date
    ne_guidelines = fix_guidelines(guidelines_per_ne_type[named_entity]['gpt_answer'])
    example_prompt = f"You will be given a Named Entity and its associated annotation guidelines (delimited by triple quotes). \nCarefully read the Definition and Guidelines before answering to the instruction in the end. \n\"\"\"\nNamed Entity: \'{named_entity}\', {ne_guidelines}\n\"\"\"" 
    #example_prompt += f"\nInstruction: Leverage on the provided annotation Guidelines to generate {n_sentences_to_generate} sentences containing positive occurrences of this Named Entity (what should be labeled as this Named Entity). For each sentence provide a very short explanation and a JSON list of the positive occurrences you added in each sentence." 
    #example_prompt += f" Most importantly, for Guidelines presenting ambiguous terms, generate a sentence for the term that should be labelled as this Named Entity (e.g. Apple the company and not the fruit if the Named Entity is 'organization'). Ensure to output a list of {n_sentences_to_generate} sentences. The sentences should not be too short.\n"
    
    #example_prompt += f"\nInstruction: Leverage on the provided annotation Guidelines to generate {n_sentences_to_generate} sentences containing positive occurrences of this Named Entity (what should be labeled as this Named Entity). For Guidelines presenting ambiguous terms is very important that you generate a sentence for the term that should be labelled as this Named Entity (e.g. 'Apple' the company, and not the fruit, if the Named Entity is 'organization')."
    
    # last try
    #example_prompt += f"\nInstruction: Leverage on the provided annotation Guidelines to generate {n_sentences_to_generate} sentences containing positive occurrences of this Named Entity (what should be labeled as this Named Entity)."
    #example_prompt += f" \nFor each sentence provide a very short explanation and a JSON list of the positive occurrences you added in each sentence. Ensure to output a JSON list of {n_sentences_to_generate} examples. The sentences should not be too short.\n"
    #example_prompt += f" \nWhen the guidelines mention ambiguous terms, it is very important that you generate sentences where the term fits the definition of the Named Entity. For example, if the Named Entity is 'organization' and the word is 'Apple', use 'Apple' in a sentence as a company name, not as the name of a fruit."
    
    #
    example_prompt += f"\nInstruction: For guidelines mentioning ambiguous terms, generate a few exemplary sentences in which the term usage fits the definition of the Named Entity. For example, if the Named Entity is 'organization' and the word is 'Apple', use 'Apple' in a sentence as a company name, not as the name of a fruit. Make sure that you create sentences in which the use of the term makes it belong to the Named Entity in question."
    example_prompt += f" \nFor each sentence, provide a very short explanation and a JSON list of the positive occurrences of the considered Named Entity in each sentence. Ensure to output a list of sentences. The sentences should not be too short.\n"


    # Output a list of length {n_sentences_to_generate} where each item is a dict with fields \"adversarial_sentence\" and \"explanation\".

    # message to send
    messages_to_send = [
    {
        "role": "system",
        "content": system_message
    },
    {
        "role": "user",
        "content": example_prompt
    }
    ]

    n_input_tokens = num_tokens_from_messages(messages_to_send, gpt_model)
    # print(messages_to_send)

    for message in messages_to_send:
        print(message)
    
    if n_input_tokens > 1000:
        raise ValueError(f"Too many input tokens in messages_to_send: {len(messages_to_send)}, {messages_to_send}")

    print(f"Sending prompt for NE: {named_entity}...")
    
    completion = client.chat.completions.create(
        messages=messages_to_send,
        model=gpt_model_name,
        max_tokens=max_output_tokens,
        response_format={ "type": "json_object" }, 
        temperature=1
    )

    choice = completion.choices[0]
    finish_reason = choice.finish_reason
    message = choice.message.content

    #if finish_reason == 'length' and message[-2:] != "\"}":
        #message += "\"}"

    return {"named_entity": named_entity,
            "prompt_length": completion.usage.prompt_tokens,
            "output_length": completion.usage.completion_tokens,
            "annotation_guidelines": json.loads(ne_guidelines),
            "positive_sentences": message,
            "finish_reason": finish_reason
        }
    

In [40]:
response = get_prompt_for_a_ne('file type', guidelines_per_ne_type, gpt_model, max_output_tokens=500, n_sentences_to_generate=3)
print(response)

{'role': 'system', 'content': 'You are a helpful NER data annotator designed to output JSON.'}
{'role': 'user', 'content': 'You will be given a Named Entity and its associated annotation guidelines (delimited by triple quotes). \nCarefully read the Definition and Guidelines before answering to the instruction in the end. \n"""\nNamed Entity: \'file type\', {"Definition": "\'file type\' refers to the specific format or extension of a digital file, such as PDF, XML, or TXT, which indicates the type of data contained in the file.", "Guidelines": "Do not label general terms like \'file\' or \'document\'. Avoid labeling file types within abstract concepts, such as \'shared workbooks\', which refer to functionalities rather than file formats. Be careful with acronyms and ambiguous terms like \'PDF\' (could also mean \'Portable"}\n"""\nInstruction: For guidelines mentioning ambiguous terms, generate a few exemplary sentences in which the term usage fits the definition of the Named Entity. For

In [41]:
print(response['positive_sentences'])

{
  "sentences": [
    {
      "sentence": "I need to convert this text document into a PDF file type for easy sharing.",
      "explanation": "In this sentence, 'PDF file type' refers to the specific format or extension of a digital file, meeting the definition of the Named Entity 'file type'."
    },
    {
      "sentence": "The system only accepts images in JPG file type for uploads.",
      "explanation": "'JPG file type' here denotes the specific format of digital images, fitting the definition of 'file type'."
    }
  ]
}


In [31]:
response = get_prompt_for_a_ne('state', guidelines_per_ne_type, gpt_model, max_output_tokens=500, n_sentences_to_generate=3)
print(response)

{'role': 'system', 'content': 'You are a helpful NER data annotator designed to output JSON.'}
{'role': 'user', 'content': 'You will be given a Named Entity and its associated annotation guidelines (delimited by triple quotes). \nCarefully read the Definition and Guidelines before answering to the instruction in the end. \n"""\nNamed Entity: \'state\', {"Definition": "\'state\' refers to a defined territory within a country or nation, often with its own government and administrative boundaries.", "Guidelines": "Do not label non-political divisions like \'emotional state\' or \'solid state\'. Exercise caution with ambiguous terms such as \'Georgia\' (could refer to the state or the country) and use contextual clues to disambiguate."}\n"""\nInstruction: For guidelines mentioning ambiguous terms, generate a few exemplary sentences in which the term usage fits the definition of the Named Entity. For example, if the Named Entity is \'organization\' and the word is \'Apple\', use \'Apple\' i

In [32]:
print(response['positive_sentences'])

{
    "sentences": [
        {
            "sentence": "Washington is the capital city of a state that is known for its apples.",
            "explanation": "In this sentence, 'state' refers to a defined territory within a country with its own government and administrative boundaries, not an 'emotional state' or 'solid state'. The mention of apples indicates the use of 'state' in a political context.",
            "positive_occurrences": ["state"]
        },
        {
            "sentence": "I visited Texas last summer, which is the second most populous state in the United States.",
            "explanation": "Here, 'state' refers to a defined territory within a country with its own government and administrative boundaries, not an 'emotional state' or 'solid state'. The context of being a part of the United States disambiguates any other possible meanings of 'state'.",
            "positive_occurrences": ["state"]
        },
        {
            "sentence": "My friend plans to move t

In [14]:
print(response['negative_sentences'])

{
  "sentences": [
    {
      "sentence": "Apple announced a new product launch event next week.",
      "explanation": "In this sentence, 'Apple' refers to the company known for its technology products, so it should be labeled as an organization.",
      "positive_occurrences": ["Apple"]
    },
    {
      "sentence": "The Red Cross provided aid to the disaster-stricken area.",
      "explanation": "Here, 'Red Cross' specifically denotes the humanitarian organization, making it a clear instance of an organization.",
      "positive_occurrences": ["Red Cross"]
    },
    {
      "sentence": "Google is known for its innovative approach to technology.",
      "explanation": "'Google' is a well-known company in the tech industry, making it a suitable example of an organization.",
      "positive_occurrences": ["Google"]
    }
  ]
}


In [23]:
positive_generated_examples = []
for ne in list(guidelines_per_ne_type.keys())[300:]:
    try:
        response = get_prompt_for_a_ne(ne, guidelines_per_ne_type, gpt_model, max_output_tokens=500, n_sentences_to_generate=3)
        response['positive_sentences'] = json.loads(response['positive_sentences'])
        positive_generated_examples.append(response)
    except:
        with open(f"./adversarial_examples/{dataset_name}_err.json", 'w') as f:
            json.dump(positive_generated_examples, f, indent=2)
        print(f"Something went wrong while processing NE: {ne}")

with open(f"./adversarial_examples/{dataset_name}_POS_300_423.json", 'w') as f:
    json.dump(positive_generated_examples, f, indent=2)

{'role': 'system', 'content': 'You are a helpful NER data annotator designed to output JSON.'}
{'role': 'user', 'content': 'You will be given a Named Entity and its associated annotation guidelines (delimited by triple quotes). \nCarefully read the Definition and Guidelines before answering to the instruction in the end. \n"""\nNamed Entity: \'role\', {"Definition": "\'role\' in the context of NER refers to a specific function, position, or responsibility within a system, organization, or context.", "Guidelines": "Do not label generic terms like \'worker\' or \'member\' unless they are part of a specific role (e.g., \'project manager\'). Be cautious with ambiguous terms that have multiple meanings in different contexts (e.g., \'driver\' as a job or a hardware component)."}\n"""\nInstruction: For guidelines mentioning ambiguous terms, generate a few exemplary sentences in which the term usage fits the definition of the Named Entity. For example, if the Named Entity is \'organization\' a

In [21]:
with open('./adversarial_examples/ALL_pileNER_positive_adversarial_examples_raw.json', 'r') as file:
    positive_adversarial_examples_per_NE = json.load(file)

In [22]:
print(positive_adversarial_examples_per_NE)
print(len(positive_adversarial_examples_per_NE))

423


In [23]:
missing_ne = set(ne_types_list) - set([sample['named_entity'] for sample in positive_adversarial_examples_per_NE])
print(missing_ne)

set()


In [31]:
cleaned_adversarial_examples = {}
canonical_keys = ['sentence', 'explanation', 'occurrences']
used_keys = {}
for sample in positive_adversarial_examples_per_NE:
    named_entity = sample['named_entity']
    annotation_guidelines = sample['annotation_guidelines']
    positive_adv_sentences = sample['positive_sentences']
    if isinstance(positive_adv_sentences, dict):
        try:
            first_value = list(positive_adv_sentences.values())[0]
        except:
            print(named_entity)
        if isinstance(first_value, list):
            positive_adv_sentences = first_value
        else:
            raise ValueError
    cleaned_positive_adv_sentences = []
    if len(positive_adv_sentences) < 2:
        print(named_entity)
    for example in positive_adv_sentences:
        try:
            keys = list(example.keys())
            if len(keys) != 3:
                print(named_entity)
        except:
            print(named_entity)
        for key in keys:
            used_keys[key] = 1

        example_values = list(example.values())
        sentence = example_values[0]
        positive_occurrences = example_values[-1]
        if not isinstance(sentence, str):
            print(named_entity)
        if not isinstance(positive_occurrences, list):
            print(named_entity)
        for occurrence in positive_occurrences:
            if not isinstance(occurrence, str):
                print(named_entity)

        cleaned_positive_adv_sentences.append({
            "sentence": sentence,
            "positive occurrences": positive_occurrences
        })
            
    cleaned_adversarial_examples[named_entity] = {
        'named_entity': named_entity,
        'annotation_guidelines': annotation_guidelines,
        'positive_adv_sentences': cleaned_positive_adv_sentences
    }

print(used_keys)


{'sentence': 1, 'explanation': 1, 'positive_occurrences': 1, 'location_entities': 1, 'occurrences': 1, 'date_mentions': 1, 'nationality_mentions': 1, 'company_mentions': 1, 'method_occurrences': 1, 'organisms_occurrences': 1, 'positives': 1, 'quantity_positive_occurrences': 1, 'Sentence': 1, 'Explanation': 1, 'PositiveOccurrences': 1, 'occupations': 1, 'named_entity_occurrences': 1, 'action_occurrences': 1, 'character_occurrences': 1, 'category_occurrences': 1, 'Named_Entity_occurrences': 1, 'ethnicity_occurrences': 1, 'named_entities': 1, 'tag_occurrences': 1, 'ordinal_occurrences': 1, 'demonyms': 1, 'positive_mentions': 1, 'entity_occurrences': 1, 'keyword_occurrences': 1, 'direction': 1, 'code_occurrences': 1, 'html element_occurrences': 1, 'entities': 1, 'reference_occurrences': 1, 'population_occurrences': 1, 'source_occurrences': 1, 'entity_mentions': 1, 'instances': 1, 'role_occurrences': 1, 'commodity_occurrences': 1, 'pronouns': 1, 'norp_occurrences': 1, 'style_occurrences': 1

In [26]:
with open(f"./adversarial_examples/ALL_pileNER_positive_adversarial_examples.json", 'w') as f:
    json.dump(cleaned_adversarial_examples, f, indent=2)