## Prompting gpt-3.5-turbo to get a Definition for each NE type in cross-NER/BUSTER

In [1]:
import json
import sys
import os

from openai import OpenAI
from dotenv import load_dotenv
import tiktoken 

# my libraries
sys.path.append("/Users/andrew/ExpertAI/MSEQA_for_NER/src/MSEQA_4_NER")
from data_handlers import data_handler_cross_NER
from data_handlers import data_handler_BUSTER
from data_handlers import data_handler_MIT

In [2]:
# Load API key from .env
load_dotenv('./.env')
print(os.environ.get('OPENAI_API_KEY') is not None)

True


In [3]:
client = OpenAI()

In [4]:
gpt_model = "gpt-3.5-turbo-1106"

In [5]:
dataset_name = "restaurant"
with open(f"../data_handlers/questions/MIT/sentences_per_ne_type_{dataset_name}.json", 'r') as file:
        sentences_per_ne_type = json.load(file)

ne_types_list = list(sentences_per_ne_type.keys())

print("NE types:")
print(len(ne_types_list))
print(ne_types_list)

NE types:
8
['Amenity', 'Cuisine', 'Dish', 'Hours', 'Location', 'Price', 'Rating', 'Restaurant_Name']


In [7]:
def num_tokens_from_messages(messages, model="gpt-3.5-turbo-1106"):
    """Return the number of tokens used in the input prompt."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model in {
        "gpt-3.5-turbo-0613",
        "gpt-3.5-turbo-16k-0613",
        "gpt-4-0314",
        "gpt-4-32k-0314",
        "gpt-4-0613",
        "gpt-4-32k-0613",
        }:
        tokens_per_message = 3
        tokens_per_name = 1
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif "gpt-3.5-turbo" in model:
        # print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613")
    elif "gpt-4" in model:
        print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
        return num_tokens_from_messages(messages, model="gpt-4-0613")
    else:
        raise NotImplementedError(
            f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
        )
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

# PROMPT formulation

System message

In [None]:
system_message = "You are a helpful NER data annotator designed to output JSON."  # OpenAI documentations suggest to write also here to output JSON

creating 1st 'user_prompt' using 'location' as example 

In [None]:
ne_type = 'location'
#related_data = sentences_per_ne_type[ne_type]

#example_prompt = data_handler_cross_NER.generate_structured_prompt(related_data)

example_prompt = "Named Entity: 'location'. Examples: [{'sentence': 'He was awarded honorary degree from the University of Cambridge in Cambridge , UK , and the Copley Medal of the Royal Society in 1906 .', 'entities': ['Cambridge']}, {'sentence': 'According to onlookers, the communicant told the priest: \"You're in Ireland now\".', 'entities': ['Ireland']}, {'sentence': 'The Big Bend Country is part of the larger Columbia Country , which includes the Columbia Valley and upper Arrow Lakes .', 'entities': ['Arrow Lakes', 'Columbia Country', 'Big Bend Country', 'Columbia Valley']}]. Hints: You should not label 'here', 'there' and similar as 'location' entities.\nInstructions: 1. Provide a concise definition for the named entity 'location' in the context of NER. 2. Provide guidelines by specifying what entities should not be labeled as 'location' and include potential pitfalls to avoid. Go beyond generic terms and delve into nuanced scenarios. Be explicit about potential ambiguities and provide guidance on distinguishing 'location' from similar entities.\nOutput in JSON format: {\"Definition\": \"\", \"Guidelines\": \"\"}."

print("example_prompt:\n")
print(example_prompt)

What would look like a desired gpt's answer

In [None]:
example_answer = "{\"Definition\": \"\'location\' denotes geographic entities, such as cities, countries, and landmarks, that represent specific places on Earth.\", \"Guidelines\": \"Do not label abstract references. Be cautious with ambiguous terms like 'Paris Hilton' (person with a location name) or 'Amazon' (both a company and a river)." # Be cautious with terms that may refer to both people and places, like 'Jordan' (person or country).\"}"
print("example answer:")
print(example_answer)

The real prompt for the desired NE 

In [None]:
ne_type = 'Advisors.GENERIC_CONSULTING_COMPANY'  # pick any NE
ex_sentences = sentences_per_ne_type[ne_type]['sentences']
ex_sentences_json = []
for exsent in ex_sentences:
    ex_sentences_json.append({'sentence': exsent['sentence'], 'entities': exsent['target_words_in_it']})
real_prompt = f"Now do the same for the Named Entity: \'{sentences_per_ne_type[ne_type]['real_name']}\', Examples: {ex_sentences_json}"

print("real prompt: ")
print(real_prompt)

In [None]:
messages_to_send = [
    {
        "role": "system",
        "content": system_message
    },
    {
        "role": "user",
        "content": example_prompt
    },
    {
        "role": "system",
        "content": example_answer
    },
    {
        "role": "user",
        "content": real_prompt
    }
]

In [None]:
n_input_tokens = num_tokens_from_messages(messages_to_send, gpt_model)
print(f"The input prompt has length in tokens: {n_input_tokens}")

In [None]:
print(messages_to_send)

### Pack it in 1 function

In [8]:
def get_json_definition_for_ne(named_entity, sentences_per_ne_type, gpt_model_name, max_output_tokens):
    # system message
    system_message = "You are a helpful NER data annotator designed to output JSON."
    # user prompt example on location
    example_prompt = "Named Entity: 'location'. Examples: [{'sentence': 'He was awarded honorary degree from the University of Cambridge in Cambridge , UK , and the Copley Medal of the Royal Society in 1906 .', 'entities': ['Cambridge']}, {'sentence': 'According to onlookers, the communicant told the priest: \"You're in Ireland now\".', 'entities': ['Ireland']}, {'sentence': 'The Big Bend Country is part of the larger Columbia Country , which includes the Columbia Valley and upper Arrow Lakes .', 'entities': ['Arrow Lakes', 'Columbia Country', 'Big Bend Country', 'Columbia Valley']}]. Hints: You should not label 'here', 'there' and similar as 'location' entities.\nInstructions: 1. Provide a concise definition for the named entity 'location' in the context of NER. 2. Provide guidelines by specifying what entities should not be labeled as 'location' and include potential pitfalls to avoid. Go beyond generic terms and delve into nuanced scenarios. Be explicit about potential ambiguities and provide guidance on distinguishing 'location' from similar entities.\nOutput in JSON format: {\"Definition\": \"\", \"Guidelines\": \"\"}."
    # example answer
    example_answer = "{\"Definition\": \"\'location\' denotes geographic entities, such as cities, countries, and landmarks, that represent specific places on Earth.\", \"Guidelines\": \"Do not label abstract references. Be cautious with ambiguous terms like 'Paris Hilton' (person with a location name) or 'Amazon' (both a company and a river)."
    # real prompt
    exemplary_data = sentences_per_ne_type[named_entity]
    ex_sentences = exemplary_data['sentences']
    real_ne_name = exemplary_data['real_name']
    ex_sentences_json = []
    for exsent in ex_sentences:
        ex_sentences_json.append({'sentence': exsent['sentence'], 'entities': exsent['target_words_in_it']})
    real_prompt = f"Now do the same for the Named Entity: \'{real_ne_name}\', Examples: {ex_sentences_json}"
    if exemplary_data['Hints'] != "":
        real_prompt += f". Hints: {exemplary_data['Hints']}\n"

    # message to send
    messages_to_send = [
    {
        "role": "system",
        "content": system_message
    },
    {
        "role": "user",
        "content": example_prompt
    },
    {
        "role": "system",
        "content": example_answer
    },
    {
        "role": "user",
        "content": real_prompt
    }
    ]

    n_input_tokens = num_tokens_from_messages(messages_to_send, gpt_model)
    print(messages_to_send)

    #for message in messages_to_send:
    #print(message)
    
    if n_input_tokens > 1000:
        raise ValueError(f"Too many input tokens in messages_to_send: {len(messages_to_send)}, {messages_to_send}")

    print(f"Sending prompt for NE: {named_entity}...")
    
    completion = client.chat.completions.create(
        messages=messages_to_send,
        model=gpt_model_name,
        max_tokens=max_output_tokens,
        response_format={ "type": "json_object" }
    )

    choice = completion.choices[0]
    finish_reason = choice.finish_reason
    message = choice.message.content

    #if finish_reason == 'length' and message[-2:] != "\"}":
        #message += "\"}"

    return {"named_entity": named_entity,
            "real_name": real_ne_name,
            "sentences_as_example": ex_sentences_json,
            "prompt_length": completion.usage.prompt_tokens,
            "output_length": completion.usage.completion_tokens,
            "gpt_answer": message,
            "finish_reason": finish_reason
            }
    

### get all definitions

In [9]:
definitions = []
for ne, exemplary_data in sentences_per_ne_type.items():
    try:
        ne_definition = get_json_definition_for_ne(ne, sentences_per_ne_type, gpt_model, max_output_tokens=100)
        definitions.append(ne_definition)
    except:
        with open(f"./definitions/{dataset_name}_err.json", 'w') as f:
            json.dump(definitions, f, indent=2)
        print(f"Something went wrong while processing NE: {ne}")

with open(f"./definitions/{dataset_name}.json", 'w') as f:
    json.dump(definitions, f, indent=2)

[{'role': 'system', 'content': 'You are a helpful NER data annotator designed to output JSON.'}, {'role': 'user', 'content': 'Named Entity: \'location\'. Examples: [{\'sentence\': \'He was awarded honorary degree from the University of Cambridge in Cambridge , UK , and the Copley Medal of the Royal Society in 1906 .\', \'entities\': [\'Cambridge\']}, {\'sentence\': \'According to onlookers, the communicant told the priest: "You\'re in Ireland now".\', \'entities\': [\'Ireland\']}, {\'sentence\': \'The Big Bend Country is part of the larger Columbia Country , which includes the Columbia Valley and upper Arrow Lakes .\', \'entities\': [\'Arrow Lakes\', \'Columbia Country\', \'Big Bend Country\', \'Columbia Valley\']}]. Hints: You should not label \'here\', \'there\' and similar as \'location\' entities.\nInstructions: 1. Provide a concise definition for the named entity \'location\' in the context of NER. 2. Provide guidelines by specifying what entities should not be labeled as \'locati

### END