# Data Annotation

- definitions & guidelines
- Tool Augmentation, Syntactic Prompting
- Negative sampling
- Dynamic few-shot choice
- Prompts from errors

## Querying

In [5]:
!pip install openai



In [6]:
from openai import OpenAI

client = OpenAI()

In [7]:
import json

def run_query(system_prompt, user_prompt, response_type="json_object"):
    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        response_format={"type": response_type}
    )
    result = completion.choices[0].message.content
    
    if response_type == "json_object":
        result = json.loads(result)    
    
    return result

## Definitions & Guidelines

In [1]:
import json

with open("../data/definitions.json") as f:
    definitions = json.load(f)
    
del definitions['mode of transport']
definitions

{'human-powered': 'Human powered transport, a form of sustainable transportation, is the transport of people and/or goods using human muscle-power, in the form of walking, running and swimming. Modern technology has allowed machines to enhance human power. Human-powered transport remains popular for reasons of cost-saving, leisure, physical exercise, and environmentalism; it is sometimes the only type available, especially in underdeveloped or inaccessible regions.Although humans are able to walk without infrastructure, the transport can be enhanced through the use of roads, especially when using the human power with vehicles, such as bicycles and inline skates. Human-powered vehicles have also been developed for difficult environments, such as snow and water, by watercraft rowing and skiing; even the air can be entered with human-powered aircraft.',
 'animal-powered': 'Animal-powered transport is the use of working animals for the transport of people and/or goods. Humans may use some 

In [2]:
system_prompt = "You are a helpful NER data annotator designed to output JSON."
dg_user_prompt_template = """
Named Entity: "{entity}". Definition: {definition}. Examples: {examples}.

Instructions: 
1. Provide a concise definition for the named entity "{entity}" in the context of NER.
2. Provide guidelines by specifying what entities should not be labelled as "{entity}" and include potential pitfalls to avoid. Go beyond generic terms and delve into nuanced scenarios. Be explicit about potential ambiguities and provide guidance on distinguishing "{entity}" from similar entities.

Output in JSON format: {{"Definition": "", "Guidelines": ""}}.
"""

In [8]:
entity_vocab_user_prompt = f"""
Here is an entity type set: [{str(list(definitions.keys()))}].
""" + \
'\n'.join([f"The definition of {entity} is {definition};" for entity, definition in definitions.items()]) + \
"""
Please imagine a list of at least 3 diverse words for each entity type in the set in pop music domain.
Output in JSON format: {"Entity Type 1": ["word1", ...], "Entity Type 2": ["word1", ...], ...}.
"""

entity_vocab = run_query(system_prompt, entity_vocab_user_prompt)
entity_vocab

{'human-powered': ['dance', 'clap', 'stomp'],
 'animal-powered': ['horse', 'rodeo', 'cowboy'],
 'railways': ['train', 'track', 'whistle'],
 'roadways': ['car', 'highway', 'drive'],
 'water_transport': ['sail', 'wave', 'cruise'],
 'air_transport': ['fly', 'jet', 'sky']}

In [14]:
example_generation_user_prompt = f"""
Here is an entity type set: [{str(list(entity_vocab.keys()))}].
Construct a sentence that contains entities from the following words:
""" + \
'\n'.join([f"{entity}: {', '.join(words)};" for entity, words in entity_vocab.items()]) + \
"""
Output the sentence along with the utilized entities. Please imagine at least 3 different sentences for every entity type.
Output in JSON format: {"Entity Type 1": [{"sentence": "", "entities": {"Entity Type 1": ["value", ...]}}, ...], ...}.
"""

entity_examples = run_query(system_prompt, example_generation_user_prompt)
entity_examples

{'human-powered': [{'sentence': 'During the festival, everyone gathered to dance and clap along to the rhythmic beats.',
   'entities': {'human-powered': ['dance', 'clap']}},
  {'sentence': 'As the music grew louder, the crowd began to stomp their feet in unison.',
   'entities': {'human-powered': ['stomp']}},
  {'sentence': 'The children decided to create a game where they stomp, clap, and dance at every corner.',
   'entities': {'human-powered': ['stomp', 'clap', 'dance']}}],
 'animal-powered': [{'sentence': 'The cowboy expertly rode his horse during the exciting rodeo event.',
   'entities': {'animal-powered': ['cowboy', 'horse', 'rodeo']}},
  {'sentence': 'As the rodeo continued, the cowboy maintained perfect balance on the wild horse.',
   'entities': {'animal-powered': ['rodeo', 'cowboy', 'horse']}},
  {'sentence': "To everyone's surprise, the new cowboy won the rodeo by taming the fastest horse.",
   'entities': {'animal-powered': ['cowboy', 'rodeo', 'horse']}}],
 'railways': [{

In [15]:
entity_metadata = dict()

for entity, examples in entity_examples.items():
    definition = definitions[entity]
    
    items = entity_vocab[entity]
    
    dg_user_prompt = dg_user_prompt_template.format(entity=entity, definition=definition, examples=str(examples))
    result = run_query(system_prompt, dg_user_prompt)
    
    entity_metadata[entity] = {'examples': examples,
                               'items': items,
                               'definition': result['Definition'],
                               'guidelines': result['Guidelines']}

In [16]:
entity_metadata['human-powered']

{'examples': [{'sentence': 'During the festival, everyone gathered to dance and clap along to the rhythmic beats.',
   'entities': {'human-powered': ['dance', 'clap']}},
  {'sentence': 'As the music grew louder, the crowd began to stomp their feet in unison.',
   'entities': {'human-powered': ['stomp']}},
  {'sentence': 'The children decided to create a game where they stomp, clap, and dance at every corner.',
   'entities': {'human-powered': ['stomp', 'clap', 'dance']}}],
 'items': ['dance', 'clap', 'stomp'],
 'definition': "In the context of NER, 'human-powered' refers to activities, modes of transportation, or processes that are operated or moved by human effort or muscle power without the use of external engines or motors. It includes activities such as walking, running, dancing, clapping, and using vehicles like bicycles, rowboats, or skis that are propelled by human force.",
 'guidelines': "1. Only label actions or activities where human muscle power is the primary source of oper

In [17]:
with open("../data/entity_metadata.json", "w") as f:
    json.dump(entity_metadata, f, indent=4)

## Prompt optimizations

In [70]:
import json

with open("../data/definitions.json") as f:
    definitions = json.load(f)
    
del definitions['mode of transport']
definitions

{'human-powered': 'Human powered transport, a form of sustainable transportation, is the transport of people and/or goods using human muscle-power, in the form of walking, running and swimming. Modern technology has allowed machines to enhance human power. Human-powered transport remains popular for reasons of cost-saving, leisure, physical exercise, and environmentalism; it is sometimes the only type available, especially in underdeveloped or inaccessible regions.Although humans are able to walk without infrastructure, the transport can be enhanced through the use of roads, especially when using the human power with vehicles, such as bicycles and inline skates. Human-powered vehicles have also been developed for difficult environments, such as snow and water, by watercraft rowing and skiing; even the air can be entered with human-powered aircraft.',
 'animal-powered': 'Animal-powered transport is the use of working animals for the transport of people and/or goods. Humans may use some 

In [65]:
!pip install stanza

Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting protobuf>=3.15.0 (from stanza)
  Downloading protobuf-5.29.3-cp38-abi3-macosx_10_9_universal2.whl.metadata (592 bytes)
Collecting networkx (from stanza)
  Using cached networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting torch>=1.3.0 (from stanza)
  Using cached torch-2.5.1-cp310-none-macosx_11_0_arm64.whl.metadata (28 kB)
Collecting filelock (from torch>=1.3.0->stanza)
  Downloading filelock-3.17.0-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec (from torch>=1.3.0->stanza)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting sympy==1.13.1 (from torch>=1.3.0->stanza)
  Using cached sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy==1.13.1->torch>=1.3.0->stanza)
  Using cached mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Download

In [66]:
import stanza

stanza.download('en')
nlp = stanza.Pipeline('en', processors='tokenize,pos')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-01-23 19:52:54 INFO: Downloaded file to /Users/artempris/stanza_resources/resources.json
2025-01-23 19:52:54 INFO: Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.10.0/models/default.zip:   0%|          | …

2025-01-23 19:54:18 INFO: Downloaded file to /Users/artempris/stanza_resources/en/default.zip
2025-01-23 19:54:20 INFO: Finished downloading models and saved to /Users/artempris/stanza_resources
2025-01-23 19:54:20 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-01-23 19:54:20 INFO: Downloaded file to /Users/artempris/stanza_resources/resources.json
2025-01-23 19:54:21 INFO: Loading these models for language: en (English):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

2025-01-23 19:54:21 INFO: Using device: cpu
2025-01-23 19:54:21 INFO: Loading: tokenize
2025-01-23 19:54:21 INFO: Loading: mwt
2025-01-23 19:54:21 INFO: Loading: pos
2025-01-23 19:54:22 INFO: Done loading processors!


word: As	POS: SCONJ
word: we	POS: PRON
word: planned	POS: VERB
word: to	POS: PART
word: jet	POS: VERB
word: set	POS: NOUN
word: ,	POS: PUNCT
word: the	POS: DET
word: thought	POS: NOUN
word: of	POS: SCONJ
word: following	POS: VERB
word: the	POS: DET
word: flight	POS: NOUN
word: path	POS: NOUN
word: excited	POS: VERB
word: us	POS: PRON
word: .	POS: PUNCT


In [68]:
def get_tool_augmentation(sentence, engine=nlp):
    doc = engine(sentence)
    pos_tags = [f"{word.text}/{word.pos}" for sent in doc.sentences for word in sent.words]
    pos_text = ' '.join(pos_tags)
    
    return pos_text

In [69]:
text = "As we planned to jet set, the thought of following the flight path excited us."
pos_text = get_tool_augmentation(text)

pos_text

'As/SCONJ we/PRON planned/VERB to/PART jet/VERB set/NOUN ,/PUNCT the/DET thought/NOUN of/SCONJ following/VERB the/DET flight/NOUN path/NOUN excited/VERB us/PRON ./PUNCT'

In [72]:
ta_sp_user_prompt_template = """
Given entity label set: {entity_set}.
Given the text and the corresponding Part-of-Speech tags, please recognize the named entities in the given text. Let's infer named entities step by step from the text based on the given Part-of-Speech tags.
Text: {input_text}
Part-of-Speech tags: {pos_text}
Question: What are the named entities labeled as "{entity}" in the text?
Answer:
"""

ta_sp_user_prompt = ta_sp_user_prompt_template.format(entity_set=str(list(definitions.keys())), 
                                  input_text=text, pos_text=get_tool_augmentation(text), 
                                  entity="human-powered")

print(ta_sp_user_prompt)


Given entity label set: ['human-powered', 'animal-powered', 'railways', 'roadways', 'water_transport', 'air_transport'].
Given the text and the corresponding Part-of-Speech tags, please recognize the named entities in the given text. Let's infer named entities step by step from the text based on the given Part-of-Speech tags.
Text: As we planned to jet set, the thought of following the flight path excited us.
Part-of-Speech tags: As/SCONJ we/PRON planned/VERB to/PART jet/VERB set/NOUN ,/PUNCT the/DET thought/NOUN of/SCONJ following/VERB the/DET flight/NOUN path/NOUN excited/VERB us/PRON ./PUNCT
Question: What are the named entities labeled as "human-powered" in the text?
Answer:



## Negative sampling

In [75]:
entity_vocab

{'human-powered': ['dance moves', 'jump', 'stride'],
 'animal-powered': ['horseback ride',
  'elephants on parade',
  'reindeer sleigh'],
 'railways': ['train station sound', 'railroad tracks', 'subway vibes'],
 'roadways': ['car journey', 'highway drive', 'motorcycle ride'],
 'water_transport': ['sailing away', 'cruise rhythm', 'rowboat'],
 'air_transport': ['jet set', 'helicopter spree', 'flight path']}

In [76]:
entity_examples

{'human-powered': [{'sentence': 'The dance moves were as lively as a jump during a long stride.',
   'entities': {'human-powered': ['dance moves', 'jump', 'stride']}},
  {'sentence': 'Her jump during the performance was as smooth as a carefully executed stride.',
   'entities': {'human-powered': ['jump', 'stride']}},
  {'sentence': 'With each stride, he executed dance moves that left everyone in awe.',
   'entities': {'human-powered': ['stride', 'dance moves']}}],
 'animal-powered': [{'sentence': 'The horseback ride was scenic, reminiscent of elephants on parade.',
   'entities': {'animal-powered': ['horseback ride', 'elephants on parade']}},
  {'sentence': 'During the winter holidays, a reindeer sleigh is as enchanting as elephants on parade.',
   'entities': {'animal-powered': ['reindeer sleigh', 'elephants on parade']}},
  {'sentence': 'A horseback ride through the snowy woods felt as magical as a reindeer sleigh.',
   'entities': {'animal-powered': ['horseback ride', 'reindeer slei

In [None]:
import random

key = ...
values = ...

_value = random.choice(list(set(entity_vocab[key]) - set(values)))

key_ = random.choice(list(set(entity_vocab.keys()) - {key}))
value_ = random.choice(entity_vocab[key_])

ns_user_prompt = '\n'.join([f"'{value}' is a '{key}'." for value in values]) + \
    f"'{_value}' is a '{key}'." + \
    f"'{value_}' is not a '{key}'."

## Dynamic few-shot choice

## Guidelines update from errors