In [65]:
# set up prompts
SYSTEM_PROMPT = "You are a smart and intelligent Named Entity Recognition (NER) system using IOB1 tagging scheme. I will provide you the definition of the entities you need to extract, the sentence from where your extract the entities and the output format with examples."

USER_PROMPT_1 = "Are you clear about your role?"

ASSISTANT_PROMPT_1 = "Sure, I'm ready to help you with your NER task. Please provide me with the necessary information to get started."

GUIDELINES_PROMPT = (
    "Entity Definition:\n"
    "1. B-LOC/I-LOC: These tags are used for any geographic location, like cities, countries, continents, districts etc.\n"
    "2. B-ORG/I-ORG: These tags are used for organizations.\n"
    "3. B-PER/I-PER: These tags are for names of people.\n"
    "3. B-MISC/I-MISC: These tags are for words that are part of entities that do not fall under the other predefined categories (Location, Organization, Person).\n"
    "4. O: This tag is for words that do not belong to any of the entity categories mentioned above.\n"
    "\n"
    "Annotation Rules:\n"
    "Single-Word Entities: If an entity consists of only one word, it should be tagged with an 'I-' prefix.\n"
    "Multi-Word Entities: If an entity contains multiple words, the first word should be tagged with 'I-' and the subsequent words with 'I-' as well.\n"
    "Separate Entities: If two entities of the same type are adjacent to each other, the first word of the second entity should be tagged with 'B-' to indicate a new entity.\n"
    "Continuous Entities: Do not use a 'B-' tag. Instead, start multi-word entities with 'I-' and continue with 'I-' tags for all subsequent words that belong to the entity.\n"
    "'B-' prefix is only used to separate two adjacent entities of the same type.\n"
    "Entity Ambiguity: If a word could belong to more than one category, prioritize by context. If still unsure, maintain consistency with the rest of the document or seek clarification.\n"
    "Punctuation: Punctuation is typically tagged as 'O' unless part of a named entity.\n"
    "\n"
    "Output Format:\n"
    "[list of name entities corresponding to each word in the sentence]"
    "\n"
    "Output Format and Examples:\n"
    "\n"
    "1. Sentence: ['Swiss', 'Grand', 'Prix', 'World', 'Cup', 'cycling', 'race', 'on', 'Sunday', ':']\n"
    "Output: ['I-MISC', 'B-MISC', 'I-MISC', 'B-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O']\n"
    "\n"
    "2. Sentence: ['After', 'the', 'defeat', 'of', 'the', 'resolution', ',', 'drafted', 'by', 'the', 'European', 'Union', 'and', 'the', 'United', 'States', ',', 'China', \''s\', 'Foreign', 'Ministry', 'thanked', '26', 'countries', 'for', 'backing', 'its', 'motion', 'for', '\"', 'no', 'action', '\"', 'on', 'the', 'document', '.']"
    "Output: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-ORG', 'I-ORG', 'O', 'O', 'I-LOC', 'I-LOC', 'O', 'I-LOC', 'O', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']\n"
    "\n"
    "I will provide you a list of lists of tokens. Please return a list of lists of labeled BIO tags, which should have the same length as input tokens, without unnecessary explanations.\n"
)

In [75]:
# GPT-4 setup
import openai

OPENAI_API_KEY = "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"
openai.api_key = OPENAI_API_KEY

models = openai.Model.list()
# for model in models['data']:
    # print(model['id'])  # This prints the ID of each model

def openai_chat_completion_response(final_prompt):
  response = openai.ChatCompletion.create(
              model="gpt-3.5-turbo",
              messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": USER_PROMPT_1},
                    {"role": "assistant", "content": ASSISTANT_PROMPT_1},
                    {"role": "user", "content": final_prompt}
                ]
            )

  return response['choices'][0]['message']['content'].strip(" \n")

In [76]:
from datasets import Dataset

# read data
def read_conll_file(file_path):
    with open(file_path, "r") as f:
        content = f.read().strip()
        sentences = content.split("\n\n")
        data = []
        for sentence in sentences:
            tokens = sentence.split("\n")
            token_data = []
            for token in tokens:
                token_data.append(token.split())
            data.append(token_data)
    return data

# prepare data
def convert_to_dataset(data):
    formatted_data = {"tokens": [], "ner_tags": [], "sentences": []}
    for sentence in data:
        tokens = [token_data[0] for token_data in sentence]
        ner_tags = [token_data[3] for token_data in sentence]
        sentence_str = " ".join(tokens)
        formatted_data["tokens"].append(tokens)
        formatted_data["ner_tags"].append(ner_tags)
        formatted_data["sentences"].append(sentence_str)
    return Dataset.from_dict(formatted_data)

test_data = read_conll_file("/mnt/lustre/yuxin/SC4002_G06/datasets/CoNLL2003/eng.testb")
test_dataset = convert_to_dataset(test_data)

In [20]:
test_dataset["sentences"][0:500]

['SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT .',
 'Nadim Ladki',
 'AL-AIN , United Arab Emirates 1996-12-06',
 'Japan began the defence of their Asian Cup title with a lucky 2-1 win against Syria in a Group C championship match on Friday .',
 'But China saw their luck desert them in the second match of the group , crashing to a surprise 2-0 defeat to newcomers Uzbekistan .',
 'China controlled most of the match and saw several chances missed until the 78th minute when Uzbek striker Igor Shkvyrin took advantage of a misdirected defensive header to lob the ball over the advancing Chinese keeper and into an empty net .',
 'Oleg Shatskiku made sure of the win in injury time , hitting an unstoppable left foot shot from just outside the area .',
 'The former Soviet republic was playing in an Asian Cup finals tie for the first time .',
 'Despite winning the Asian Games title two years ago , Uzbekistan are in the finals as outsiders .',
 'Two goals from defensive errors in the las

In [77]:
prompt = GUIDELINES_PROMPT + str(test_dataset["tokens"][0:5])
replies = openai_chat_completion_response(prompt)

In [78]:
replies 

"[['SOCCER', '-', 'JAPAN', 'GET', 'LUCKY', 'WIN', ',', 'CHINA', 'IN', 'SURPRISE', 'DEFEAT', '.']]\nOutput: ['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O']\n\n[['Nadim', 'Ladki']]\nOutput: ['B-PER', 'I-PER']\n\n[['AL-AIN', ',', 'United', 'Arab', 'Emirates', '1996-12-06']]\nOutput: ['B-LOC', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'O']\n\n[['Japan', 'began', 'the', 'defence', 'of', 'their', 'Asian', 'Cup', 'title', 'with', 'a', 'lucky', '2-1', 'win', 'against', 'Syria', 'in', 'a', 'Group', 'C', 'championship', 'match', 'on', 'Friday', '.']]\nOutput: ['B-LOC', 'O', 'O', 'O', 'O', 'O', 'I-MISC', 'I-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O', 'O']\n\n[['But', 'China', 'saw', 'their', 'luck', 'desert', 'them', 'in', 'the', 'second', 'match', 'of', 'the', 'group', ',', 'crashing', 'to', 'a', 'surprise', '2-0', 'defeat', 'to', 'newcomers', 'Uzbekistan', '.']]\nOutput: ['O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O

In [49]:
replies

"['I-MISC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O']\n['B-PER', 'I-PER']\n['B-LOC', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'O']\n['B-LOC', 'O', 'O', 'O', 'O', 'O', 'I-MISC', 'I-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']\n['O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']"

In [51]:
new_string = replies.replace('\n', ',')
new_string 

"['I-MISC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O'],['B-PER', 'I-PER'],['B-LOC', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'O'],['B-LOC', 'O', 'O', 'O', 'O', 'O', 'I-MISC', 'I-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],['O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']"

In [54]:
list_of_lists = ast.literal_eval(new_string)
type(list_of_lists)

tuple

In [55]:
y_true = test_dataset["ner_tags"][0:5]
y_pred = list(list_of_lists)

In [62]:
# Two lists of lists
list1 = y_true
list2 = y_pred

# Function to pad or clip a list to match the length of another list
def pad_or_clip_to_match_length(source_list, target_length):
    if len(source_list) < target_length:
        return source_list + ['O'] * (target_length - len(source_list))
    elif len(source_list) > target_length:
        return source_list[:target_length]
    else:
        return source_list

# Ensure that list2 has sublists with the same length as list1
list2_adjusted = [pad_or_clip_to_match_length(sublist2, len(sublist1)) for sublist1, sublist2 in zip(list1, list2)]

# Now, list2 has sublists with the same length as list1
print(list1)
print(list2_adjusted)

[['O', 'O', 'I-LOC', 'O', 'O', 'O', 'O', 'I-PER', 'O', 'O', 'O', 'O'], ['I-PER', 'I-PER'], ['I-LOC', 'O', 'I-LOC', 'I-LOC', 'I-LOC', 'O'], ['I-LOC', 'O', 'O', 'O', 'O', 'O', 'I-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O']]
[['I-MISC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O'], ['B-PER', 'I-PER'], ['B-LOC', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'O'], ['B-LOC', 'O', 'O', 'O', 'O', 'O', 'I-MISC', 'I-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O'], ['O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'B-MISC', 'O', 'O', 'O', 'O']]


In [63]:
from seqeval.metrics import f1_score
from seqeval.metrics import classification_report
from seqeval.scheme import IOB1

f1 = f1_score(list1, list2_adjusted)
# f1 = f1_score(y_true, y_pred)
print(f1)
# classification_report(y_true, y_pred)

0.5454545454545454


In [6]:
import ast
from tqdm import tqdm

y_true = []
y_pred = []
for i in tqdm(test_dataset):
    # Constructing the prompt
    prompt = GUIDELINES_PROMPT + str(test_dataset["sentences"]))
    
    try:
        # Fetching the response from your function
        replies = openai_chat_completion_response(prompt)
        
        # Attempt to find the starting index of the list in the string
        start_index = replies.find("['")
        end_index = replies.find("']")
        
        # Validate if both start and end indexes are found
        if start_index != -1 and end_index != -1 and (end_index > start_index):
            # Extract the substring that should contain the list
            pred_tag = replies[start_index:end_index+2]
            
            # Now attempt to evaluate this string as a list
            try:
                tag_list = ast.literal_eval(pred_tag)
            except SyntaxError as se:
                print(f"Syntax error when parsing: {pred_tag}")
                raise se

            # Ensure the tag list matches the length of the ground truth
            if len(tag_list) != len(i['ner_tags']):
                if len(tag_list) < len(i['ner_tags']):
                    # Pad the tag_list with 'O' to match the length
                    tag_list += ['O'] * (len(i['ner_tags']) - len(tag_list))
                else:
                    # Truncate the tag_list to match the length
                    tag_list = tag_list[:len(i['ner_tags'])]
        else:
            print(f"No valid list found in the string for input: {i['tokens']}")
            tag_list = ['O'] * len(i['ner_tags'])
        
        # Append the true and predicted tags to their respective lists
        y_true.append(i['ner_tags'])
        y_pred.append(tag_list)
    
    except Exception as e:
        print(f"An error occurred: {e}")
        tag_list = ['O'] * len(i['ner_tags'])
        y_true.append(i['ner_tags'])
        y_pred.append(tag_list)

  1%|▏                                             | 19/3684 [01:19<5:03:10,  4.96s/it]

No valid list found in the string for input: ["'"]


  1%|▍                                             | 34/3684 [02:12<3:17:21,  3.24s/it]

No valid list found in the string for input: ['I', 'think', 'now', 'is', 'the', 'right', 'time', 'for', 'him', 'to', 'return', '.', '"']


  1%|▍                                             | 40/3684 [02:35<4:09:27,  4.11s/it]

No valid list found in the string for input: ['Takuya', 'Takagi', 'headed', 'the', 'winner', 'in', 'the', '88th', 'minute', 'of', 'the', 'group', 'C', 'game', 'after', 'goalkeeper', 'Salem', 'Bitar', 'spoiled', 'a', 'mistake-free', 'display', 'by', 'allowing', 'the', 'ball', 'to', 'slip', 'under', 'his', 'body', '.']


  2%|█                                             | 83/3684 [12:06<8:45:26,  8.75s/it]


KeyboardInterrupt: 

In [70]:
prompt = GUIDELINES_PROMPT + str(test_dataset["tokens"][5:10])
replies = openai_chat_completion_response(prompt)
replies

"[\n   ['I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],\n   ['B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],\n   ['O', 'O', 'B-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'I-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],\n   ['O', 'O', 'O', 'I-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'I-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],\n   ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O']\n]"

In [71]:
new_string = replies.replace('\n', '')
list_of_lists = ast.literal_eval(new_string)

y_true = test_dataset["ner_tags"][0:5]
y_pred = list(list_of_lists)

list1 = y_true
list2 = y_pred

# Function to pad or clip a list to match the length of another list
def pad_or_clip_to_match_length(source_list, target_length):
    if len(source_list) < target_length:
        return source_list + ['O'] * (target_length - len(source_list))
    elif len(source_list) > target_length:
        return source_list[:target_length]
    else:
        return source_list

# Ensure that list2 has sublists with the same length as list1
list2_adjusted = [pad_or_clip_to_match_length(sublist2, len(sublist1)) for sublist1, sublist2 in zip(list1, list2)]

# Now, list2 has sublists with the same length as list1
print(list1)
print(list2_adjusted)

f1 = f1_score(list1, list2_adjusted)
# f1 = f1_score(y_true, y_pred)
print(f1)

[['O', 'O', 'I-LOC', 'O', 'O', 'O', 'O', 'I-PER', 'O', 'O', 'O', 'O'], ['I-PER', 'I-PER'], ['I-LOC', 'O', 'I-LOC', 'I-LOC', 'I-LOC', 'O'], ['I-LOC', 'O', 'O', 'O', 'O', 'O', 'I-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O']]
[['I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-PER', 'I-PER'], ['O', 'O', 'B-MISC', 'I-MISC', 'O', 'O'], ['O', 'O', 'O', 'I-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'I-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
0.11764705882352941


In [74]:
prompt = GUIDELINES_PROMPT + str(test_dataset["tokens"][10:15])
replies = openai_chat_completion_response(prompt)
replies

"[['Takuya', 'Takagi', 'scored', 'the', 'winner', 'in', 'the', '88th', 'minute', ',', 'rising', 'to', 'head', 'a', 'Hiroshige', 'Yanagimoto', 'cross', 'towards', 'the', 'Syrian', 'goal', 'which', 'goalkeeper', 'Salem', 'Bitar', 'appeared', 'to', 'have', 'covered', 'but', 'then', 'allowed', 'to', 'slip', 'into', 'the', 'net', '.'], ['It', 'was', 'the', 'second', 'costly', 'blunder', 'by', 'Syria', 'in', 'four', 'minutes', '.'], ['Defender', 'Hassan', 'Abbas', 'rose', 'to', 'intercept', 'a', ... 'rarely', 'breached', 'the', 'Syrian', 'defence', '.']]\n\n[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-PER', 'I-PER', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O... 'O', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 

In [56]:
new_string = replies.replace('\n', '')
list_of_lists = ast.literal_eval(new_string)

y_true = test_dataset["ner_tags"][0:5]
y_pred = list(list_of_lists)

list1 = y_true
list2 = y_pred

# Function to pad or clip a list to match the length of another list
def pad_or_clip_to_match_length(source_list, target_length):
    if len(source_list) < target_length:
        return source_list + ['O'] * (target_length - len(source_list))
    elif len(source_list) > target_length:
        return source_list[:target_length]
    else:
        return source_list

# Ensure that list2 has sublists with the same length as list1
list2_adjusted = [pad_or_clip_to_match_length(sublist2, len(sublist1)) for sublist1, sublist2 in zip(list1, list2)]

# Now, list2 has sublists with the same length as list1
print(list1)
print(list2_adjusted)

f1 = f1_score(list1, list2_adjusted)
# f1 = f1_score(y_true, y_pred)
print(f1)

ValueError: Found input variables with inconsistent numbers of samples:
[12, 2, 6, 25, 25]
[11, 2, 6, 31, 36]

In [9]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         LOC       0.41      0.40      0.41        67
        MISC       0.06      0.05      0.05        21
         ORG       0.00      0.00      0.00         2
         PER       0.17      0.17      0.17       103
        TIME       0.00      0.00      0.00         0
     arcello       0.00      0.00      0.00         0
          as       0.00      0.00      0.00         0
     ecalled       0.00      0.00      0.00         0
       econd       0.00      0.00      0.00         0
    efensive       0.00      0.00      0.00         0
          he       0.00      0.00      0.00         0
      inutes       0.00      0.00      0.00         0
      lunder       0.00      0.00      0.00         0
           n       0.00      0.00      0.00         0
         our       0.00      0.00      0.00         0
           t       0.00      0.00      0.00         0
        taly       0.00      0.00      0.00         0
     uttitta       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
