In [1]:
# set up prompts
SYSTEM_PROMPT = "You are a smart and intelligent Named Entity Recognition (NER) system using IOB1 tagging scheme. I will provide you the definition of the entities you need to extract, the sentence from where your extract the entities and the output format with examples."

USER_PROMPT_1 = "Are you clear about your role?"

ASSISTANT_PROMPT_1 = "Sure, I'm ready to help you with your NER task. Please provide me with the necessary information to get started."

GUIDELINES_PROMPT = (
    "Entity Definition:\n"
    "1. B-LOC/I-LOC: These tags are used for any geographic location, like cities, countries, continents, districts etc.\n"
    "2. B-ORG/I-ORG: These tags are used for organizations.\n"
    "3. B-PER/I-PER: These tags are for names of people.\n"
    "3. B-MISC/I-MISC: These tags are for words that are part of entities that do not fall under the other predefined categories (Location, Organization, Person).\n"
    "4. O: This tag is for words that do not belong to any of the entity categories mentioned above.\n"
    "\n"
    "Annotation Rules:\n"
    "Single-Word Entities: If an entity consists of only one word, it should be tagged with an 'I-' prefix.\n"
    "Multi-Word Entities: If an entity contains multiple words, the first word should be tagged with 'I-' and the subsequent words with 'I-' as well.\n"
    "Separate Entities: If two entities of the same type are adjacent to each other, the first word of the second entity should be tagged with 'B-' to indicate a new entity.\n"
    "Continuous Entities: Do not use a 'B-' tag. Instead, start multi-word entities with 'I-' and continue with 'I-' tags for all subsequent words that belong to the entity.\n"
    "'B-' prefix is only used to separate two adjacent entities of the same type.\n"
    "Entity Ambiguity: If a word could belong to more than one category, prioritize by context. If still unsure, maintain consistency with the rest of the document or seek clarification.\n"
    "Punctuation: Punctuation is typically tagged as 'O' unless part of a named entity.\n"
    "\n"
    "Output Format:\n"
    "[list of name entities corresponding to each word in the sentence]"
    "\n"
    "Output Format and Examples:\n"
    "\n"
    "1. Sentence: ['Swiss', 'Grand', 'Prix', 'World', 'Cup', 'cycling', 'race', 'on', 'Sunday', ':']\n"
    "Output: ['I-MISC', 'B-MISC', 'I-MISC', 'B-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O']\n"
    "\n"
    "2. Sentence: ['After', 'the', 'defeat', 'of', 'the', 'resolution', ',', 'drafted', 'by', 'the', 'European', 'Union', 'and', 'the', 'United', 'States', ',', 'China', \''s\', 'Foreign', 'Ministry', 'thanked', '26', 'countries', 'for', 'backing', 'its', 'motion', 'for', '\"', 'no', 'action', '\"', 'on', 'the', 'document', '.']"
    "Output: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-ORG', 'I-ORG', 'O', 'O', 'I-LOC', 'I-LOC', 'O', 'I-LOC', 'O', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']\n"
    "\n"
    "I will provide you a list of lists of tokens. Please return a list of lists of labeled BIO tags, which should have the same length as input tokens, without unnecessary explanations.\n"
)

In [3]:
# GPT-4 setup
import openai

OPENAI_API_KEY = ""
openai.api_key = OPENAI_API_KEY

models = openai.Model.list()
# for model in models['data']:
#    print(model['id'])  # This prints the ID of each model

def openai_chat_completion_response(final_prompt):
  response = openai.ChatCompletion.create(
              model="gpt-4",
              messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": USER_PROMPT_1},
                    {"role": "assistant", "content": ASSISTANT_PROMPT_1},
                    {"role": "user", "content": final_prompt}
                ]
            )

  return response['choices'][0]['message']['content'].strip(" \n")

In [4]:
from datasets import Dataset

# read data
def read_conll_file(file_path):
    with open(file_path, "r") as f:
        content = f.read().strip()
        sentences = content.split("\n\n")
        data = []
        for sentence in sentences:
            tokens = sentence.split("\n")
            token_data = []
            for token in tokens:
                token_data.append(token.split())
            data.append(token_data)
    return data

# prepare data
def convert_to_dataset(data):
    formatted_data = {"tokens": [], "ner_tags": [], "sentences": []}
    for sentence in data:
        tokens = [token_data[0] for token_data in sentence]
        ner_tags = [token_data[3] for token_data in sentence]
        sentence_str = " ".join(tokens)
        formatted_data["tokens"].append(tokens)
        formatted_data["ner_tags"].append(ner_tags)
        formatted_data["sentences"].append(sentence_str)
    return Dataset.from_dict(formatted_data)

test_data = read_conll_file("/mnt/lustre/yuxin/SC4002_G06/datasets/CoNLL2003/eng.testb")
test_dataset = convert_to_dataset(test_data)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
prompt = GUIDELINES_PROMPT + str(test_dataset["tokens"][0:5])
replies = openai_chat_completion_response(prompt)

In [6]:
replies

"[['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O'], ['B-PER', 'I-PER'], ['B-LOC', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'O'], ['B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'B-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O'], ['O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O']]"

In [9]:
import ast
from seqeval.metrics import f1_score
from seqeval.metrics import classification_report
from seqeval.scheme import IOB1

# Function to pad or clip a list to match the length of another list
def pad_or_clip_to_match_length(source_list, target_length):
    if len(source_list) < target_length:
        return source_list + ['O'] * (target_length - len(source_list))
    elif len(source_list) > target_length:
        return source_list[:target_length]
    else:
        return source_list

y_true = test_dataset["ner_tags"][0:5]
reply_list = list(ast.literal_eval(replies))

# Ensure that list2 has sublists with the same length as list1
y_pred = [pad_or_clip_to_match_length(sublist2, len(sublist1)) for sublist1, sublist2 in zip(y_true, reply_list)]

f1 = f1_score(y_true, y_pred)
print(f1)
print(classification_report(y_true, y_pred))

0.761904761904762
              precision    recall  f1-score   support

         LOC       0.75      0.86      0.80         7
        MISC       0.50      1.00      0.67         1
         PER       1.00      0.50      0.67         2

   micro avg       0.73      0.80      0.76        10
   macro avg       0.75      0.79      0.71        10
weighted avg       0.78      0.80      0.76        10



In [10]:
prompt = GUIDELINES_PROMPT + str(test_dataset["tokens"][100:110])
replies = openai_chat_completion_response(prompt)

In [11]:
replies

"[['O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', 'I-LOC'],\n['O', 'O', 'O'],\n['I-LOC'],\n['I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],\n['I-PER', 'I-PER', 'O', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O'],\n['I-PER', 'I-PER', 'O', 'I-PER', 'O', 'I-PER', 'O'],\n['I-PER', 'I-PER', 'O', 'I-PER', 'O', 'I-PER', 'O'],\n['I-PER', 'I-PER', 'O', 'I-PER', 'O'],\n['I-PER', 'I-PER', 'O', 'I-PER', 'O'],\n['I-PER', 'I-PER', 'O', 'I-PER', 'O', 'I-PER', 'O']]"

In [12]:
import ast
from seqeval.metrics import f1_score
from seqeval.metrics import classification_report
from seqeval.scheme import IOB1

# Function to pad or clip a list to match the length of another list
def pad_or_clip_to_match_length(source_list, target_length):
    if len(source_list) < target_length:
        return source_list + ['O'] * (target_length - len(source_list))
    elif len(source_list) > target_length:
        return source_list[:target_length]
    else:
        return source_list

y_true = test_dataset["ner_tags"][100:110]
new_string = replies.replace('\n', '')
reply_list = list(ast.literal_eval(new_string))

# Ensure that list2 has sublists with the same length as list1
y_pred = [pad_or_clip_to_match_length(sublist2, len(sublist1)) for sublist1, sublist2 in zip(y_true, reply_list)]

f1 = f1_score(y_true, y_pred)
print(f1)
print(classification_report(y_true, y_pred))

1.0
              precision    recall  f1-score   support

         LOC       1.00      1.00      1.00         3
         PER       1.00      1.00      1.00        16

   micro avg       1.00      1.00      1.00        19
   macro avg       1.00      1.00      1.00        19
weighted avg       1.00      1.00      1.00        19



In [16]:
prompt = GUIDELINES_PROMPT + str(test_dataset["tokens"][200:220])
replies = openai_chat_completion_response(prompt)

In [17]:
replies

"[['B-PER', 'I-PER'], \n['B-LOC', 'O'], \n['O', 'I-PER', 'I-PER', 'O', 'O', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-ORG', 'O', 'I-LOC', 'O', 'O', 'O'],\n['I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'I-PER', 'I-PER', 'O'],\n['I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'I-PER', 'I-PER', 'O'],\n['O', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'O'],\n['O', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],\n['I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-ORG', 'O', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'I-MISC', 'O', 'I-LOC'

In [20]:
import ast
from seqeval.metrics import f1_score
from seqeval.metrics import classification_report
from seqeval.scheme import IOB1

# Function to pad or clip a list to match the length of another list
def pad_or_clip_to_match_length(source_list, target_length):
    if len(source_list) < target_length:
        return source_list + ['O'] * (target_length - len(source_list))
    elif len(source_list) > target_length:
        return source_list[:target_length]
    else:
        return source_list

y_true = test_dataset["ner_tags"][200:220]
new_string = replies.replace('\n', '')
reply_list = list(ast.literal_eval(new_string))

# Ensure that list2 has sublists with the same length as list1
y_pred = [pad_or_clip_to_match_length(sublist2, len(sublist1)) for sublist1, sublist2 in zip(y_true, reply_list)]

f1 = f1_score(y_true, y_pred)
print(f1)
print(classification_report(y_true, y_pred))

0.2909090909090909
              precision    recall  f1-score   support

         LOC       0.23      0.25      0.24        36
        MISC       0.00      0.00      0.00         7
         ORG       0.45      0.71      0.56         7
         PER       0.32      0.30      0.31        60

   micro avg       0.29      0.29      0.29       110
   macro avg       0.25      0.32      0.28       110
weighted avg       0.28      0.29      0.28       110



# Let's Go GPT-4!!!

In [37]:
import ast
from seqeval.metrics import f1_score
from seqeval.metrics import classification_report
from tqdm import tqdm

# Function to pad or clip a list to match the length of another list
def pad_or_clip_to_match_length(source_list, target_length):
    if len(source_list) < target_length:
        return source_list + ['O'] * (target_length - len(source_list))
    elif len(source_list) > target_length:
        return source_list[:target_length]
    else:
        return source_list

def clean_predictions(y_true, replies):
    new_string = replies.replace('\n', '')
    reply_list = list(ast.literal_eval(new_string))
    
    # Ensure that list2 has sublists with the same length as list1
    y_pred = [pad_or_clip_to_match_length(sublist2, len(sublist1)) for sublist1, sublist2 in zip(y_true, reply_list)]
    return y_pred

y_trues = []
y_preds = []
for i in tqdm(range(0, len(test_dataset), 10)):
    # Create the prompt using a subset of the test dataset
    prompt = GUIDELINES_PROMPT + str(test_dataset["tokens"][i:i+10])
    predictions = openai_chat_completion_response(prompt)
    
    y_true_batch = test_dataset["ner_tags"][i:i+10]
    y_pred_batch = clean_predictions(y_true_batch, predictions)

    batch_f1 = f1_score(y_true_batch, y_pred_batch)
    print(f"F1 score for batch {i // 10 + 1}: {batch_f1}")
    
    # Extend our lists of true and predicted labels
    y_trues.extend(y_true_batch)
    y_preds.extend(y_pred_batch)

# Once the loop is complete, we can calculate the F1
f1 = f1_score(y_trues, y_preds)
print(f"The overall F1 score is: {f1}")
print(classification_report(y_trues, y_preds))

  0%|▏                                                      | 1/369 [00:38<3:56:29, 38.56s/it]

F1 score for batch 1: 0.46511627906976744


  1%|▎                                                      | 2/369 [01:23<4:18:26, 42.25s/it]

F1 score for batch 2: 0.65


  1%|▍                                                      | 3/369 [02:04<4:13:39, 41.58s/it]

F1 score for batch 3: 0.6530612244897959


  1%|▌                                                      | 4/369 [02:55<4:36:06, 45.39s/it]

F1 score for batch 4: 0.619718309859155


  1%|▋                                                      | 5/369 [04:04<5:27:39, 54.01s/it]

F1 score for batch 5: 0.5714285714285713


  2%|▉                                                      | 6/369 [04:19<4:05:18, 40.55s/it]

F1 score for batch 6: 1.0


  2%|█                                                      | 7/369 [04:36<3:17:59, 32.81s/it]

F1 score for batch 7: 1.0


  2%|█▏                                                     | 8/369 [04:54<2:50:49, 28.39s/it]

F1 score for batch 8: 0.9142857142857143


  2%|█▎                                                     | 9/369 [05:12<2:29:29, 24.92s/it]

F1 score for batch 9: 0.9600000000000001


  2%|█▎                                                     | 9/369 [05:29<3:39:45, 36.63s/it]


SyntaxError: invalid syntax (<unknown>, line 1)