In [1]:
import re
import sys
sys.path.append('./readme2kg-exp/src/')
import os
import random
from collections import defaultdict
from termcolor import colored
from functools import partial, reduce
import operator as op
import hashlib
import multiprocessing as mp
import logging

from predictor import BasePredictor, LABELS
from webanno_tsv import webanno_tsv_read_file, Document, Annotation, Token
import utils
import cleaner

In [2]:
phase = '3shot'
model_name = 'Meta-Llama-3-8B-Instruct'

In [5]:
prompt_id = '3shot'
prompt_template_path = f'data/ner-202407/prompt-{prompt_id}.txt'
if os.path.isfile(prompt_template_path):
    with open(prompt_template_path, 'r') as fd:
        prompt_template = fd.read()
else:
    prompt_template = ''
print(prompt_template)

**Task:**
You are tasked with performing Fine-Grained Named Entity Recognition (NER) on the given a publication title. 
Follow the examples to identify and classify entities into their respective categories. 
Annotate the entities directly in the original text using XML-style tags (e.g <TAG>Entity</TAG>). 
Only return the annotated text in Markdown format—no explanations, introductions, or extra text. 
Thank You.

**Entity Classes:**
1. DATE-CREATION - Date of creation (e.g. "4. März", "19. Octobr.", ).
2. DATE-PUB - Date of publication (e.g. "19. Octobr.").
3. DATE-SUBJ - Date being described in the title.
4. GPE-AUT - Geopolitical entity responsible for authoring the title.
5. GPE-CREATION - Geopolitical entity where the title was created.
6. GPE-DES - Geopolitical entity mentioned as the designation of a person entity.
7. GPE-SUBJ - Geopolitcal entity as the subject of the title.
8. GPE-PUB - Geopolitical entity where the title was published.
9. LITWORK - title of the publication (e

In [6]:
sentence_text = "Christliche Leichenpredigt Vber den Text aus dem 2. Buch Samuelis am 10. Cap. v. 12. Sey getrost , und laß uns starck seyn für unser Volck , etc. : Bey volckreicher Leichenbestattung Des ... H. Stellan Otto von Mörners , Königl. Maj. und Cron Schweden ... Rittmeisters , Welcher den XXX. Octob. ... sein Leben ritterlich und seliglich beschlossen , und folgends den 11. Decemb. ... beygesetzet worden"
prompt = prompt_template.replace('{input_text}', sentence_text)
messages=[
            {"role": "system", "content": "You are a highly accurate Named Entity Recognition (NER) in the library domain."},
            {"role": "user", "content": prompt},
        ]
print(messages)

[{'role': 'system', 'content': 'You are a highly accurate Named Entity Recognition (NER) in the library domain.'}, {'role': 'user', 'content': '**Task:**\nYou are tasked with performing Fine-Grained Named Entity Recognition (NER) on the given a publication title. \nFollow the examples to identify and classify entities into their respective categories. \nAnnotate the entities directly in the original text using XML-style tags (e.g <TAG>Entity</TAG>). \nOnly return the annotated text in Markdown format—no explanations, introductions, or extra text. \nThank You.\n\n**Entity Classes:**\n1. DATE-CREATION - Date of creation (e.g. "4. März", "19. Octobr.", ).\n2. DATE-PUB - Date of publication (e.g. "19. Octobr.").\n3. DATE-SUBJ - Date being described in the title.\n4. GPE-AUT - Geopolitical entity responsible for authoring the title.\n5. GPE-CREATION - Geopolitical entity where the title was created.\n6. GPE-DES - Geopolitical entity mentioned as the designation of a person entity.\n7. GPE-S

# Load Llama model

In [7]:
import torch
torch.cuda.is_available()
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [8]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

model.generation_config.pad_token_id = tokenizer.pad_token_id

# original code
#prompt = prompt_template.replace('{input_text}', sentence.text)

messages = [
    {"role": "system", "content": "You are a highly accurate Named Entity Recognition (NER) in the library domain."},
    {"role": "user", "content": prompt},
]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
    
outputs = model.generate(
    input_ids,
    max_new_tokens=255,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
    pad_token_id=tokenizer.eos_token_id
)
response = outputs[0][input_ids.shape[-1]:]
print(tokenizer.decode(response, skip_special_tokens=True))

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<PERSON-AUT>H. Stellan Otto von Mörners</PERSON-AUT>, Königl. Maj. und Cron <GPE-DES>Schweden</GPE-DES>... Rittmeisters, Welcher den <DATE-SUBJ>XXX. Octob.</DATE-SUBJ>... sein Leben ritterlich und seliglich beschlossen, und folgends den <DATE-SUBJ>11. Decemb.</DATE-SUBJ>... beygesetzet worden


In [9]:
def do_prediction(title):
    prompt = prompt_template.replace('{input_text}', title)

    messages = [
        {"role": "system", "content": "You are a highly accurate Named Entity Recognition (NER) system in the library domain."},
        {"role": "user", "content": prompt},
    ]    
    
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)
    
    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
        
    outputs = model.generate(
        input_ids,
        max_new_tokens=255,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )
    response = outputs[0][input_ids.shape[-1]:]
    result = tokenizer.decode(response, skip_special_tokens=True)

    return result        

In [10]:
result_list = []

In [16]:
from datetime import datetime
from rich.progress import track, Progress
from rich import print

with open('data/ner-202407/ddb-ner-dataset.json') as f:
    sentence_list = json.load(f)

now = datetime.now() # current date and time
timestamp = now.strftime("%Y%m%d%H%M%S")
foutname = f'data/ner-202407/results-{model_name}-{prompt_id}-{timestamp}.json' # NOTE: prefix zzz for directory sorting, non-sense

offset = 704

with Progress() as progress:

    task1 = progress.add_task("[blue] Prompt sample [%s]" % (len(sentence_list)), total=len(sentence_list))

    for elem in sentence_list:
        progress.update(task1, advance=1)
        sent_id = elem['sent_id']
        title = elem['title']
        annotation = elem['annotation']
    
        if offset >= sent_id:
            continue

        predicted = do_prediction(title)

        #for testing
        #print(predicted)
        #break
        
        result_list.append({\
            'sent_id': sent_id,\
            'title': title, \
            'gt': annotation, \
            'pred': predicted
        })

with open(foutname, 'w') as f:
    json.dump(result_list, f)

Output()

In [13]:
with open(foutname, 'w') as f:
    json.dump(result_list, f)

In [17]:
print(len(result_list))