In [1]:
import re
import sys
sys.path.append('./readme2kg-exp/src/')
import os
import random
from collections import defaultdict
from termcolor import colored
from functools import partial, reduce
import operator as op
import hashlib
import multiprocessing as mp
import logging
import json

from predictor import BasePredictor, LABELS
from webanno_tsv import webanno_tsv_read_file, Document, Annotation, Token
import utils
import cleaner

In [2]:
phase = '3shot'
model_name = 'Mistral-7B-Instruct-v0.3'

In [5]:
prompt_id = '3shot'

prompt_template_path = f'data/ner-202407/prompt-{prompt_id}.txt'
if os.path.isfile(prompt_template_path):
    with open(prompt_template_path, 'r') as fd:
        prompt_template = fd.read()
else:
    prompt_template = ''
print(prompt_template)

**Task:**
You are tasked with performing Fine-Grained Named Entity Recognition (NER) on the given a publication title. 
Follow the examples to identify and classify entities into their respective categories. 
Annotate the entities directly in the original text using XML-style tags (e.g <TAG>Entity</TAG>). 
Only return the annotated text in Markdown format—no explanations, introductions, or extra text. 
Thank You.

**Entity Classes:**
1. DATE-CREATION - Date of creation (e.g. "4. März", "19. Octobr.", ).
2. DATE-PUB - Date of publication (e.g. "19. Octobr.").
3. DATE-SUBJ - Date being described in the title.
4. GPE-AUT - Geopolitical entity responsible for authoring the title.
5. GPE-CREATION - Geopolitical entity where the title was created.
6. GPE-DES - Geopolitical entity mentioned as the designation of a person entity.
7. GPE-SUBJ - Geopolitcal entity as the subject of the title.
8. GPE-PUB - Geopolitical entity where the title was published.
9. LITWORK - title of the publication (e

In [6]:
sentence_text = "Christliche Leichenpredigt Vber den Text aus dem 2. Buch Samuelis am 10. Cap. v. 12. Sey getrost , und laß uns starck seyn für unser Volck , etc. : Bey volckreicher Leichenbestattung Des ... H. Stellan Otto von Mörners , Königl. Maj. und Cron Schweden ... Rittmeisters , Welcher den XXX. Octob. ... sein Leben ritterlich und seliglich beschlossen , und folgends den 11. Decemb. ... beygesetzet worden"
prompt = prompt_template.replace('{input_text}', sentence_text)
messages=[
            {"role": "system", "content": "You are a highly accurate Named Entity Recognition (NER) in the library domain."},
            {"role": "user", "content": prompt},
        ]
print(messages)

[{'role': 'system', 'content': 'You are a highly accurate Named Entity Recognition (NER) in the library domain.'}, {'role': 'user', 'content': '**Task:**\nYou are tasked with performing Fine-Grained Named Entity Recognition (NER) on the given a publication title. \nFollow the examples to identify and classify entities into their respective categories. \nAnnotate the entities directly in the original text using XML-style tags (e.g <TAG>Entity</TAG>). \nOnly return the annotated text in Markdown format—no explanations, introductions, or extra text. \nThank You.\n\n**Entity Classes:**\n1. DATE-CREATION - Date of creation (e.g. "4. März", "19. Octobr.", ).\n2. DATE-PUB - Date of publication (e.g. "19. Octobr.").\n3. DATE-SUBJ - Date being described in the title.\n4. GPE-AUT - Geopolitical entity responsible for authoring the title.\n5. GPE-CREATION - Geopolitical entity where the title was created.\n6. GPE-DES - Geopolitical entity mentioned as the designation of a person entity.\n7. GPE-S

# Load Mistral model

In [7]:
from huggingface_hub import snapshot_download
from pathlib import Path

mistral_models_path = Path.home().joinpath('mistral_models', '7B-Instruct-v0.3')
mistral_models_path.mkdir(parents=True, exist_ok=True)

snapshot_download(repo_id="mistralai/Mistral-7B-Instruct-v0.3", allow_patterns=["params.json", "consolidated.safetensors", "tokenizer.model.v3"], local_dir=mistral_models_path)

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

'/home/ann/mistral_models/7B-Instruct-v0.3'

In [8]:
from mistral_inference.transformer import Transformer
from mistral_inference.generate import generate

from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.protocol.instruct.messages import UserMessage
from mistral_common.protocol.instruct.request import ChatCompletionRequest


tokenizer = MistralTokenizer.from_file(f"{mistral_models_path}/tokenizer.model.v3")
model = Transformer.from_folder(mistral_models_path)

completion_request = ChatCompletionRequest(messages=messages)

tokens = tokenizer.encode_chat_completion(completion_request).tokens

out_tokens, _ = generate([tokens], model, max_tokens=1000, temperature=0.0, eos_id=tokenizer.instruct_tokenizer.tokenizer.eos_id)
result = tokenizer.instruct_tokenizer.tokenizer.decode(out_tokens)

print(result)

['<LITWORK>Christliche Leichenpredigt Vber den Text aus dem 2. Buch Samuelis am 10. Cap. v. 12. Sey getrost , und laß uns starck seyn für unser Volck , etc.</LITWORK> : Bey volckreicher Leichenbestattung Des ... <PERSON-SUBJ>H. Stellan Otto von Mörners</PERSON-SUBJ> , Königl. Maj. und Cron <GPE-DES>Schweden</GPE-DES> ... Rittmeisters , Welcher den XXX. <DATE-SUBJ>Octob.</DATE-SUBJ> ... sein Leben ritterlich und seliglich beschlossen , und folgends den <DATE-SUBJ>11. Decemb.</DATE-SUBJ> ... beygesetzet worden']


In [9]:
def do_prediction(title):
    #print(f"Process-{os.getpid()} processing {colored(sentence.text, 'red')} ...")
    prompt = prompt_template.replace('{input_text}', title)

    completion_request = ChatCompletionRequest(messages=[
        {"role": "system", "content": "You are a highly accurate Named Entity Recognition (NER) in the library domain."},
        {"role": "user", "content": prompt},
    ])
    tokens = tokenizer.encode_chat_completion(completion_request).tokens

    out_tokens, _ = generate([tokens], model, max_tokens=255, temperature=0.0, eos_id=tokenizer.instruct_tokenizer.tokenizer.eos_id)
    result = tokenizer.instruct_tokenizer.tokenizer.decode(out_tokens[0])
    
    return result

In [22]:
# DO NOT RUN THIS
with open ('data/ner-202407/results-Mistral-7B-Instruct-v0.3-3shot-20250417151610.json') as f:
    result_list = json.load(f)

In [17]:
result_list = []

In [24]:
from datetime import datetime
from rich.progress import track, Progress
from rich import print

with open('data/ner-202407/ddb-ner-dataset.json') as f:
    sentence_list = json.load(f)


now = datetime.now() # current date and time
timestamp = now.strftime("%Y%m%d%H%M%S")
foutname = f'data/ner-202407/results-{model_name}-{prompt_id}-{timestamp}.json' # NOTE: prefix zzz for directory sorting, non-sense

offset = 739
with Progress() as progress:

    task1 = progress.add_task("[blue] Prompt sample [%s]" % (len(sentence_list)), total=len(sentence_list))

    for elem in sentence_list:
        progress.update(task1, advance=1)
        sent_id = elem['sent_id']
        title = elem['title']
        annotation = elem['annotation']

        if offset >= sent_id:
            continue
        
        predicted = do_prediction(title)

        #for testing
        #print(predicted)
        #break
        
        result_list.append({\
            'sent_id': sent_id,\
            'title': title, \
            'gt': annotation, \
            'pred': predicted
        })

with open(foutname, 'w') as f:
    json.dump(result_list, f)

Output()

In [20]:
with open(foutname, 'w') as f:
    json.dump(result_list, f)

In [23]:
print(len(result_list))