## Evaluate model out of the box

In [1]:
! ls data

my-conll2003-dataset-test.jsonl		my-conll2003-dataset-train_sample.jsonl
my-conll2003-dataset-test_sample.jsonl	my-conll2003-dataset-validation.jsonl
my-conll2003-dataset-train.jsonl


In [21]:
from datasets import load_dataset

# load data
dataset = json_datasets_reloaded = load_dataset("json", 
                            data_files="data/my-conll2003-dataset-test_sample.jsonl")


In [22]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'ner_labels', 'sentence', 'entities'],
        num_rows: 100
    })
})

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

In [7]:
model_id = "mistralai/Mistral-7B-Instruct-v0.2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [8]:
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

Loading checkpoint shards: 100%|██████████| 3/3 [00:09<00:00,  3.26s/it]


In [9]:
# Init an eval tokenizer that doesn't add padding or eos token
eval_tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    add_bos_token=True,
)


In [11]:
def eval_formatting_func(entry):
    # tokenizer adds bos token
    text = (f"[INST] You are an NLP expert tasked with Named Entity Extraction. "
        f"Identify entities of the type Person (PER), Organization (ORG), Location (LOC) and Miscellaneous (MISC) in the following sentence: '{entry['sentence']}'\n"
        "Your answer must be in the form of a dict {'PER':['person entity 1', 'person entity 2', '...'], 'ORG': [], 'LOC': [], 'MISC': []} \n"
        "Take care, your answer is only valid if it follows the correct format! [/INST]")

    return text


In [24]:
eval_prompt = eval_formatting_func(dataset['train'][10])
print(eval_prompt)

[INST] You are an NLP expert tasked with Named Entity Extraction. Identify entities of the type Person (PER), Organization (ORG), Location (LOC) and Miscellaneous (MISC) in the following sentence: 'LONDON 1996-12-07'
Your answer must be in the form of a dict {'PER':['person entity 1', 'person entity 2', '...'], 'ORG': [], 'LOC': [], 'MISC': []} 
Take care, your answer is only valid if it follows the correct format! [/INST]


In [26]:
print(eval_tokenizer.decode(eval_tokenizer(eval_prompt)['input_ids']))

<s> [INST] You are an NLP expert tasked with Named Entity Extraction. Identify entities of the type Person (PER), Organization (ORG), Location (LOC) and Miscellaneous (MISC) in the following sentence: 'LONDON 1996-12-07'
Your answer must be in the form of a dict {'PER':['person entity 1', 'person entity 2', '...'], 'ORG': [], 'LOC': [], 'MISC': []} 
Take care, your answer is only valid if it follows the correct format! [/INST]


In [27]:
#test
model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    response  = eval_tokenizer.decode(model.generate(**model_input,
                    max_new_tokens=256, repetition_penalty=1.15,
                    pad_token_id=eval_tokenizer.eos_token_id)[0], skip_special_tokens=True)
    response = response.replace(eval_prompt, "")
    print(response)

 Based on the given input, there does not seem to be any identifiable PER, ORG, LOC or MISC entities present in the sentence 'LONDON 1996-12-07'. The sentence consists solely of a location (LONDON) and a date (1996-12-07). Therefore, the output should be:
{'PER': [], 'ORG': [], 'LOC': ['LONDON'], 'MISC': []}


In [28]:
import re
def parse_response(sentence):
    # Find the text between curly braces
    keys = ['PER', 'ORG', 'LOC', 'MISC']
    match = re.search(r'\{[^{}]+\}', response)
    # If a match is found, evaluate it as a dictionary using eval
    if match:
        entities_str = match.group(0)
        try:
            entities_dict = eval(entities_str)
            # check dictionary contains all keys, if not add it
            for key in keys:
                if key not in entities_dict:
                    entities_dict[key] = []
            return entities_dict
        # if you cant parse it as a dictionary return empty dictionary
        except:
            return {'PER':[], 'ORG':[], 'LOC':[], 'MISC':[]}
    else:
        # If no match is found, return an empty dictionary
        return {'PER':[], 'ORG':[], 'LOC':[], 'MISC':[]}

In [30]:
response

" Based on the given input, there does not seem to be any identifiable PER, ORG, LOC or MISC entities present in the sentence 'LONDON 1996-12-07'. The sentence consists solely of a location (LONDON) and a date (1996-12-07). Therefore, the output should be:\n{'PER': [], 'ORG': [], 'LOC': ['LONDON'], 'MISC': []}"

In [31]:
response_dict = parse_response(response)
response_dict

{'PER': [], 'ORG': [], 'LOC': ['LONDON'], 'MISC': []}

In [35]:
len(dataset['train'])

100

In [33]:
dataset['train'][25]

{'id': '2252',
 'tokens': ['Burmese',
  'students',
  'march',
  'out',
  'of',
  'campus',
  'again',
  '.'],
 'pos_tags': [21, 24, 41, 15, 15, 21, 30, 7],
 'chunk_tags': [11, 12, 21, 13, 13, 11, 3, 0],
 'ner_tags': [7, 0, 0, 0, 0, 0, 0, 0],
 'ner_labels': ['B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 'sentence': 'Burmese students march out of campus again .',
 'entities': {'LOC': [], 'MISC': ['Burmese'], 'ORG': [], 'PER': []}}

In [57]:
list(range(3,10))

[3, 4, 5, 6, 7, 8, 9]

In [58]:
from tqdm import tqdm
import jsonlines

# Define the file path for saving JSONL
output_file = "data/base_model_responses.jsonl"


for idx in tqdm(range(3,10)): #len(dataset['train']))):

    eval_prompt = eval_formatting_func(dataset['train'][idx])

    ground_truth = dataset['train'][idx]['entities']

    model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")

    model.eval()
    with torch.no_grad():
        response  = eval_tokenizer.decode(model.generate(**model_input,
                        max_new_tokens=256, repetition_penalty=1.15,
                        pad_token_id=eval_tokenizer.eos_token_id)[0], skip_special_tokens=True)
        response = response.replace(eval_prompt, "")

    response_dict = parse_response(response)

    response_data = {'id':  dataset['train'][idx]['id'],
                     'sentence': dataset['train'][idx]['sentence'],
                    'entities': dataset['train'][idx]['entities'],
                    'base_model': 
                        {'response': response,
                        'response_dict': response_dict}
                        }
       
    # Save response data to JSONL file
    with jsonlines.open(output_file, mode='a') as writer:
        writer.write(response_data)


100%|██████████| 7/7 [00:30<00:00,  4.33s/it]


In [59]:
responses= []
with jsonlines.open('data/base_model_responses.jsonl', mode='r') as reader:
    for item in reader:
        responses.append(item)

In [60]:
len(responses)

9

In [61]:
responses

[{'id': '409',
  'sentence': 'Hartford 4 BOSTON 2',
  'entities': {'LOC': [],
   'MISC': [],
   'ORG': ['Hartford', 'BOSTON'],
   'PER': []},
  'base_model': {'response': " Based on the given sentence 'Hartford 4 BOSTON 2', there are no explicit person or miscellaneous entities mentioned. Therefore, the output will only contain location and organization entities, which in this case are both cities.\n\n{'LOC': ['Hartford', 'Boston'], 'ORG': [], 'PER': [], 'MISC': []}",
   'response_dict': {'LOC': ['Hartford', 'Boston'],
    'ORG': [],
    'PER': [],
    'MISC': []}}},
 {'id': '125',
  'sentence': 'S. Doull c subs ( M. Wasim ) b Waqar 1',
  'entities': {'LOC': [],
   'MISC': [],
   'ORG': [],
   'PER': ['S. Doull', 'M. Wasim', 'Waqar']},
  'base_model': {'response': " Based on the given sentence, I cannot identify any clear-cut PER, ORG, LOC or MISC entities. The sentence appears to contain names and abbreviations but lacks sufficient context for accurate identification. Therefore, my an