In [None]:
from langchain.prompts.example_selector import SemanticSimilarityExampleSelector
from langchain.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain_community.llms import Ollama
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain_core.exceptions import OutputParserException
from langchain.chains import LLMChain
from collections import namedtuple, defaultdict
import pandas as pd
import numpy as np
import time
import json

### Params

In [None]:
# model: llama 2,  mistral, medllama2
model = "biomistral"
embedding_model = "biomistral"

# taxonomy
taxonomy_file_name = "sirch_csv_1.txt" # "sirch_json_6m.txt"

# examples
use_example_selector = False
example_set_size = 2
example_data_file_name = "data.csv"
example_batch_size = 1
num_examples = 2

# input/output
extract_index = 8
prompt_index = 17

In [None]:
# prompts
prompt_0 = '''Assume the role of a medical expert. Using the following taxonomy: {taxonomy} of human factors containing factors, subfactors and subsubfactors, I want you to find a list of the most relevant text segments to annotate in the following medical case: {text_extract}. Then for relevant text segments in the medical case, find the most fitting labels using the taxonomy to give the output as list of human factors in the following format: ("text extract 1": ["factor1", "factor2", ...], ...) and nothing else.'''
# added \n (line breaks) and used gpt4 to reword initial prompt
prompt_1 = '''Imagine you are a medical expert. Given the following taxonomy:\n {taxonomy}\n of human factors containing factors, subfactors, and subsubfactors, your task is to find relevant text segments in this medical case:\n {text_extract}\n. Then, for each relevant text segment, identify the most fitting labels using the taxonomy. Present the output as a list of human factors in the following format: ("text extract 1": ["factor1", "factor2", ...], ...)'''
# moved taxonomy to start
prompt_2 = '''TAXONOMY: {taxonomy}\n\nImagine you are a medical expert. Using the taxonomy containing human factors containing factors, subfactors, and subsubfactors, your task is to find relevant text segments in the medical case:\n {text_extract}\n. Then, for each relevant text segment, identify the most fitting labels using the taxonomy. Present the output as a list of human factors in the following format: {{'TEXT_EXTRACT_1': ["FACTOR_1", "FACTOR_2", ...], ...}}  '''
# moved medical extract to start
prompt_3 = '''TAXONOMY: {taxonomy}\n\nMEDICAL CASE: {text_extract}\n\nImagine you are a medical expert. Using the taxonomy containing human factors containing factors, subfactors, and subsubfactors, your task is to find relevant text segments in the medical case. Then, for each relevant text segment, identify the most fitting labels using the taxonomy. Present the output as a list of human factors in the following format: {{'TEXT_EXTRACT_1': ["FACTOR_1", "FACTOR_2", ...], ...}}  '''
# added output parser instead of manual format
prompt_4 = '''TAXONOMY: {taxonomy}\n\nMEDICAL CASE: {text_extract}\n\nImagine you are a medical expert. Using the taxonomy containing human factors containing factors, your task is to find relevant text segments in the medical case. Then, for each relevant text segment, identify the most fitting labels using the taxonomy.\n\n{format_instructions}'''

gpt_prompt_5 = '''TAXONOMY: {taxonomy}\n\nMEDICAL CASE: {text_extract}\n\nImagine you are a medical expert. Using the taxonomy containing human factors containing factors, your task is to find relevant text segments in the medical case. Then, for each relevant text segment, identify the most fitting labels using the taxonomy.\n\n{format_instructions}'''
gpt_prompt_6 = '''TAXONOMY: {taxonomy}\n\nMEDICAL SCENARIO: {text_extract}\n\nEnvision yourself as a healthcare specialist. Your task is to locate relevant text segments in the medical scenario using the given taxonomy. Subsequently, label each segment with the most suitable category from the taxonomy.\n\n{format_instructions}'''
gpt_prompt_7 = '''TAXONOMY: {taxonomy}\n\nMEDICAL TEXT: {text_extract}\n\nAssume the role of a medical expert. Your mission is to find and label relevant portions of the medical text using the taxonomy provided.\n\n{format_instructions}'''
gpt_prompt_8 = '''TAXONOMY: {taxonomy}\n\nCASE DESCRIPTION: {text_extract}\n\nImagine being a medical practitioner. Your objective is to identify relevant text segments in the case description using the taxonomy. Then, assign the most fitting labels from the taxonomy to each segment.\n\n{format_instructions}'''
gpt_prompt_9 = '''TAXONOMY: {taxonomy}\n\nMEDICAL REPORT: {text_extract}\n\nConsider yourself as a medical analyst. Your job is to find significant text segments in the medical report using the taxonomy. Then, label each segment with the most appropriate category from the taxonomy.\n\n{format_instructions}'''
gpt_prompt_10 = '''Consider yourself as a medical analyst. Your job is to find significant text segments in the medical report using the taxonomy and definitions. Then, label each segment with the most appropriate category from the taxonomy.\n\nTAXONOMY:\n\n{taxonomy}\n\n{format_instructions}'''
gpt_prompt_11 = '''Assume the role of a medical expert. Your mission is to find and label relevant portions of the medical text using the taxonomy provided.\n\nTAXONOMY:\n\n{taxonomy}\n\n{format_instructions}'''

# test prompts

inst = 'Annotate the text extract using the taxonomy.'

test_prompt12 = inst + '\n\nTAXONOMY: {taxonomy}\n\n{format_instructions}'
test_prompt13 = inst + '\n\n{format_instructions}\n\nTAXONOMY: {taxonomy}'

test_prompt14 = 'TAXONOMY: {taxonomy}\n\n' + inst + '\n\n{format_instructions}'
test_prompt15 = '{format_instructions}\n\n' + inst + '\n\nTAXONOMY: {taxonomy}'

test_prompt16 = '{format_instructions}\n\nTAXONOMY: {taxonomy}\n\n' + inst 
test_prompt17 = 'TAXONOMY: {taxonomy}\n\n{format_instructions}\n\n' + inst

test_prompts = [test_prompt12, test_prompt13, test_prompt14, test_prompt15, test_prompt16, test_prompt17]
prompts = [prompt_0, prompt_1, prompt_2, prompt_3, prompt_4, gpt_prompt_5, gpt_prompt_6, gpt_prompt_7, gpt_prompt_8, gpt_prompt_9, gpt_prompt_10, gpt_prompt_11] + test_prompts
prompt = prompts[prompt_index]
prompt

### Loading taxonomy

In [None]:
taxonomy = ""
with open(taxonomy_file_name, 'r') as file:
    taxonomy = file.read().replace('\n', ' ')#.replace(" ", "")

### Loading example data

In [None]:
df = pd.read_csv(example_data_file_name)
df.head()

In [None]:
grouped = df[["Code", "GPT rephrased sentence 1"]].groupby('GPT rephrased sentence 1')['Code'].apply(list).reset_index(name='Labels')
grouped.head()

### Combine examples into batches

This is because the data contains a single sentence mapped to a list of labels

But there may be multiple sentences, each with their own list of labels.

This batched data will map a list of sentences to a list of labels for each sentence.

In [None]:
# batches = np.array_split(grouped, grouped.shape[0] / example_batch_size)

# examples = []
# for batch_index, batch in enumerate(batches[:example_set_size]):
#     inp = ""
#     out = ""    
#     for index, item in batch.iterrows():
#         inp += " " + item['GPT rephrased sentence 1']
#         label = ', '.join(f'"{i}"' for i in item['Labels'])
#         # out += ', "' + item['GPT rephrased sentence 1'] +  '''": [''' + label + ']'
#         out += ', "text_extract": "' + item['GPT rephrased sentence 1'] +  '''",\n\t"factors": [''' + label + ']'

#     # example = {"input": inp[1:], "output": "{{" + out[2:] + "}}"}
#     # format as json
#     example = {"input": inp[1:], "output": "```json\n{{\n\t" + out[2:] + "\n}}\n```"}
#     examples.append(example)


# # print(examples[:2])
# # print(examples[1]["output"])


# examples

### Creating model

In [None]:
llm = Ollama(model=model) 

### Output parser for parsing JSON from model output

In [None]:
output_parser = StructuredOutputParser.from_response_schemas([
    ResponseSchema(
        name='text_extract',
        type='string',
        description='Text extract from the medical case'
    ),
    ResponseSchema(
        name='factors',
        type='List[string]',
        description='List of factors associated with the text extract'
    )
])
format_instructions = output_parser.get_format_instructions()
format_instructions

### Create few shot template

In [None]:
examples = [{'input': 'A MEOWS score was calculated on several occasions with an incomplete set of observation parameters being recorded.', 'output': '```json\n{{\n\t"text_extract": "A MEOWS score was calculated on several occasions with an incomplete set of observation parameters being recorded.",\n\t"factors": ["Assessment, investigation, testing, screening (e.g., holistic review)"]\n}}\n```'}, {'input': 'A combination of handwritten and electronic antenatal healthcare records were used and not all of the mother s risk factors were highlighted in her electronic healthcare records.', 'output': '```json\n{{\n\t"text_extract": "A combination of handwritten and electronic antenatal healthcare records were used and not all of the mother s risk factors were highlighted in her electronic healthcare records.",\n\t"factors": ["Risk assessment", "Documentation"]\n}}\n```'}]

example_prompt = PromptTemplate(
    input_variables=["input", "output"],
    template="Example Input: {input}\nExample Output: {output}",
)

example_selector = None
if use_example_selector:
    example_selector = SemanticSimilarityExampleSelector.from_examples(
        examples, 
        OllamaEmbeddings(model=embedding_model), 
        Chroma, 
        k=num_examples
    )

similar_prompt = FewShotPromptTemplate(
    example_selector=example_selector,
    prefix=prompt,
    suffix="Input: {text_extract}\nOutput:",
    input_variables=["text_extract"],
    partial_variables={"format_instructions": format_instructions},
) if use_example_selector else FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    prefix=prompt,
    suffix="Input: {text_extract}\nOutput:",
    input_variables=["text_extract"],
    partial_variables={"format_instructions": format_instructions},
)

print(similar_prompt.format(text_extract="INPUT WOULD GO HERE", taxonomy=taxonomy))

In [None]:
results = []
parsed_results = []
ChainOutput = namedtuple("ChainOutput", "avg_precision std_precision avg_recall std_recall runs_list bad_format_count")

def count_tp_fp_fn(output, expected):
    tp, fp = defaultdict(int), defaultdict(int)
    for factor in output["factors"]:
        if factor in expected:
            tp[factor] +=1
        else:
            fp[factor] +=1

    fn = len(expected) - len(tp)
    return tp, fp, fn

def run_chain(chain, extract, expected, num_results, debug=False):
    print("Expected:", " ".join(expected))
    
    bad_format_count = 0
    runs_list = []
    list_precision = []
    list_recall = []

    # making num_results number of inferences
    for _ in range(num_results):
        run_info = {}

        # generate output using llm
        llm_output = chain.invoke({"text_extract": extract, "taxonomy": taxonomy})
        results.append(llm_output)

        # parse generated output into json
        parsed_output = None 
        try:
            parsed_output = output_parser.parse(llm_output)
            parsed_results.append(parsed_output)
        except (TypeError, OutputParserException): 
            bad_format_count += 1
            continue

        run_info["expected_factors"] = list(expected)
        run_info["predicted_factors"] = parsed_output["factors"]

        # count tp, fp and fn
        tp, fp, fn_val = count_tp_fp_fn(output=parsed_output, expected=expected)
        
        tp_val = sum(tp.values())
        fp_val = sum(fp.values())
        
        run_info["tp"] = tp_val
        run_info["fp"] = fp_val
        run_info["fn"] = fn_val

        # calculate precision and recall using tp, fp and fn
        run_precision =  0 if (tp_val + fp_val) == 0 else tp_val / (tp_val + fp_val)
        run_recall =  0 if (tp_val + fn_val) == 0 else tp_val / (tp_val + fn_val)
        
        run_info["precision"] = run_precision
        run_info["recall"] = run_recall

        # used to calculate avg and std
        list_precision.append(run_precision)
        list_recall.append(run_recall)

        # keep track of runs
        runs_list.append(run_info)

        if debug:
            print("----------------------------------------------------------")
            print("Run info:", run_info)

    
    list_precision = np.array(list_precision)
    list_recall = np.array(list_recall)

    # calculating avg precision recall and std
    avg_precision = list_precision.mean() if len(list_precision) > 0 else 0
    std_precision = list_precision.std() if len(list_precision) > 0 else 0
    avg_recall = list_recall.mean() if len(list_recall) > 0 else 0
    std_recall = list_recall.std() if len(list_recall) > 0 else 0
 
    if debug:
        print("----------------------------------------------------------")
        print("Precision for input:", avg_precision)
        print("STD Precision for input:", std_precision)
        print("Recall for input:", avg_recall)
        print("STD Recall for input:", std_recall)
    
    return ChainOutput(avg_precision, std_precision, avg_recall, std_recall, runs_list, bad_format_count)


def save_json(f_name, dict):
    with open(f_name, "w") as outfile: 
        json.dump(dict, outfile)


### Running the chain

Looping through each text extract, and getting average precision and recall for each class found in the extract

In [None]:
# first two used as examples
offset = 2
input_extracts = grouped["GPT rephrased sentence 1"].tolist()[offset:]
expected_codes = grouped["Labels"].tolist()[offset:]
chain = similar_prompt | llm

# parameters
num_results = 10
num_of_extracts = 5

# for printing and saving output as a json
debug = 0

# to measure prompt's ability to instruct the llm to generate a json output
bad_format_count_total = 0

# for calculating avg and std across multiple text extracts
list_precision = []
list_precision_std = []
list_recall = []
list_recall_std = []

# sentence: parsed_output, expected, tp, fp, fn, precision, recall
debug_output = {}

start_time = time.time()

''' 
    Calculate precision, recall and std for num_of_extracts each extract 
    for each extract, generate num_results outputs and keep track of run metrics
'''
for input_extract, expected in zip(input_extracts[:num_of_extracts], expected_codes[:num_of_extracts]):
    print("Input:", input_extract)
    chain_output = run_chain(chain=chain, extract=input_extract, expected=expected, num_results=num_results, debug=debug)
    print("Precision:", chain_output.avg_precision, "Recall:", chain_output.avg_recall, "\n")
    
    debug_output[input_extract] = {"runs": chain_output.runs_list, 
                                   "avg_precision": chain_output.avg_precision, 
                                   "std_precision": chain_output.std_precision, 
                                   "avg_recall": chain_output.avg_recall, 
                                   "std_recall": chain_output.std_recall,
                                   "bad_format_count": chain_output.bad_format_count}
    
    bad_format_count_total += chain_output.bad_format_count

    list_precision.append(chain_output.avg_precision)
    list_precision_std.append(chain_output.std_precision)
    list_recall.append(chain_output.avg_recall)
    list_recall_std.append(chain_output.std_recall)

end_time = time.time()

if debug:
    print("----------------------------------------------------------")
    print("Debug output\n", debug_output)
          
save_json("debug.json", dict(debug_output))

list_precision_np = np.array(list_precision)
list_precision_std_np = np.array(list_precision_std)
list_recall_np = np.array(list_recall)
list_recall_std_np = np.array(list_recall_std)

print("Avg precision:", list_precision_np.mean(), 
      "| STD precision:", list_precision_std_np.mean(), 
      "| Avg recall:", list_recall_np.mean(), 
      "| STD recall:", list_recall_std_np.mean())
print("Time taken:", end_time - start_time, "seconds")
print("Badly formatted jsons:", bad_format_count_total, "/", len(results))

### Results plots

In [None]:
import matplotlib.pyplot as plt

# Plotting the distributions
plt.figure(figsize=(10, 6))

# Distribution of list_precision_np
plt.subplot(2, 2, 1)
plt.hist(list_precision_np, bins=10, color='blue')
plt.title('Distribution of list_precision_np')
plt.xlabel('Precision')
plt.ylabel('Frequency')

# Distribution of list_precision_std_np
plt.subplot(2, 2, 2)
plt.hist(list_precision_std_np, bins=10, color='green')
plt.title('Distribution of list_precision_std_np')
plt.xlabel('Precision STD')
plt.ylabel('Frequency')

# Distribution of list_recall_np
plt.subplot(2, 2, 3)
plt.hist(list_recall_np, bins=10, color='red')
plt.title('Distribution of list_recall_np')
plt.xlabel('Recall')
plt.ylabel('Frequency')

# Distribution of list_recall_std_np
plt.subplot(2, 2, 4)
plt.hist(list_recall_std_np, bins=10, color='orange')
plt.title('Distribution of list_recall_std_np')
plt.xlabel('Recall STD')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()


### All results parsed as JSONs

In [None]:
parsed_results

In [None]:
from difflib import SequenceMatcher

total_ratio = 0
for i in results:
    for j in results:
        total_ratio += SequenceMatcher(None, i, j).ratio()

# aveage result similarity across 10 inferences
total_ratio/(len(results)**2)

In [None]:
save_json("output.json", parsed_results)