In [1]:
from langchain.prompts.example_selector import SemanticSimilarityExampleSelector
from langchain.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain_community.llms import Ollama
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.chains import LLMChain
import pandas as pd
import numpy as np
import time

In [2]:
# model: llama 2,  mistral, medllama2
model = "mistral"
embedding_model = "mistral"

# taxonomy
taxonomy_file_name = "sirch_json_1.txt"

# examples
example_set_size = 10
example_data_file_name = "data.csv"
example_batch_size = 2
num_examples = 2

# input/output
extract_index = 0
prompt_index = 4

In [3]:
# prompts
prompt_0 = '''Assume the role of a medical expert. Using the following taxonomy: {taxonomy} of human factors containing factors, subfactors and subsubfactors, I want you to find a list of the most relevant text segments to annotate in the following medical case: {text_extract}. Then for relevant text segments in the medical case, find the most fitting labels using the taxonomy to give the output as list of human factors in the following format: ("text extract 1": ["factor1", "factor2", ...], ...) and nothing else.'''
# added \n (line breaks) and used gpt4 to reword initial prompt
prompt_1 = '''Imagine you are a medical expert. Given the following taxonomy:\n {taxonomy}\n of human factors containing factors, subfactors, and subsubfactors, your task is to find relevant text segments in this medical case:\n {text_extract}\n. Then, for each relevant text segment, identify the most fitting labels using the taxonomy. Present the output as a list of human factors in the following format: ("text extract 1": ["factor1", "factor2", ...], ...)'''
# moved taxonomy to start
prompt_2 = '''TAXONOMY: {taxonomy}\n\nImagine you are a medical expert. Using the taxonomy containing human factors containing factors, subfactors, and subsubfactors, your task is to find relevant text segments in the medical case:\n {text_extract}\n. Then, for each relevant text segment, identify the most fitting labels using the taxonomy. Present the output as a list of human factors in the following format: {{'TEXT_EXTRACT_1': ["FACTOR_1", "FACTOR_2", ...], ...}}  '''
# moved medical extract to start
prompt_3 = '''TAXONOMY: {taxonomy}\n\nMEDICAL CASE: {text_extract}\n\n Imagine you are a medical expert. Using the taxonomy containing human factors containing factors, subfactors, and subsubfactors, your task is to find relevant text segments in the medical case. Then, for each relevant text segment, identify the most fitting labels using the taxonomy. Present the output as a list of human factors in the following format: {{'TEXT_EXTRACT_1': ["FACTOR_1", "FACTOR_2", ...], ...}}  '''
# added output parser instead of manual format
prompt_4 = '''TAXONOMY: {taxonomy}\n\nMEDICAL CASE: {text_extract}\n\n Imagine you are a medical expert. Using the taxonomy containing human factors containing factors, subfactors, and subsubfactors, your task is to find relevant text segments in the medical case. Then, for each relevant text segment, identify the most fitting labels using the taxonomy. Present the output as a list of human factors in the following format: {{'TEXT_EXTRACT_1': ["FACTOR_1", "FACTOR_2", ...], ...}}  '''


prompts = [prompt_0, prompt_1, prompt_2, prompt_3, prompt_4]
prompt = prompts[prompt_index]

In [4]:
# medical extracts
extract_0 = "Time pressures in triage led to missed opportunities to identify mothers’ additional needs and risks, resulting in delays in recognizing abnormal fetal heart rate tracings and decisions for delivery."
extract_1 = "The IOL proforma was not fully completed and staff delivering care did not follow the correct procedure for undertaking observations on the Mother and Baby."

extracts = [extract_0, extract_1]
extract = extracts[extract_index]

In [5]:
df = pd.read_csv(example_data_file_name)
df.head()

Unnamed: 0,FileID,Code,GPT rephrased sentence 1,GPT rephrased sentence 2,GPT rephrased sentence 3,GPT rephrased sentence 4,GPT rephrased sentence 5
0,1,"Assessment, investigation, testing, screening ...",The Mother was diagnosed with hypertension at ...,The Mother was diagnosed with hypertension at ...,The Mother was diagnosed with hypertension at ...,The Mother was diagnosed with hypertension at ...,The Mother was diagnosed with hypertension at ...
1,2,Physical characteristics,The Mother was diagnosed with hypertension at ...,The Mother was diagnosed with hypertension at ...,The Mother was diagnosed with hypertension at ...,The Mother was diagnosed with hypertension at ...,The Mother was diagnosed with hypertension at ...
2,3,Guidance,The Mother was diagnosed with hypertension at ...,The Mother was diagnosed with hypertension at ...,The Mother was diagnosed with hypertension at ...,The Mother was diagnosed with hypertension at ...,The Mother was diagnosed with hypertension at ...
3,4,Escalation/referral factor (including fresh ey...,A senior obstetrician did not make the decisio...,A senior obstetrician did not make the decisio...,The senior obstetrician did not have time to m...,The senior obstetrician was too busy to make a...,The senior obstetrician was unable to make a d...
4,5,"Acuity (e.g., capacity of the maternity unit a...",A senior obstetrician did not make the decisio...,A senior obstetrician did not make the decisio...,The senior obstetrician did not have time to m...,The senior obstetrician was too busy to make a...,The senior obstetrician was unable to make a d...


In [6]:
grouped = df[["Code", "GPT rephrased sentence 1"]].groupby('GPT rephrased sentence 1')['Code'].apply(list).reset_index(name='Labels')
grouped.head()

Unnamed: 0,GPT rephrased sentence 1,Labels
0,A MEOWS score was calculated on several occasi...,"[Assessment, investigation, testing, screening..."
1,A combination of handwritten and electronic an...,"[Risk assessment, Documentation]"
2,A debrief did not take place because it was to...,[Communication]
3,A decision to rescue the intravenous cannula r...,[Functionality (including cleaning and PPE)]
4,A decision was made to transfer Abigail to the...,[Equipment failure]


In [7]:
taxonomy = ""
with open(taxonomy_file_name, 'r') as file:
    taxonomy = file.read().replace('\n', ' ')#.replace(" ", "")

In [8]:
batches = np.array_split(grouped, grouped.shape[0] / example_batch_size)

examples = []
for batch in batches[:example_set_size]:
    inp = ""
    out = ""
    for index, item in batch.iterrows():
        inp += " " + item['GPT rephrased sentence 1']
        label = ', '.join(f'"{i}"' for i in item['Labels'])
        out += ', "' + item['GPT rephrased sentence 1'] +  '''": [''' + label + ']'
    example = {"input": inp[1:], "output": "{{" + out[2:] + "}}"}
    examples.append(example)

# examples[:3]

  return bound(*args, **kwds)


In [9]:
llm = Ollama(model=model) 

example_prompt = PromptTemplate(
    input_variables=["input", "output"],
    # input_variables=["input", "taxonomy", "output"],
    template="Example Input: {input}\nExample Output: {output}",
    # template="Example Input: {input}\nExample Taxonomy: {taxonomy}\nExample Output: {output}",
)

In [10]:
example_selector = SemanticSimilarityExampleSelector.from_examples(
    examples, 
    OllamaEmbeddings(model=embedding_model), 
    Chroma, 
    k=num_examples
)

In [11]:
similar_prompt = FewShotPromptTemplate(
    example_selector=example_selector,
    example_prompt=example_prompt,
    prefix=prompt,
    suffix="Input: {text_extract}\nOutput:",
    input_variables=["text_extract"],
)

print(similar_prompt.format(text_extract=extract, taxonomy=taxonomy))

TAXONOMY: {"taxonomy":[{"title":"External Environment","factors":[{"title":"Policy factor"},{"title":"Societal factor"},{"title":"Economic factor"},{"title":"COVID"},{"title":"Geographical factor (e.g. Location of patient)"}]},{"title":"Internal Environment","factors":[{"title":"Physical layout and Environment"},{"title":"Acuity (e.g., capacity of the maternity unit as a whole)"},{"title":"Availability (e.g., operating theatres)"},{"title":"Time of day (e.g., night working or day of the week)"},{"title":"Policy"}]},{"title":"Organisation","factors":[{"title":"Team culture factor (e.g., patient safety culture)"},{"title":"Incentive factor (e.g., performance evaluation)"},{"title":"Teamworking"},{"title":"Communication factor","sub-factors":[{"title":"Between staff"},{"title":"Between staff and patient (verbal)"}]},{"title":"Documentation"},{"title":"Escalation/referral factor (including fresh eyes reviews)"},{"title":"National and/or local guidance"},{"title":"Language barrier"}]},{"tit

In [13]:
chain = similar_prompt | llm

start_time = time.time()
results = []
for x in range(10):
    output = chain.invoke({"text_extract": extract, "taxonomy": taxonomy})
    results.append(output)
end_time = time.time()

print("Time taken", end_time - start_time, "seconds")

[' {"Time pressures in triage led to missed opportunities to identify mothers’ additional needs and risks, resulting in delays in recognizing abnormal fetal heart rate tracings and decisions for delivery.": ["Policy (e.g., understaffing or inadequate resources)", "Assessment, investigation, testing, screening (e.g., holistic review)", "Risk assessment", "Situation awareness (e.g., loss of helicopter view)"]}',
 ' {"Time pressures in triage led to missed opportunities to identify mothers’ additional needs and risks,": ["Policy factor (e.g., inadequate resources or staffing)", "Assessment, investigation, testing, screening (e.g., failure to conduct a comprehensive assessment)"]}',
 ' {"Time pressures in triage led to missed opportunities to identify mothers’ additional needs and risks, resulting in delays in recognizing abnormal fetal heart rate tracings and decisions for delivery.": ["Policy (e.g., triage policy)", "Assessment, investigation, testing, screening (e.g., failure to perform

In [None]:
results

In [16]:
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
# Parse the first item of results into JSON
parser = StructuredOutputParser()
json_output = parser.parse(results[0])

# Print the JSON output
print(json_output)

ValidationError: 1 validation error for StructuredOutputParser
response_schemas
  field required (type=value_error.missing)