### 4CE pipeline final

- This notebook:
1. Takes the content of each site's submitted documents
2. Creates and appends a prompt of 14 questions to the text body of each note
3. Submits the above to a GPT-4 instance hosted on Azure
4. Compiles the results into a spreadsheet for each site
- The filetypes can be `.txt, .pdf`
- Each Azure deployment may have a different name for the inference model used
- the outputted spreadsheets contains the full details of the request and response for each note

- The `.env` file must contain information about the endpoint

In [None]:
import os 
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4")

path = "../../Data/" # folder containing the patient notes

supfolders = [f for f in os.listdir(path)]

#remove the folders we don't want
supfolders.remove('.DS_Store')
supfolders.remove('Zips')

dirnames = []

for f in supfolders:
    dirnames.append(f + '/' + [sf for sf in os.listdir(f"{path}/{f}") if '.' not in sf][0])
    
system_prompt = "You are a medical assistant."

In [None]:
print(dirnames)

In [None]:
from pypdf import PdfReader
import chardet
import re

files = []
print("Name", "Length", "Tokens", "Words")
tokens = {}
bodies = {}
for dname in dirnames:
    temptokens = []
    tempbodies = []
    full_path = path + dname
    files_per_site = []
    for filename in os.listdir(full_path):
        body=''
        direct_path = os.path.join(full_path, filename)
        if '.pdf' in direct_path:
            reader = PdfReader(direct_path)
            body = ''
            for p in range(len(reader.pages)):
                body += reader.pages[p].extract_text()
            files_per_site.append(direct_path)
        if '.txt' in direct_path:
            with open(os.path.join(full_path, filename), 'rb') as f: # open in readonly mode
                body = f.read()
                body = body.decode(chardet.detect(body)['encoding']) # use chardet to find the right encoding
                direct_path = f.name
            files_per_site.append(direct_path)
        else:
            print(direct_path)
            pass
        if len(body)>0:
            temptokens.append(len(encoding.encode(body)))
            tempbodies.append(len(re.findall(r'\w+', body)))
    tokens[dname] = temptokens
    bodies[dname]= tempbodies
    files_per_site.sort()
    files.append(files_per_site)
    print("---------")

print(files)

print(tokens)

In [None]:
import statistics
import numpy as np

print("----- \nTokens")

for id in tokens:
    data = tokens[id]
    med = statistics.median(data)
    q3, q1 = np.percentile(data, [75 ,25])
    iqr = q3 - q1
    print(id, f"{med}, {iqr}")

print("----- \nWords")

for id in bodies:
    data = bodies[id]
    med = statistics.median(data)
    q3, q1 = np.percentile(data, [75 ,25])
    iqr = q3 - q1
    print(id, f"{med}, {iqr}")

Once we find that the note lengths are comfortably under the token limit, we can go ahead and submit these as-is in the API request.

Initialization:

In [None]:
import openai
import configparser

config = configparser.ConfigParser()
config.read("../config.ini")

openai.api_key = config["OpenAI"]["api_key"]
openai.api_type = config["OpenAI"]["api_type"]
openai.api_base = config["OpenAI"]["api_base"]
openai.api_version = config["OpenAI"]["api_version"]

print('Deployment IDs and capabilities:')
deployment_id = None
result = openai.Deployment.list()

for deployment in result.data:
    print(f"{deployment['id']}: {openai.Model.retrieve(deployment['model'])['capabilities']}")

modelname = "<your_model_name>"

In [None]:
inclusion = ["What is the patient's BMI?",
"Is the patient within the age range of the study (18 to 65)?",
"Was the primary reason for this visit to treat the patient for COVID-19 disease?",
"""Does the patient adhere to the inclusion criteria of the study? 
 >>>>Inclusion criteria: Patients aged 18 to 65, both male and female, with obesity and history of or with active COVID.
 >>>>Exclusion criteria: Patients outside the age range, without obesity or COVID.
 If the answer is no, tell me which criteria it did not meet.""",
"Was the main reason for hospitalization covid? This includes patients who developed complications due to covid."]

novsunknown = ["Does the patient have covid on admission? If this is not available in the notes, return \"not found in notes\".",
"Does the patient require immediate medical attention?",
"Is the patient obese? which threshold/definition did you use to determine this?",
"What is the patient's age?"]

hallucination = ["What is the patient's gender?",
"Is this an admission note?",
"Does the patient have diabetes?",
"""Apply the prevailing USPSTF Guidelines for Prediabetes and Type 2 Diabetes to this patient. 
This guideline states: 'The USPSTF recommends screening for prediabetes and type 2 diabetes in adults aged 35 to 70 years who have overweight or obesity. Clinicians should offer or refer patients with prediabetes to effective preventive interventions.'
 Based on this guideline, should the clinician offer or refer this patient for preventive intervention? Provide reasons to support your answer. """]

typos = ["Do you notice any inconsistencies or typos in the patient note? If you do, reprint them and specify of what kind they are."]


In [None]:
questions = inclusion + novsunknown + hallucination + typos

In [None]:
qbody = ""
for q in range(len(questions)):
    qbody += f"{q+1}. {questions[q]}\n"

print(qbody)

In [None]:
instructions = """From the provided document below, answer the following questions: {}
Return the list order number for each of your answers and make sure to briefly specify at the end whether you found the answer in the document or not.
 
------
{}
------"""


In [None]:
import pandas as pd
import time
from datetime import datetime
import re

for site in files:
      tstamp = datetime.now().strftime('%Y-%m-%d %H꞉%M꞉%S')
      site_id = site[0].replace("../../Data/","").split("/")[0]
      full_responses = pd.DataFrame(columns=["ID", "Document Size", "Input", "Output", "Tokens In", "Tokens Out", "Total Tokens", "Time"])
      for filename in site:
            start_time = time.time()
            storage = {}
            if '.pdf' in filename:
                  reader = PdfReader(filename)
                  body = ''
                  for p in range(len(reader.pages)):
                        body += reader.pages[p].extract_text() + '\n'
            else:
                  with open(filename, 'r', encoding="latin-1") as f: # open in readonly mode
                        storage["ID"] = filename.split("/")[-1].replace(".pdf","")
                        note_content = f.read()
                        print(storage["ID"])
                        storage["Document Size"] = len(note_content)
                        storage["Input"] = instructions.format(qbody, note_content)
                        llm_call = openai.ChatCompletion.create(
                              engine=modelname, # Model name specific to each Azure GPT-4 instance
                              messages=[
                              {"role": "system", "content": system_prompt},
                              {"role": "user", "content": storage["Input"]},
                        ],     temperature=0.2,
                              top_p=0.95,
                              frequency_penalty=0,
                              presence_penalty=0,
                              stop=None)
                        storage["Output"] = llm_call.choices[0].message.content
                        storage["Tokens In"] = llm_call.usage.prompt_tokens
                        storage["Tokens Out"] = llm_call.usage.completion_tokens
                        storage["Total Tokens"] = llm_call.usage.total_tokens
                        storage["Time"] = time.time() - start_time

                        full_responses.loc[len(full_responses)] = storage

      #write the excel

      for index, row in full_responses.iterrows():
            indetails = pd.DataFrame(columns=["ID"]+questions)
            #split the ordered list up
            ordered_responses = []
            for i in range(1,len(questions)+1):
                  if i == 1:
                        ordered_responses.append(row.Output.split(f"{i}. ")[1].split(f"\n{i+1}. ")[0])
                  else:
                        ordered_responses.append(row.Output.split(f"\n{i}. ")[1].split(f"\n{i+1}. ")[0])
            
            indetails.loc[len(indetails)] = [row.ID] + ordered_responses
            # adding a row for the feedback
            indetails.loc[len(indetails)] = [f"Is the information available in the note?"] + ["" for w in range(len(questions))]
            indetails.loc[len(indetails)] = [f"Can the information be inferred?"] + ["" for w in range(len(questions))]
            indetails.loc[len(indetails)] = [f"Do you agree with GPT's answer?"] + ["" for w in range(len(questions))]
            indetails.loc[len(indetails)] = [f"What is your answer?"] + ["" for w in range(len(questions))]
            indetails.loc[len(indetails)] = [f"Comments"] + ["" for w in range(len(questions))]
            try:
                  with pd.ExcelWriter(f"{site_id}_results_{tstamp}.xlsx", mode='a',if_sheet_exists = 'new') as writer:
                        indetails.T.to_excel(writer, sheet_name=f"{row.ID}")
            except FileNotFoundError:
                        indetails.T.to_excel(f"{site_id}_results_{tstamp}.xlsx", sheet_name=f"{row.ID}")
      try:
            with pd.ExcelWriter(f"{site_id}_results_{tstamp}.xlsx", mode='a',if_sheet_exists = 'new') as writer:
                  full_responses.to_excel(writer, sheet_name='GPT-4')
      except FileNotFoundError:
                  full_responses.to_excel(f"{site_id}_results_{tstamp}.xlsx", sheet_name="GPT-4")

      print(f"done {site_id}") 