## **READ BEFORE PROCEEDING**

- Inside the `input` and `output` directories, make sure you create a directory for each phenotype you plan on executing the ChatGPT code for
- You will need to upload the files generated by ATLAS into the appropriate directory and name them according to the filenames specified in the first code block for each phenotype of interest
- Create an OpenAI account, generate an OpenAI API key (you may need to deposit a small API balance to use the key), and paste the API key in the .env file located so that ChatGPT responses can be generated

In [67]:
import sys
!{sys.executable} -m pip install -q openai
!{sys.executable} -m pip install -q python-dotenv
!{sys.executable} -m pip install -q pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [68]:
#Import necessary packages
from openai import OpenAI
from dotenv import load_dotenv
import os
import pandas as pd
import re

In [69]:
#Load API key from ENV file and get properties of interest
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_KEY")
#Instantiate OpenAI client
client = OpenAI(api_key = OPENAI_API_KEY)

In [70]:
def generate_prompt(disease, diseaseDescription, conceptName, domainDefinition, domain = "condition"):
    prompt = f"""You are a helpful medical expert. Your task is to assess whether an inputted {domain} is specific to an inputted malady. Specific means that if you have the {domain}, then you definitely have the malady, but if you have the malady, you may or may not have the {domain}.

A {domain} should be considered {domainDefinition}. 

A description of the malady is provided to assist you, but please use your extensive medical knowledge in addition to this description when performing the task at hand. Provide a “yes” or “no” answer, and explain your rationale for the answer you provide.

Here is the malady:
{disease}

Here is a brief description of the malady:
{diseaseDescription}

Here is the condition:
{conceptName}
"""
    return prompt

In [71]:
def generate_gpt_response(content:str, print_output=False)->str:
    
    completions = client.chat.completions.create(#A method that allows you to generate text-based chatbot responses using a pre-trained GPT language model.
        model="gpt-3.5-turbo", 
        temperature = 0, #controls the level of randomness or creativity in the generated text; . A higher temperature value will result in a more diverse and creative output, as it increases the probability of sampling lower probability tokens. 
#         max_tokens = 2000, #controls the maximum number of tokens (words or subwords) in the generated text.
#         stop = ['###'], #specifies a sequence of tokens that the GPT model should stop generating text when it encounters
        n = 1, #the number of possible chat completions or responses that the GPT model should generate in response to a given prompt
        messages=[
          {"role":"user", "content": content},
          ])

    # Displaying the output can be helpful if things go wrong
    if print_output:
        print(completions)

    # Return the first choice's text
    ##return completions.choices[0]['message']['content'] #I only want the first repsonses
    return completions.choices[0].message.content.strip()


In [72]:
def parse_output(response:str):
    outputDict = dict()
    if bool(re.match("yes", response, re.I)):
        outputDict["include"] = True
    else:
        outputDict["include"] = False
    outputDict["rationale"] = response
    return outputDict

In [None]:
#Formula to compute the sensitivity of proposed concept set
def compute_sens_and_spec(goldStandardPositives:list, goldStandardNegatives:list, proposedPositives:list, proposedNegatives:list)->float:
    #Convert string elements to numeric elements
    proposedPositives  = list(map(int, proposedPositives))
    proposedNegatives  = list(map(int, proposedNegatives))
    #Initialize variables
    truePositivesArr = sorted(set(proposedPositives).intersection(goldStandardPositives))
    truePositiveCount = len(truePositivesArr)
    falseNegativesArr = sorted(set(proposedNegatives).intersection(goldStandardPositives))
    falseNegativeCount = len(falseNegativesArr)
    trueNegativesArr = sorted(set(proposedNegatives).intersection(goldStandardNegatives))
    trueNegativeCount = len(trueNegativesArr)
    falsePositivesArr = sorted(set(proposedPositives).intersection(goldStandardNegatives))
    falsePositiveCount = len(falsePositivesArr)
    #Initialize dictionary to return      
    returnValuesDict = dict()
    returnValuesDict["sensitivity"] = truePositiveCount/(truePositiveCount + falseNegativeCount)
    returnValuesDict["truePositives"] = truePositivesArr
    returnValuesDict["falseNegatives"] = falseNegativesArr
    returnValuesDict["specificity"] = trueNegativeCount/(trueNegativeCount + falsePositiveCount)
    returnValuesDict["trueNegatives"] = trueNegativesArr
    returnValuesDict["falsePositives"] = falsePositivesArr
    return returnValuesDict

## Type 1 Diabetes

Initialize filepath variables to avoid hardcoding, variables related to the GPT prompt, and any DataFrames needed for storing results

In [61]:
#Define filepath variables
phoebePassOneFilepath = "input/Type1Diabetes/phoebe_concepts_pass_one.csv"
phoebePassTwoFilepath = "input/Type1Diabetes/phoebe_concepts_pass_two.csv"
gptAnswersPassOneFilepath = "output/Type1Diabetes/gpt_answers_pass_one.csv"
gptPositivesPassOneFilepath = "output/Type1Diabetes/gpt_positives_pass_one.txt"
gptNegativesPassOneFilepath = "output/Type1Diabetes/gpt_negatives_pass_one.txt"
gptAnswersPassTwoFilepath = "output/Type1Diabetes/gpt_answers_pass_two.csv"
gptPositivesPassTwoFilepath = "output/Type1Diabetes/gpt_positives_pass_two.txt"
gptNegativesPassTwoFilepath = "output/Type1Diabetes/gpt_negatives_pass_two.txt"
gptFinalFilepath = "output/Type1Diabetes/gpt_concepts_finals.csv"
goldStandardPositivesFilepath = "input/Type1Diabetes/gold_standard_positive_concepts.csv"
goldStandardNegativesFilepath = "input/Type1Diabetes/gold_standard_negative_concepts.csv"
#Define ChatGPT prompt information 
disease = "Type 1 Diabetes Mellitus"
diabetesDescription = "Type 1 Diabetes Mellitus (T1DM) is a chronic autoimmune disorder characterized by the destruction of insulin-producing beta cells in the pancreas, leading to absolute insulin deficiency. It is defined as a metabolic disorder characterized by hyperglycemia due to insulin deficiency resulting from autoimmune destruction of pancreatic beta cells. T1DM typically presents with polyuria, polydipsia, polyphagia, weight loss, and fatigue. Diagnosis is confirmed by elevated blood glucose levels, presence of autoantibodies against pancreatic beta cells, and often necessitates lifelong insulin replacement therapy. Management involves a multidisciplinary approach focusing on insulin therapy, dietary modifications, regular exercise, and monitoring blood glucose levels. Prognosis varies, with complications including diabetic ketoacidosis (DKA), cardiovascular disease, neuropathy, nephropathy, and retinopathy. Exclusions for T1DM include other types of diabetes such as type 2 diabetes mellitus and secondary causes of hyperglycemia (pregnancy, disorders of pancreas, alcohol dependency, etc.)."
domain = "condition"
conditionDefinition = "records of events of a person suggesting the presence of a disease or medical condition stated as a diagnosis, a sign, or a symptom, which is either observed by a provider or reported by the patient"
#Initialize dataframes
dfResponses = pd.DataFrame(columns = ["concept_name", "concept_id", "include", "rationale"])
dfConcepts = pd.read_csv(phoebePassOneFilepath)
#Load gold-standard concept sets
dfGoldStandardPositives = pd.read_csv(goldStandardPositivesFilepath)
goldStandardPositives = dfGoldStandardPositives["Id"].tolist()
dfGoldStandardNegatives = pd.read_csv(goldStandardNegativesFilepath)
goldStandardNegatives = dfGoldStandardNegatives["Id"].tolist()

### ChatGPT Pass One

In [62]:
for index, row in dfConcepts.iterrows():
    prompt = generate_prompt(conceptName = row["Name"], disease = disease, diseaseDescription = diabetesDescription, domain = "condition", domainDefinition = conditionDefinition)
    res = generate_gpt_response(prompt)
    answerComponents = parse_output(res)
    dfResponses.loc[len(dfResponses.index)] = [row["Name"], row["Id"], answerComponents["include"], answerComponents["rationale"]]
dfResponses.to_csv(gptAnswersPassOneFilepath, index = False)

In [63]:
gptPositivesMask = dfResponses["include"] == True 
gptNegativesMask = dfResponses["include"] == False
dfPositives = dfResponses[gptPositivesMask]
dfNegatives = dfResponses[gptNegativesMask]
positiveConcepts = dfPositives["concept_id"].tolist()
negativeConcepts = dfNegatives["concept_id"].tolist()
positiveConceptsAsStr = ""
negativeConceptsAsStr = ""
for el in positiveConcepts:
    positiveConceptsAsStr += (str(el) + ",")
for el in negativeConcepts:
    negativeConceptsAsStr += (str(el) + ",")
fp = open(gptPositivesPassOneFilepath, "x")
fp.write(positiveConceptsAsStr[:len(positiveConceptsAsStr)-1])
fp.close()
fp = open(gptNegativesPassOneFilepath, "x")
fp.write(negativeConceptsAsStr[:len(negativeConceptsAsStr)-1])
fp.close()

### ChatGPT Pass Two

In [32]:
dfResponses = pd.DataFrame(columns = ["concept_name", "concept_id", "include", "rationale"])
dfConcepts = pd.read_csv(phoebePassTwoFilepath)
for index, row in dfConcepts.iterrows():
    prompt = generate_prompt(conceptName = row["Name"], disease = disease, diseaseDescription = diabetesDescription, domain = "condition", domainDefinition = conditionDefinition)
    res = generate_gpt_response(prompt)
    answerComponents = parse_output(res)
    dfResponses.loc[len(dfResponses.index)] = [row["Name"], row["Id"], answerComponents["include"], answerComponents["rationale"]]
dfResponses.to_csv(gptAnswersPassTwoFilepath, index = False)

In [66]:
gptPositivesMask = dfResponses["include"] == True 
gptNegativesMask = dfResponses["include"] == False
dfPositives = dfResponses[gptPositivesMask]
dfNegatives = dfResponses[gptNegativesMask]
positiveConcepts = dfPositives["concept_id"].tolist()
negativeConcepts = dfNegatives["concept_id"].tolist()
positiveConceptsAsStr = ""
negativeConceptsAsStr = ""
for el in positiveConcepts:
    positiveConceptsAsStr += (str(el) + ",")
for el in negativeConcepts:
    negativeConceptsAsStr += (str(el) + ",")
fp = open(gptPositivesPassTwoFilepath, "x")
fp.write(positiveConceptsAsStr[:len(positiveConceptsAsStr)-1])
fp.close()
fp = open(gptNegativesPassTwoFilepath, "x")
fp.write(negativeConceptsAsStr[:len(negativeConceptsAsStr)-1])
fp.close()

In [40]:
#Combine the GPT-recommended concepts from pass one and pass two
fpOne = open(gptPositivesPassOneFilepath, "r")
fpTwo = open(gptPositivesPassTwoFilepath, "r")
positivesAsStr = fpOne.readline() + "," + fpTwo.readline()
fpOne.close()
fpTwo.close()
proposedPositiveConcepts = positivesAsStr.split(",")
fpOne = open(gptNegativesPassOneFilepath, "r")
fpTwo = open(gptNegativesPassTwoFilepath, "r")
negativesAsStr = fpOne.readline() + "," + fpTwo.readline()
fpOne.close()
fpTwo.close()
proposedNegativeConcepts = positivesAsStr.split(",")
#Compute sensitivity and specificity
t1dmRes = compute_sens_and_spec(goldStandardPositives=goldStandardPositives, goldStandardNegatives=goldStandardNegatives, proposedPositives=proposedPositiveConcepts, proposedNegatives=proposedNegativeConcepts)
print(f"Sensitivity: {t1dmRes["sensitivity"]}")
print(f"Sensitivity: {t1dmRes["specificity"]}")

#### Acute Myocardial Infarction

In [None]:
#Define filepath variables
phoebePassOneFilepath = "input/Type1Diabetes/phoebe_concepts_pass_one.csv"
phoebePassTwoFilepath = "input/Type1Diabetes/phoebe_concepts_pass_two.csv"
gptAnswersPassOneFilepath = "output/Type1Diabetes/gpt_concepts_pass_one.csv"
gptConceptsPassOneFilepath = "output/Type1Diabetes/gpt_concepts_pass_one.txt"
gptAnswersPassTwoFilepath = "output/Type1Diabetes/gpt_concepts_pass_two.csv"
gptConceptsPassTwoFilepath = "output/Type1Diabetes/gpt_concepts_pass_two.txt"
gptFinalFilepath = "output/Type1Diabetes/gpt_concepts_finals.csv"
goldStandardFilepath = "input/Type1Diabetes/gold_standard_concepts.csv"
#Define ChatGPT prompt information 
disease = "Type 1 Diabetes Mellitus"
diseaseDescription = "Type 1 Diabetes Mellitus (T1DM) is a chronic autoimmune disorder characterized by the destruction of insulin-producing beta cells in the pancreas, leading to absolute insulin deficiency. It is defined as a metabolic disorder characterized by hyperglycemia due to insulin deficiency resulting from autoimmune destruction of pancreatic beta cells. T1DM typically presents with polyuria, polydipsia, polyphagia, weight loss, and fatigue. Diagnosis is confirmed by elevated blood glucose levels, presence of autoantibodies against pancreatic beta cells, and often necessitates lifelong insulin replacement therapy. Management involves a multidisciplinary approach focusing on insulin therapy, dietary modifications, regular exercise, and monitoring blood glucose levels. Prognosis varies, with complications including diabetic ketoacidosis (DKA), cardiovascular disease, neuropathy, nephropathy, and retinopathy. Exclusions for T1DM include other types of diabetes such as type 2 diabetes mellitus and secondary causes of hyperglycemia (pregnancy, disorders of pancreas, alcohol dependency, etc.)."
domain = "condition"
conditionDefinition = "records of events of a person suggesting the presence of a disease or medical condition stated as a diagnosis, a sign, or a symptom, which is either observed by a provider or reported by the patient"
#Initialize dataframes
dfResponses = pd.DataFrame(columns = ["concept_name", "concept_id", "include", "rationale"])
dfConcepts = pd.read_csv(phoebePassOneFilepath)

In [41]:
dfResponses = pd.DataFrame(columns = ["concept", "include", "rationale"])
dfConcepts = pd.read_csv("input/AcuteMyocardialInfarction/firstPass.csv")
disease = "Acute Myocardial Infarction"
diabetesDescription = "Acute myocardial infarction (AMI), commonly known as a heart attack, is a life-threatening medical emergency characterized by the sudden occlusion of a coronary artery, resulting in ischemia and necrosis of cardiac tissue. It is defined as the abrupt interruption of blood flow to a portion of the myocardium, leading to myocardial cell death and subsequent release of cardiac biomarkers such as troponin. AMI typically presents with severe chest pain or pressure, often radiating to the left arm, jaw, or back, along with accompanying symptoms such as shortness of breath, diaphoresis, nausea, and vomiting. Diagnosis is confirmed by clinical history, electrocardiography (ECG) findings indicative of ST-segment elevation or new-onset Q waves, and elevated cardiac biomarkers. Treatment involves immediate reperfusion therapy to restore blood flow to the ischemic myocardium, utilizing thrombolytics or percutaneous coronary intervention (PCI). Additional therapies include antiplatelet agents, anticoagulants, beta-blockers, angiotensin-converting enzyme (ACE) inhibitors, and statins to prevent recurrent ischemic events and reduce mortality. Prognosis varies depending on the extent of myocardial damage, timely intervention, and the presence of comorbidities. Exclusions for AMI include other causes of acute chest pain such as unstable angina, aortic dissection, and pulmonary embolism."
domain = "condition"
conditionDefinition = "records of events of a person suggesting the presence of a disease or medical condition stated as a diagnosis, a sign, or a symptom, which is either observed by a provider or reported by the patient"

In [42]:
for index, row in dfConcepts.iterrows():
    prompt = generate_prompt(conceptName = row["Name"], disease = disease, diseaseDescription = diabetesDescription, domain = "condition", domainDefinition = conditionDefinition)
    res = generate_gpt_response(prompt)
    answerComponents = parse_output(res)
    dfResponses.loc[len(dfResponses.index)] = [row["Name"], answerComponents["include"], answerComponents["rationale"]]

In [43]:
dfResponses.to_csv("./output/AcuteMyocardialInfarction/gptFirstPass.csv", index = False)

In [44]:
dfGptRecs = pd.read_csv("./output/AcuteMyocardialInfarction/gptFirstPass.csv")
includeConceptsMask = dfGptRecs["include"] == True
dfInclude = dfGptRecs[includeConceptsMask]
includeConceptNames = dfInclude["concept"].tolist()

In [45]:
recommendedConceptsFilepath = "./input/AcuteMyocardialInfarction/firstPass.csv"
dfAtlasRecommended  = pd.read_csv(recommendedConceptsFilepath)
filterMask = dfAtlasRecommended["Name"].isin(includeConceptNames)
conceptsToInclude = dfAtlasRecommended[filterMask]["Id"].tolist()
conceptsAsStr = ""
for el in conceptsToInclude:
    conceptsAsStr += (str(el) + ",")

In [56]:
fp = open("./output/AcuteMyocardialInfarction/conceptIdsPassOne.txt", "x")
fp.write(conceptsAsStr)
fp.close()

'4119606,4119597,604179,4161570,36712983,4172822,4124686,4030582,4064348,4354249,4242670,45766165,43021857,4247796,4186397,4215140,40482638,45767560,4168972,766257,45767558,4170094,4206867,45772774,4064609,4207921,45767627,4194618,258449,4215259,4119950,4119951,4119949,4162038,4253065,618951,4161991,4155965,4155962,4155007,4179525,437894,4178622,4184762,37309630,4180609,4189939,4200113,4121468,4121467,4237062,35615053,4263712,4138833,4134723,4187067,4108217,4119462,609191,3189643,4209541,46273814,43020660,35610089,'

Second pass of ChatGPT

In [47]:
dfResponses = pd.DataFrame(columns = ["concept", "include", "rationale"])
dfConcepts = pd.read_csv("input/AcuteMyocardialInfarction/secondPass.csv")
disease = "Acute Myocardial Infarction"
diabetesDescription = "Acute myocardial infarction (AMI), commonly known as a heart attack, is a life-threatening medical emergency characterized by the sudden occlusion of a coronary artery, resulting in ischemia and necrosis of cardiac tissue. It is defined as the abrupt interruption of blood flow to a portion of the myocardium, leading to myocardial cell death and subsequent release of cardiac biomarkers such as troponin. AMI typically presents with severe chest pain or pressure, often radiating to the left arm, jaw, or back, along with accompanying symptoms such as shortness of breath, diaphoresis, nausea, and vomiting. Diagnosis is confirmed by clinical history, electrocardiography (ECG) findings indicative of ST-segment elevation or new-onset Q waves, and elevated cardiac biomarkers. Treatment involves immediate reperfusion therapy to restore blood flow to the ischemic myocardium, utilizing thrombolytics or percutaneous coronary intervention (PCI). Additional therapies include antiplatelet agents, anticoagulants, beta-blockers, angiotensin-converting enzyme (ACE) inhibitors, and statins to prevent recurrent ischemic events and reduce mortality. Prognosis varies depending on the extent of myocardial damage, timely intervention, and the presence of comorbidities. Exclusions for AMI include other causes of acute chest pain such as unstable angina, aortic dissection, and pulmonary embolism."
domain = "condition"
conditionDefinition = "records of events of a person suggesting the presence of a disease or medical condition stated as a diagnosis, a sign, or a symptom, which is either observed by a provider or reported by the patient"

In [48]:
for index, row in dfConcepts.iterrows():
    prompt = generate_prompt(conceptName = row["Name"], disease = disease, diseaseDescription = diabetesDescription, domain = "condition", domainDefinition = conditionDefinition)
    res = generate_gpt_response(prompt)
    answerComponents = parse_output(res)
    dfResponses.loc[len(dfResponses.index)] = [row["Name"], answerComponents["include"], answerComponents["rationale"]]

In [49]:
dfResponses.to_csv("./output/AcuteMyocardialInfarction/gptSecondPass.csv", index = False)

In [50]:
dfGptRecs = pd.read_csv("./output/AcuteMyocardialInfarction/gptSecondPass.csv")
includeConceptsMask = dfGptRecs["include"] == True
dfInclude = dfGptRecs[includeConceptsMask]
includeConceptNames = dfInclude["concept"].tolist()

In [51]:
recommendedConceptsFilepath = "./input/AcuteMyocardialInfarction/secondPass.csv"
dfAtlasRecommended  = pd.read_csv(recommendedConceptsFilepath)
filterMask = dfAtlasRecommended["Name"].isin(includeConceptNames)
conceptsToInclude = dfAtlasRecommended[filterMask]["Id"].tolist()
conceptsAsStr = ""
for el in conceptsToInclude:
    conceptsAsStr += (str(el) + ",")

In [57]:
fp = open("./output/AcuteMyocardialInfarction/conceptIdsPassTwo.txt", "x")
fp.write(conceptsAsStr)
fp.close()

In [14]:
#Compute sensitivity
dfGoldStandard = pd.read_csv("./input/AcuteMyocardialInfarction/goldStandard.csv")
goldStandardConcepts = dfGoldStandard["Id"].tolist()
fp = open("./output/AcuteMyocardialInfarction/conceptIdsPassTwo.txt", "r")
conceptsAsStr = fp.readline()
fp.close()
proposedConcepts = conceptsAsStr.split(",")

In [18]:
acuteMyocardialInfarctionDict = compute_sensitivity(goldStandardConcepts, proposedConcepts)
acuteMyocardialInfarctionDict["sensitivity"]

0.11965811965811966

## Rheumatoid Arthritis

In [100]:
#Define filepath variables
phoebePassOneFilepath = "input/RheumatoidArthritis/phoebe_concepts_pass_one.csv"
phoebePassTwoFilepath = "input/RheumatoidArthritis/phoebe_concepts_pass_two.csv"
gptAnswersPassOneFilepath = "output/RheumatoidArthritis/gpt_answers_pass_one.csv"
gptPositivesPassOneFilepath = "output/RheumatoidArthritis/gpt_positives_pass_one.txt"
gptNegativesPassOneFilepath = "output/RheumatoidArthritis/gpt_negatives_pass_one.txt"
gptAnswersPassTwoFilepath = "output/RheumatoidArthritis/gpt_answers_pass_two.csv"
gptPositivesPassTwoFilepath = "output/RheumatoidArthritis/gpt_positives_pass_two.txt"
gptNegativesPassTwoFilepath = "output/RheumatoidArthritis/gpt_negatives_pass_two.txt"
gptFinalFilepath = "output/RheumatoidArthritis/gpt_concepts_finals.csv"
goldStandardPositivesFilepath = "input/RheumatoidArthritis/gold_standard_positive_concepts.csv"
goldStandardNegativesFilepath = "input/RheumatoidArthritis/gold_standard_negative_concepts.csv"
#Define ChatGPT prompt information 
disease = "Rheumatoid Arthritis"
diseaseDescription = "Rheumatoid Arthritis (RA) is a chronic autoimmune inflammatory disorder primarily affecting the synovial joints. It is defined as a systemic autoimmune disease characterized by symmetric polyarthritis with joint swelling, tenderness, and destruction, leading to deformities and functional impairment. RA often presents with morning stiffness, joint pain, and swelling, particularly in the small joints of the hands and feet. Diagnosis is based on clinical criteria such as joint involvement, serological markers (e.g., rheumatoid factor, anti-cyclic citrullinated peptide antibodies), and imaging findings (e.g., joint erosions on X-ray). Treatment aims to control inflammation, relieve symptoms, and prevent joint damage, utilizing disease-modifying antirheumatic drugs (DMARDs), biologic agents, nonsteroidal anti-inflammatory drugs (NSAIDs), and glucocorticoids. Prognosis varies widely, with some patients achieving remission while others experience progressive joint damage and disability. Exclusions for RA include other forms of arthritis such as osteoarthritis and systemic lupus erythematosus (SLE)."
domain = "condition"
conditionDefinition = "records of events of a person suggesting the presence of a disease or medical condition stated as a diagnosis, a sign, or a symptom, which is either observed by a provider or reported by the patient"
#Initialize dataframes
dfResponses = pd.DataFrame(columns = ["concept_name", "concept_id", "include", "rationale"])
dfConcepts = pd.read_csv(phoebePassOneFilepath)
#Load gold-standard concept sets
dfGoldStandardPositives = pd.read_csv(goldStandardPositivesFilepath)
goldStandardPositives = dfGoldStandardPositives["concept_id"].tolist()
dfGoldStandardNegatives = pd.read_csv(goldStandardNegativesFilepath)
goldStandardNegatives = dfGoldStandardNegatives["concept_id"].tolist()

### ChatGPT Pass One

In [101]:
for index, row in dfConcepts.iterrows():
    prompt = generate_prompt(conceptName = row["Name"], disease = disease, diseaseDescription = diseaseDescription, domain = "condition", domainDefinition = conditionDefinition)
    res = generate_gpt_response(prompt)
    answerComponents = parse_output(res)
    dfResponses.loc[len(dfResponses.index)] = [row["Name"], row["Id"], answerComponents["include"], answerComponents["rationale"]]
dfResponses.to_csv(gptAnswersPassOneFilepath, index = False)

In [102]:
gptPositivesMask = dfResponses["include"] == True 
gptNegativesMask = dfResponses["include"] == False
dfPositives = dfResponses[gptPositivesMask]
dfNegatives = dfResponses[gptNegativesMask]
positiveConcepts = dfPositives["concept_id"].tolist()
negativeConcepts = dfNegatives["concept_id"].tolist()
positiveConceptsAsStr = ""
negativeConceptsAsStr = ""
for el in positiveConcepts:
    positiveConceptsAsStr += (str(el) + ",")
for el in negativeConcepts:
    negativeConceptsAsStr += (str(el) + ",")
fp = open(gptPositivesPassOneFilepath, "x")
fp.write(positiveConceptsAsStr[:len(positiveConceptsAsStr)-1])
fp.close()
fp = open(gptNegativesPassOneFilepath, "x")
fp.write(negativeConceptsAsStr[:len(negativeConceptsAsStr)-1])
fp.close()

### ChatGPT Pass Two

In [103]:
dfResponses = pd.DataFrame(columns = ["concept_name", "concept_id", "include", "rationale"])
dfConcepts = pd.read_csv(phoebePassTwoFilepath)
for index, row in dfConcepts.iterrows():
    prompt = generate_prompt(conceptName = row["Name"], disease = disease, diseaseDescription = diseaseDescription, domain = "condition", domainDefinition = conditionDefinition)
    res = generate_gpt_response(prompt)
    answerComponents = parse_output(res)
    dfResponses.loc[len(dfResponses.index)] = [row["Name"], row["Id"], answerComponents["include"], answerComponents["rationale"]]
dfResponses.to_csv(gptAnswersPassTwoFilepath, index = False)

In [None]:
gptPositivesMask = dfResponses["include"] == True 
gptNegativesMask = dfResponses["include"] == False
dfPositives = dfResponses[gptPositivesMask]
dfNegatives = dfResponses[gptNegativesMask]
positiveConcepts = dfPositives["concept_id"].tolist()
negativeConcepts = dfNegatives["concept_id"].tolist()
positiveConceptsAsStr = ""
negativeConceptsAsStr = ""
for el in positiveConcepts:
    positiveConceptsAsStr += (str(el) + ",")
for el in negativeConcepts:
    negativeConceptsAsStr += (str(el) + ",")
fp = open(gptPositivesPassTwoFilepath, "x")
fp.write(positiveConceptsAsStr[:len(positiveConceptsAsStr)-1])
fp.close()
fp = open(gptNegativesPassTwoFilepath, "x")
fp.write(negativeConceptsAsStr[:len(negativeConceptsAsStr)-1])
fp.close()

In [None]:
#Combine the GPT-recommended concepts from pass one and pass two
fpOne = open(gptPositivesPassOneFilepath, "r")
fpTwo = open(gptPositivesPassTwoFilepath, "r")
positivesAsStr = fpOne.readline() + "," + fpTwo.readline()
fpOne.close()
fpTwo.close()
proposedPositiveConcepts = positivesAsStr.split(",")
proposedPositiveConcepts  = list(map(int, proposedPositiveConcepts))
fpOne = open(gptNegativesPassOneFilepath, "r")
fpTwo = open(gptNegativesPassTwoFilepath, "r")
negativesAsStr = fpOne.readline() + "," + fpTwo.readline()
fpOne.close()
fpTwo.close()
proposedNegativeConcepts = negativesAsStr.split(",")
proposedNegativeConcepts  = list(map(int, proposedNegativeConcepts))
#Compute sensitivity and specificity
raRes = compute_sens_and_spec(goldStandardPositives=goldStandardPositives, goldStandardNegatives=goldStandardNegatives, proposedPositives=proposedPositiveConcepts, proposedNegatives=proposedNegativeConcepts)
print("Sensitivity: ", raRes["sensitivity"])
print("Specificity: ", raRes["specificity"])

## Pulmonary Hypertension

In [74]:
#Define filepath variables
phoebePassOneFilepath = "input/PulmonaryHypertension/phoebe_concepts_pass_one.csv"
phoebePassTwoFilepath = "input/PulmonaryHypertension/phoebe_concepts_pass_two.csv"
gptAnswersPassOneFilepath = "output/PulmonaryHypertension/gpt_answers_pass_one.csv"
gptPositivesPassOneFilepath = "output/PulmonaryHypertension/gpt_positives_pass_one.txt"
gptNegativesPassOneFilepath = "output/PulmonaryHypertension/gpt_negatives_pass_one.txt"
gptAnswersPassTwoFilepath = "output/PulmonaryHypertension/gpt_answers_pass_two.csv"
gptPositivesPassTwoFilepath = "output/PulmonaryHypertension/gpt_positives_pass_two.txt"
gptNegativesPassTwoFilepath = "output/PulmonaryHypertension/gpt_negatives_pass_two.txt"
gptFinalFilepath = "output/PulmonaryHypertension/gpt_concepts_finals.csv"
goldStandardPositivesFilepath = "input/PulmonaryHypertension/gold_standard_positive_concepts.csv"
goldStandardNegativesFilepath = "input/PulmonaryHypertension/gold_standard_negative_concepts.csv"
#Define ChatGPT prompt information 
disease = "Pulmonary Hypertension"
diseaseDescription = "Pulmonary Hypertension (PH) is a complex and progressive condition characterized by elevated blood pressure within the pulmonary vasculature, leading to right ventricular dysfunction and ultimately, heart failure. It is defined as a hemodynamic and pathophysiological state characterized by an increase in mean pulmonary arterial pressure (mPAP) greater than or equal to 20 mmHg at rest, as assessed by right heart catheterization. PH can result from various etiologies, including idiopathic, heritable, drug-induced, and associated with other conditions such as connective tissue diseases, congenital heart defects, or chronic lung diseases. Patients with PH may present with symptoms such as dyspnea, fatigue, chest pain, syncope, and signs of right heart failure. Diagnostic evaluation involves a thorough clinical assessment, echocardiography, pulmonary function tests, and right heart catheterization to confirm the diagnosis and assess disease severity. Treatment aims to improve symptoms, slow disease progression, and optimize hemodynamics, utilizing various classes of medications including vasodilators, endothelin receptor antagonists, phosphodiesterase-5 inhibitors, soluble guanylate cyclase stimulators, and prostacyclin analogs. Prognosis varies depending on the underlying cause and severity of PH, with early diagnosis and targeted therapy improving outcomes. Exclusions for PH include other causes of pulmonary arterial hypertension such as left heart disease, chronic thromboembolic disease, and pulmonary arterial hypertension associated with lung diseases and/or hypoxia."
domain = "condition"
conditionDefinition = "records of events of a person suggesting the presence of a disease or medical condition stated as a diagnosis, a sign, or a symptom, which is either observed by a provider or reported by the patient"
#Initialize dataframes
dfResponses = pd.DataFrame(columns = ["concept_name", "concept_id", "include", "rationale"])
dfConcepts = pd.read_csv(phoebePassOneFilepath)
#Load gold-standard concept sets
dfGoldStandardPositives = pd.read_csv(goldStandardPositivesFilepath)
goldStandardPositives = dfGoldStandardPositives["concept_id"].tolist()
dfGoldStandardNegatives = pd.read_csv(goldStandardNegativesFilepath)
goldStandardNegatives = dfGoldStandardNegatives["concept_id"].tolist()

### ChatGPT Pass One

In [75]:
for index, row in dfConcepts.iterrows():
    prompt = generate_prompt(conceptName = row["Name"], disease = disease, diseaseDescription = diseaseDescription, domain = "condition", domainDefinition = conditionDefinition)
    res = generate_gpt_response(prompt)
    answerComponents = parse_output(res)
    dfResponses.loc[len(dfResponses.index)] = [row["Name"], row["Id"], answerComponents["include"], answerComponents["rationale"]]
dfResponses.to_csv(gptAnswersPassOneFilepath, index = False)

In [76]:
gptPositivesMask = dfResponses["include"] == True 
gptNegativesMask = dfResponses["include"] == False
dfPositives = dfResponses[gptPositivesMask]
dfNegatives = dfResponses[gptNegativesMask]
positiveConcepts = dfPositives["concept_id"].tolist()
negativeConcepts = dfNegatives["concept_id"].tolist()
positiveConceptsAsStr = ""
negativeConceptsAsStr = ""
for el in positiveConcepts:
    positiveConceptsAsStr += (str(el) + ",")
for el in negativeConcepts:
    negativeConceptsAsStr += (str(el) + ",")
fp = open(gptPositivesPassOneFilepath, "x")
fp.write(positiveConceptsAsStr[:len(positiveConceptsAsStr)-1])
fp.close()
fp = open(gptNegativesPassOneFilepath, "x")
fp.write(negativeConceptsAsStr[:len(negativeConceptsAsStr)-1])
fp.close()

### ChatGPT Pass Two

In [78]:
dfResponses = pd.DataFrame(columns = ["concept_name", "concept_id", "include", "rationale"])
dfConcepts = pd.read_csv(phoebePassTwoFilepath)
for index, row in dfConcepts.iterrows():
    prompt = generate_prompt(conceptName = row["Name"], disease = disease, diseaseDescription = diseaseDescription, domain = "condition", domainDefinition = conditionDefinition)
    res = generate_gpt_response(prompt)
    answerComponents = parse_output(res)
    dfResponses.loc[len(dfResponses.index)] = [row["Name"], row["Id"], answerComponents["include"], answerComponents["rationale"]]
dfResponses.to_csv(gptAnswersPassTwoFilepath, index = False)

In [79]:
gptPositivesMask = dfResponses["include"] == True 
gptNegativesMask = dfResponses["include"] == False
dfPositives = dfResponses[gptPositivesMask]
dfNegatives = dfResponses[gptNegativesMask]
positiveConcepts = dfPositives["concept_id"].tolist()
negativeConcepts = dfNegatives["concept_id"].tolist()
positiveConceptsAsStr = ""
negativeConceptsAsStr = ""
for el in positiveConcepts:
    positiveConceptsAsStr += (str(el) + ",")
for el in negativeConcepts:
    negativeConceptsAsStr += (str(el) + ",")
fp = open(gptPositivesPassTwoFilepath, "x")
fp.write(positiveConceptsAsStr[:len(positiveConceptsAsStr)-1])
fp.close()
fp = open(gptNegativesPassTwoFilepath, "x")
fp.write(negativeConceptsAsStr[:len(negativeConceptsAsStr)-1])
fp.close()

In [98]:
#Combine the GPT-recommended concepts from pass one and pass two
fpOne = open(gptPositivesPassOneFilepath, "r")
fpTwo = open(gptPositivesPassTwoFilepath, "r")
positivesAsStr = fpOne.readline() + "," + fpTwo.readline()
fpOne.close()
fpTwo.close()
proposedPositiveConcepts = positivesAsStr.split(",")
proposedPositiveConcepts  = list(map(int, proposedPositiveConcepts))
fpOne = open(gptNegativesPassOneFilepath, "r")
fpTwo = open(gptNegativesPassTwoFilepath, "r")
negativesAsStr = fpOne.readline() + "," + fpTwo.readline()
fpOne.close()
fpTwo.close()
proposedNegativeConcepts = negativesAsStr.split(",")
proposedNegativeConcepts  = list(map(int, proposedNegativeConcepts))
#Compute sensitivity and specificity
phRes = compute_sens_and_spec(goldStandardPositives=goldStandardPositives, goldStandardNegatives=goldStandardNegatives, proposedPositives=proposedPositiveConcepts, proposedNegatives=proposedNegativeConcepts)
print("Sensitivity: ", phRes["sensitivity"])
print("Specificity: ", phRes["specificity"])

0
<class 'int'>


ZeroDivisionError: division by zero