In [1]:
import pandas as pd
import os
from datetime import datetime
from langchain.document_loaders import JSONLoader
from langchain.docstore.document import Document

In [2]:
root_dir = "sample"
file_paths = []

for dir_, _, files in os.walk(root_dir):
    for file in files:
        rel_dir = os.path.relpath(dir_, root_dir)
        rel_file = os.path.join(root_dir, rel_dir, file)
        file_paths.append(rel_file)

In [None]:
sample_df = pd.read_json(file_paths[1])

In [34]:
TODAY = datetime.now()

def create_patient_doc(df):
    r = df.iloc[0]
    name = r['resource.name'][0]['prefix'][0] + ' ' + r['resource.name'][0]['given'][0].rstrip('0123456789') + ' ' + r['resource.name'][0]['family'].rstrip('0123456789')
    gender = r['resource.gender']
    age = str(int((TODAY - pd.to_datetime(r['resource.birthDate'])).days / 365.2425))
    race =  r['resource.extension'][0]['valueCodeableConcept']['coding'][0]['display']
    ethnicity =  r['resource.extension'][1]['valueCodeableConcept']['coding'][0]['display']
    address = r['resource.extension'][2]['valueAddress']['city'] + ' ' + r['resource.extension'][2]['valueAddress']['state']
    marital_status_dict = {'M': 'Married', 'S': 'Single'}
    marital_status = marital_status_dict[r['resource.maritalStatus.coding'][0]['code']]

    text = f"Patient's name is {name}. "
    text += f"Patient is {age} years old, {gender}, {race}, {ethnicity}, and {marital_status}. "
    text += f"Patient lives in {address}."

    metadata = {}
    metadata['type'] = r['resource.resourceType']
 
    return Document(page_content=text, metadata=metadata)

def create_careplan_doc(df):
    docs = []
    for i, r in df.iterrows():
        care_plan = r['resource.category'][0]['coding'][0]['display']
        code = r['resource.category'][0]['coding'][0]['code']
        start = r['resource.period.start']
        end = r['resource.period.end']
        activities = ''
        for j in range(len(r['resource.activity'])):
            if j != len(r['resource.activity']) - 1:
                activities += r['resource.activity'][j]['detail']['code']['coding'][0]['display'] + " and "
            else:
                activities += r['resource.activity'][j]['detail']['code']['coding'][0]['display']

        
        if type(end) == float:
            text = f"Patient's care plan was {care_plan} with code {code} which started on {start} and is stil active. "
        else:
            text = f"Patient's care plan was {care_plan} with code {code} which lasted from {start} to {end}. "
        text += f"The care plan involved {activities}."
    
        metadata = {}
        metadata['type'] = r['resource.resourceType']
    
        docs.append(Document(page_content=text, metadata=metadata))

    return docs

def create_condition_doc(df):
    docs = []
    for i, r in df.iterrows():
        condition = r['resource.code.coding'][0]['display']
        code = r['resource.code.coding'][0]['code']
        start = r['resource.onsetDateTime'].split('T')[0]
        end = r['resource.abatementDateTime']

        if type(end) == float:
            text = f"Patient experienced {condition} with code {code} which started on {start} and is still ongoing. "
        else:
            text = f"Patient experienced {condition} with code {code} which lasted from {start} to {end.split('T')[0]}. "

        metadata = {}
        metadata['type'] = r['resource.resourceType']

        docs.append(Document(page_content=text, metadata=metadata))

    return docs

def create_diagnosticreport_doc(df):
    docs = []
    for i, r in df.iterrows():
        diagnostic = r['resource.code.coding'][0]['display']
        code = r['resource.code.coding'][0]['code']
        date = r['resource.effectiveDateTime'].split('T')[0]
        source = r['resource.performer'][0]['display']
        results = ''
        temp_df = pd.DataFrame(r['resource.result'])
        merge_df = observation_df.merge(temp_df, left_on='fullUrl', right_on='reference')
        for j, rr in merge_df.iterrows():
            if j != len(merge_df) - 1:
                results += rr['display'] + " reading of " + str(rr['resource.valueQuantity.value']) + ' ' + rr['resource.valueQuantity.unit'] + " and "
            else:
                results += rr['display'] + " reading of " + str(rr['resource.valueQuantity.value']) + ' ' + rr['resource.valueQuantity.unit']

        text = f"Patient received {diagnostic} with code {code} on {date} from {source}. " 
        text += f"The results included a {results}."

        metadata = {}
        metadata['type'] = r['resource.resourceType']

        docs.append(Document(page_content=text, metadata=metadata))

    return docs

def create_encounter_doc(df):
    docs = []
    for i, r in df.iterrows():
        encounter = r['resource.type'][0]['text']
        code = r['resource.type'][0]['coding'][0]['code']
        class_code = r['resource.class.code']
        start = r['resource.period.start'].split('T')[0]
        end = r['resource.period.end'].split('T')[0]
        reason = r['resource.reason.coding']

        text = f"Patient went in for {encounter} with code {code} and class code {class_code} from {start} to {end}."
        if type(reason) != float:
            text = text[:-1]
            text += f" because of {reason[0]['display']} with code {reason[0]['code']}."

        metadata = {}
        metadata['type'] = r['resource.resourceType']

        docs.append(Document(page_content=text, metadata=metadata))

    return docs

def create_immunization_doc(df):
    docs = []
    for i, r in df.iterrows():
        vaccine = r['resource.vaccineCode.coding'][0]['display'].split(',')[0]
        code =  r['resource.vaccineCode.coding'][0]['code']
        date = r['resource.date'].split('T')[0]

        if r['resource.wasNotGiven']:
            text = f"Patient didn't receive {vaccine} vaccine with code {code} on {date}."
        else:
            text = f"Patient received {vaccine} vaccine with code {code} on {date}."

        metadata = {}
        metadata['type'] = r['resource.resourceType']

        docs.append(Document(page_content=text, metadata=metadata))

    return docs

def create_observation_doc(df):
    docs = []
    for i, r in df.iterrows():
        date = r['resource.effectiveDateTime'].split('T')[0]
        try:
            observation = r['resource.code.coding'][0]['display']
            val = r['resource.valueQuantity.value']
            unit = r['resource.valueQuantity.unit']
            text = f"Patient's {observation} was {val} {unit} on {date}."
        except:
            systolic_bp = r['resource.component'][0]
            diastolic_bp = r['resource.component'][1]
            text = f"Patient's {systolic_bp['code']['coding'][0]['display']} was {systolic_bp['valueQuantity']['value']} {systolic_bp['valueQuantity']['unit']} and "
            text += f"patient's {diastolic_bp['code']['coding'][0]['display']} was {diastolic_bp['valueQuantity']['value']} {diastolic_bp['valueQuantity']['unit']}."
        
        metadata = {}
        metadata['type'] = r['resource.resourceType']
        
        docs.append(Document(page_content=text, metadata=metadata))

    return docs

def create_procedure_doc(df):
    docs = []
    for i, r in df.iterrows():
        procedure = r['resource.code.text']
        code = r['resource.code.coding'][0]['code']

        if type(r['resource.performedDateTime']) == float:
            date = r['resource.performedPeriod.end'].split('T')[0]
        else:
            date = r['resource.performedDateTime'].split('T')[0]

        text = f"Patient went in for {procedure} with code {code} on {date}."

        metadata = {}
        metadata['type'] = r['resource.resourceType']

        docs.append(Document(page_content=text, metadata=metadata))

    return docs

In [40]:
docs = []

patient_df = pd.DataFrame() 
careplan_df = pd.DataFrame() 
condition_df = pd.DataFrame() 
diagnosticreport_df = pd.DataFrame() 
encounter_df = pd.DataFrame() 
immunization_df = pd.DataFrame() 
observation_df = pd.DataFrame() 
procedure_df = pd.DataFrame() 

for i, r in sample_df.iterrows():
    temp_df = pd.json_normalize(r['entry'])

    key = temp_df['resource.resourceType'][0]
    if key == 'Patient':
        patient_df = pd.concat([patient_df, temp_df])
    elif key == 'CarePlan':
        careplan_df = pd.concat([careplan_df, temp_df])
    elif key == 'Condition':
        condition_df = pd.concat([condition_df, temp_df])
    elif key == 'DiagnosticReport':
        diagnosticreport_df = pd.concat([diagnosticreport_df, temp_df])
    elif key == 'Encounter':
        encounter_df = pd.concat([encounter_df, temp_df])
    elif key == 'Immunization':
        immunization_df = pd.concat([immunization_df, temp_df])
    elif key == 'Observation':
        observation_df = pd.concat([observation_df, temp_df])
    elif key == 'Procedure':
        procedure_df = pd.concat([procedure_df, temp_df])

    if i == len(sample_df) - 1:
        docs.append(create_patient_doc(patient_df))
        docs.extend(create_careplan_doc(careplan_df))
        docs.extend(create_condition_doc(condition_df))
        docs.extend(create_diagnosticreport_doc(diagnosticreport_df))
        docs.extend(create_encounter_doc(encounter_df))
        docs.extend(create_immunization_doc(immunization_df))
        docs.extend(create_observation_doc(observation_df))
        docs.extend(create_procedure_doc(procedure_df))

### General Approach

In [6]:
from langchain.vectorstores import FAISS
from langchain.embeddings import GPT4AllEmbeddings
from langchain.llms import GPT4All
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain import LLMChain

In [None]:
vectorstore = FAISS.from_documents(documents=docs, embedding=GPT4AllEmbeddings())

model_path = "C:\\Users\\aravp\\Downloads\\orca-mini-3b.ggmlv3.q4_0.bin"
# llm = GPT4All(model=model_path)
llm = GPT4All(model=model_path, backend="gptj", callbacks=[StreamingStdOutCallbackHandler()], verbose=True)

In [None]:
# patient EHR-based (context-dependent) questioning

# question = "What care plans did the patient follow?"
question = input("Ask a question about the patient: ")
template = """Use the following pieces of context to answer the question at the end. 
You are speaking to the patient's doctor and you have the patient's consent to disclose their information.
Keep the answer as concise as possible but don't leave anything out. Answer in the form of a sentence. {context}
Question: {question}
Helpful Answer: """
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

qa_chain = RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever(), chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}, 
                                       chain_type="stuff")
print(question)
result = qa_chain({"query": question})

In [None]:
# general medical questioning

# question2 = "What is hypothermia?"
question2 = input("Ask a general medical question: ")
template2 = """You are a medical assistant tasked with aiding a doctor. 
Based on your knowledge of medicine, answer the question as concisely as possible with only relevant info. 
Question: {question}
Answer: """

prompt = PromptTemplate(template=template2, input_variables=["question"])
llm_chain = LLMChain(prompt=prompt, llm=llm)
print(llm_chain.run(question2))

### Conversational Approach

In [40]:
from gpt4all import GPT4All

In [None]:
model_path = "C:\\Users\\aravp\\Downloads\\orca-mini-3b.ggmlv3.q4_0.bin"
model = GPT4All(model_path)

In [None]:
print(model.generate("What are treatments for hypothermia?", temp=0).strip())

In [None]:
system_template = 'A chat between a patient and an artificial intelligence medical assistant.'
# prompt_template = 'USER: {0}\nASSISTANT: '
with model.chat_session(system_template):
    print("AI: Hello there! What can I help you with today?")
    prompt = input("USER: ")
    while prompt != 'end':
            response = model.generate(prompt)
            print(response + '\n')
            prompt = input("USER: ")
    print("Thanks for chatting!")