## Create DB

In [18]:
from dotenv import load_dotenv

load_dotenv()

True

In [19]:
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

In [20]:
import pandas as pd

df = pd.read_csv('./faker/dataset_3.csv', dtype={'Patient Phone': str})
# Change date to YYYYMMDD-interger 
df['Visit Date'] = pd.to_datetime(df['Visit Date']).dt.strftime('%Y%m%d').astype(int)
df

Unnamed: 0,Visit ID,Description,Visit Date,Patient ID,Diagnosis Category,Diagnosis Sub Category,Treatment Category,Treatment Sub Category,New Patient,Consulting Physician,...,Patient Gender,Patient Age,Patient Age Range,Patient Blood Type,Patient Insurance Number,Patient Phone,Patient Address,Patient Occupation,Patient Emergency Contact,Intended Purposes
0,VQC513203,The patient presented with chronic lower back ...,20170101,PZM508653,Chronic Pain,Vertebral Disc Problem,Pharmacy/Prescription Drugs,Non-FDA Approved Use,True,Dr. Jerry Daniels,...,Female,43,41-50,O-,G264037622,017588215469,Pärtzeltweg 2\n22301 Neunburg vorm Wald,"Surveyor, quantity","Jennifer Bailey, 015680180768.","['Care', 'Research', 'Insurance', 'Support', '..."
1,VVC435406,The patient presented with symptoms of fatigue...,20170101,PSN036517,Endocrine/ Metabolic,Hormone Deficiency,Pharmacy/Prescription Drugs,Hormones,True,Dr. Michelle Lamb,...,Female,36,31-40,O+,Y133547589,015087781378,Kira-Gorlitz-Allee 8\n67100 Rosenheim,Copy,"Dennis Carlson, 015182104709.","['Care', 'Research', 'Insurance', 'Support', '..."
2,VKT437745,The patient presented with delayed speech deve...,20170101,PPD253419,Pediatrics,Delayed Speech,Rehabilitation Services - Outpatient,Speech Therapy,True,Dr. Michelle Lamb,...,Male,5,0-10,O+,F115599209,016368315207,Thiesstr. 3/5\n68745 Bremen,Claims inspector/assessor,"Veronica Harris, 015034485673.","['Care', 'Research', 'Insurance', 'Support', '..."
3,VJG208744,The patient presented with symptoms of muscle ...,20170101,PBN488954,Central Nervous System/ Neuromuscular,,Pharmacy/Prescription Drugs,Non-FDA Approved Use,True,Dr. James Barber,...,Male,52,51-64,AB+,W720918648,016217695326,Schmidtkeallee 53\n54913 Siegen,Tourism officer,"Michelle Graham, 017438105819.","['Care', 'Research', 'Insurance', 'Support', '..."
4,VAF235393,The patient presented with a diagnosis of brea...,20170101,PJG173047,Cancer,Breast Cancer,Cancer Treatment,Surgery,True,Dr. James Barber,...,Female,36,31-40,A+,Q198981012,017645195473,Ida-Fliegner-Ring 7/6\n53118 Wolgast,Engineering geologist,"Peter Stout, 015017527431.","['Care', 'Research', 'Insurance', 'Support', '..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11684,VTL998110,"Today, I met with a patient who was diagnosed ...",20181230,PND058072,Mental,Depression,Electrical/ Thermal/ Radiofreq. Interventions,Transcranial Magnetic Stimulation,False,Dr. Alexandria Gaines,...,Male,51,51-64,A-,Private,016976645290,Wolfram-auch Schlauchin-Ring 3/3\n94816 Neuruppin,"Research officer, political party","Emily Garrett, 016593887796.","['Care', 'Research', 'Insurance', 'Support', '..."
11685,VLH032189,The patient was diagnosed with liver cancer af...,20181230,PPC895292,Cancer,Liver Cancer,Special Procedure,,False,Dr. Eddie Young,...,Male,62,51-64,B-,Q100759966,016394851011,Sölzerring 05\n21768 Altötting,"Engineer, petroleum","Linda Flores, 016973980632.","['Care', 'Insurance', 'Support', 'Public', 'Tr..."
11686,VQZ422704,Today I saw a patient with a chromosomal anoma...,20181230,PFS771409,Genetic,Chromosomal Anomalies,Pharmacy/Prescription Drugs,Hormones,False,Dr. James Barber,...,Female,7,0-10,A+,L605331052,016453400773,Mielcarekplatz 1/9\n83357 Melle,Scientific laboratory technician,"Michael Weiss, 017716313028.","['Care', 'Research', 'Insurance', 'Support', '..."
11687,VUT814433,The patient presented with symptoms of persist...,20181230,PVV930449,Mental,,,,True,Dr. Eddie Young,...,Female,16,10-20,A+,G215159968,017155908480,Dussen vangasse 501\n61560 Aachen,Data processing manager,"Tammy Whitney, 017700151177.","['Care', 'Research', 'Insurance', 'Support', '..."


In [21]:
from langchain_core.documents import Document

def generate_db_docs(df, page_content_column, limit = None):
    if limit is not None:
        df = df.iloc[:limit]

    # Ensure the specified page_content_column exists in the DataFrame
    if page_content_column not in df.columns:
        raise ValueError(f"{page_content_column} does not exist in the DataFrame.")

    docs = []
    cols = list(df.columns)

    for row in df.itertuples(index=False, name=None):
        # Dynamically get the page content using the column name
        page_content_index = cols.index(page_content_column)
        page_content = row[page_content_index]

        # Construct metadata dictionary from the rest of the columns
        metadata = {col: row[cols.index(col)] for col in cols if col != page_content_column}

        # Create Document object and append to docs list
        docs.append(Document(page_content=page_content, metadata=metadata))

    return docs

docs = generate_db_docs(df, 'Description', 1000)


In [22]:
from langchain.chains.query_constructor.base import AttributeInfo

metadata_field_info = [
    AttributeInfo(
        name="Visit ID",
        description="Unique ID for patient visting a medical center.",
        type="string",
    ),
    AttributeInfo(
        name="Visit Date",
        description="The date the visit to the medical center took place. Format: YYYYMMDD.",
        type="integer",
    ),
    AttributeInfo(
        name="Patient ID",
        description="The ID corresponding to a patient. A patient might visit multiple times, therefore the ID is not unique.",
        type="string",
    ),
    AttributeInfo(
        name="Diagnosis Category", 
        description="Category for medical diagnosis.", 
        type="string"
    ),
    AttributeInfo(
        name="Diagnosis Sub Category", 
        description="Sub category for medical diagnosis.", 
        type="string"
    ),
    AttributeInfo(
        name="Treatment Category", 
        description="Category for medical treatment based on diganosis.", 
        type="string"
    ),
    AttributeInfo(
        name="Treatment Sub Category", 
        description="Sub category for medical treatment based on diganosis.", 
        type="string"
    ),
    AttributeInfo(
        name="New Patient", 
        description="A flag indicating if it is the first time a patient is visiting the medical center. 'True' if it is the first time, 'False' if not.", 
        type="boolean"
    ),
    AttributeInfo(
        name="Consulting Physician", 
        description="The name of the consulting physician. Availbale options: 'Dr. Jerry Daniels', 'Dr. Michelle Lamb', 'Dr. James Barber', 'Dr. Shelly Hunt', 'Dr. Alexandria Gaines', 'Dr. Eddie Young'.", 
        type="string"
    ),
    AttributeInfo(
        name="Patient Name", 
        description="The name of the patient seeking medical assistance.", 
        type="string"
    ),
    AttributeInfo(
        name="Patient Gender", 
        description="The gender of the patient seeking medical assistance. Available options: 'Female', 'Male', 'Other'.", 
        type="string"
    ),
    AttributeInfo(
        name="Patient Age", 
        description="The age of the patient seeking medical assistance.", 
        type="integer"
    ),
    AttributeInfo(
        name="Patient Age Range", 
        description="The age range of the patient seeking medical assistance. Available options: '0-10', '10-20', '21-30', '31-40', '41-50', '51-64', '65+'.", 
        type="string"
    ),
    AttributeInfo(
        name="Patient Blood Type", 
        description="The blood type of the patient seeking medical assistance. Available options: 'O-', 'O+', 'AB+', 'A+', 'B+', 'A-', 'B-', 'AB-'.", 
        type="string"
    ),
    AttributeInfo(
        name="Patient Insurance Number", 
        description="The insurance number of the patient seeking medical assistance.", 
        type="string"
    ),
    AttributeInfo(
        name="Patient Phone", 
        description="The moblie phone number of the patient seeking medical assistance.", 
        type="string"
    ),
    AttributeInfo(
        name="Patient Address", 
        description="The address of the patient seeking medical assistance.", 
        type="string"
    ),
    AttributeInfo(
        name="Patient Occupation", 
        description="The occupation of the patient seeking medical assistance.", 
        type="string"
    ),
    AttributeInfo(
        name="Patient Emergency Contact", 
        description="The emergency contact of the patient seeking medical assistance. The contact includes a name and an address of the emergency contact.", 
        type="string"
    ),
    AttributeInfo(
        name="Intended Purposes", 
        description="A list of intended access purposes for the data generated at the visit to the medical center. The list can contain a set of the following options: 'Care', 'Research', 'Insurance', 'Support', 'Public', 'Trial', 'Product', 'Marketing'.", 
        type="list[string]"
    ),
]


## Self-Query

In [35]:
import os

persist_directory = "./chroma_medical_db"

# Check if the persistence directory exists
if not os.path.exists(persist_directory):
    # If not, create using docs
    os.makedirs(persist_directory, exist_ok=True)
    vectorstore = Chroma.from_documents(docs, OpenAIEmbeddings(), persist_directory=persist_directory)
    print('Creating vectorstore...')
else:
    # If the directory exists, initialize Chroma to use the existing data
    vectorstore = Chroma(persist_directory=persist_directory, embedding_function=OpenAIEmbeddings())
    print('Using existing vectorstore...')

Using existing vectorstore...


In [40]:
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_openai import ChatOpenAI

document_content_description = "A summary of of a visit at a medical center for the docters perspective. The text includes symptoms dignosis and proposed treatment."
llm = ChatOpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectorstore,
    document_content_description,
    metadata_field_info,
    verbose=True,
    return_intermediate_steps=True
    #enable_limit=True,
)

In [42]:
response = retriever.invoke("Get me all visits with diganosis of cancer.")
response

[Document(page_content='The patient presented with a confirmed diagnosis of skin cancer. They exhibited symptoms such as a persistent, growing skin lesion with irregular borders and changes in color. As part of their cancer treatment plan, I have recommended chemotherapy to target and destroy the cancer cells. I have discussed the potential side effects of chemotherapy with the patient and emphasized the importance of closely monitoring their response to the treatment.', metadata={'Consulting Physician': 'Dr. Jerry Daniels', 'Diagnosis Category': 'Cancer', 'Diagnosis Sub Category': 'Skin Cancer', 'Intended Purposes': "['Care', 'Research', 'Insurance', 'Support', 'Trial']", 'New Patient': True, 'Patient Address': 'Adolfine-Dobes-Platz 2/2\n06040 Freital', 'Patient Age': 61, 'Patient Age Range': '51-64', 'Patient Blood Type': 'A+', 'Patient Emergency Contact': 'Joseph Russell, 015748674955.', 'Patient Gender': 'Female', 'Patient ID': 'PZJ572884', 'Patient Insurance Number': 'G238368884',

In [44]:
response = retriever.invoke("Get me patients that allow data access for research.")
response

[Document(page_content='The patient has been diagnosed with breast cancer. They are being considered for an investigational treatment as part of their cancer treatment plan. This treatment approach involves exploring innovative therapies that are still undergoing clinical trials and research. The goal is to provide the patient with access to cutting-edge treatments that may offer promising results in their fight against breast cancer. Close monitoring and participation in research protocols will be essential throughout the course of this investigational treatment.', metadata={'Consulting Physician': 'Dr. Shelly Hunt', 'Diagnosis Category': 'Cancer', 'Diagnosis Sub Category': 'Breast Cancer', 'Intended Purposes': "['Care', 'Research', 'Insurance', 'Support']", 'New Patient': True, 'Patient Address': 'Schmiedtallee 8/5\n56041 Chemnitz', 'Patient Age': 53, 'Patient Age Range': '51-64', 'Patient Blood Type': 'B+', 'Patient Emergency Contact': 'Dr. Christopher Pennington DDS, 015314134358.'

In [45]:
response = retriever.invoke("Get me patients with blood type A-")
response

[Document(page_content='Today I met with a patient diagnosed with advanced lung cancer. We discussed the option of enrolling in an investigational treatment program as part of their cancer treatment plan. I explained the potential benefits and risks associated with this innovative approach, and the patient expressed interest in exploring this opportunity further. We will continue to closely monitor their condition and progress throughout the course of this investigational treatment.', metadata={'Consulting Physician': 'Dr. Shelly Hunt', 'Diagnosis Category': 'Cancer', 'Diagnosis Sub Category': 'Lung Cancer', 'Intended Purposes': "['Care', 'Insurance', 'Support', 'Public', 'Marketing']", 'New Patient': True, 'Patient Address': 'Kabusgasse 9/4\n03639 Ebersberg', 'Patient Age': 51, 'Patient Age Range': '51-64', 'Patient Blood Type': 'A-', 'Patient Emergency Contact': 'Michael Clark, 015224630202.', 'Patient Gender': 'Male', 'Patient ID': 'PDU358831', 'Patient Insurance Number': 'C61438042

In [49]:
response = retriever.invoke("Get me vists where patients struggle with high blood pressure.")
response

[]

### Include Query

In [50]:
from langchain.chains.query_constructor.base import (
    StructuredQueryOutputParser,
    get_query_constructor_prompt,
)

prompt = get_query_constructor_prompt(
    document_content_description,
    metadata_field_info,
)
output_parser = StructuredQueryOutputParser.from_components()
query_constructor = prompt | llm | output_parser

In [51]:
from langchain.retrievers.self_query.chroma import ChromaTranslator

retriever = SelfQueryRetriever(
    query_constructor=query_constructor,
    vectorstore=vectorstore,
    structured_query_translator=ChromaTranslator(),
)

In [52]:
print(query_constructor.invoke(
        {
            "query": "Get all patients with any kind of cancer that allow their data to be used to research."
        }
    )
)

print(retriever.invoke(
        "Get all patients with any kind of cancer that allow their data to be used to research."
    )
)

OutputParserException: Parsing text
```json
{
    "query": "cancer",
    "filter": "and(contains(\"Diagnosis Category\", \"cancer\"), in(\"Intended Purposes\", [\"Research\"]))"
}
```
 raised following error:
Received unrecognized function contains. Valid functions are [<Operator.AND: 'and'>, <Operator.OR: 'or'>, <Operator.NOT: 'not'>, <Comparator.EQ: 'eq'>, <Comparator.NE: 'ne'>, <Comparator.GT: 'gt'>, <Comparator.GTE: 'gte'>, <Comparator.LT: 'lt'>, <Comparator.LTE: 'lte'>, <Comparator.CONTAIN: 'contain'>, <Comparator.LIKE: 'like'>, <Comparator.IN: 'in'>, <Comparator.NIN: 'nin'>]

In [34]:
retriever.invoke(
    "Get all patients with cancer that allow their data to be used to research."
)

ValueError: Received disallowed comparator in. Allowed comparators are [<Comparator.EQ: 'eq'>, <Comparator.NE: 'ne'>, <Comparator.GT: 'gt'>, <Comparator.GTE: 'gte'>, <Comparator.LT: 'lt'>, <Comparator.LTE: 'lte'>]

## QA

https://github.com/insightbuilder/python_de_learners_data/blob/main/code_script_notebooks/projects/exploring_bard/selfQueryingRetriever_QAChains.ipynb

https://www.youtube.com/watch?v=J9mZdEksz3c&t=3s

In [15]:
import pandas as pd

df = pd.read_csv('./faker/dataset_3.csv', dtype={'Patient Phone': str})
# Change date to YYYYMMDD-interger 
df['Visit Date'] = pd.to_datetime(df['Visit Date']).dt.strftime('%Y%m%d').astype(int)
# Add 'source' column to match later retrievals
df['source'] = df['Visit ID']
df

Unnamed: 0,Visit ID,Description,Visit Date,Patient ID,Diagnosis Category,Diagnosis Sub Category,Treatment Category,Treatment Sub Category,New Patient,Consulting Physician,...,Patient Age,Patient Age Range,Patient Blood Type,Patient Insurance Number,Patient Phone,Patient Address,Patient Occupation,Patient Emergency Contact,Intended Purposes,source
0,VQC513203,The patient presented with chronic lower back ...,20170101,PZM508653,Chronic Pain,Vertebral Disc Problem,Pharmacy/Prescription Drugs,Non-FDA Approved Use,True,Dr. Jerry Daniels,...,43,41-50,O-,G264037622,017588215469,Pärtzeltweg 2\n22301 Neunburg vorm Wald,"Surveyor, quantity","Jennifer Bailey, 015680180768.","['Care', 'Research', 'Insurance', 'Support', '...",VQC513203
1,VVC435406,The patient presented with symptoms of fatigue...,20170101,PSN036517,Endocrine/ Metabolic,Hormone Deficiency,Pharmacy/Prescription Drugs,Hormones,True,Dr. Michelle Lamb,...,36,31-40,O+,Y133547589,015087781378,Kira-Gorlitz-Allee 8\n67100 Rosenheim,Copy,"Dennis Carlson, 015182104709.","['Care', 'Research', 'Insurance', 'Support', '...",VVC435406
2,VKT437745,The patient presented with delayed speech deve...,20170101,PPD253419,Pediatrics,Delayed Speech,Rehabilitation Services - Outpatient,Speech Therapy,True,Dr. Michelle Lamb,...,5,0-10,O+,F115599209,016368315207,Thiesstr. 3/5\n68745 Bremen,Claims inspector/assessor,"Veronica Harris, 015034485673.","['Care', 'Research', 'Insurance', 'Support', '...",VKT437745
3,VJG208744,The patient presented with symptoms of muscle ...,20170101,PBN488954,Central Nervous System/ Neuromuscular,,Pharmacy/Prescription Drugs,Non-FDA Approved Use,True,Dr. James Barber,...,52,51-64,AB+,W720918648,016217695326,Schmidtkeallee 53\n54913 Siegen,Tourism officer,"Michelle Graham, 017438105819.","['Care', 'Research', 'Insurance', 'Support', '...",VJG208744
4,VAF235393,The patient presented with a diagnosis of brea...,20170101,PJG173047,Cancer,Breast Cancer,Cancer Treatment,Surgery,True,Dr. James Barber,...,36,31-40,A+,Q198981012,017645195473,Ida-Fliegner-Ring 7/6\n53118 Wolgast,Engineering geologist,"Peter Stout, 015017527431.","['Care', 'Research', 'Insurance', 'Support', '...",VAF235393
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11684,VTL998110,"Today, I met with a patient who was diagnosed ...",20181230,PND058072,Mental,Depression,Electrical/ Thermal/ Radiofreq. Interventions,Transcranial Magnetic Stimulation,False,Dr. Alexandria Gaines,...,51,51-64,A-,Private,016976645290,Wolfram-auch Schlauchin-Ring 3/3\n94816 Neuruppin,"Research officer, political party","Emily Garrett, 016593887796.","['Care', 'Research', 'Insurance', 'Support', '...",VTL998110
11685,VLH032189,The patient was diagnosed with liver cancer af...,20181230,PPC895292,Cancer,Liver Cancer,Special Procedure,,False,Dr. Eddie Young,...,62,51-64,B-,Q100759966,016394851011,Sölzerring 05\n21768 Altötting,"Engineer, petroleum","Linda Flores, 016973980632.","['Care', 'Insurance', 'Support', 'Public', 'Tr...",VLH032189
11686,VQZ422704,Today I saw a patient with a chromosomal anoma...,20181230,PFS771409,Genetic,Chromosomal Anomalies,Pharmacy/Prescription Drugs,Hormones,False,Dr. James Barber,...,7,0-10,A+,L605331052,016453400773,Mielcarekplatz 1/9\n83357 Melle,Scientific laboratory technician,"Michael Weiss, 017716313028.","['Care', 'Research', 'Insurance', 'Support', '...",VQZ422704
11687,VUT814433,The patient presented with symptoms of persist...,20181230,PVV930449,Mental,,,,True,Dr. Eddie Young,...,16,10-20,A+,G215159968,017155908480,Dussen vangasse 501\n61560 Aachen,Data processing manager,"Tammy Whitney, 017700151177.","['Care', 'Research', 'Insurance', 'Support', '...",VUT814433


In [16]:
from langchain_core.documents import Document

def generate_db_docs(df, page_content_column, limit = None):
    if limit is not None:
        df = df.iloc[:limit]

    # Ensure the specified page_content_column exists in the DataFrame
    if page_content_column not in df.columns:
        raise ValueError(f"{page_content_column} does not exist in the DataFrame.")

    docs = []
    cols = list(df.columns)

    for row in df.itertuples(index=False, name=None):
        # Dynamically get the page content using the column name
        page_content_index = cols.index(page_content_column)
        page_content = row[page_content_index]

        # Construct metadata dictionary from the rest of the columns
        metadata = {col: row[cols.index(col)] for col in cols if col != page_content_column}

        # Create Document object and append to docs list
        docs.append(Document(page_content=page_content, metadata=metadata))

    return docs

docs = generate_db_docs(df, 'Description', 1000)


In [17]:
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.chains.query_constructor.base import AttributeInfo
import os

embeddings = OpenAIEmbeddings()

metadata_field_info = [
    AttributeInfo(
        name="Visit ID",
        description="Unique ID for patient visting a medical center.",
        type="string",
    ),
    AttributeInfo(
        name="Visit Date",
        description="The date the visit to the medical center took place. Format: YYYYMMDD.",
        type="integer",
    ),
    AttributeInfo(
        name="Patient ID",
        description="The ID corresponding to a patient. A patient might visit multiple times, therefore the ID is not unique.",
        type="string",
    ),
    AttributeInfo(
        name="Diagnosis Category", 
        description="Category for medical diagnosis.", 
        type="string"
    ),
    AttributeInfo(
        name="Diagnosis Sub Category", 
        description="Sub category for medical diagnosis.", 
        type="string"
    ),
    AttributeInfo(
        name="Treatment Category", 
        description="Category for medical treatment based on diganosis.", 
        type="string"
    ),
    AttributeInfo(
        name="Treatment Sub Category", 
        description="Sub category for medical treatment based on diganosis.", 
        type="string"
    ),
    AttributeInfo(
        name="New Patient", 
        description="A flag indicating if it is the first time a patient is visiting the medical center. 'True' if it is the first time, 'False' if not.", 
        type="boolean"
    ),
    AttributeInfo(
        name="Consulting Physician", 
        description="The name of the consulting physician. Availbale options: 'Dr. Jerry Daniels', 'Dr. Michelle Lamb', 'Dr. James Barber', 'Dr. Shelly Hunt', 'Dr. Alexandria Gaines', 'Dr. Eddie Young'.", 
        type="string"
    ),
    AttributeInfo(
        name="Patient Name", 
        description="The name of the patient seeking medical assistance.", 
        type="string"
    ),
    AttributeInfo(
        name="Patient Gender", 
        description="The gender of the patient seeking medical assistance. Available options: 'Female', 'Male', 'Other'.", 
        type="string"
    ),
    AttributeInfo(
        name="Patient Age", 
        description="The age of the patient seeking medical assistance.", 
        type="integer"
    ),
    AttributeInfo(
        name="Patient Age Range", 
        description="The age range of the patient seeking medical assistance. Available options: '0-10', '10-20', '21-30', '31-40', '41-50', '51-64', '65+'.", 
        type="string"
    ),
    AttributeInfo(
        name="Patient Blood Type", 
        description="The blood type of the patient seeking medical assistance. Available options: 'O-', 'O+', 'AB+', 'A+', 'B+', 'A-', 'B-', 'AB-'.", 
        type="string"
    ),
    AttributeInfo(
        name="Patient Insurance Number", 
        description="The insurance number of the patient seeking medical assistance.", 
        type="string"
    ),
    AttributeInfo(
        name="Patient Phone", 
        description="The moblie phone number of the patient seeking medical assistance.", 
        type="string"
    ),
    AttributeInfo(
        name="Patient Address", 
        description="The address of the patient seeking medical assistance.", 
        type="string"
    ),
    AttributeInfo(
        name="Patient Occupation", 
        description="The occupation of the patient seeking medical assistance.", 
        type="string"
    ),
    AttributeInfo(
        name="Patient Emergency Contact", 
        description="The emergency contact of the patient seeking medical assistance. The contact includes a name and an address of the emergency contact.", 
        type="string"
    ),
    AttributeInfo(
        name="Intended Purposes", 
        description="A list of intended access purposes for the data generated at the visit to the medical center. The list can contain a set of the following options: 'Care', 'Research', 'Insurance', 'Support', 'Public', 'Trial', 'Product', 'Marketing'.", 
        type="list[string]"
    ),
]

persist_directory = "./chroma_medical_qa_db"

# Check if the persistence directory exists
if not os.path.exists(persist_directory):
    # If not, create using docs
    os.makedirs(persist_directory, exist_ok=True)
    vectorstore = Chroma.from_documents(docs, OpenAIEmbeddings(), persist_directory=persist_directory)
    print('Creating vectorstore...')
else:
    # If the directory exists, initialize Chroma to use the existing data
    vectorstore = Chroma(persist_directory=persist_directory, embedding_function=OpenAIEmbeddings())
    print('Using existing vectorstore...')


Creating vectorstore...


In [18]:
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_openai import ChatOpenAI

document_content_description = "A summary of of a visit at a medical center for the docters perspective. The text includes symptoms dignosis and proposed treatment."
llm = ChatOpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectorstore,
    document_content_description,
    metadata_field_info,
    verbose=True,
    return_intermediate_steps=True
    #enable_limit=True,
)

In [19]:
from langchain.chains import RetrievalQAWithSourcesChain

chain = RetrievalQAWithSourcesChain.from_chain_type(llm, 
                                                    chain_type="stuff", 
                                                    retriever=retriever)

In [20]:
chain({"question":"How many male patients have cancer?"}, return_only_outputs=False)

{'question': 'How many male patients have cancer?',
 'answer': 'The number of male patients with cancer is not specified in the provided content.\n',
 'sources': ''}

In [21]:
chain({"question":"What is a common symptom?"}, return_only_outputs=False)

{'question': 'What is a common symptom?',
 'answer': 'A common symptom is persistent fatigue.\n',
 'sources': 'VMD613427'}

In [22]:
chain({"question":"How many male patients visited the medical center?"}, return_only_outputs=False)

{'question': 'How many male patients visited the medical center?',
 'answer': 'There were 4 male patients who visited the medical center.\n',
 'sources': 'VYW041077, VAQ243746, VNP039940, VQB689845'}