In [None]:
#!pip3.11 install langchain pypdf openai chromadb

In [23]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA

##### Load "Guide to Laboratory Tests" PDF file 

In [2]:
loader = PyPDFLoader("https://www.ampath.co.za/pdfs/Desk-Reference-web.pdf")
pages = loader.load()

# Replace "�" with "."
pages = [page.page_content.replace("�", ".") for page in pages]
len(pages)

232

##### Extract BIOCHEMISTRY and HAEMATOLOGY sections

In [3]:
def save_section(file_name, section):
  with open(f"{file_name}.txt", "w") as f:
    for page in section:
        f.write(page)

Abbreviations_and_Symbols = pages[11:18]
BIOCHEMISTRY = pages[19:66]
HAEMATOLOGY = pages[114:146]

save_section("Abbreviations_and_Symbols", Abbreviations_and_Symbols)
save_section("BIOCHEMISTRY", BIOCHEMISTRY)
save_section("HAEMATOLOGY", HAEMATOLOGY)

##### Extract and Save sub-sections as list items

In [4]:
def extract_sections_from_text(file_name, section_titles):
    with open(f"./{file_name}.txt", 'r', encoding='utf-8') as txt_file:
        lines = txt_file.readlines()

        sections = {}
        current_section = None

        for line in lines:
            for title in section_titles[:-1]:
                if title in line:
                    current_section = title
                    sections[current_section] = ""

                if current_section is not None and line not in sections[current_section]:
                    sections[current_section] += line

            # Check if the current line contains the last section title
            if current_section == section_titles[-1] and section_titles[-1] in line:
                break

    return sections

In [5]:
BIOCHEMISTRY_section_titles = ["Electrolytes and renal function", "Diagnosis of Chronic Kidney Disease (CKD)",
                  "Calcium, magnesium, phosphate, vitamin D and PTH", "Liver function tests",
                  "Pancreas", "Inflammatory markers",
                  "Cardiac and skeletal muscle markers", "Carbohydrate metabolism",
                  "Lipid metabolism", "Iron studies", "Folate and vitamin B12", "ENDOCRINOLOGY"]
BIOCHEMISTRY_subsections = extract_sections_from_text("BIOCHEMISTRY", BIOCHEMISTRY_section_titles)
len(BIOCHEMISTRY_subsections)

11

In [6]:
HAEMATOLOGY_section_titles = ["Full blood count (FBC)", "ESR (erythrocyte sedimentation rate)",
                              "Investigation of a bleeding disorder", "Disseminated intravascular coagulation (DIC) screen",
                              "Tests used in the investigation of a thrombotic tendency",
                              "Testing for the presence of a lupus anticoagulant",
                              "Monitoring of anticoagulation therapy",
                              "Bone marrow investigation", "Flow cytometry",
                              "Tests used in the investigation of a haemolytic process",
                              "Testing for inherited enzyme abnormalities",
                              "Malaria testing", "JAK2 V617F PCR", "IMMUNOLOGY"]
HAEMATOLOGY_subsections = extract_sections_from_text("HAEMATOLOGY", HAEMATOLOGY_section_titles)
len(HAEMATOLOGY_subsections)

13

##### Initialize text splitter

In [7]:
text_splitter = CharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap  = 250,
)

##### Split each sub-section into chunks

In [10]:
Abbreviations_and_Symbols_chunks = text_splitter.create_documents([('\n').join(Abbreviations_and_Symbols)])

BIOCHEMISTRY_subsections_chunks = {}

for i, d in BIOCHEMISTRY_subsections.items():
        print(f"importing section: {i}")
        BIOCHEMISTRY_subsections_chunks[i] = text_splitter.create_documents([BIOCHEMISTRY_subsections[i]])

HAEMATOLOGY_subsections_chunks = {}

for i, d in HAEMATOLOGY_subsections.items():
        print(f"importing section: {i}")
        HAEMATOLOGY_subsections_chunks[i] = text_splitter.create_documents([HAEMATOLOGY_subsections[i]])

importing section: Electrolytes and renal function
importing section: Diagnosis of Chronic Kidney Disease (CKD)
importing section: Calcium, magnesium, phosphate, vitamin D and PTH
importing section: Liver function tests
importing section: Pancreas
importing section: Inflammatory markers
importing section: Cardiac and skeletal muscle markers
importing section: Carbohydrate metabolism
importing section: Lipid metabolism
importing section: Iron studies
importing section: Folate and vitamin B12
importing section: Full blood count (FBC)
importing section: ESR (erythrocyte sedimentation rate)
importing section: Investigation of a bleeding disorder
importing section: Disseminated intravascular coagulation (DIC) screen
importing section: Tests used in the investigation of a thrombotic tendency
importing section: Testing for the presence of a lupus anticoagulant
importing section: Monitoring of anticoagulation therapy
importing section: Bone marrow investigation
importing section: Flow cytometr

##### create the embedding function

In [11]:
embedding = OpenAIEmbeddings()
Abbreviations_and_Symbols_vectors = Chroma.from_documents(Abbreviations_and_Symbols_chunks, embedding=embedding)
chain = RetrievalQA.from_chain_type(llm = OpenAI(),
                                    retriever = Abbreviations_and_Symbols_vectors.as_retriever(),
                                    chain_type="stuff")

In [13]:
query = "what is CrCl in laboratry and explain it "
chain.run(query)

Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1


' CrCl stands for Creatinine Clearance, which is used to measure how well the kidneys are filtering wastes from blood. It is calculated by measuring the creatinine in urine and comparing it to the creatinine in blood.'

In [None]:
embedding = OpenAIEmbeddings()
Abbreviations_and_Symbols_with_embeddings = Chroma.from_documents(documents = Abbreviations_and_Symbols_chunks,
                                                                  embedding=embedding)
query = "what is CrCl in laboratry and explain it"
query_context = Abbreviations_and_Symbols_with_embeddings.similarity_search(query)
Abbreviations_and_Symbols_vectors = Chroma.from_documents(query_context, embedding=embedding)
chain = RetrievalQA.from_chain_type(llm = OpenAI(),
                                    retriever = Abbreviations_and_Symbols_vectors.as_retriever(),
                                    chain_type="stuff")

chain.run(query)

##### Save each sub-section data in "persist_directory"

In [25]:
persist_dir = "./persist_db/"
embedding = OpenAIEmbeddings()
Abbreviations_and_Symbols_vectordb = Chroma.from_documents(documents = Abbreviations_and_Symbols_chunks,
                                                           embedding = embedding,
                                                           persist_directory = f"{persist_dir}Abbreviations_and_Symbols")
Abbreviations_and_Symbols_vectordb.persist()

In [26]:
for key in BIOCHEMISTRY_subsections.keys():
    BIOCHEMISTRY_subsection_vectordb = Chroma.from_documents(documents = BIOCHEMISTRY_subsections_chunks[key],
                                                           embedding = embedding,
                                                           persist_directory = f"{persist_dir}BIOCHEMISTRY/{key}")
    BIOCHEMISTRY_subsection_vectordb.persist()

for key in HAEMATOLOGY_subsections.keys():
    HAEMATOLOGY_subsection_vectordb = Chroma.from_documents(documents = HAEMATOLOGY_subsections_chunks[key],
                                                           embedding = embedding,
                                                           persist_directory = f"{persist_dir}HAEMATOLOGY/{key}")
    HAEMATOLOGY_subsection_vectordb.persist()


In [32]:
vectordb = None
embedding = OpenAIEmbeddings()
vectordb = Chroma(embedding_function = embedding,
                  persist_directory = f"{persist_dir}Abbreviations_and_Symbols")

chain = RetrievalQA.from_chain_type(llm = OpenAI(),
                                    retriever = vectordb.as_retriever(),
                                    chain_type="stuff")
generated_text = chain.run(query)
generated_text

Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1


" CrCl stands for Creatinine Clearance. It is a measure of how well the kidneys are filtering creatinine, which is a waste product in the blood. It is calculated by dividing the amount of creatinine in the urine by the amount of creatinine in the blood and multiplying by the patient's body surface area. It is typically used to assess kidney function."

In [33]:
print(generated_text)

 CrCl stands for Creatinine Clearance. It is a measure of how well the kidneys are filtering creatinine, which is a waste product in the blood. It is calculated by dividing the amount of creatinine in the urine by the amount of creatinine in the blood and multiplying by the patient's body surface area. It is typically used to assess kidney function.
