In [1]:
import os
os.chdir("../")

In [2]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob ="*.pdf",
        loader_cls = PyPDFLoader
    )
    documents = loader.load()
    return documents

In [None]:
extracted_data = load_pdf_files("data")

In [None]:
from typing import List
from langchain.schema import Document
def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(page_content=doc.page_content,
                     metadata= {"source": src} 
                     )
        )
    return minimal_docs

In [None]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [None]:
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        chunk_overlap = 20
    )
    texts_chunk = text_splitter.split_documents(minimal_docs)
    return texts_chunk

In [None]:
texts_chunk = text_split(minimal_docs)

In [11]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    embeddings = HuggingFaceEmbeddings(
        model_name = "sentence-transformers/all-MiniLM-L6-v2"
    )
    return embeddings

embeddings = download_embeddings()

  embeddings = HuggingFaceEmbeddings(


In [12]:
from dotenv import load_dotenv
load_dotenv()

True

In [13]:
PINECOIN_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECOIN_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [None]:
from pinecone import Pinecone
pinecone_api_key = PINECOIN_API_KEY

pc = Pinecone(api_key=pinecone_api_key)

In [8]:
# from pinecone import ServerlessSpec

index_name = "medical-chatbot"

# if not pc.has_index(index_name):
#     pc.create_index(
#         name=index_name,
#         dimension=384,
#         metric = "cosine",
#         spec=ServerlessSpec(
#             cloud = "aws",
#             region = "us-east-1",
#         )
#     )
# index = pc.Index(index_name)

In [None]:
# from langchain_pinecone import PineconeVectorStore
# docsearch = PineconeVectorStore.from_documents(
#     documents = texts_chunk,
#     embedding = embeddings,
#     index_name = index_name
# )

In [14]:
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_existing_index(
    embedding = embeddings,
    index_name = index_name
)

In [15]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [16]:
retreived_docs = retriever.invoke("What is diabetes?") 
retreived_docs

[Document(id='ac6c59b2-75b5-46b2-aa15-e4b3ad93580f', metadata={'source': 'data\\Medical_book.pdf'}, page_content='begin to fall. A person with diabetes mellitus either does\nnot make enough insulin, or makes insulin that does not\nwork properly. The result is blood sugar that remains\nhigh, a condition called hyperglycemia.\nDiabetes must be diagnosed as early as possible. If\nleft untreated, it can damage or cause failure of the eyes,\nkidneys, nerves, heart, blood vessels, and other body\norgans. Hypoglycemia, or low blood sugar, may also be\ndiscovered through blood sugar testing. Hypoglycemia is'),
 Document(id='ceca3b47-03f4-45a3-a7b0-6cf5af4a2fd7', metadata={'source': 'data\\Medical_book.pdf'}, page_content='Resources\nBOOKS\nBerkow, Robert, ed. The Merck Manual of Medical Informa-\ntion: Home Edition. Whitehouse Station, NJ: Merck &\nCo., Inc., 1997.\nKEY TERMS\nAplastic —Exhibiting incomplete or faulty devel-\nopment.\nDiabetes mellitus —A disorder of carbohydrate\nmetabolism b

In [17]:
from langchain_openai import ChatOpenAI
chatModel = ChatOpenAI(
    model = "gpt-4o"
)

In [18]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents.stuff import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [None]:
system_prompt = (
    "You are a helpful medical assistant. Use the following context to answer the question.\n"
    "If you don't know the answer, just say you don't know. Do not try to make up an answer.\n"
    "Never use anything outside of the context to answer the question.\n"
    "Context: {context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}")
    ]
)

In [20]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [21]:
response = rag_chain.invoke({"input": "what is gigantism?"})
print(response["answer"])

Gigantism is a condition that occurs in children when there is an abnormal release of growth hormone (GH) from the pituitary gland in the brain, leading to exceptional growth of long bones. This results in excessive height and size due to the continued growth of the skeleton, as the bony growth plates have not yet closed.
