In [2]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
#Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents

In [4]:
from langchain_community.document_loaders import PyPDFLoader

# load PDF
loader = PyPDFLoader("/workspaces/Medical-Assistant-Chatbot/Data/Medical_book.pdf")
documents = loader.load()


#print(documents[0].page_content[:500])

In [6]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

In [7]:
extracted_data = documents

In [8]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [9]:
#Split the Data into Text Chunks
def text_split(minimal_docs):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(minimal_docs)
    return text_chunks

In [10]:
text_chunks=text_split(minimal_docs)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 5859


In [11]:
from langchain.embeddings import HuggingFaceEmbeddings
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

embeddings = download_hugging_face_embeddings()

  embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
  from .autonotebook import tqdm as notebook_tqdm


In [12]:
# for just check it has dimension is 384
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [13]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [14]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
GROQ_API_KEY=os.environ.get("GROQ_API_KEY")

In [15]:
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY 
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

In [16]:
from pinecone import Pinecone
pinecone_api_key = PINECONE_API_KEY 

pc = Pinecone(api_key = pinecone_api_key)

In [17]:
pc

<pinecone.pinecone.Pinecone at 0x77f7e52a7020>

In [18]:
from pinecone import ServerlessSpec

index_name = "medical-bot"  

if not pc.has_index(index_name): 
    pc.create_index( 
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

index = pc.Index(index_name) 

In [19]:
# Embed each chunk and insert the embeddings into your Pinecone index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks, 
    index_name=index_name, 
    embedding=embeddings, 
)

In [20]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x77f7e7b1d5b0>

In [21]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [22]:
retrieved_docs = retriever.invoke("What is Acne?") 
retrieved_docs

[Document(id='fc786d75-c667-42d9-9dcf-89eb696a6ce2', metadata={'source': '/workspaces/Medical-Assistant-Chatbot/Data/Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='9e508b79-1e92-48e9-b0d4-5d3f04d49842', metadata={'source': '/workspaces/Medical-Assistant-Chatbot/Data/Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25'),
 Document(id='4076885c-3389-4a93-823e-b99f0ec4af2b', metadata={'source': '/workspaces/Medical-Assistant-Chatbot/Data/Medical_book.pdf'}, page_content='Acidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease 

In [23]:
from langchain_groq import ChatGroq

chatModel = ChatGroq(
    groq_api_key=os.environ["GROQ_API_KEY"],
    model="llama-3.3-70b-versatile"   # you can pick another model like mixtral-8x7b
)

In [24]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [25]:
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages( 
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [26]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt) 
rag_chain = create_retrieval_chain(retriever, question_answer_chain) 

In [27]:
response = rag_chain.invoke({"input": "What is Acne?"}) 
print(response["answer"])

Acne is a common skin disease characterized by pimples on the face, chest, and back. It occurs when the pores of the skin become clogged with oil, dead skin cells, and bacteria. Acne vulgaris, also known as common acne, is the most common skin disease, affecting nearly 17 million people in the United States.


In [28]:
response = rag_chain.invoke({"input": "What are the medicine for Acne?"}) 
print(response["answer"])

The medicines for acne include topical drugs such as tretinoin, benzoyl peroxide, adapalene, and salicylic acid. Topical antibiotics may also be added to the treatment regimen when acne is complicated by inflammation. Additionally, some natural remedies like milk thistle, essential fatty acids, and Chinese herbal remedies like cnidium seed and honeysuckle flower are also recommended.
