In [1]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
#Extract the data from the PDF file

def load_pdf_file(data):
    loader = DirectoryLoader(data,
                             glob="*pdf",
                             loader_cls=PyPDFLoader
                            )
    documents = loader.load()
    return documents

In [3]:
extracted_data = load_pdf_file(data = "../data/")

In [4]:
#Split our data into text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
                                                    chunk_overlap=20
                                                    )
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [None]:
extracted_data = load_pdf_file(data="../data/")
print(f"Number of documents loaded: {len(extracted_data)}")
print(f"Sample document: {extracted_data[0] if extracted_data else 'No documents found'}")

Length of text chunks:  6973


In [6]:
from langchain.embeddings import HuggingFaceEmbeddings

In [7]:
#download the embeddding from huggingface
def download_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings 

In [8]:
embeddings = download_embeddings()

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [9]:
query_result = embeddings.embed_query("What is Acne?")
print("Query length: ", len(query_result))

Query length:  384


In [10]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [11]:
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
OPEN_AI_API_KEY = os.environ.get("OPEN_AI_API_KEY")
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")

In [44]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
index_name = "medikbot-index"
pc = Pinecone(
    api_key=PINECONE_API_KEY,
)

pc.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

KeyboardInterrupt: 

In [13]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPEN_AI_API_KEY"] = OPEN_AI_API_KEY
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [None]:
from langchain_pinecone import PineconeVectorStore


docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    embedding=embeddings,  
    index_name=index_name
)


KeyboardInterrupt: 

In [15]:
#Load existing Index

from langchain_pinecone import Pinecone
docsearch = Pinecone.from_existing_index(
    index_name=index_name,      
    embedding=embeddings
)

In [16]:
docsearch

<langchain_pinecone.vectorstores.Pinecone at 0x1ffb3879960>

In [17]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [18]:
retrieved_docs = retriever.invoke("What is Acne?")

In [19]:
retrieved_docs

[Document(id='cb08f8be-bf07-482a-9b3c-4c372543573c', metadata={'author': '', 'creationdate': '2017-05-01T10:37:35-07:00', 'creator': '', 'keywords': '', 'moddate': '2017-05-01T10:37:35-07:00', 'page': 425.0, 'page_label': '426', 'producer': 'GPL Ghostscript 9.10', 'source': '..\\data\\medical_book.pdf', 'subject': '', 'title': '', 'total_pages': 759.0}, page_content='Corticosteriod —A group of synthetic hormones\nthat are used to prevent or reduce inflammation.\nToxic effects may result from rapid withdrawal after\nprolonged use or from continued use of large doses.\nPatch test—A skin test that is done to identify aller-\ngens. A suspected substance is applied to the skin.\nAfter 24–48 hours, if the area is red and swollen,\nthe test is positive for that substance. If no reaction\noccurs, another substance is applied. This is con-'),
 Document(id='4f733a88-30dd-4c28-8226-4ed06afa05da', metadata={'author': '', 'creationdate': '2017-05-01T10:37:35-07:00', 'creator': '', 'keywords': '', '

In [26]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0.4,
    max_output_tokens=500,
    google_api_key=GOOGLE_API_KEY
)



In [31]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks"
    "Use the following pieces of retrived contexts to answer"
    "the question, If you don't know the answer say, you don't know. Use three sentences in order to keep the answers concise"
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("user", "Question: {input}"),  # Changed {question} to {input}
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, prompt)
retrieval_chain = create_retrieval_chain(retriever, question_answer_chain)

In [28]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
retrieval_chain = create_retrieval_chain(retriever,question_answer_chain)

In [43]:
response = retrieval_chain.invoke({"input": "what do you know about things similar  to malaria"})
print(response["answer"])

Bed nets and insect repellents can help avoid contact with bugs. Blood screening is not always effective in areas where infection is common. It is important to carefully screen people who have moved from Central and South America before they donate blood.
