In [1]:
print("start")

start


In [2]:
#%pwd

In [4]:
#%cd ..
# now I am in root project directory

In [5]:
#to upload the pdf, since pdf is in a directory(Data) I need DirectoryLoader
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
#after uploading pdf, make chunks of text
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [6]:
#Extract Data From the PDF File
#creating function that will load the pdf and return documents
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf", #it will load all the pdf files only, i can add docx other extension as well
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents

In [7]:
# executing above function to load the data
extracted_data=load_pdf_file(data='Data/')

In [8]:
#Spliting the Data into Text Chunks
#defining function to split the text into chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [9]:
#executing above function to split the text into chunks
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 5860


In [10]:
# now I will convert chunks into vectros using HuggingFaceEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings

In [11]:
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [12]:
embeddings = download_hugging_face_embeddings()

  embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
  from .autonotebook import tqdm as notebook_tqdm


In [13]:
# check the embeddings
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [15]:
# now I will need pincone database to store vectore and open ai api key
from dotenv import load_dotenv
load_dotenv()

True

In [16]:
import os
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY=os.environ.get('OPENAI_API_KEY')

In [17]:
# instad of manualy creating index on pinecone site, I am using this python code to create index
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medicalbot"


pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    ) 
) 

In [18]:
# Embed each chunk and upsert the embeddings into my Pinecone index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [19]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x14e426530>

In [21]:
# creating retriever object, it will perform similarity search and return the similar 3 similar chunks
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [22]:
# testing
retrieved_docs = retriever.invoke("What is fever")

In [23]:
retrieved_docs

[Document(id='7dbe00f7-ff75-4b18-8110-960f84014d77', metadata={'page': 69.0, 'page_label': '70', 'source': 'Data/Medical_book.pdf'}, page_content='fever in children. This disease is most often caused by\ntypes 3 and 7. Symptoms, which appear suddenly and\nusually disappear in less than a week, include:\n• inflammation of the lining of the eyelid (conjunctivitis)\n•f e v e r\n• sore throat (pharyngitis)\n• runny nose\n• inflammation of lymph glands in the neck (cervical\nadenitis)\nGALE ENCYCLOPEDIA OF MEDICINE 256\nAdenovirus infections\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 56'),
 Document(id='ba40be16-4e4d-4db7-bd89-6224ae27a0dc', metadata={'page': 60.0, 'page_label': '61', 'source': 'Data/Medical_book.pdf'}, page_content='(38°–40°C). In addition, a general ill feeling, muscle\naches,headache, chills, and loss of appetite may be felt.\nDiagnosis\nIf lymphangitis is suspected, the person should call\nhis or her doctor immediately or go to an emergency\nroom. Acute lymphangiti

In [25]:
# setting up llm model, it will use the above three similar chunks and query and generate the final answer
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4, max_tokens=500)

In [26]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [27]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [28]:
response = rag_chain.invoke({"input": "what is paralysis?"})
print(response["answer"])



Paralysis is a condition in which a person loses the ability to move or control certain muscles in their body. It can be caused by various factors, such as damage to the nerves or spinal cord, stroke, or diseases like multiple sclerosis. Paralysis can also affect other bodily functions, such as breathing and speaking, depending on the location and severity of the paralysis. It is a serious condition that can greatly impact a person's daily life and may require ongoing medical treatment.


In [29]:
# my code is perfectly running now I will perform modular coding