In [1]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [28]:
#Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents


In [29]:
extracted_data=load_pdf_file(data='Data/')

In [30]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [31]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 8706


In [None]:

def load_pdf_file(data_path):
    """Load PDF files from a directory"""
    loader = DirectoryLoader(
        data_path, 
        glob="*.pdf", 
        loader_cls=PyPDFLoader
    )
    
    documents = loader.load()
    return documents

In [6]:
import os
os.chdir("../")

In [7]:
extracted_data=load_pdf_file(data='Data/')

In [None]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [9]:
#Download the Embeddings from Hugging Face
def download_OpenAIEmbeddings():
    embeddings=OpenAIEmbeddings()
    return embeddings

In [10]:
embeddings = download_OpenAIEmbeddings()

In [11]:
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x0000017B758BBA90>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x0000017B67490FD0>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [22]:
query_result = embeddings.embed_query("Hi")
print("Length", len(query_result))

Length 1536


In [13]:
from dotenv import load_dotenv
load_dotenv()

True

In [14]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY=os.environ.get('OPENAI_API_KEY')

In [24]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key= PINECONE_API_KEY)
import os

In [25]:


index_name = "medicalbot"


pc.create_index(
    name=index_name,
    dimension=1536, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    ) 
) 

In [26]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [32]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [33]:
# Load Existing index 

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [34]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x17b706999c0>

In [35]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [36]:
retriever

VectorStoreRetriever(tags=['PineconeVectorStore', 'OpenAIEmbeddings'], vectorstore=<langchain_pinecone.vectorstores.PineconeVectorStore object at 0x0000017B706999C0>, search_kwargs={'k': 3})

In [37]:
retrieved_docs = retriever.invoke("What is Acne?")

In [38]:
retrieved_docs

[Document(id='1699655f-e2ee-41f9-a19a-b9d55f84c223', metadata={'page': 624.0, 'source': 'Data\\Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf'}, page_content='followed by a rash.\nKEY TERMS\nBlackhead —A plug of fatty cells capped with a\nblackened mass.\nErythema—A diffuse red and inflamed area of the\nskin.\nPapule—A small hard elevation of the skin.\nPustule—A small pus-filled elevation of the skin.\nRetinoid—A synthetic vitamin A derivative used in\nthe treatment of a variety of skin disorders.\nRhinophyma —Long-term swelling and over-\ngrowth in skin tissue of the nose that leaves it with\na knobby bulb-like look.'),
 Document(id='b3bac283-dc3c-4079-b751-08518882315a', metadata={'page': 526.0, 'source': 'Data\\Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf'}, page_content='will come and go, but usually responds to a good clean-\ning after meals. About a third of all infants develop\n“acne” usually after the third week of life in response to\ntheir mothers’ hormones before birth. T

In [39]:
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4, max_tokens=200)

In [40]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [41]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [42]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])



Acromegaly and gigantism are both conditions that result from abnormalities in the pituitary gland, specifically in the production of growth hormone. Acromegaly is characterized by excessive growth in the bones and soft tissues, while gigantism is the excessive growth of the entire body, including the bones, muscles, and organs. Both conditions can lead to serious health complications if left untreated.


In [47]:
response = rag_chain.invoke({"input": "What is Lesion?"})
print(response["answer"])



Lesion is a tissue disruption or loss of function caused by a disease process. It can also refer to an abnormal growth or area of skin that does not resemble the surrounding skin.
