## Select proper directory

In [1]:
import os
%pwd

'e:\\AIML\\BappyML\\GenAI-medical-chatbot\\Medical-Chatbot-GenerativeAI\\research'

In [2]:
os.chdir("../")
%pwd

'e:\\AIML\\BappyML\\GenAI-medical-chatbot\\Medical-Chatbot-GenerativeAI'

In [3]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

## helper functions:

### load pdf using langchain

In [4]:
# Extract Data from the pdf file
def load_pdf_file(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader,
    )
    documents = loader.load()
    return documents

extracted_data = load_pdf_file(data="data/")

### split text using langchain

In [5]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

text_chunks = text_split(extracted_data)

### Create Embeddings using langchain HuggingfaceEmbedding class

Model used all-MiniLM-L6

In [6]:
!pip install sentence_transformers -q
!pip install huggingface_hub -q

In [7]:
from langchain.embeddings import HuggingFaceEmbeddings
def download_embedding():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

embeddings = download_embedding()

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# length of embeddings output
query_result = embeddings.embed_query("What is a heart disease?")
print("Length", len(query_result))

Length 384


## Create Vector Indexing using Pinecone

In [9]:
!pip install pinecone[grpc] -q

In [10]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from dotenv import load_dotenv
import time

load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

pc = Pinecone(api_key= PINECONE_API_KEY)

index_name = "testbot"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws", 
            region="us-east-1"
        ) 
    ) 

# Wait for the index to be ready
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

In [17]:
# Embed each chunk and upsert the embedding into your Pinecone index
from langchain_pinecone import PineconeVectorStore

# docsearch = PineconeVectorStore.from_documents(
#     documents=text_chunks, 
#     embedding=embeddings, 
#     index_name=index_name)

In [13]:
from langchain_pinecone import PineconeVectorStore

# Existing Index
docsearch = PineconeVectorStore.from_existing_index(
    embedding=embeddings, 
    index_name=index_name)

In [14]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1d7b31458a0>

In [15]:
retriever = docsearch.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)

In [16]:
retrieved_docs = retriever.get_relevant_documents("What is a heart disease?")
print(retrieved_docs)

  retrieved_docs = retriever.get_relevant_documents("What is a heart disease?")


[Document(id='4f0706d5-11c3-4c66-8afd-f0d38b59e1a4', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2006-10-16T22:03:45+02:00', 'page': 763.0, 'page_label': '734', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': 'data\\Gale_encyclopedia.pdf', 'total_pages': 4505.0}, page_content='Definition\nCardiomyopathy is a chronic disease of the heart\nmuscle (myocardium), in which the muscle is abnor-\nmally enlarged, thickened, and/or stiffened. The wea-\nkened heart muscle loses the ability to pump blood\neffectively, resulting in irregular heartbeats (arrhyth-\nmias) and possibly evenheart failure.\nDescription\nCardiomyopathy, a disease of the heart muscle,\nprimarily affects the left ventricle, which is the main\npumping chamber of the heart. The disease is often'), Document(id='cee615d1-a251-4016-bb35-f82c9b9baf90', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2006-10-16T22:03:45+02:0

## Load Chat Model

In [19]:
from langchain.chat_models import ChatOllama
llm = ChatOllama(temperature=0.4, max_tokens=600, model="llama2")

## Create a chain

In [None]:
from langchain.chains.retrieval import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for a question answering system. "
    "Use the following pieces of context to answer the user's question."
    "If you don't know the answer, just say that you don't know, don't try to make up an answer."
    "Answer correctly."
    "\n\n"
    "{context}"
    "\n\n"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

## Testing

In [21]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [22]:
response = rag_chain.invoke({"input": "What is Acne?"})
print(response["answer"])

According to the provided context, acne is a common skin disease characterized by pimples on the face, chest, and back. It occurs when the pores of the skin become clogged with oil, dead skin cells, and bacteria. The definition provided in the context states that acne is a condition where the sebaceous glands become inflamed, leading to pimples and other skin problems.


In [24]:
response = rag_chain.invoke({"input": "What is Statistics?"})
print(response["answer"])

Thank you for your question! I'm happy to help you understand statistics.

Statistics is a branch of mathematics that deals with the collection, analysis, interpretation, and presentation of data. It involves the use of mathematical techniques to summarize, visualize, and draw conclusions from data. The main goal of statistics is to extract meaningful information from data, make predictions, and support decision-making in various fields such as business, healthcare, social sciences, and many more.

In your question, you mentioned the Stanford-Binet scale, which is a widely used intelligence test. The standard deviation of 16 in this case indicates how far scores are distributed above or below the mean score of 100. A higher standard deviation means that scores are more spread out, while a lower standard deviation means that scores are closer to the mean.

In statistics, we use measures such as the mean and standard deviation to understand the distribution of data. For example, in a nor

In [23]:
response = rag_chain.invoke({"input": "What is Heart Desease?"})
print(response["answer"])

Based on the provided context, I can answer your question as follows:

Heart disease refers to any condition that affects the heart's function or structure. The context you provided mentions several types of heart conditions, including:

1. Ischemia: a condition where the heart muscle receives an insufficient supply of blood and slowly starves.
2. Mitral stenosis: narrowing or constricting of the mitral valve, which separates the left atrium from the left ventricle.
3. Pulmonary edema: fluid accumulation in the lungs and respiratory system.
4. Heart murmur: a sound during the heartbeat caused by a heart valve that does not close properly.
5. Rheumatic heart disease: a condition caused by a streptococcus infection that can result in permanent heart damage.
6. Mitral valve stenosis: narrowing or constricting of the mitral valve, which separates the left atrium from the left ventricle.

These conditions can affect the heart's ability to pump blood efficiently, leading to symptoms such as 