In [14]:
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import CTransformers
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore

In [3]:
# Extract the data from PDF
def load_pdf(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [4]:
extracted_data = load_pdf("../data/")

In [5]:
# Create Text Chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [6]:
text_chunks = text_split(extracted_data)
print("Length of my chunks:", len(text_chunks))

Length of my chunks: 5859


In [7]:
# Download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [8]:
embeddings = download_hugging_face_embeddings()

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [9]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [10]:
query_result = embeddings.embed_query("hello world")
print("Dimention Length: ", len(query_result))

Dimention Length:  384


In [19]:
# Step1 Create Pinecone client instance
pc = Pinecone(api_key="pcsk_4APhah_9hT1yGGEnY1MRXmJH5Cqf1gpxF1qC8UbrKfsuew3fBsyufWCQZ6VNQEjUdV7YJf")  # Replace with your actual API key

In [21]:
pc.list_indexes()

[
    {
        "name": "testing-v3",
        "metric": "cosine",
        "host": "testing-v3-k9cmw6y.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "vector_type": "dense",
        "dimension": 1536,
        "deletion_protection": "disabled",
        "tags": null
    },
    {
        "name": "medical-chatbot-test",
        "metric": "cosine",
        "host": "medical-chatbot-test-k9cmw6y.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "vector_type": "dense",
        "dimension": 384,
        "deletion_protection": "disabled",
        "tags": null,
   

In [None]:
pc.delete_index("medical-chatbot-test-1")

NotFoundException: (404)
Reason: Not Found
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'x-pinecone-api-version': '2025-01', 'x-cloud-trace-context': '07dabbf92ecf445a702fa47772ab72ce', 'date': 'Wed, 21 May 2025 05:08:56 GMT', 'server': 'Google Frontend', 'Content-Length': '97', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"NOT_FOUND","message":"Resource medical-chatbot-test-2 not found"},"status":404}


In [24]:
# Step 2: Create index if it doesn't exist
pc.create_index(
    name="medical-chatbot-test-1",
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

{
    "name": "medical-chatbot-test-1",
    "metric": "cosine",
    "host": "medical-chatbot-test-1-k9cmw6y.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [25]:
# Step 3: Connect to existing index
index = pc.Index("medical-chatbot-test")

In [26]:
import os
os.environ["PINECONE_API_KEY"] = "pcsk_4APhah_9hT1yGGEnY1MRXmJH5Cqf1gpxF1qC8UbrKfsuew3fBsyufWCQZ6VNQEjUdV7YJf"

In [27]:
# Creating Embeddings for each for the text chunks and storing
docsearch = PineconeVectorStore.from_texts([t.page_content for t in text_chunks], embedding=embeddings, index_name="medical-chatbot-test")

In [29]:
# If we already have an index we can load it like this
docsearch = PineconeVectorStore.from_existing_index("medical-chatbot-test", embeddings)

query = "What are allergies?"

docs = docsearch.similarity_search(query, k=3)

print("Result: ", docs)

Result:  [Document(id='2094eda9-2a29-4b57-84aa-eeb5d0bc3e6b', metadata={}, page_content='reaction. Allergic rhinitis is characterized by an itchy,\nrunny nose, often with a scratchy or irritated throat due\nto post-nasal drip. Inflammation of the thin membrane\ncovering the eye (allergic conjunctivitis) causes redness,\nirritation, and increased tearing in the eyes. Asthma caus-\nes wheezing, coughing, and shortness of breath. Symp-\ntoms of food allergies depend on the tissues most sensi-\ntive to the allergen and whether the allergen spread sys-'), Document(id='44f2eeba-e1fc-493e-b330-978174d6019b', metadata={}, page_content='reaction. Allergic rhinitis is characterized by an itchy,\nrunny nose, often with a scratchy or irritated throat due\nto post-nasal drip. Inflammation of the thin membrane\ncovering the eye (allergic conjunctivitis) causes redness,\nirritation, and increased tearing in the eyes. Asthma caus-\nes wheezing, coughing, and shortness of breath. Symp-\ntoms of food al

In [30]:
prompt_template = """
    Use the following pieces of information to answer the user's question.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    
    Context: {context}
    Question: {question}
    
    Only return the helpful answer below and nothing else.
    Helpful answer:
"""

In [31]:
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": prompt}

In [None]:
llm = CTransformers(model="../model/llama-2-7b-chat.ggmlv3.q2_K.bin", model_type="llama", config={'max_new_tokens': 512, 'temperature': 0.8})

In [33]:
question_answer = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever(search_kwargs={'k': 2}), return_source_documents=True, chain_type_kwargs=chain_type_kwargs)

In [None]:
while True:
    user_input = input(f"Input Prompt:")
    result = question_answer({'query': user_input})
    print("Response : ", result["result"])

Response :  Abuse refers to any harmful or injurious actions taken against a person, including physical, sexual, emotional, or substance abuse. It can also involve wrongful misuse of something or someone. The different types of abuse include child abuse, adult abuse, elderly abuse, and emotional abuse. Physical abuse of a child includes inflicting injury by an other person through means such as punching, kicking, biting, burning, beating, or pulling the victim's hair.
Response :      The incubation period for AIDS is typically between 2 to 10 years after exposure, but it can range from a few weeks to several decades.
Response :      AIDS is a disease caused by the human immunodeficiency virus (HIV). HIV attacks the body's immune system, making it difficult to fight off infections and diseases. There is no cure for AIDS, but antiretroviral therapy (ART) can help manage the disease and slow its progression.


Response :  Esophageal manometry is a diagnostic test used to assess the functi