In [33]:
print("WASUP")

WASUP


In [1]:
%pwd

'c:\\Projects\\Medical-Chatbot-Gen-AI\\research'

In [2]:
import os
os.chdir("../")

In [3]:
%pwd

'c:\\Projects\\Medical-Chatbot-Gen-AI'

In [4]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [5]:
#Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents


In [6]:
import os
from langchain.document_loaders import PyPDFLoader, DirectoryLoader

def load_pdf_file(data):
    data_path = os.path.abspath(data)  # Convert to absolute path
    
    if not os.path.exists(data_path):
        raise FileNotFoundError(f"Directory not found: '{data_path}'")
    
    loader = DirectoryLoader(data_path, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

# Use the absolute path
extracted_data = load_pdf_file(r'C:\Projects\Medical-Chatbot-Gen-AI\Data')  # Use raw string (r'' for Windows paths)


In [7]:
#extracted_data

In [8]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [9]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 5961


In [10]:
#text_chunks

In [11]:
from langchain.embeddings import HuggingFaceEmbeddings

In [12]:
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings


In [13]:
embeddings = download_hugging_face_embeddings()

  embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
  from .autonotebook import tqdm as notebook_tqdm


In [14]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [15]:
#query_result

In [16]:
from dotenv import load_dotenv
load_dotenv()

True

In [17]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
GEMINI_API_KEY=os.environ.get('GEMINI_API_KEY')

In [18]:
import pinecone
import os
from dotenv import load_dotenv  # Load API key securely

# Load environment variables from .env file
load_dotenv()

# Securely get Pinecone API key from environment
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = "us-east-1"  # Check your Pinecone console for correct environment

# ✅ Correct initialization for Pinecone v3+
pc = pinecone.Pinecone(api_key=PINECONE_API_KEY)

# Define index name
index_name = "medicalbot"

# Check if index exists, then create it if missing
if index_name not in [index.name for index in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=384,  # Must match embedding model dimensions
        metric="cosine",
        spec=pinecone.ServerlessSpec(
            cloud="aws",  # Change based on your Pinecone setup
            region=PINECONE_ENV  
        )
    )

print("✅ Pinecone index setup completed successfully!")


✅ Pinecone index setup completed successfully!


In [19]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY

In [20]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [21]:
# Load Existing index 

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [22]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x200b6acccb0>

In [23]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [24]:
retrieved_docs = retriever.invoke("What is Acne?")

In [25]:
retrieved_docs

[Document(id='fb146ddc-c0af-4d29-9615-eb01c411ba95', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 37.0, 'page_label': '38', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'C:\\Projects\\Medical-Chatbot-Gen-AI\\Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='Nancy J. Nordenson\nAcid reflux see Heartburn\nAcidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when the\npores of the skin become clogged with oil, dead skin\ncells, and bacteria.\nDescription\nAcne vulgaris, the medical term for common acne, is\nthe most common skin disease. It affects nearly 17 million\npeople in the United States. While acne can arise at any'),
 Document(id='cb7de8b7-b145-4313-ac13-00af1af59d70', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'modda

In [26]:
# Block 1: Initialize the language model (llm)
from langchain_google_genai import ChatGoogleGenerativeAI
import os

llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0.4,
    max_tokens=500,
    google_api_key=os.getenv("GEMINI_API_KEY")  # Ensure this key is set
)


In [27]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# Define system prompt for Gemini
system_prompt = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, say that you don't know. 
Use three sentences maximum and keep the answer concise.

{context}"""

# Create a ChatPromptTemplate
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


In [28]:
# Create the question-answering chain using Gemini
question_answer_chain = create_stuff_documents_chain(llm, prompt)

# Create the RAG (Retrieval-Augmented Generation) chain
rag_chain = create_retrieval_chain(retriever, question_answer_chain)


In [29]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])

Acromegaly is a disorder where the pituitary gland releases an abnormal amount of a chemical, causing increased bone and soft tissue growth and other bodily disturbances.  Gigantism is related, but the timing of the hormone release differs, resulting in overall body size increase.  More information can be found in the cited resources.


In [30]:
response = rag_chain.invoke({"input": "What is stats?"})
print(response["answer"])

I am sorry, but this document does not contain the answer to this question.  I do not have access to external websites or specific files online.


In [31]:
retrieved_docs = retriever.invoke("What is stats?")
for i, doc in enumerate(retrieved_docs):
    print(f"Document {i+1}: {doc.page_content}\n")


Document 1: Blood count
Definition
One of the most commonly ordered clinical labora-
tory tests, a blood count, also called a complete blood
count (CBC), is a basic evaluation of the cells (red blood
cells, white blood cells, and platelets) suspended in the
liquid part of the blood (plasma). It involves determining
the numbers, concentrations, and conditions of the differ-
ent types of blood cells.
Purpose
The CBC is a useful screening and diagnostic test

Document 2: The blood count is performed relatively inexpen-
sively and quickly. Most laboratories routinely use some
type of automated equipment to dilute the blood, sample
a measured volume of the diluted suspension, and count
the cells in that volume. In addition to counting actual
numbers of red cells, white cells, and platelets, the auto-
mated cell counters also measure the hemoglobin and
calculate the hematocrit and the red blood cell indices
(measures of the size and hemoglobin content of the red

Document 3: cal conduction s

In [32]:
from pinecone import Pinecone
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

# Initialize Pinecone client
pc = Pinecone(api_key=PINECONE_API_KEY)

# Define index name
index_name = "medicalbot"

# Check if the index exists
if index_name in [index.name for index in pc.list_indexes()]:
    index = pc.Index(index_name)
    stats = index.describe_index_stats()
    print(f"✅ Total records in Pinecone: {stats['total_vector_count']}")
else:
    print("❌ Index 'medicalbot' does not exist. You need to run `store_index.py`.")


✅ Total records in Pinecone: 6569
