In [1]:
%pwd

'c:\\Users\\asif8\\OneDrive\\Documents\\practice-work\\genai\\medical-chatbot-genai\\research'

In [2]:
import os
os.chdir('../')

In [3]:
%pwd

'c:\\Users\\asif8\\OneDrive\\Documents\\practice-work\\genai\\medical-chatbot-genai'

In [4]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [5]:
# Extract Data from the PDF file
def load_pdf_file(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader,
    )
    documents = loader.load()
    
    return documents

In [6]:
# Load the PDF file
extracted_data = load_pdf_file(data="Data/")

In [9]:
#extracted_data

In [10]:
# Split the data into chunks
def split_data(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )
    # Split the data into chunks
    text_chunks = text_splitter.split_documents(documents)
    return text_chunks


In [11]:
text_chunks = split_data(extracted_data)
print("Length of text chunks: ", len(text_chunks))

Length of text chunks:  39994


## Embedding the data into the vector database

In [12]:
from langchain.embeddings import HuggingFaceEmbeddings

In [21]:
# Download the Embeddings from HuggingFace
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [23]:
embeddings = download_hugging_face_embeddings()

In [26]:
query_result = embeddings.embed_query("What is the name of the patient?")
print("Length: ",len(query_result))

Length:  384


## Create Pinecone index

In [48]:
from dotenv import load_dotenv
load_dotenv()

PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")


In [35]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medibot"

pc.create_index(
    index_name,
    dimension=384, 
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

{
    "name": "medibot",
    "metric": "cosine",
    "host": "medibot-mtvkost.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

## Save the embeddings in the Pinecone Vector Store

In [39]:
# Embed each chunk and upsert the embeddings into your Pinecone index
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks, 
    index_name=index_name,
    embedding=embeddings
)

## Load the Existing Index


In [41]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [42]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x21b8153a1d0>

In [43]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [46]:
retrieved_docs = retriever.invoke("What is the vaginal acne?")

In [47]:
retrieved_docs

[Document(id='8d0d101e-02ad-4957-9e16-e4c0d172a490', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2006-10-16T22:03:45+02:00', 'page': 55.0, 'page_label': '26', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 4505.0}, page_content='Researchers, Inc. Reproduced by permission.)\n26 GALE ENCYCLOPEDIA OF MEDICINE\nAcne'),
 Document(id='f7b1c7fd-561e-46be-91c3-a95171c5d921', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2006-10-16T22:03:45+02:00', 'page': 997.0, 'page_label': '968', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 4505.0}, page_content='Vagina— The genital canal in the female, leading\nfrom the vulva to the uterus.\n968 GALE ENCYCLOPEDIA OF MEDICINE\nCondom'),
 Document(id='e89d97df-58f1-4394-bc05-77601ede353f', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Ac

## Initialize and use LLm with Gemini through LangChain

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get the Gemini API key from environment variables
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")

# If the API key is not set, prompt the user to enter it
if not GEMINI_API_KEY:
    print("Please enter your Gemini API key (you can get one from https://makersuite.google.com/app/apikey):")
    GEMINI_API_KEY = input()
    # Save it to environment variable for this session
    os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY

# Initialize the Gemini model through LangChain
llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.4, google_api_key=GEMINI_API_KEY)

# Create the prompt template with 'question' variable instead of 'input'
prompt = ChatPromptTemplate.from_template("""
You are a medical assistant. Use the following context to answer the question.
If you don't know the answer, just say that you don't know.
Keep the answer concise and professional.

Context: {context}
Question: {question}

Answer: """)

# Create the document chain
document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)

# Create the retrieval chain
retrieval_chain = create_retrieval_chain(retriever=retriever, combine_docs_chain=document_chain)

# Function to get answers - make sure to use 'question' as the key
def get_medical_answer(question: str) -> str:
    response = retrieval_chain.invoke({
        "question": question  # Changed from "input" to "question"
    })
    return response["answer"]


In [None]:
# Example usage
user_question = input("Enter your medical question: ")
answer = get_medical_answer(user_question)
print("\nResponse from Gemini via LangChain:\n")
print(answer)