In [None]:
print("Ok")

In [None]:
%pwd

In [None]:
import os
os.chdir("..")

In [None]:
%pwd

In [None]:
#!pip install pypdf langchain-community pinecone langchain_pinecone langchain_google_genai

In [None]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter # used to split larger text documents into smaller, more manageable chunks

In [None]:
#Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents


In [None]:
extracted_data=load_pdf_file(data='Data/')

In [None]:
extracted_data[0:3]

In [None]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    # chunk_size=500: This specifies that each chunk should be approximately 500 characters long.
    # chunk_overlap=20: This indicates that consecutive chunks should overlap by 20 characters. Overlap helps preserve context between chunks.
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    # text_splitter=RecursiveCharacterTextSplitter(chunk_size=65, chunk_overlap=7)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [None]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

In [None]:
text_chunks[1:3]

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
# from langchain_google_genai import GoogleGenerativeAIEmbeddings

In [None]:
#For gemini
from google.colab import userdata

GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')

In [None]:
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    # embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY)
    return embeddings


In [None]:
embeddings = download_hugging_face_embeddings()

In [None]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

In [None]:
query_result

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
# PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
# OPENAI_API_KEY=os.environ.get('OPENAI_API_KEY')

In [None]:
from google.colab import userdata
PINECONE_API_KEY=userdata.get('PINECONE_API_KEY')
# OPENAI_API_KEY=userdata.get('OPENAI_API_KEY')

In [None]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medicalbot"

# Creating index/cluster in Pinecone
try:
    pc.create_index(
      name=index_name,
      dimension=384, # becoz embedding model used here produce embeddings of 384 dimesion
      # dimension = 768,
      metric="cosine",
      spec=ServerlessSpec(
          cloud="aws",
          region="us-east-1"
      )
  )
except Exception as e:
  print(e)


In [None]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
# os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [None]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings,
)



In [None]:
# Whille using google's embedding (lenght 784) its showing following errror.
# found 4202300 bytes, the limit is: 4194304 bytes
#  found 4198828 bytes, the limit is: 4194304 bytes
# large: found 4194631 bytes, the limit is: 4194304 bytes
#
#You can try dimentionality reduction as follows.
'''
import pinecone
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from sklearn.decomposition import PCA

# ... (your existing code) ...

embeddings = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001", google_api_key=GOOGLE_API_KEY
)

# Apply PCA to reduce dimensionality (e.g., to 384 dimensions)
pca = PCA(n_components=384)
embeddings_reduced = pca.fit_transform(embeddings) # Assuming 'embeddings' is a NumPy array

# Now use 'embeddings_reduced' when upserting into Pinecone
index = pinecone.Index(index_name)  # Assuming you have already initialized Pinecone
index.upsert([(id, embeddings_reduced.tolist()) for id, embeddings_reduced in zip(ids, embeddings_reduced)])
'''

In [None]:
# Load Existing index

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [None]:
docsearch

In [None]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})
# This function is essentially transforming the docsearch index into a format suitable for retrieval.
# This argument specifies the method used to search for documents. In this case, we are using "similarity" search, meaning the retriever will find documents most similar to the given query.
# "k":3 tells the retriever to return the top 3 most similar documents for any given query.

In [None]:
retrieved_docs = retriever.invoke("What is Acne?")

In [None]:
retrieved_docs[0]

In [None]:
retrieved_docs[1]

In [None]:
retrieved_docs[2]

In [None]:
# from langchain_openai import OpenAI
# llm = OpenAI(temperature=0.4, max_tokens=500)

In [None]:
#For gemini
from google import genai
from google.genai import types

In [None]:
#For gemini
from langchain_google_genai import ChatGoogleGenerativeAI
# config = types.GenerateContentConfig(temperature=0.4, max_output_tokens=500)

# Initialize the Gemini model
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    google_api_key=GOOGLE_API_KEY,
    temperature=0.4,
)

In [None]:
# create_retrieval_chain: This function is used for creating a chain that retrieves relevant documents and then uses them to answer questions.
# create_stuff_documents_chain: This function combines multiple documents into a single context to be passed to the language model.
# ChatPromptTemplate: This class is used to create prompt templates that can be formatted with user input and context.
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [None]:
# question_answer_chain = create_stuff_documents_chain(llm, prompt)
# rag_chain = create_retrieval_chain(retriever, question_answer_chain)
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])

In [None]:
response = rag_chain.invoke({"input": "What is stats?"})
response["answer"]

In [None]:
response["answer"]