# Medical Chatbot with Latest Pinecone SDK (v7+)
This notebook demonstrates a retrieval-augmented generation (RAG) workflow using the latest Pinecone SDK, LangChain, and HuggingFace embeddings.

In [3]:
# Set Working Directory
import os
os.chdir(r"C:\Users\t000020y\OneDrive - Trench Group\Desktop\Project\end-to-end_medical_chatbot")
%pwd

'C:\\Users\\t000020y\\OneDrive - Trench Group\\Desktop\\Project\\end-to-end_medical_chatbot'

In [4]:
# Load Environment Variables
from dotenv import load_dotenv
load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [5]:
# Import PDF Loaders
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [6]:
# Define PDF Loader Function
def load_pdf_file(data):
    """
    Loads all PDF files from the specified directory using LangChain's DirectoryLoader and PyPDFLoader.
    Args:
        data (str): Path to the directory containing PDF files.
    Returns:
        list: A list of loaded document objects.
    """
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    print(f"Loaded {len(documents)} PDF documents from {data}")
    return documents

In [None]:
# Extract PDF Documents
extracted_data = load_pdf_file(data='data')

Loaded 4505 PDF documents from data/external/


In [8]:
# Define Chunking Function
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    print(f"Split into {len(text_chunks)} chunks")
    return text_chunks

In [9]:
# Split text into chunks
text_chunks = text_split(extracted_data)

Split into 40000 chunks


In [10]:
# Set Up Embeddings
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer

# Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={'device': 'cpu'},
        encode_kwargs={'normalize_embeddings': True}
    )
    return embeddings

embeddings = download_hugging_face_embeddings()

  from .autonotebook import tqdm as notebook_tqdm
  embeddings = HuggingFaceEmbeddings(


In [11]:
# Initialize Pinecone with the latest SDK (v7+)
from pinecone import Pinecone, ServerlessSpec

# Initialize Pinecone client
pc = Pinecone(api_key=PINECONE_API_KEY)

# Set index name
index_name = 'medicalbot'

# Check if index exists, create if it doesn't
if index_name not in [index.name for index in pc.list_indexes()]:
    # Create Pinecone index
    pc.create_index(
        name=index_name,
        dimension=384,  # dimension of all-MiniLM-L6-v2 embeddings
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
    print(f"Created new index: {index_name}")
else:
    print(f"Using existing index: {index_name}")

# Get the index
index = pc.Index(index_name)

Created new index: medicalbot


In [12]:
# Define a function to upsert documents to Pinecone
def upsert_to_pinecone(documents, embedding_function, batch_size=100):
    """
    Upsert documents to Pinecone index.
    Args:
        documents: List of LangChain Document objects
        embedding_function: Function to generate embeddings
        batch_size: Size of batches for upserting
    """
    for i in range(0, len(documents), batch_size):
        batch = documents[i:i+batch_size]
        # Create IDs and embed documents
        ids = [f'doc_{i+j}' for j in range(len(batch))]
        texts = [doc.page_content for doc in batch]
        metadatas = [
            {
                'text': doc.page_content,
                **doc.metadata
            } 
            for doc in batch
        ]
        # Get embeddings
        embeddings_batch = embedding_function.embed_documents(texts)
        # Prepare vectors for upserting
        vectors = [
            {
                'id': ids[j],
                'values': embeddings_batch[j],
                'metadata': metadatas[j]
            }
            for j in range(len(batch))
        ]
        # Upsert to Pinecone
        index.upsert(vectors=vectors)
    print(f"Upserted {len(documents)} documents to Pinecone index '{index_name}'")

In [13]:
# Upsert documents to Pinecone
upsert_to_pinecone(text_chunks, embeddings)

Upserted 40000 documents to Pinecone index 'medicalbot'


In [17]:
from langchain_core.retrievers import BaseRetriever
from langchain_core.documents import Document
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from typing import List

class PineconeRetriever(BaseRetriever):
    k: int = 3

    def __init__(self, index, embedding_function, k=3, filter=None, namespace=None, **kwargs):
        super().__init__(k=k, **kwargs)
        self._index = index
        self._embedding_function = embedding_function
        self._filter = filter
        self._namespace = namespace

    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[Document]:
        query_embedding = self._embedding_function.embed_query(query)
        results = self._index.query(
            vector=query_embedding,
            top_k=self.k,
            include_metadata=True,
            filter=self._filter,
            namespace=self._namespace
        )
        documents = []
        for match in results["matches"]:
            metadata = match.get("metadata", {})
            doc = Document(
                page_content=metadata.get("text", ""),
                metadata={k: v for k, v in metadata.items() if k != "text"}
            )
            documents.append(doc)
        return documents

In [18]:
# Create a retriever
retriever = PineconeRetriever(
    index=index,
    embedding_function=embeddings,
    k=3
)

# Test the retriever
retrieved_docs = retriever.invoke("What is Acne?")
print(retrieved_docs)

[Document(metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2016-02-07T11:23:03+07:00', 'page': 55.0, 'page_label': '26', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': 'data\\external\\encyclopedia-of-medicine-vol-1-5-3rd-edition.pdf.pdf', 'total_pages': 4505.0}, page_content='Researchers, Inc. Reproduced by permission.)\n26 GALE ENCYCLOPEDIA OF MEDICINE\nAcne'), Document(metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2016-02-07T11:23:03+07:00', 'page': 55.0, 'page_label': '26', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': 'data\\external\\encyclopedia-of-medicine-vol-1-5-3rd-edition.pdf.pdf', 'total_pages': 4505.0}, page_content='Sebaceous follicles— A structure found within the\nskin that houses the oil-producing glands and hair\nfollicles, where pimples form.\nSebum— An oily skin moisturizer produced by\nsebaceous glands.\nTretinoin— A drug that works by increasing the\nturn

In [22]:
# Set up LLM
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4, max_tokens=500)

# Create the RAG chain
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use ONLY the following pieces of retrieved context to answer the question. "
    "If the context does not contain information relevant to the question, respond ONLY with 'I don't know.' "
    "Use three sentences maximum and keep the answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
])

# Create the question-answering chain
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [23]:
# Test the RAG chain
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])


Acromegaly and gigantism are both disorders caused by abnormal release of a chemical from the pituitary gland in the brain. This chemical is responsible for increased growth in bone and soft tissue, as well as other disturbances in the body. These disorders can be confirmed through tests for underactivity or overproduction of the pituitary gland. Left untreated, the disease does not worsen.


In [24]:
response = rag_chain.invoke({"input": "What is stats?"})
print(response["answer"])


I don't know.
