In [1]:
%pwd

'/home/pal194/RAG_guided_chat_bot/notebook'

In [3]:
import os
os.chdir("../")

In [4]:
%pwd

'/home/pal194/RAG_guided_chat_bot'

In [5]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Extract text from PDF files
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )

    documents = loader.load()
    return documents

In [7]:
extracted_data = load_pdf_files("data")

In [8]:
extracted_data

[Document(metadata={'producer': 'iText 4.2.0 by 1T3XT', 'creator': 'John Wiley & Sons', 'creationdate': '2025-10-13T16:00:40+05:30', 'keywords': 'cancer; microbiome; precision oncology; treatment; tumor microenvironment', 'moddate': '2025-12-26T02:59:48-08:00', 'subject': 'iMeta 2025.4:e70070', 'wps-proclevel': '3', 'wps-journaldoi': '10.1002/(ISSN)2770-596X', 'author': 'Anqi Lin', 'title': 'The microbiome in cancer', 'wps-articledoi': '10.1002/imt2.70070', 'source': 'data/microbiome_in_cancer.pdf', 'total_pages': 104, 'page': 0, 'page_label': '1'}, page_content="Received: 14 April 2025 | Revised: 15 July 2025 | Accepted: 16 July 2025\nDOI: 10.1002/imt2.70070\nREVIEW ARTICLE\nThe microbiome in cancer\nAnqi Lin1,2 | Minying Xiong2 | Aimin Jiang3 | Lihaoyun Huang2 |\nHank Z. H. Wong4 | Suyin Feng1 | Chunyan Zhang2 | Yu Li2 | Li Chen5 |\nHao Chi6,7 | Pengpeng Zhang8 | Bicheng Ye9 | Hengguo Zhang10 |\nNan Zhang11 | Lingxuan Zhu2 | Weiming Mou2 | Junyi Shen2 | Kailai Li2 |\nWentao Xu2 | Hao

In [9]:
len(extracted_data)

118

In [10]:
from typing import List
# from langchain.schema import Document
from langchain_core.documents import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

In [11]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [12]:
minimal_docs[1:10]

[Document(metadata={'source': 'data/microbiome_in_cancer.pdf'}, page_content="Funding information\nMacao Science and Technology\nDevelopment Fund,\nGrant/Award Numbers: 0009/2022/AKP,\n0073/2021/A2; Hunan Youth Science and\nTechnology Talent Project,\nGrant/Award Number: NO.2023RC3074;\nthe Science and Technology Program of\nGuangzhou, Grant/Award Number:\n2023A04J1257; the Guangdong Basic and\nApplied Basic Research Foundation,\nGrant/Award Number:\n2022A1515111212; the National Natural\nScience Foundation of China,\nGrant/Award Numbers: 82172750,\n82172811, 82260546, 82373129; the\nNatural Science Foundation of\nGuangdong Province,\nGrant/Award Number:\n2021A1515012593; Multiyear research\ngrant (MYRG), Grant/Award Number:\nMYRG‐GRG2023‐00150‐FHS‐UMDF to\nK. M.\nmicrobiota transplantation, and engineered microbes offer new avenues for\nadjunctive cancer therapy. This review provides a roadmap for future investiga-\ntion and underscores the transformative promise of microbiome modulat

In [13]:
# Split the documents into smaller chunks
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )
    texts_chunk = text_splitter.split_documents(minimal_docs)
    return texts_chunk

In [14]:
texts_chunk = text_split(minimal_docs)
print(f"Number of chunks: {len(texts_chunk)}")

Number of chunks: 1454


In [15]:
texts_chunk[1:5]

[Document(metadata={'source': 'data/microbiome_in_cancer.pdf'}, page_content='Jindong Xie15 | Xinpei Deng16 | Qi Wang17 | Jianying Xu18 |\nWenjie Shi19 | Chang Qi20 | Chunrun Qu21 | Xufeng Huang22,23 |\nAndrás Hajdu22 | Chaoqun Li24 | Changmin Peng25 | Xuanye Cao26 |\nGuangsheng Pei27 | Lin Zhang28 | Yujia Huo28 | Jiabao Xu29 |\nAntonino Glaviano30 | Attila Gábor Szöllősi31 | Sicheng Bian32 |\nZhengrui Li33 | Hailin Tang15 | Bufu Tang34 | Zaoqu Liu35 |\nJian Zhang2 | Kai Miao5,36 | Quan Cheng37,38 | Ting Wei2 |\nShuofeng Yuan39,40 | Peng Luo1,2,40\nCorrespondence'),
 Document(metadata={'source': 'data/microbiome_in_cancer.pdf'}, page_content="Correspondence\nQuan Cheng, Department of Neurosurgery,\nXiangya Hospital, Central South University,\nChangsha 410008, China.\nEmail: chengquan@csu.edu.cn\nTing Wei, Department of Oncology,\nZhujiang Hospital, Southern Medical\nUniversity, Guangzhou 510282, China.\nEmail: weitingyouyou@qq.com\nShuofeng Yuan, Department of Infectious\nDisease and M

In [16]:
# from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings


def download_embeddings():
    """
    Download and return the HuggingFace embeddings model.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs={"device": "cpu"},                 # for CPU
        # encode_kwargs={"normalize_embeddings": True}, # for CPU
    )
    return embeddings

embedding = download_embeddings()

  embeddings = HuggingFaceEmbeddings(


In [17]:
embedding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={'device': 'cpu'}, encode_kwargs={}, multi_process=False, show_progress=False)

In [18]:
vector = embedding.embed_query("Hello world")
vector

[-0.03447726368904114,
 0.031023193150758743,
 0.0067349947057664394,
 0.02610894665122032,
 -0.03936201333999634,
 -0.16030244529247284,
 0.06692396849393845,
 -0.006441481877118349,
 -0.04745052754878998,
 0.014758824370801449,
 0.07087526470422745,
 0.05552756413817406,
 0.019193341955542564,
 -0.026251332834362984,
 -0.010109550319612026,
 -0.026940450072288513,
 0.022307418286800385,
 -0.022226607427001,
 -0.1496926248073578,
 -0.01749303564429283,
 0.007676276378333569,
 0.0543522946536541,
 0.00325443921610713,
 0.03172590211033821,
 -0.08462145179510117,
 -0.0294059868901968,
 0.051595594733953476,
 0.048124048858881,
 -0.003314807778224349,
 -0.05827918276190758,
 0.04196925461292267,
 0.022210685536265373,
 0.1281888484954834,
 -0.02233896590769291,
 -0.01165627222508192,
 0.06292841583490372,
 -0.03287633880972862,
 -0.09122608602046967,
 -0.031175406649708748,
 0.052699532359838486,
 0.04703480750322342,
 -0.08420310169458389,
 -0.030056176707148552,
 -0.02074483036994934,


In [19]:
print( "Vector length:", len(vector))

Vector length: 384


In [57]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [59]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY


In [22]:
from pinecone import Pinecone 
pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)

In [23]:
pc

<pinecone.pinecone.Pinecone at 0x74f0651b0af0>

In [24]:
from pinecone import ServerlessSpec 

index_name = "rag-guided-chatbot" #lowercase and hyphens

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension=384,  # Dimension of the embeddings
        metric= "cosine",  # Cosine similarity
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )


index = pc.Index(index_name)

In [25]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=texts_chunk,
    embedding=embedding,
    index_name=index_name
)

In [26]:
# Load Existing index 
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding
)

## Add sample data to the existing Pinecone index

In [27]:
ukw_info = Document(
    page_content="Mr. UKW has a youtube channel on learning Machine learning technologies.",
    metadata={"source": "Youtube"}
)

In [28]:
docsearch.add_documents(documents=[ukw_info])

['8b0b79d3-bcbe-4051-917a-f3d011812c71']

In [30]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":2})

In [37]:
retrieved_docs = retriever.invoke("What does gut microbiomes contribute to?")
retrieved_docs
retrieved_docs2 = retriever.invoke("What does Mr UKW have?")
retrieved_docs2

[Document(id='8b0b79d3-bcbe-4051-917a-f3d011812c71', metadata={'source': 'Youtube'}, page_content='Mr. UKW has a youtube channel on learning Machine learning technologies.'),
 Document(id='ab9437fa-62d1-4389-ac2b-736af56802ad', metadata={'source': 'data/microbiome_in_cancer.pdf'}, page_content='This is an open access article under the terms of theCreative Commons AttributionLicense, which permits use, distribution and reproduction in any medium, provided\nthe original work is properly cited.\nFor affiliations refer to page 65.\nAnqi Lin, Minying Xiong, Aimin Jiang, Lihaoyun Huang, Hank Z. H. Wong, and Suyin Feng contributed equally.\nZhengrui Li, Hailin Tang, Bufu Tang, Zaoqu Liu, Jian Zhang, Kai Miao were senior authors.')]

In [48]:
from langchain_openai import ChatOpenAI

chatModel = ChatOpenAI(model="gpt-4o-mini")

In [49]:
# from langchain.chains import create_retrieval_chain
# from langchain.chains.combine_documents import create_stuff_documents_chain
# from langchain_core.prompts import ChatPromptTemplate


from langchain_classic.chains.retrieval import create_retrieval_chain
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [51]:
system_prompt = (
    "You are an Medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [54]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [55]:
response = rag_chain.invoke({"input": "What does gut microbiomes contribute to?"})   #what is Acromegaly and gigantism?"})
print(response["answer"])

Gut microbiomes contribute to nutrient absorption, maintaining gastrointestinal health, and regulating immune function. They help orchestrate microbial balance and can influence cancer pathogenesis. Additionally, they are involved in conditions like inflammatory bowel disease and colorectal cancers.


In [56]:
response = rag_chain.invoke({"input": "does ukw has a youtube channel?"})
print(response["answer"])

Yes, Mr. UKW has a YouTube channel focused on learning machine learning technologies.
