In [None]:
# Document Ingestion - PDF Loading
from langchain_community.document_loaders import TextLoader, PyPDFLoader
loader = PyPDFLoader('data/ConceptsofBiology.pdf')
doc = loader.load()

In [3]:
# Document Transformation - Chunking
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_documents = text_splitter.split_documents(doc)

In [24]:
import os, yaml
# Read the app.yaml file
with open('config/app.yaml', 'r') as file:
    config = yaml.safe_load(file)

os.environ["OPENAI_API_KEY"] = config.get('openai-api-key')

# Vector Embeddings and Vectorstore
from langchain_community.embeddings import OpenAIEmbeddings, OllamaEmbeddings
from langchain_community.vectorstores import Chroma,FAISS
vector_db = FAISS.from_documents(split_documents, OpenAIEmbeddings())

In [25]:
# Querying the vector DB based on similarity search
query = "What is mitochondria?"
results  = vector_db.similarity_search(query)

In [26]:
results

[Document(metadata={'source': 'data/ConceptsofBiology.pdf', 'page': 81}, page_content='smaller in prokaryotic cells. They are particularly abundant in immature red blood cells for the synthesis of\nhemoglobin, which functions in the transport of oxygen throughout the body.\nMitochondria\nMitochondria(singular = mitochondrion) are often called the “powerhouses” or “energy factories” of a cell because\nthey are responsible for making adenosine triphosphate (ATP), the cell’s main energy-carrying molecule. The\nformation of ATP from the breakdown of glucose is known as cellular respiration. Mitochondria are oval-shaped,\ndouble-membrane organelles (Figure 3.14) that have their own ribosomes and DNA. Each membrane is a\nphospholipid bilayer embedded with proteins. The inner layer has folds called cristae, which increase the surface\narea of the inner membrane. The area surrounded by the folds is called the mitochondrial matrix. The cristae and\nthe matrix have different roles in cellular re

In [27]:
#incorporating LLMs to the RAG flow
from langchain_ollama import OllamaLLM
model = OllamaLLM(model='llama3.2')

In [28]:
model

OllamaLLM(model='llama3.2')

In [29]:
# Prompt Template
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template("""
Answer the questions based only on the provided context. 
Think step by step before providing a detailed answer. 
<context> {context} </context>
Question: {input}""")

In [30]:
#create chains

from langchain.chains.combine_documents import create_stuff_documents_chain
doc_chain = create_stuff_documents_chain(model,prompt= prompt)

In [32]:
# Adding retriever
retriever = vector_db.as_retriever()
retriever

VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x32faae590>, search_kwargs={})

In [33]:
from langchain.chains import create_retrieval_chain
retriever_chain = create_retrieval_chain(retriever, doc_chain)


In [35]:
response = retriever_chain.invoke({"input" : "What is mitochondria? Explain the concept in details."})
answer = response['answer']

In [36]:
answer

'Based on the provided context, here\'s a detailed explanation of what mitochondria are:\n\nMitochondria are often referred to as the "powerhouses" or "energy factories" of a cell because they play a crucial role in making adenosine triphosphate (ATP), the cell\'s main energy-carrying molecule. This process is known as cellular respiration.\n\nThe formation of ATP from the breakdown of glucose occurs within the mitochondria, releasing energy stored in nutrients. Mitochondria are oval-shaped, double-membrane organelles that have their own ribosomes and DNA. Each membrane is a phospholipid bilayer embedded with proteins. The inner layer has folds called cristae, which increase the surface area of the inner membrane. The area surrounded by the folds is called the mitochondrial matrix.\n\nThe cristae and the matrix have different roles in cellular respiration. The cristae are involved in releasing energy stored in nutrients, while the matrix is responsible for storing and releasing ATP dur

In [37]:
response = retriever_chain.invoke({"input" : "Which innate immune system component uses MHC class I molecules directly in its defense strategy"})
answer = response['answer']

In [38]:
answer

"To answer this question, let's analyze the information provided in the context.\n\nAccording to the text, NK cells identify intracellular infections, especially from viruses, by the altered expression of major histocompatibility complex (MHC) I molecules on the surface of infected cells. This indicates that NK cells directly use MHC class I molecules as part of their defense strategy.\n\nSo, based on this information, the answer to question 9 is:\n\nc. NK cells"