In [1]:
# Document Ingestion - PDF Loading
from langchain_community.document_loaders import TextLoader, PyPDFLoader
loader = PyPDFLoader('data/ConceptsofBiology.pdf')
doc = loader.load()

In [2]:
# Document Transformation - Chunking
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_documents = text_splitter.split_documents(doc)

In [4]:
import os, yaml
# Read the app.yaml file
with open('config/app.yaml', 'r') as file:
    config = yaml.safe_load(file)

os.environ["OPENAI_API_KEY"] = config.get('openai-api-key')

# Vector Embeddings and Vectorstore
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
vector_db = Chroma.from_documents(split_documents, OpenAIEmbeddings())

In [5]:
# Querying the vector DB based on similarity search
query = "What is mitochondria?"
results  = vector_db.similarity_search(query)

In [6]:
results

[Document(metadata={'page': 81, 'source': 'data/ConceptsofBiology.pdf'}, page_content='smaller in prokaryotic cells. They are particularly abundant in immature red blood cells for the synthesis of\nhemoglobin, which functions in the transport of oxygen throughout the body.\nMitochondria\nMitochondria(singular = mitochondrion) are often called the “powerhouses” or “energy factories” of a cell because\nthey are responsible for making adenosine triphosphate (ATP), the cell’s main energy-carrying molecule. The\nformation of ATP from the breakdown of glucose is known as cellular respiration. Mitochondria are oval-shaped,\ndouble-membrane organelles (Figure 3.14) that have their own ribosomes and DNA. Each membrane is a\nphospholipid bilayer embedded with proteins. The inner layer has folds called cristae, which increase the surface\narea of the inner membrane. The area surrounded by the folds is called the mitochondrial matrix. The cristae and\nthe matrix have different roles in cellular re

In [7]:
#incorporating LLMs to the RAG flow
from langchain_openai import ChatOpenAI
model = ChatOpenAI(model='gpt-3.5-turbo')

In [8]:
model

ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x177d8db70>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x177dff640>, root_client=<openai.OpenAI object at 0x117d688b0>, root_async_client=<openai.AsyncOpenAI object at 0x177d8dba0>, model_kwargs={}, openai_api_key=SecretStr('**********'))

In [9]:
# Prompt Template
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template("""
Answer the questions based only on the provided context. 
Provide detailed and clear answer based on the context. 
<context> {context} </context>
Question: {input}""")

In [10]:
# Create chains
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

doc_chain = create_stuff_documents_chain(model,prompt= prompt)
# Adding retriever
retriever = vector_db.as_retriever()
retriever_chain = create_retrieval_chain(retriever, doc_chain)

In [11]:
response = retriever_chain.invoke({"input" : "What is mitochondria? Explain the concept in details."})

In [12]:
response

{'input': 'What is mitochondria? Explain the concept in details.',
 'context': [Document(metadata={'page': 81, 'source': 'data/ConceptsofBiology.pdf'}, page_content='smaller in prokaryotic cells. They are particularly abundant in immature red blood cells for the synthesis of\nhemoglobin, which functions in the transport of oxygen throughout the body.\nMitochondria\nMitochondria(singular = mitochondrion) are often called the “powerhouses” or “energy factories” of a cell because\nthey are responsible for making adenosine triphosphate (ATP), the cell’s main energy-carrying molecule. The\nformation of ATP from the breakdown of glucose is known as cellular respiration. Mitochondria are oval-shaped,\ndouble-membrane organelles (Figure 3.14) that have their own ribosomes and DNA. Each membrane is a\nphospholipid bilayer embedded with proteins. The inner layer has folds called cristae, which increase the surface\narea of the inner membrane. The area surrounded by the folds is called the mitoch