# RAG Pipeline with vector database

In [4]:
from langchain_community.document_loaders import TextLoader
loader = TextLoader("file.txt")
docs =loader.load()

In [5]:
docs

[Document(metadata={'source': 'file.txt'}, page_content='This book is aimed at the data scientist with some familiarity with the R\nprogramming language, and with some prior (perhaps spotty or ephemeral)\nexposure to statistics. Both of us came to the world of data science from the world\nof statistics, so we have some appreciation of the contribution that statistics can\nmake to the art of data science. At the same time, we are well aware of the\nlimitations of traditional statistics instruction: statistics as a discipline is a century\nand a half old, and most statistics textbooks and courses are laden with the\nmomentum and inertia of an ocean liner.')]

In [6]:
import os
from dotenv import load_dotenv

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [7]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("Stats.pdf")
docs = loader.load()
docs

[Document(metadata={'producer': 'calibre 2.85.1 [https://calibre-ebook.com]', 'creator': 'calibre 2.85.1 [https://calibre-ebook.com]', 'creationdate': '2017-07-25T15:12:08+00:00', 'author': 'Peter Bruce & Andrew Bruce', 'title': 'Practical Statistics for Data Scientists: 50 Essential Concepts', 'moddate': '2017-07-25T20:45:33+05:30', 'source': 'Stats.pdf', 'total_pages': 562, 'page': 0, 'page_label': '1'}, page_content=''),
 Document(metadata={'producer': 'calibre 2.85.1 [https://calibre-ebook.com]', 'creator': 'calibre 2.85.1 [https://calibre-ebook.com]', 'creationdate': '2017-07-25T15:12:08+00:00', 'author': 'Peter Bruce & Andrew Bruce', 'title': 'Practical Statistics for Data Scientists: 50 Essential Concepts', 'moddate': '2017-07-25T20:45:33+05:30', 'source': 'Stats.pdf', 'total_pages': 562, 'page': 1, 'page_label': '2'}, page_content='Practical\tStatistics\tfor\t\nData\tScientists\n50\tEssential\tConcepts\nPeter\tBruce\tand\tAndrew\tBruce'),
 Document(metadata={'producer': 'calibr

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size =1000, chunk_overlap=200)
chunk_documents =text_splitter.split_documents(docs)
chunk_documents

[Document(metadata={'producer': 'calibre 2.85.1 [https://calibre-ebook.com]', 'creator': 'calibre 2.85.1 [https://calibre-ebook.com]', 'creationdate': '2017-07-25T15:12:08+00:00', 'author': 'Peter Bruce & Andrew Bruce', 'title': 'Practical Statistics for Data Scientists: 50 Essential Concepts', 'moddate': '2017-07-25T20:45:33+05:30', 'source': 'Stats.pdf', 'total_pages': 562, 'page': 1, 'page_label': '2'}, page_content='Practical\tStatistics\tfor\t\nData\tScientists\n50\tEssential\tConcepts\nPeter\tBruce\tand\tAndrew\tBruce'),
 Document(metadata={'producer': 'calibre 2.85.1 [https://calibre-ebook.com]', 'creator': 'calibre 2.85.1 [https://calibre-ebook.com]', 'creationdate': '2017-07-25T15:12:08+00:00', 'author': 'Peter Bruce & Andrew Bruce', 'title': 'Practical Statistics for Data Scientists: 50 Essential Concepts', 'moddate': '2017-07-25T20:45:33+05:30', 'source': 'Stats.pdf', 'total_pages': 562, 'page': 2, 'page_label': '3'}, page_content='Practical\tStatistics\tfor\tData\tScientist

In [9]:
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

db = FAISS.from_documents(chunk_documents, OpenAIEmbeddings())
db

  db = FAISS.from_documents(chunk_documents, OpenAIEmbeddings())


<langchain_community.vectorstores.faiss.FAISS at 0x2b50f914980>

In [10]:
query ="KEY TERMS FOR DATA TYPES"
retrieved_result = db.similarity_search(query)
print(retrieved_result[0].page_content)


Further	Reading
Data	types	can	be	confusing,	since	types	may	overlap,	and	the	taxonomy	in
one	software	may	differ	from	that	in	another.
	The	
R-Tutorial	website
	covers
the	taxonomy	for	R.
Databases	are	more	detailed	in	their	classification	of	data	types,
incorporating	considerations	of	precision	levels,	fixed-	or	variable-length
fields,	and	more;
	see	the	
W3Schools	guide	for	SQL
.


In [11]:
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template("""
Answer the following questions based on only on the provided context.
Think step by step before providing a detailed answer.
I will tip you $1000 if the user finds the answer helpful.
<context>
{context}
</context>

Question :{input}
                                          """)

In [12]:
from langchain_community.llms import Ollama

llm = Ollama(model="deepseek-r1:1.5b")
llm

  llm = Ollama(model="deepseek-r1:1.5b")


Ollama(model='deepseek-r1:1.5b')

In [13]:
# Chain
from langchain.chains.combine_documents import create_stuff_documents_chain
doc_chain =create_stuff_documents_chain(llm,prompt)
doc_chain

RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, template='\nAnswer the following questions based on only on the provided context.\nThink step by step before providing a detailed answer.\nI will tip you $1000 if the user finds the answer helpful.\n<context>\n{context}\n</context>\n\nQuestion :{input}\n                                          '), additional_kwargs={})])
| Ollama(model='deepseek-r1:1.5b')
| StrOutputParser(), kwargs={}, config={'run_name': 'stuff_documents_chain'}, config_factories=[])