## RAG PIPELINE WITH VECTOR DATABASE

In [1]:
##Data ingestion
#txt file  loader
from langchain_community.document_loaders import TextLoader
loader=TextLoader("speech.txt")
docs=loader.load()
docs

[Document(metadata={'source': 'speech.txt'}, page_content='The world must be made safe for democracy. Its peace must be planted upon the tested foundations of political liberty. We have no selfish ends to serve. We desire no conquest, no dominion. We seek no indemnities for ourselves, no material compensation for the sacrifices we shall freely make. We are but one of the champions of the rights of mankind. We shall be satisfied when those rights have been made as secure as the faith and the freedom of nations can make them.\n\nJust because we fight without rancor and without selfish object, seeking nothing for ourselves but what we shall wish to share with all free peoples, we shall, I feel confident, conduct our operations as belligerents without passion and ourselves observe with proud punctilio the principles of right and of fair play we profess to be fighting for.\n\n…\n\nIt will be all the easier for us to conduct ourselves as belligerents in a high spirit of right and fairness be

In [2]:
import os
from dotenv import load_dotenv
load_dotenv()
langchain_api_key = os.getenv("LANGCHAIN_API_KEY")
if not langchain_api_key:
    raise ValueError("LANGCHAIN_API_KEY environment variable not set.")
os.environ["LANGCHAIN_API_KEY"] = langchain_api_key
os.environ["LANGCHAIN_TRACING_V2"] = "true"

In [3]:
#web site loader
from langchain_community.document_loaders import WebBaseLoader
import bs4

loader=WebBaseLoader(web_path=("https://en.wikipedia.org/wiki/Thakurgaon_District#:~:text=Thakurgaon%20is%20in%20the%20north,of%20the%20Himalayan%20plain%20land.",),
                     bs_kwargs=dict(parse_only=bs4.SoupStrainer(
                                class_=("mw-page-container"))))
docs=loader.load()
docs

USER_AGENT environment variable not set, consider setting it to identify your requests.


[Document(metadata={'source': 'https://en.wikipedia.org/wiki/Thakurgaon_District#:~:text=Thakurgaon%20is%20in%20the%20north,of%20the%20Himalayan%20plain%20land.'}, page_content='\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nContents\nmove to sidebar\nhide\n\n\n\n\n(Top)\n\n\n\n\n\n1\nSubdistricts\n\n\n\n\n\n\n\n\n2\nEtymology\n\n\n\n\n\n\n\n\n3\nHistory\n\n\n\n\n\n\n\n\n4\nEthnography\n\n\n\n\n\n\n\n\n5\nGeography\n\n\n\n\n\n\n\n\n6\nDemographics\n\n\n\n\n\n\n\n\n7\nEconomy\n\n\n\n\n\n\n\n\n8\nPlaces of interest[13]\n\n\n\n\n\n\n\n\n9\nCulture[14]\n\n\n\n\n\n\n\n\n10\nNotable people\n\n\n\n\n\n\n\n\n11\nGallery\n\n\n\n\n\n\n\n\n12\nSee also\n\n\n\n\n\n\n\n\n13\nNotes\n\n\n\n\n\n\n\n\n14\nReferences\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nToggle the table of contents\n\n\n\n\n\n\n\nThakurgaon District\n\n\n\n26 languages\n\n\n\n\nবাংলা閩南語 / Bân-lâm-gúCebuanoDeutschडोटेलीΕλληνικάEspañolفارسیFrançais한국어हिन्दीবিষ্ণুপ্রিয়া মণিপুরীItalianoNederlandsनेपालीپنجابیRomânăРусскийᱥᱟᱱᱛᱟᱲᱤSimple EnglishSvenska

In [4]:
#pdf loader
from langchain_community.document_loaders import PyPDFLoader
loader=PyPDFLoader("attention.pdf")
docs=loader.load()
docs

[Document(metadata={'source': 'attention.pdf', 'page': 0}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.comNoam Shazeer∗\nGoogle Brain\nnoam@google.comNiki Parmar∗\nGoogle Research\nnikip@google.comJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.comAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.eduŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗ ‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer,\n

In [5]:
#split the data into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter
spliter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
chunk_documents=spliter.split_documents(docs)
len(chunk_documents)


52

In [6]:
#embedding into vector and vector store
#from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
import faiss

from langchain_community.docstore.in_memory import InMemoryDocstore

embeddings=OllamaEmbeddings(model="llama3.2")
single_vector=embeddings.embed_query("aiman")
ndex=faiss.IndexFlatL2(len(single_vector))
vector_store=FAISS(
    embedding_function=embeddings,
    index=ndex,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)
ids=vector_store.add_documents(documents=chunk_documents)


  embeddings=OllamaEmbeddings(model="llama3.2")


In [7]:
len(ids)

52

In [10]:
# #store into local 
db_name="myData"
#vector_store.save_local(db_name)

In [11]:
#load local
new_vector=FAISS.load_local(db_name,embeddings=embeddings,allow_dangerous_deserialization=True)
len(new_vector.index_to_docstore_id)

52

In [12]:
question="what is neural network"
docs=new_vector.search(query=question,search_type='similarity')
for doc in docs:
    print(doc.page_content)
    print("\n\n")

1 Introduction
Recurrent neural networks, long short-term memory [ 13] and gated recurrent [ 7] neural networks
in particular, have been firmly established as state of the art approaches in sequence modeling and
transduction problems such as language modeling and machine translation [ 35,2,5]. Numerous
efforts have since continued to push the boundaries of recurrent language models and encoder-decoder
architectures [38, 24, 15].
Recurrent models typically factor computation along the symbol positions of the input and output
sequences. Aligning the positions to steps in computation time, they generate a sequence of hidden
states ht, as a function of the previous hidden state ht−1and the input for position t. This inherently
sequential nature precludes parallelization within training examples, which becomes critical at longer
sequence lengths, as memory constraints limit batching across examples. Recent work has achieved



One is the total computational complexity per layer. Another is 

RAG WITH MULTI DATA SOURCE


In [24]:
#wikipedia tool
from langchain_community.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper

api_wrapper=WikipediaAPIWrapper(top_k_results=1,doc_content_chars_max=200)
wiki=WikipediaQueryRun(api_wrapper=api_wrapper)

In [63]:
#retriver custom tool for pdf
from langchain.tools.retriever import create_retriever_tool
retriever=new_vector.as_retriever(search_type='mmr',search_kwargs={'k':3,'fetch_k':100,'lamda_mult':1})
retriever
retriver_tool=create_retriever_tool(retriever,
                                    name="langsmit_search",
                                    description="search for information about langsmit")
retriver_tool.args


{'query': {'description': 'query to look up in retriever',
  'title': 'Query',
  'type': 'string'}}

In [64]:
#arxiv tool
from langchain_community.tools import ArxivQueryRun
from langchain_community.utilities import ArxivAPIWrapper
arxiv_wrapper=ArxivAPIWrapper(top_k_results=1,doc_content_chars_max=200)
arxiv=ArxivQueryRun(api_wrapper=arxiv_wrapper)

In [None]:
#request sequence
from langchain_community.agent_toolkits.openapi.base import OpenAPIToolkit

from langchain_core.tools import tool
# Define your tools
tools = ['wiki','arxiv', 'retriver_tool']



In [113]:
#initalize the LLm model
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.llms import Ollama

import streamlit as st
import os
from dotenv import load_dotenv

load_dotenv()

# Verify environment variable
langchain_api_key = os.getenv("LANGCHAIN_API_KEY")
if not langchain_api_key:
    raise ValueError("LANGCHAIN_API_KEY environment variable not set.")
os.environ["LANGCHAIN_API_KEY"] = langchain_api_key
os.environ["LANGCHAIN_TRACING_V2"] = "true"
from langchain_ollama import ChatOllama

llm = ChatOllama(
    model="llama3.2",
    temperature=0,
    # other params...
).bind_tools()

TypeError: ChatOllama.bind_tools() missing 1 required positional argument: 'tools'

In [109]:
#promt templete
from langchain import hub
prompt=hub.pull("hwchase17/openai-functions-agent")
prompt.messages

[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template='You are a helpful assistant'), additional_kwargs={}),
 MessagesPlaceholder(variable_name='chat_history', optional=True),
 HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], input_types={}, partial_variables={}, template='{input}'), additional_kwargs={}),
 MessagesPlaceholder(variable_name='agent_scratchpad')]

In [114]:
#creating agents

from langchain.chains.llm import LLMChain
from langchain.agents import create_json_chat_agent,AgentExecutor

agent=create_json_chat_agent(llm,tools,prompt)
# Create an agent executor by passing in the agent and tools
agent_executor = AgentExecutor(
    agent=agent, tools=tools, verbose=True, handle_parsing_errors=True
)

ValueError: Prompt missing required variables: {'tools', 'tool_names'}