In [3]:
## Data Ingestion
from langchain_community.document_loaders import TextLoader
loader = TextLoader("rag/speech.txt")
text_documents = loader.load()
text_documents

[Document(page_content='The world must be made safe for democracy. Its peace must be planted upon the tested foundations of political liberty. We have no selfish ends to serve. We desire no conquest, no dominion. We seek no indemnities for ourselves, no material compensation for the sacrifices we shall freely make. We are but one of the champions of the rights of mankind. We shall be satisfied when those rights have been made as secure as the faith and the freedom of nations can make them.\n\nJust because we fight without rancor and without selfish object, seeking nothing for ourselves but what we shall wish to share with all free peoples, we shall, I feel confident, conduct our operations as belligerents without passion and ourselves observe with proud punctilio the principles of right and of fair play we profess to be fighting for.\n\nâ€¦\n\nIt will be all the easier for us to conduct ourselves as belligerents in a high spirit of right and fairness because we act without animus, not 

In [4]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ['OPENAI_API_KEY']=os.getenv("OPENAI_API_KEY")

In [11]:
# web based loader
from langchain_community.document_loaders import WebBaseLoader
import bs4

# Load,chunk and index the content of the page

loader = WebBaseLoader(web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
                       bs_kwargs=dict(parse_only=bs4.SoupStrainer(
                           class_ = ("post-title","post-content","post-header")
                       )))

text_documents = loader.load()

In [12]:
text_documents

[Document(page_content='\n\n      LLM Powered Autonomous Agents\n    \nDate: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng\n\n\nBuilding agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\nAgent System Overview#\nIn a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:\n\nPlanning\n\nSubgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks.\nReflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistakes and refine them for future steps, thereby improving the quality of final re

In [15]:
# PDF reader
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader('rag/attention.pdf')
documents = loader.load()

In [16]:
documents

[Document(page_content='Deep Learning\nSequence to Sequence models: \nAttention Models\n1', metadata={'source': 'rag/attention.pdf', 'page': 0}),
 Document(page_content='Sequence-to-sequence modelling\n•Problem: \n–A sequence ଵேgoes in\n–A different sequence ଵெcomes out\n•E.g.\n–Speech recognition:  Speech goes in, a word sequence comes out\n•Alternately output may be phoneme or character sequence\n–Machine translation: Word sequence goes in, word sequence comes \nout\n–Dialog :  User statement goes in,  system response comes out\n–Question answering :  Question comes in, answer goes out\n•In general \n–No synchrony between and .\n2', metadata={'source': 'rag/attention.pdf', 'page': 1}),
 Document(page_content='Sequence to sequence\n•Sequence goes in,  sequence comes out\n•No notion of “time synchrony” between input and output\n–May even not even maintain order of symbols\n•E.g.   “I ate an apple” \uf0e0“Ich habeeinenapfelgegessen”\n–Or even seem related to the input\n•E.g. “My screen 

In [17]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
docs = text_splitter.split_documents(documents)
docs[:5]

[Document(page_content='Deep Learning\nSequence to Sequence models: \nAttention Models\n1', metadata={'source': 'rag/attention.pdf', 'page': 0}),
 Document(page_content='Sequence-to-sequence modelling\n•Problem: \n–A sequence ଵேgoes in\n–A different sequence ଵெcomes out\n•E.g.\n–Speech recognition:  Speech goes in, a word sequence comes out\n•Alternately output may be phoneme or character sequence\n–Machine translation: Word sequence goes in, word sequence comes \nout\n–Dialog :  User statement goes in,  system response comes out\n–Question answering :  Question comes in, answer goes out\n•In general \n–No synchrony between and .\n2', metadata={'source': 'rag/attention.pdf', 'page': 1}),
 Document(page_content='Sequence to sequence\n•Sequence goes in,  sequence comes out\n•No notion of “time synchrony” between input and output\n–May even not even maintain order of symbols\n•E.g.   “I ate an apple” \uf0e0“Ich habeeinenapfelgegessen”\n–Or even seem related to the input\n•E.g. “My screen 

In [18]:
docs

[Document(page_content='Deep Learning\nSequence to Sequence models: \nAttention Models\n1', metadata={'source': 'rag/attention.pdf', 'page': 0}),
 Document(page_content='Sequence-to-sequence modelling\n•Problem: \n–A sequence ଵேgoes in\n–A different sequence ଵெcomes out\n•E.g.\n–Speech recognition:  Speech goes in, a word sequence comes out\n•Alternately output may be phoneme or character sequence\n–Machine translation: Word sequence goes in, word sequence comes \nout\n–Dialog :  User statement goes in,  system response comes out\n–Question answering :  Question comes in, answer goes out\n•In general \n–No synchrony between and .\n2', metadata={'source': 'rag/attention.pdf', 'page': 1}),
 Document(page_content='Sequence to sequence\n•Sequence goes in,  sequence comes out\n•No notion of “time synchrony” between input and output\n–May even not even maintain order of symbols\n•E.g.   “I ate an apple” \uf0e0“Ich habeeinenapfelgegessen”\n–Or even seem related to the input\n•E.g. “My screen 

In [19]:
# vector embeddings and vector store
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
db = Chroma.from_documents(docs[:20],OpenAIEmbeddings())

  warn_deprecated(


In [27]:
## chroma vector database
query = "Each word that is output depends only on current hidden state"
result = db.similarity_search(query)
result[0].page_content

'Simple recurrence : Text Modelling\n•Learn a model that can predict the next symbol \ngiven a sequence of symbols\n–Characters or words\n•After observing inputs it predicts \n–In reality, outputs a probability distribution for h-1\n\u0b34 ଵ ଶ ଷ ସ ହ \u0b3aଵ ଶ ଷ ସ ହ \u0b3a \u0b3b\n8'

In [29]:
# FAISS vector database
from langchain_community.vectorstores import FAISS

db1 = FAISS.from_documents(docs[:20],OpenAIEmbeddings())


In [30]:
## FAISS vector database
query = "Each word that is output depends only on current hidden state"
result = db1.similarity_search(query)
result[0].page_content

'Simple recurrence : Text Modelling\n•Learn a model that can predict the next symbol \ngiven a sequence of symbols\n–Characters or words\n•After observing inputs it predicts \n–In reality, outputs a probability distribution for h-1\n\u0b34 ଵ ଶ ଷ ସ ହ \u0b3aଵ ଶ ଷ ସ ହ \u0b3a \u0b3b\n8'

In [33]:
# LANCE vector database
from langchain_community.vectorstores import LanceDB
db2 = LanceDB.from_documents(docs[:20],OpenAIEmbeddings())

In [35]:
# LANCE vector database
query = "Each word that is output depends only on current hidden state"
result = db2.similarity_search(query)
result[0].page_content

'Simple recurrence : Text Modelling\n•Learn a model that can predict the next symbol \ngiven a sequence of symbols\n–Characters or words\n•After observing inputs it predicts \n–In reality, outputs a probability distribution for h-1\n\u0b34 ଵ ଶ ଷ ସ ହ \u0b3aଵ ଶ ଷ ସ ହ \u0b3a \u0b3b\n8'