In [4]:
## Data Ingestion
from langchain_community.document_loaders import TextLoader

loader = TextLoader("speech.txt")
text_documents = loader.load()
text_documents

[Document(page_content='The world must be made safe for democracy. Its peace must be planted upon the tested foundations of political liberty. We have no selfish ends to serve. We desire no conquest, no dominion. We seek no indemnities for ourselves, no material compensation for the sacrifices we shall freely make. We are but one of the champions of the rights of mankind. We shall be satisfied when those rights have been made as secure as the faith and the freedom of nations can make them.\n\nJust because we fight without rancor and without selfish object, seeking nothing for ourselves but what we shall wish to share with all free peoples, we shall, I feel confident, conduct our operations as belligerents without passion and ourselves observe with proud punctilio the principles of right and of fair play we profess to be fighting for.\n\nI have said nothing of the governments allied with the Imperial government of Germany because they have not made war upon us or challenged us to defend

In [6]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")


In [9]:
# Web based loader 
from langchain_community.document_loaders import WebBaseLoader
import bs4 

## Load, chunk and index the content of any HTML Page

loader = WebBaseLoader(web_paths = ("https://lilianweng.github.io/posts/2023-06-23-agent/",),
                       bs_kwargs = dict(parse_only = bs4.SoupStrainer(
                           class_=("post-title", "post-content", "post-header") 
                       )), )

text_documents = loader.load()

In [10]:
text_documents

[Document(page_content='\n\n      LLM Powered Autonomous Agents\n    \nDate: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng\n\n\nBuilding agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\nAgent System Overview#\nIn a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:\n\nPlanning\n\nSubgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks.\nReflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistakes and refine them for future steps, thereby improving the quality of final re

In [12]:
## PDF Reader
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("Like_War_The_Weaponization_of_Social_Media.pdf")
docs = loader.load()

In [13]:
docs

[Document(page_content='', metadata={'source': 'Like_War_The_Weaponization_of_Social_Media.pdf', 'page': 0}),
 Document(page_content='', metadata={'source': 'Like_War_The_Weaponization_of_Social_Media.pdf', 'page': 1}),
 Document(page_content='', metadata={'source': 'Like_War_The_Weaponization_of_Social_Media.pdf', 'page': 2}),
 Document(page_content='C o n t e n t s\nT itle Page\nContents\nCopyright\nEndpaper\nDedication\nThe W ar Begins\nEvery W ire a Nerve\nThe T ruth Is Out There\nThe Empires Strike Back\nThe Unreality Machine\nW in the Net, W in the Day\nLikeW ar\nMasters of the Universe\nConclusion\nAcknowledgments\nNotes\nIndex\nSample Chapter fr om GHOST FLEET\nBuy the Book\nAbout the Author\nEndpaper\nConnect with HMH', metadata={'source': 'Like_War_The_Weaponization_of_Social_Media.pdf', 'page': 3}),
 Document(page_content='Copyright © 2018 by P . W . Singer and Emerson T . Brooking\nAll rights reserved\nFor information about permission to reproduce selections from this book,

## loading of Data complete, now moving onto the Transformation bit

![alt text](image.png)

In [17]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splittter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splittter.split_documents(docs)
documents[:5]

[Document(page_content='C o n t e n t s\nT itle Page\nContents\nCopyright\nEndpaper\nDedication\nThe W ar Begins\nEvery W ire a Nerve\nThe T ruth Is Out There\nThe Empires Strike Back\nThe Unreality Machine\nW in the Net, W in the Day\nLikeW ar\nMasters of the Universe\nConclusion\nAcknowledgments\nNotes\nIndex\nSample Chapter fr om GHOST FLEET\nBuy the Book\nAbout the Author\nEndpaper\nConnect with HMH', metadata={'source': 'Like_War_The_Weaponization_of_Social_Media.pdf', 'page': 3}),
 Document(page_content='Copyright © 2018 by P . W . Singer and Emerson T . Brooking\nAll rights reserved\nFor information about permission to reproduce selections from this book, write to\ntrade.permissions@hmhco.com  or to Permissions, Houghton Mif flin Harcourt Publishing Company ,\n3 Park A venue, 19th Floor , New Y ork, New Y ork 10016.\nhmhco.com\nLibrary of Congr ess Cataloging-in-Publication Data is available.\nISBN  978-1-328-69574-1\ne ISBN  978-1-328-69575-8\nv1.0818\nCover design by Mark R. R

In [18]:
documents

[Document(page_content='C o n t e n t s\nT itle Page\nContents\nCopyright\nEndpaper\nDedication\nThe W ar Begins\nEvery W ire a Nerve\nThe T ruth Is Out There\nThe Empires Strike Back\nThe Unreality Machine\nW in the Net, W in the Day\nLikeW ar\nMasters of the Universe\nConclusion\nAcknowledgments\nNotes\nIndex\nSample Chapter fr om GHOST FLEET\nBuy the Book\nAbout the Author\nEndpaper\nConnect with HMH', metadata={'source': 'Like_War_The_Weaponization_of_Social_Media.pdf', 'page': 3}),
 Document(page_content='Copyright © 2018 by P . W . Singer and Emerson T . Brooking\nAll rights reserved\nFor information about permission to reproduce selections from this book, write to\ntrade.permissions@hmhco.com  or to Permissions, Houghton Mif flin Harcourt Publishing Company ,\n3 Park A venue, 19th Floor , New Y ork, New Y ork 10016.\nhmhco.com\nLibrary of Congr ess Cataloging-in-Publication Data is available.\nISBN  978-1-328-69574-1\ne ISBN  978-1-328-69575-8\nv1.0818\nCover design by Mark R. R

In [19]:
# Entire PDF Document devided into chunks, now we conver4t them into vectors

In [21]:
## Vector Embeddings and Vector Store

from langchain_community.embeddings import OpenAIEmbeddings

#this will create the embeddings, now we need to store it in some kind of vector store

from langchain.vectorstores import Chroma

db = Chroma.from_documents(documents[:20], OpenAIEmbeddings()) #this will create the vector store of the first 20 documents


  warn_deprecated(


In [41]:
query = "design by" 
result = db.similarity_search(query)
result[0].page_content

'Copyright © 2018 by P . W . Singer and Emerson T . Brooking\nAll rights reserved\nFor information about permission to reproduce selections from this book, write to\ntrade.permissions@hmhco.com  or to Permissions, Houghton Mif flin Harcourt Publishing Company ,\n3 Park A venue, 19th Floor , New Y ork, New Y ork 10016.\nhmhco.com\nLibrary of Congr ess Cataloging-in-Publication Data is available.\nISBN  978-1-328-69574-1\ne ISBN  978-1-328-69575-8\nv1.0818\nCover design by Mark R. Robinson\nCover images © Shutterstock\nSinger photograph © Sam Cole\nBrooking photograph © T im Coburn\nChapter 8 epigraph quote  is from the film Sneakers,  directed by Phil Alden Robinson. Universal\nPictures, 1992. Used by permission of W alter F . Parkes.\nEndpaper illustration copyright © by Doan T rang'

In [34]:
## FAISS Vector Databse

from langchain_community.vectorstores import FAISS

db1 = FAISS.from_documents(documents[:20], OpenAIEmbeddings())

In [39]:
query = "It sounds a dreadful thing to say" 
result = db1.similarity_search(query)
result[0].page_content

'It sounds a dreadful thing to say , but these are things that don’ t necessarily\nneed to be true as long as they’re believed.\n \n— ALEXANDER NIX'