## Library Imports

In [None]:
import os
import bs4
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores import FAISS


## Playing with the Data Ingestion

### Loading a text document 

In [None]:
txtloader = TextLoader("speech.txt")
txt_docs = txtloader.load()
print(txt_docs[0].page_content)

### Web Content Loading 
- Look at the classes in `inspect-elements` section in the inspect menu of the browser 


In [None]:
# Load, chunk and index the content of the html page
webloader=WebBaseLoader(web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),bs_kwargs=dict(parse_only=bs4.SoupStrainer(class_=("post-title","post-content","post-header"))))
web_docs=webloader.load()
print(web_docs[0].page_content)

### Pdf Content Loading 

In [None]:
pdf_loader = PyPDFLoader("attention.pdf")
pdf_docs = pdf_loader.load()

# For n pages, the pdf_docs is a list of pages from 0 to n-1 which can be indexed to read the page_content
print(pdf_docs[1].page_content)

## Playing with the Vector Store & Retrieving results using similarity search

- Using an LLM model yields a better result as compared to using an open source embedder

In [None]:
## Chroma DB
chroma_db = Chroma.from_documents(documents=pdf_docs, embedding=OllamaEmbeddings(model = "wizardlm2:latest"))

In [None]:
query = "Who are the authors of the paper?"
retrieved_results = chroma_db.similarity_search(query)
print(retrieved_results[0].page_content)

In [None]:
## FAISS Vector Database
from langchain_community.vectorstores import FAISS
fais_db = FAISS.from_documents(pdf_docs, OllamaEmbeddings(model="wizardlm2:latest"))

In [None]:
query = "Who are the authors of the paper?"
retrieved_results = fais_db.similarity_search(query)
print(retrieved_results[0].page_content)