In [13]:
from langchain_community.document_loaders import PyPDFLoader
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

from utils import embeddings
from os import listdir, path
from uuid import uuid4

In [2]:
PDF_DIR = "./assets/pdf"

all_files = listdir(PDF_DIR)
pdf_files = [file for file in all_files if file.endswith(".pdf")]
print(pdf_files)

["LLM Powered Autonomous Agents _ Lil'Log.pdf", "Diffusion Models for Video Generation _ Lil'Log.pdf", "Adversarial Attacks on LLMs _ Lil'Log.pdf", "Thinking about High-Quality Human Data _ Lil'Log.pdf", "Extrinsic Hallucinations in LLMs _ Lil'Log.pdf"]


In [14]:
def load_pdf_files(file_path: str):
    for file_name in pdf_files:
        file_path = path.join(PDF_DIR, file_name)
        loader = PyPDFLoader(file_path=file_path)
        docs = loader.load()
        yield file_name, docs

In [None]:
index = faiss.IndexFlatL2(len(embeddings.embed_query("Hi")))

In [11]:
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [None]:
try:
    vector_store.load_local("./assets/faiss_index/base" , embeddings=embeddings, allow_dangerous_deserialization=True)
except Exception as e:
    print(f"Faiss Index not found")
    for file_name, docs in load_pdf_files(file_path=PDF_DIR):
        print(f"Indexing documents for {file_name}")
        uuids = [uuid4().hex for _ in range(len(docs))]
        vector_store.add_documents(documents=docs, ids=uuids)
        
    vector_store.save_local("./assets/faiss_index/base")

Faiss Index not found
Indexing documents for LLM Powered Autonomous Agents _ Lil'Log.pdf
Indexing documents for Diffusion Models for Video Generation _ Lil'Log.pdf
Indexing documents for Adversarial Attacks on LLMs _ Lil'Log.pdf
Indexing documents for Thinking about High-Quality Human Data _ Lil'Log.pdf
Indexing documents for Extrinsic Hallucinations in LLMs _ Lil'Log.pdf
