In [None]:
from langchain_community.document_loaders import PyPDFLoader
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from os import listdir, path, chdir

chdir("..")
chdir("..")

from utils import embeddings
from uuid import uuid4

USER_AGENT environment variable not set, consider setting it to identify your requests.


#### Identify dataset files

In [2]:
PDF_DIR = "./assets/pdf"

all_files = listdir(PDF_DIR)
pdf_files = [file for file in all_files if file.endswith(".pdf")]
print(pdf_files)

["LLM Powered Autonomous Agents _ Lil'Log.pdf", "Diffusion Models for Video Generation _ Lil'Log.pdf", "Adversarial Attacks on LLMs _ Lil'Log.pdf", "Thinking about High-Quality Human Data _ Lil'Log.pdf", "Extrinsic Hallucinations in LLMs _ Lil'Log.pdf"]


#### Read pdf content

In [3]:
def load_pdf_files(file_path: str):
    for file_name in pdf_files:
        file_path = path.join(PDF_DIR, file_name)
        loader = PyPDFLoader(file_path=file_path)
        docs = loader.load()
        yield file_name, docs

#### Define index

In [4]:
index = faiss.IndexFlatL2(len(embeddings.embed_query("Hi")))

#### Ensure `index_to_docstore` is preserved

In [None]:
import json
import os

class JsonDictManager:
    data = {}

    def __init__(self, file_path):
        """
        Initialize by loading JSON from a file into the class variable.
        
        Args:
            file_path (str): Path to the JSON file
        """
        self.file_path = file_path
        if os.path.exists(file_path):
            try:
                with open(file_path, 'r') as file:
                    JsonDictManager.data = json.load(file)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
                JsonDictManager.data = {}
            except Exception as e:
                print(f"Error reading file: {e}")
                JsonDictManager.data = {}
        else:
            print(f"File {file_path} not found. Initializing empty dictionary.")
            JsonDictManager.data = {}

    def save(self):
        """
        Serialize the class dictionary to the JSON file.
        """
        try:
            with open(self.file_path, 'w') as file:
                json.dump(JsonDictManager.data, file, indent=4)
            print(f"Dictionary saved to {self.file_path}")
        except Exception as e:
            print(f"Error saving to file: {e}")

    @classmethod
    def update_dict(cls, key, value):
        """
        Update the class dictionary with a new key-value pair.
        
        Args:
            key: Key to add or update
            value: Value to associate with the key
        """
        cls.data[key] = value

In [6]:
index_to_docstore = JsonDictManager(file_path="./assets/faiss_index/base_docstore.json")

File ./assets/faiss_index/base_docstore.json not found. Initializing empty dictionary.


#### Define Vector Store

In [7]:
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id=index_to_docstore.data
)

#### Add documents to Vector Store

In [8]:
try:
    vector_store.load_local("./assets/faiss_index/base" , embeddings=embeddings, allow_dangerous_deserialization=True)
except Exception as e:
    print(f"Faiss Index not found")
    for file_name, docs in load_pdf_files(file_path=PDF_DIR):
        print(f"Indexing documents for {file_name}")
        uuids = [uuid4().hex for _ in range(len(docs))]
        vector_store.add_documents(documents=docs, ids=uuids)

    vector_store.save_local("./assets/faiss_index/base")

Faiss Index not found
Indexing documents for LLM Powered Autonomous Agents _ Lil'Log.pdf
Indexing documents for Diffusion Models for Video Generation _ Lil'Log.pdf
Indexing documents for Adversarial Attacks on LLMs _ Lil'Log.pdf
Indexing documents for Thinking about High-Quality Human Data _ Lil'Log.pdf
Indexing documents for Extrinsic Hallucinations in LLMs _ Lil'Log.pdf


In [10]:
len(index_to_docstore.data)

113

In [11]:
vector_store.similarity_search("task planning")

[Document(id='af59975e7b7c46eca715ea2461814949', metadata={'producer': 'Skia/PDF m137', 'creator': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36', 'creationdate': '2025-06-18T11:50:38+00:00', 'title': "LLM Powered Autonomous Agents | Lil'Log", 'moddate': '2025-06-18T11:50:38+00:00', 'source': "./assets/pdf/LLM Powered Autonomous Agents _ Lil'Log.pdf", 'total_pages': 23, 'page': 1, 'page_label': '2'}, page_content='Figure 1: Overview of a LLM-powered autonomous agent system.\nComponent One: Planning\nA complicated task usually involves many steps. An agent needs to know what they are and\nplan ahead.\nTask Decomposition\nChain of thought (CoT; Wei et al. 2022) has become a standard prompting technique for\nenhancing model performance on complex tasks. The model is instructed to “think step by\nstep” to utilize more test-time computation to decompose hard tasks into smaller and simpler\nsteps. CoT transforms big tasks int

In [12]:
index_to_docstore.save()

Dictionary saved to ./assets/faiss_index/base_docstore.json
