# Install the dependencies

In [1]:
!pip install -qU langchain langchain-core langchain-community langchain-openai

In [2]:
!pip install -qU tiktoken pymupdf

In [4]:
import faiss
import os
import openai
from getpass import getpass
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

# Set the enviroment values

In [5]:
openai.api_key = getpass("Please provide your OpenAI Key: ")
os.environ["OPENAI_API_KEY"] = openai.api_key

Please provide your OpenAI Key:  ········


# Choose the pdf documents to embed

In [20]:
file_name = "Seattle" 

In [21]:
path_raw = "..\\data\\raw\\" + file_name + ".pdf" # The path where the raw documents are stored
path_processed = "..\\data\\processed\\" + file_name + ".faiss" # The path where we will store the index

# Load the document

In [22]:
loader = PyMuPDFLoader(
    path_raw,
)

documents = loader.load()

# Split the document into chunks

In [23]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 700,
    chunk_overlap = 50
)

documents = text_splitter.split_documents(documents)

# Load the embedding model

In [24]:
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small"
)

# Create a FAISS VectorStore

In [25]:
vector_store = FAISS.from_documents(documents, embeddings)

# Save the vector store

In [26]:
vector_store.save_local(path_processed)