In [1]:
%reload_ext autoreload
%autoreload 2

In [9]:
import box
import yaml, os
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings

from exteract.paths import BASE_DIR

# Import config vars
with open(BASE_DIR / "config.yml", "r", encoding="utf8") as ymlfile:
    cfg = box.Box(yaml.safe_load(ymlfile))


def run_ingest():
    loader = DirectoryLoader(BASE_DIR / cfg.DATA_PATH, glob="*.pdf", loader_cls=PyPDFLoader)

    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=cfg.CHUNK_SIZE, chunk_overlap=cfg.CHUNK_OVERLAP
    )
    texts = text_splitter.split_documents(documents)

    embeddings = HuggingFaceEmbeddings(
        model_name=cfg.EMBEDDINGS, model_kwargs={"device": "cpu"}
    )

    vectorstore = FAISS.from_documents(texts, embeddings)
    path = os.path.realpath(BASE_DIR / cfg.DB_FAISS_PATH)
    vectorstore.save_local(path)


run_ingest()

In [23]:
import timeit, os
from exteract.llm.wrapper import setup_qa_chain

qa_chain = setup_qa_chain()
start = timeit.default_timer()
response = qa_chain({'query': "What is the seller's name?"})
end = timeit.default_timer()

print(f'\nAnswer: {response["result"]}')
print('='*50)

print(f"Time to retrieve data: {end - start}")


Answer: The seller's name is Chapman, Kim and Green.
Time to retrieve data: 29.53804608300004
