### Import all necessity library 

In [None]:
import os
import warnings
from dotenv import load_dotenv


from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain.prompts import PromptTemplate
from langchain.embeddings import OpenAIEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

warnings.filterwarnings("ignore")

load_dotenv()

### Initialize the parameters

In [None]:
chunk_size = 1000
chunk_overlap = 200
PDFs_PATH = "../../data/PDFs"
persist_directory = "../../db"

### Define the OpenAI APIs

In [None]:
try:
    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
except Exception as e:
    print("An error occurred:".capitalize(), e)
    raise 

### Extract the dataset

In [None]:
loader = DirectoryLoader(
    path=PDFs_PATH,
    glob="**/*.pdf",
    use_multithreading=True,
    loader_cls=PyPDFLoader
)

documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

documents = text_splitter.split_documents(documents = documents)

### Store the tokens into VectorDB

In [None]:
vectordb = Chroma.from_documents(
    documents=documents,
    embedding=OpenAIEmbeddings(),
    persist_directory=persist_directory
)