In [1]:
from llama_index.readers.file import UnstructuredReader
from pathlib import Path

subjects = ['Biology', 'Chemistry', 'Physics', 'Mathematics', 'Computer_science']

loader = UnstructuredReader()
doc_set = {}
all_docs = []
for subject in subjects:
    subject_docs = loader.load_data(
        file=Path(f"./data/test/pdf/{subject}.pdf"), split_documents=False
    )
    # insert subject and document name metadata into each document
    for s in subject_docs:
        s.metadata = {"subject": subject, "document_name": f"{subject}.pdf", "source": subject}
    doc_set[subject] = subject_docs
    all_docs.extend(subject_docs)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from llama_index.core import VectorStoreIndex, StorageContext, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# Configure document chunking size
Settings.chunk_size = 512

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

index_set = {}
for subject in subjects:
    storage_context = StorageContext.from_defaults()
    cur_index = VectorStoreIndex.from_documents(
        doc_set[subject],
        storage_context=storage_context,
        embed_model=embed_model,  # use the open source embeddings model
    )
    index_set[subject] = cur_index
    storage_context.persist(persist_dir=f"./storage/{subject}")
