pdf.py

# unstructured data 
# unstructured can really put into a data so we using vector store index --> taking all of data into embedding so we can quickly index/query 
import os 
from llama_index.core import StorageContext, VectorStoreIndex, load_index_from_storage
from llama_index.readers.file import PDFReader # look at docs to find others readers like ppt, word, etc

def get_index(data, index_name):
    index = None
    if not os.path.exists(index_name):
        print("building index", index_name)
        index = VectorStoreIndex.from_documents(data, show_progress=True)
        index.storage_context.persist(persist_dir=index_name)
    else:
        index = load_index_from_storage(StorageContext.from_defaults(persist_dir=index_name))
    
    return index


pdf_path = os.path.join("data", "Canada.pdf")
canada_pdf = PDFReader().load_data(file=pdf_path)
canada_index = get_index(canada_pdf, "canada")
canada_engine = canada_index.as_query_engine() # uses the vector store index (canada_index) as a query engine like the population data. now that we have this line here 
                                               # we can use it as an additional tool