# Inserting to existing index

Assumes that we want to update an existing index (but are unable to isolate the new documents up-front)

To do this we:
1. Load the existing index
2. Load the new data file (which includes data we've already indexed as well as some new docs :( )
3. Work out overlap and isolate new documents
4. Add new documents to index
5. Save new index

TODO:
- extend to identify data to delete (outdated pages)
- auto identify CHUNK_SIZE_LIMIT if possible

In [None]:
import os

In [None]:
os.environ["OPENAI_API_KEY"] = "foo" # dummy key to make the service context happy

In [None]:
import pandas as pd
from tqdm.auto import tqdm

from llama_index import Document, LangchainEmbedding
from llama_index.storage import StorageContext
from llama_index import ServiceContext
from llama_index import load_index_from_storage
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

In [None]:
ALL_DATA_INDEX_FPATH = os.path.abspath("./data/llama_index_indices/all_data") # existing index we want to add to
NEW_INDEX_FPATH = os.path.abspath("./data/llama_index_indices/all_data_new") # output path for newly constructed index
CHUNK_SIZE_LIMIT = 2048 # CHECK THIS! known existing value - needs to match existing input but not sure whether this is recoverable just from index files

In [None]:
website_data = "./data/public/turingacuk-no-boilerplate.csv" # our file containing new data

In [None]:
storage_context = StorageContext.from_defaults(persist_dir=ALL_DATA_INDEX_FPATH)

In [None]:
hfemb = HuggingFaceEmbeddings()
embed_model = LangchainEmbedding(hfemb)

In [None]:
service_context = ServiceContext.from_defaults(
            llm_predictor=None,
            embed_model=embed_model,
            prompt_helper=None,
            chunk_size=CHUNK_SIZE_LIMIT,
        )

In [None]:
# load a single index
index = load_index_from_storage(storage_context=storage_context, service_context=service_context)

In [None]:
df = pd.read_csv(website_data).dropna() # load our new data

In [None]:
# convert df to Documents
candidate_docs = [Document(row["body"], extra_info={"filename": row["url"]}) for i, row in df.iterrows()]

In [None]:
from typing import Sequence
from llama_index.storage.docstore import DocumentStore

def get_hashset_for_docstore(docstore: DocumentStore) -> set[str]:
    docs = docstore.docs
    return set([docs[d].doc_hash for d in docs])

In [None]:
hashes = get_hashset_for_docstore(docstore=index.docstore)

In [None]:
# don't want to insert docs we already have hashes for in doc store
# however, the docstore hashes will be after chunking... so we won't catch them all here
# doing this first for speed and will try a more complex approach on remaining docs
maybe_new_docs = [doc for doc in candidate_docs if doc.doc_hash not in hashes]

In [None]:
len(candidate_docs), len(maybe_new_docs)

In [None]:
docs_chunked = [service_context.node_parser.get_nodes_from_documents([doc]) for doc in tqdm(maybe_new_docs)]

In [None]:
# TODO collapse this into a single comprehension?
# for each doc's chunks, check if these hashes exist already in the index 
exists = [all([node.doc_hash in hashes for node in sublist]) for sublist in docs_chunked]
# use this to filter to actually new docs
new_docs = [doc for i, doc in enumerate(maybe_new_docs) if not exists[i]]

In [None]:
len(new_docs)

In [None]:
for doc in tqdm(new_docs):
    index.insert(document=doc)

In [None]:
index.storage_context.persist(persist_dir=NEW_INDEX_FPATH)

In [None]:
! ls -lh $ALL_DATA_INDEX_FPATH

In [None]:
! ls -lh $NEW_INDEX_FPATH