In [2]:
import pandas as pd
import numpy as np
import faiss
import zlib
import asyncpg
import psycopg2

from langchain_openai import OpenAIEmbeddings
random_seed = 42

## temporary open source embedding
from sentence_transformers import SentenceTransformer

## openai api key
openai_api_key = input("Enter the OpenAI API key: ")
model_embeddings = "text-embedding-3-small"

  from tqdm.autonotebook import tqdm, trange


## Incremental dataset updates PoC

In [24]:
### load the data

test_dataset = pd.read_csv('../002_create_test_dataset/001_questions_per_section/cleaned_test_dataset_with_clusters.csv')    ### placeholder data
initial_dataset = test_dataset[['question', 'answer']].iloc[:10] ### first 10 rows
late_dataset = test_dataset[['question', 'answer']].iloc[10:15]  ### rows 11 to 15
v2_dataset = test_dataset[['question', 'answer']].iloc[:15]      ### first 15 rows

In [4]:
### first, testing with SentenceTransformer (opensource)
model = SentenceTransformer('all-MiniLM-L6-v2')
sentences = ["This is an example sentence", "Each sentence is converted"]
embeddings = model.encode(sentences)

print(embeddings)



[[ 6.76568896e-02  6.34958372e-02  4.87130992e-02  7.93049559e-02
   3.74480002e-02  2.65277550e-03  3.93748470e-02 -7.09843775e-03
   5.93614727e-02  3.15369666e-02  6.00980483e-02 -5.29051460e-02
   4.06067446e-02 -2.59308070e-02  2.98427530e-02  1.12692360e-03
   7.35149309e-02 -5.03819771e-02 -1.22386597e-01  2.37028338e-02
   2.97265388e-02  4.24768552e-02  2.56337505e-02  1.99520006e-03
  -5.69190979e-02 -2.71598492e-02 -3.29035781e-02  6.60248548e-02
   1.19007118e-01 -4.58791591e-02 -7.26215392e-02 -3.25839818e-02
   5.23414165e-02  4.50552553e-02  8.25298391e-03  3.67023274e-02
  -1.39415376e-02  6.53919727e-02 -2.64272522e-02  2.06360957e-04
  -1.36643481e-02 -3.62809449e-02 -1.95042938e-02 -2.89738737e-02
   3.94270606e-02 -8.84090737e-02  2.62422441e-03  1.36714354e-02
   4.83063050e-02 -3.11565213e-02 -1.17329143e-01 -5.11690415e-02
  -8.85287598e-02 -2.18961649e-02  1.42986402e-02  4.44168150e-02
  -1.34814540e-02  7.43392408e-02  2.66382471e-02 -1.98762249e-02
   1.79190

### Creating and merging embeddings

In [25]:
initial_embeddings = initial_dataset['question'].apply(lambda x: model.encode(x))
late_embeddings = late_dataset['question'].apply(lambda x: model.encode(x))

### Differentiating v1 and v2 datasets

In [26]:
def get_new_data_entries(df1, df2, column):
    new_entries = df1[~df1[column].isin(df2[column])]
    return new_entries

In [27]:
new_entries = get_new_data_entries(v2_dataset, initial_dataset, column = 'answer')

In [28]:
new_entries_embeddings = new_entries['question'].apply(lambda x: model.encode(x))

### Now with OpenAI Embeddings and FAISS indexing

In [29]:
# project into the embedding space
embeddings = OpenAIEmbeddings(model=model_embeddings, openai_api_key=openai_api_key)

opai_initial_embeddings = embeddings.embed_documents(initial_dataset['answer'])
opai_v2_embeddings = embeddings.embed_documents(v2_dataset['answer'])
opai_late_embeddings = embeddings.embed_documents(late_dataset['answer'])


In [30]:
print(np.shape(opai_initial_embeddings))
print(np.shape(opai_v2_embeddings))
print(np.shape(opai_late_embeddings))

(10, 1536)
(15, 1536)
(5, 1536)


In [39]:
incremented_embeddings = np.vstack([opai_initial_embeddings, opai_late_embeddings])

In [40]:
np.sum(opai_v2_embeddings == incremented_embeddings)

12288

In [41]:
np.mean(opai_v2_embeddings - incremented_embeddings)

6.143214860143687e-08

In [42]:
np.mean(opai_v2_embeddings)

-0.0006648267771866926

In [43]:
np.mean(incremented_embeddings)

-0.0006648882093352941

In [49]:
diff = np.mean(incremented_embeddings)/np.mean(opai_v2_embeddings - incremented_embeddings)
print(f"On average, the proportion between a regular embedding value and the subtraction result is in the order of 10000 (exactly {diff:.2f})")

On average, the difference of a regular embedding value and the subtraction result is in the order of 10000 (exactly -10823.13)


This makes it possible to assume the results will be marginally the same of embedding it all again. 

In [45]:
np.sum(opai_v2_embeddings)

-15.317608946381398

In [46]:
np.sum(incremented_embeddings)

-15.319024343085175

In [50]:
### openai embeddings must be generated prior
def merge_openai_embeddings(dataset1, dataset2, embedding):
    embeddings_dataset1 = embedding.embed_documents(dataset1)
    embeddings_dataset2 = embedding.embed_documents(dataset2)

    return np.vstack(embeddings_dataset1, embeddings_dataset2)
    

### Testing in the actual database

In [None]:
# df = pd.read_sql('new_database_dump.sql', con = 'sqlalchemy')
# df.head()

In [None]:
# async def main(model: str):
#     data = await DataExporter.get_langchain_documents()

#     llm = model_utils.access_APIs.get_llm(model)
#     embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL)

#     questions_index = {}
#     keywords_index = {}
#     for db_name, contexts in data.items():
#         db = FAISS.from_documents(contexts, embeddings)
#         db.save_local(f"dbs/{db_name}_db/faiss/{EMBEDDING_MODEL}")

#         q_index, kw_index = generate_indexes_from_fragment(contexts, llm)

    #     for q, urls in q_index.items():
    #         if q not in questions_index:
    #             questions_index[q] = []
    #         questions_index[q].extend(urls)

    #     for k, urls in kw_index.items():
    #         if k not in keywords_index:
    #             keywords_index[k] = []
    #         keywords_index[k].extend(urls)

    # op_artifacts_pkg = importlib.resources.files(op_artifacts)
    # with open(op_artifacts_pkg.joinpath("index_questions.json"), "w") as f:
    #     json.dump(questions_index, f, indent=4)
    # index_questions = list(questions_index.keys())
    # index_questions_embed = np.array(embeddings.embed_documents(index_questions))
    # np.savez_compressed(
    #     op_artifacts_pkg.joinpath("index_questions.npz"), index_questions_embed
    # )

    # with open(op_artifacts_pkg.joinpath("index_keywords.json"), "w") as f:
    #     json.dump(keywords_index, f, indent=4)
    # index_keywords = list(keywords_index.keys())
    # index_keywords_embed = np.array(embeddings.embed_documents(index_keywords))
    # np.savez_compressed(
    #     op_artifacts_pkg.joinpath("index_keywords.npz"), index_keywords_embed
    # )

    # await reorder_file(op_artifacts_pkg.joinpath("index_questions.json"))
    # await reorder_file(op_artifacts_pkg.joinpath("index_keywords.json"))

In [None]:
def merge_faiss_indexes_same_type(index1, index2):
    
    # get the vectors from both indexes
    _, vectors1 = index1.reconstruct_n(0, index1.ntotal)
    _, vectors2 = index2.reconstruct_n(0, index2.ntotal)
    
    # combine the vectors
    all_vectors = np.vstack((vectors1, vectors2))
    
    # add the combined vectors to a new index
    merged_index = faiss.clone_index(index1)
    merged_index.add(all_vectors)
    
    return merged_index

In [None]:
def simple_incremental_faiss_update(db_params, new_small_index):

    conn = psycopg2.connect(**db_params)
    cur = conn.cursor()

    # 1. Get the latest version of the FAISS index from PostgreSQL
    cur.execute("Query here: get the latest version ordered by timestamp LastEmbeddedAt") ## todo the query
    result = cur.fetchone()
    
    if result:  # if data inexistent, it will create a new db
        latest_index = faiss.deserialize_index(result[0])
    else:
        latest_index = faiss.IndexFlatL2(new_small_index.d)  

    # 2. Merge it with the smaller FAISS index
    merged_index = faiss.merge_indexes([latest_index, new_small_index])

    # 3. Send the merged FAISS index back to PostgreSQL
    merged_index_data = faiss.serialize_index(merged_index)
    cur.execute("Query here: send the new version ordered by timestamp LastEmbeddedAt",  ## todo the query
                 (psycopg2.Binary(merged_index_data),))

    conn.commit()
    print("FAISS index successfully updated in PostgreSQL.")


    cur.close()
    conn.close()

In [None]:
### async version by Claude

async def async_simple_incremental_faiss_update(db_params, new_small_index):
    """
    Asynchronous function to incrementally update a FAISS index stored in PostgreSQL.
    
    :param db_params: dict, database connection parameters
    :param new_small_index: faiss.Index, the new small index to merge
    """
    conn = await asyncpg.connect(**db_params)

    try:
        # 1. Get the latest version of the FAISS index from PostgreSQL
        result = await conn.fetchrow("SELECT index_data FROM faiss_index ORDER BY LastEmbeddedAt DESC LIMIT 1")  ## todo replace query 
        
        if result:
            latest_index = faiss.deserialize_index(bytes(result['index_data']))
        else:
            latest_index = faiss.IndexFlatL2(new_small_index.d)

        # 2. Merge it with the smaller FAISS index
        merged_index = faiss.merge_indexes([latest_index, new_small_index])

        # 3. Send the merged FAISS index back to PostgreSQL
        merged_index_data = faiss.serialize_index(merged_index)
        await conn.execute(
            "Query here",                 ## todo
            merged_index_data
        )

        print("FAISS index successfully updated in PostgreSQL.")

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        raise

    finally:
        await conn.close()