In [None]:
import os
from bs4 import BeautifulSoup
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
import warnings
from langchain_community.document_loaders import DirectoryLoader
from llama_index.core import SimpleDirectoryReader, StorageContext
warnings.filterwarnings("ignore")
from langchain_community.embeddings import HuggingFaceEmbeddings
from sentence_transformers import CrossEncoder
import numpy as np
from trulens_eval import Tru
from trulens_eval.tru_custom_app import instrument
from llmlingua import PromptCompressor
import re
from IPython.core.display import display, HTML
import psycopg2

In [None]:
import numpy as np
import umap
from tqdm import tqdm

import torch

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
NEO4J_URI = 'bolt://' + os.getenv('NEO4J_HOST') + ':7687'
NEO4J_USERNAME = os.getenv('NEO4J_USER')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = 'neo4j' #os.getenv('NEO4J_DB')
print(NEO4J_URI)
print(NEO4J_DATABASE)

In [None]:
kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

In [None]:
def neo4j_vector_search(question, index_name):
  """Search for similar nodes using the Neo4j vector index"""
  query_embedding = embeddings.embed_query(question)  
  vector_search_query = """
    CALL db.index.vector.queryNodes($index_name, $top_k, $question) yield node, score
    RETURN score, node.ActId, node.sectionId, node.sectionName, node.url, node.text AS text, node.RegId AS Regulations
  """
  similar = kg.query(vector_search_query, 
                     params={
                      'question': query_embedding, 
                      'index_name':index_name, 
                      'top_k': 10})
  return similar

In [None]:
from trulens_eval import Tru
from trulens_eval.tru_custom_app import instrument
TRULENS_USER = os.getenv('TRULENS_USER')
TRULENS_PASSWORD = os.getenv('TRULENS_PASSWORD')
TRULENS_DB = os.getenv('TRULENS_DB')
TRULENS_PORT = os.getenv('TRULENS_PORT')
TRULENS_HOST = os.getenv('TRULENS_HOST')

TRULENS_CONNECTION_STRING = f'postgresql+psycopg2://{TRULENS_USER}:{TRULENS_PASSWORD}@{TRULENS_HOST}:{TRULENS_PORT}/{TRULENS_DB}'
tru = Tru(database_url=TRULENS_CONNECTION_STRING)

In [None]:
conn = psycopg2.connect(
    host=TRULENS_HOST,
    database=TRULENS_DB,
    user=TRULENS_USER,
    password=TRULENS_PASSWORD
)

In [None]:
cur = conn.cursor()  # creating a cursor

In [None]:
cur.execute("""
SELECT R.input as input, R.record_json , R.record_id, F.multi_result, R.app_id result FROM public.records R 
LEFT Join feedbacks F ON F.record_id = R.record_id WHERE
R.app_id='TopK_Feedback_System_v1' 
""")
rows = cur.fetchall()

In [None]:
print(len(rows))

In [None]:
import json

In [None]:
unique_acts = []

In [None]:
for row in rows:
    print(row[0])
    print(row[3])
    if row[3] is None:
        continue
    if row[1] is None:
        continue
    #print(json.loads(row[3]))
    #continue
    for key, data in enumerate(row):
        #print(key, data)
        if (key == 1):
            #print(key, data)
            #print(type(data))
            value = json.loads(data)
            #print(value['calls'][0]['rets'])
            for value_index, value_data in enumerate(value['calls'][0]['rets']):
                #print(value_data['node.ActId'])
                #print(value_data['text'])
                if value_data['node.ActId'] not in unique_acts:
                    unique_acts.append(value_data['node.ActId'])
                #print    
            #print(value)
        #print(data)

In [None]:
print(len(unique_acts))

In [None]:
def getembeddings(data):
    query_embedding = embeddings.embed_query(data)
    return query_embedding

In [None]:
adapter_query_embeddings = []
adapter_doc_embeddings = []
adapter_labels = []

In [None]:
for q, query in enumerate(tqdm(rows)):
    if query[3] is None:
        continue
    result = json.loads(query[3])['bulk']
    documents = json.loads(query[1])
    for d, document in enumerate(documents['calls'][0]['rets']):
        adapter_query_embeddings.append(getembeddings(query[0]))
        adapter_doc_embeddings.append(getembeddings(document['text']))
        adapter_labels.append(result[d])

In [None]:
len(adapter_query_embeddings)

In [None]:
len(adapter_labels)

In [None]:
len(adapter_doc_embeddings) 

In [None]:
adapter_query_embeddings = torch.Tensor(np.array(adapter_query_embeddings))
adapter_doc_embeddings = torch.Tensor(np.array(adapter_doc_embeddings))
adapter_labels = torch.Tensor(np.expand_dims(np.array(adapter_labels),1))

In [None]:
dataset = torch.utils.data.TensorDataset(adapter_query_embeddings, adapter_doc_embeddings, adapter_labels)

In [None]:
def model(query_embedding, document_embedding, adaptor_matrix):
    updated_query_embedding = torch.matmul(adaptor_matrix, query_embedding)
    return torch.cosine_similarity(updated_query_embedding, document_embedding, dim=0)

In [None]:
def mse_loss(query_embedding, document_embedding, adaptor_matrix, label):
    return torch.nn.MSELoss()(model(query_embedding, document_embedding, adaptor_matrix), label)

In [None]:
mat_size = len(adapter_query_embeddings[0])
adapter_matrix = torch.randn(mat_size, mat_size, requires_grad=True)

In [None]:
min_loss = float('inf')
best_matrix = None

for epoch in tqdm(range(100)):
    for query_embedding, document_embedding, label in dataset:
        loss = mse_loss(query_embedding, document_embedding, adapter_matrix, label)

        if loss < min_loss:
            min_loss = loss
            best_matrix = adapter_matrix.clone().detach().numpy()

        loss.backward()
        with torch.no_grad():
            adapter_matrix -= 0.01 * adapter_matrix.grad
            adapter_matrix.grad.zero_()

In [None]:
print(f"Best loss: {min_loss.detach().numpy()}")

In [None]:
test_vector = torch.ones((mat_size,1))
scaled_vector = np.matmul(best_matrix, test_vector).numpy()

In [None]:
import matplotlib.pyplot as plt
plt.bar(range(len(scaled_vector)), scaled_vector.flatten())
plt.show()


In [None]:
def project_embeddings(embeddings, umap_transform):
    umap_embeddings = np.empty((len(embeddings),2))
    for i, embedding in enumerate(tqdm(embeddings)): 
        umap_embeddings[i] = umap_transform.transform([embedding])
    return umap_embeddings

In [None]:
umap_transform = umap.UMAP(random_state=0, transform_seed=0).fit(adapter_doc_embeddings)

In [None]:
projected_dataset_embeddings = project_embeddings(adapter_doc_embeddings, umap_transform)

In [None]:
query_embeddings = adapter_query_embeddings
adapted_query_embeddings = np.matmul(best_matrix, np.array(query_embeddings).T).T

projected_query_embeddings = project_embeddings(query_embeddings, umap_transform)
projected_adapted_query_embeddings = project_embeddings(adapted_query_embeddings, umap_transform)

In [None]:
# Plot the projected query and retrieved documents in the embedding space
plt.figure()
plt.scatter(projected_dataset_embeddings[:, 0], projected_dataset_embeddings[:, 1], s=10, color='gray')
plt.scatter(projected_query_embeddings[:, 0], projected_query_embeddings[:, 1], s=150, marker='X', color='r', label="original")
plt.scatter(projected_adapted_query_embeddings[:, 0], projected_adapted_query_embeddings[:, 1], s=150, marker='X', color='green', label="adapted")

plt.gca().set_aspect('equal', 'datalim')
plt.title("Adapted Queries")
plt.axis('off')
plt.legend()

In [None]:
query = "What are the legal requirements for residents in a cooperative housing society?"

In [None]:
neo4j_vector_search(query, 'Acts_Updatedchunks')

In [None]:
query_embedding = embeddings.embed_query(query)

In [None]:
new_query = np.matmul(best_matrix, np.array(query_embedding))

In [None]:
def neo4j_vector_search_2(question, index_name, query_embedding):
  """Search for similar nodes using the Neo4j vector index"""
  #query_embedding = embeddings.embed_query(question)  
  vector_search_query = """
    CALL db.index.vector.queryNodes($index_name, $top_k, $question) yield node, score
    RETURN score, node.ActId, node.sectionId, node.sectionName, node.url, node.text AS text, node.RegId AS Regulations
  """
  similar = kg.query(vector_search_query, 
                     params={
                      'question': query_embedding, 
                      'index_name':index_name, 
                      'top_k': 10})
  return similar

In [None]:
neo4j_vector_search_2(query,  'Acts_Updatedchunks', new_query)