In [14]:
import pandas as pd
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.neo4jvector import Neo4jVectorStore
from llama_index.core import StorageContext, ServiceContext, VectorStoreIndex
from llama_index.core.schema import Document

import logging

In [2]:
from utils.generic import get_driver, get_credentials, Models

In [4]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [5]:
driver = get_driver()
embed_model = HuggingFaceEmbedding(model_name=Models.BAAI_BGE_SMALL_EN_V1_5.value)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5
INFO:sentence_transformers.SentenceTransformer:2 prompts are loaded, with the keys: ['query', 'text']


In [6]:
abstracts_df = pd.read_csv('../data/processed/ncbi_dev_abstracts.csv')
annotations_df = pd.read_csv('../data/processed/ncbi_dev_annotations.csv')

In [7]:
embedding_dimension = 384  # BAAI/bge-small-en-v1.5 embedding dimension
uri = get_credentials('uri')
username = get_credentials('username')
password = get_credentials('password')

In [8]:
neo4j_vector_hybrid_BAAI = Neo4jVectorStore(
    url=uri,
    username=username,
    password=password,
    hybrid_search=True,
    embedding_dimension=embedding_dimension,
    embedding_node_property="DiseaseEmbedding-BAAI-bge-small-en-v1_5",
    text_node_property="DiseaseName"
)

In [9]:
storage_context_BAAI = StorageContext.from_defaults(
    vector_store=neo4j_vector_hybrid_BAAI
)

In [17]:
service_context_BAAI = ServiceContext.from_defaults(
    embed_model=embed_model,
    llm=None
)

LLM is explicitly disabled. Using MockLLM.


  service_context_BAAI = ServiceContext.from_defaults(


In [10]:
def extract_graph_data(driver):
    with driver.session() as session:
        result = session.run("MATCH (n) RETURN n")
        nodes = []
        for record in result:
            node = record["n"]
            nodes.append(node)
    return nodes

nodes = extract_graph_data(driver)

In [11]:
def nodes_to_documents(nodes):
    documents = []
    for node in nodes:
        try:
            # Create a combined text from node properties for the document content
            content = " ".join([f"{key}: {value}" for key, value in node.items()])
            doc = Document(
                text=content,
                metadata={"labels": list(node.labels), "element_id": node.element_id}
            )
            documents.append(doc)
        except Exception as e:
            logger.error(f"Error processing node {node.element_id}: {e}")
    return documents

documents = nodes_to_documents(nodes)

In [18]:
try:
    index_BAAI = VectorStoreIndex.from_documents(
        documents,
        storage_context=storage_context_BAAI,
        service_context=service_context_BAAI,
        show_progress=True,
        embed_model='local',
    )
    logger.info("Vector store index created successfully.")
except Exception as e:
    logger.error(f"Error creating vector store index: {e}")

Parsing nodes:   0%|          | 0/13298 [00:00<?, ?it/s]

ERROR:__main__:Error creating vector store index: The `model_name` argument must be provided.


In [None]:
query_engine_BAAI = index_BAAI.as_query_engine()

In [None]:
query = "This patient has glucose-6-phosphate dehydrogenase (G6PD) deficiency."
results = query_engine_BAAI.query(query)
print(results)

In [None]:
import pickle

with open('../db/vector_store/vector_store_index_BAAI.pkl', 'wb') as f:
    pickle.dump(index_BAAI, f)