**Notebook Workflow:**

- **Setup Connections:** Connect to Neo4j and OpenAI.
- **Fetch and Embed Data:** Retrieve node data from Neo4j and generate embeddings using OpenAI.
- **Create FAISS Index:** Create and populate a FAISS index with node embeddings.
- **Query and Search:** Perform similarity searches on the FAISS index.
- **Format and Save Results:** Format detailed node and relationship information and save results to JSON.

# Setup

In [39]:
import pandas as pd
from neo4j import GraphDatabase
from openai import OpenAI
from dotenv import load_dotenv
import os
from utils import *
import json

# Load environment variables from .env file
load_dotenv()

True

In [33]:
# Get the URI and authentication credentials from environment variables
URI = os.getenv("NEO4J_URI")
AUTH = (os.getenv("NEO4J_USER"), os.getenv("NEO4J_PASSWORD"))

# initalize the driver
driver = GraphDatabase.driver(URI, auth=AUTH)

# connect to OpenAI
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),  # This is the default and can be omitted
)

In [65]:
def fetch_graph_as_json(driver):
    """
    Fetch all nodes and relationships from Neo4j and return as JSON.
    Args:
        driver: The Neo4j driver instance.
    Returns:
        A JSON object containing nodes and relationships.
    """
    query = """
    MATCH (n)-[r]->(m)
    RETURN n, r, m
    """
    graph_data = {"nodes": [], "relationships": []}
    node_ids = set()  # To avoid duplicate nodes

    with driver.session() as session:
        results = session.run(query)
        for record in results:
            # Add start node
            start_node = record["n"]
            if start_node.element_id not in node_ids:
                graph_data["nodes"].append({
                    "id": start_node.element_id,
                    "labels": list(start_node.labels),
                    "properties": dict(start_node)
                })
                node_ids.add(start_node.element_id)

            # Add end node
            end_node = record["m"]
            if end_node.element_id not in node_ids:
                graph_data["nodes"].append({
                    "id": end_node.element_id,
                    "labels": list(end_node.labels),
                    "properties": dict(end_node)
                })
                node_ids.add(end_node.element_id)

            # Add relationship
            relationship = record["r"]
            graph_data["relationships"].append({
                "id": relationship.element_id,
                "type": relationship.type,
                "startNode": relationship.start_node.element_id,
                "endNode": relationship.end_node.element_id,
                "properties": dict(relationship)
            })

    return graph_data

def remove_embedding_property(graph_data):
    """
    Remove the 'embedding' property from all nodes in the graph data.
    Args:
        graph_data: The JSON object containing nodes and relationships.
    Returns:
        The updated graph data with 'embedding' removed from node properties.
    """
    for node in graph_data["nodes"]:
        if "embedding" in node["properties"]:
            del node["properties"]["embedding"]
    return graph_data


In [66]:
# Fetch the graph data
graph_json = fetch_graph_as_json(driver)

# Remove 'embedding' property
graph_json = remove_embedding_property(graph_json)

# Save to a JSON file
with open("graph_data.json", "w") as f:
    json.dump(graph_json, f, indent=4)

print("Graph data saved to graph_data.json without embeddings")

Graph data saved to graph_data.json without embeddings


# Vectorize KG


## get node data

In [67]:
def get_node_types():
    """
    Fetch all node labels (types) from the Neo4j database.
    Returns:
        List of node labels.
    """
    query = "CALL db.labels() YIELD label RETURN label"
    with driver.session() as session:
        results = session.run(query)
        return [record["label"] for record in results]

node_types = get_node_types()

In [68]:
def get_nodes_by_type(node_type):
    """
    Fetch all nodes and their properties for a specific node type.
    Args:
        node_type: The label of the node type to query.
    Returns:
        List of dictionaries representing nodes and their properties.
    """
    query = f"""
    MATCH (n:{node_type})
    RETURN properties(n) AS node_properties
    """
    with driver.session() as session:
        results = session.run(query)
        return [record["node_properties"] for record in results]

# Example usage
all_nodes = {}
for node_type in node_types:
    all_nodes[node_type] = get_nodes_by_type(node_type)

In [69]:
def get_all_nodes():
    """
    Fetch all nodes of all types with their properties.
    Returns:
        List of dictionaries representing all nodes with their type.
    """
    all_nodes = []
    for node_type in node_types:
        nodes = get_nodes_by_type(node_type)
        for node in nodes:
            node["type"] = node_type  # Add the node type for context
            all_nodes.append(node)
    return all_nodes

# Example usage
graph_data = get_all_nodes()

## make embeddings for nodes

In [70]:
def generate_openai_embeddings_for_all(graph_data, batch_size=10):
    """
    Generate embeddings for all nodes using OpenAI embeddings in batches.
    Args:
        graph_data: List of dictionaries representing graph nodes.
        batch_size: Number of nodes to process in each batch.
    Returns:
        Dictionary mapping node names to their embeddings.
    """
    embeddings = {}
    
    # Process data in batches
    for i in range(0, len(graph_data), batch_size):
        batch = graph_data[i:i + batch_size]  # Get the current batch
        descriptions = []

        # Create descriptions for each node in the batch
        for node in batch:
            description_parts = [
                f"Type: {node.get('type', 'Unknown')}",
                f"Name: {node.get('name', 'Unknown')}",
                f"Label: {node.get('label', '')}",
                f"Format: {node.get('format', '')}"
            ]
            descriptions.append(" | ".join(part for part in description_parts if part.strip()))

        try:
            # Generate embeddings for the entire batch
            response = client.embeddings.create(
                input=descriptions,
                model="text-embedding-ada-002"
            )
            
            # Extract embeddings for each description in the batch
            for idx, node in enumerate(batch):
                embedding = response.data[idx].embedding
                embeddings[node["name"]] = embedding

        except Exception as e:
            print(f"Error embedding batch starting at index {i}: {e}")

    return embeddings

In [71]:
# Generate embeddings
node_embeddings = generate_openai_embeddings_for_all(graph_data)
if "Error" in node_embeddings:
    print("Node Embeddings:", node_embeddings)
else:
    print("Embeddings made")

Embeddings made


In [72]:
# store embeddings
with open("node_embeddings.json", "w") as f:
    json.dump(node_embeddings, f)

## Store Embeddings w faiss

In [74]:
import faiss
import numpy as np

def create_faiss_index(embeddings):
    """
    Create and populate a FAISS index with embeddings.
    Args:
        embeddings: Dictionary mapping node names to their embeddings.
    Returns:
        A tuple containing the FAISS index and a list of node names.
    """
    # Extract embeddings and corresponding names
    node_names = list(embeddings.keys())
    embedding_vectors = np.array(list(embeddings.values()), dtype="float32")

    # Create FAISS index (cosine similarity)
    dimension = embedding_vectors.shape[1]
    index = faiss.IndexFlatIP(dimension)  # Inner Product (cosine similarity with normalized vectors)

    # Normalize vectors for cosine similarity
    faiss.normalize_L2(embedding_vectors)

    # Add vectors to the index
    index.add(embedding_vectors)
    return index, node_names

# Example usage
faiss_index, node_names = create_faiss_index(node_embeddings)
print(f"FAISS index created with {faiss_index.ntotal} nodes.")

FAISS index created with 555 nodes.


In [75]:
faiss.write_index(faiss_index, "graph_embeddings.index")
with open("node_names.txt", "w") as f:
    for name in node_names:
        f.write(name + "\n")

In [76]:
faiss_index = faiss.read_index("graph_embeddings.index")
with open("node_names.txt", "r") as f:
    node_names = [line.strip() for line in f.readlines()]

# Perform Similarity Search

In [51]:
def query_faiss_index(query_text, index, node_names, client):
    """
    Query the FAISS index for similar nodes.
    Args:
        query_text: The user's query in plain text.
        index: The FAISS index containing node embeddings.
        node_names: List of node names corresponding to the embeddings in the index.
        client: OpenAI client instance to generate query embeddings.
    Returns:
        List of top matching nodes and their similarity scores.
    """
    # Embed the query using OpenAI
    response = client.embeddings.create(
        input=query_text,
        model="text-embedding-ada-002"
    )
    
    # Extract the embedding vector
    query_vector = np.array(response.data[0].embedding, dtype="float32").reshape(1, -1)

    # Normalize the query vector for cosine similarity
    faiss.normalize_L2(query_vector)

    # Perform similarity search (retrieve top 5 results)
    distances, indices = index.search(query_vector, k=5)

    # Map indices back to node names and distances
    results = [(node_names[idx], distances[0][i]) for i, idx in enumerate(indices[0])]
    return results


In [52]:
# Example query
query = "What data is available about degrees?"
results = query_faiss_index(query, faiss_index, node_names, client)
print("Top Results:", results)

Top Results: [('offering_highest_degree', np.float32(0.823972)), ('degree_seeking', np.float32(0.8232179)), ('degree_granting', np.float32(0.79930216)), ('medical_degree', np.float32(0.79743224)), ('other_degree_offered', np.float32(0.79340917))]


# Return to sender
- aka Format the Results for Display

In [53]:
def fetch_relationships_from_neo4j(node_name, driver):
    """
    Fetch relationships for a specific node from the Neo4j database.
    Args:
        node_name: The name of the node.
        driver: The Neo4j driver instance.
    Returns:
        List of dictionaries representing relationships.
    """
    query = """
    MATCH (n {name: $node_name})-[r]->(m)
    RETURN type(r) AS relationship_type, m.name AS target
    """
    with driver.session() as session:
        results = session.run(query, node_name=node_name)
        return [{"type": record["relationship_type"], "target": record["target"]} for record in results]

In [54]:
def fetch_node_details_with_relationships(results, graph_data, driver):
    """
    Retrieve detailed information about matching nodes and their relationships dynamically.
    Args:
        results: List of tuples containing node names and similarity scores.
        graph_data: List of dictionaries representing graph nodes.
        driver: The Neo4j driver instance.
    Returns:
        List of dictionaries with detailed node and relationship information.
    """
    details = []
    for node_name, score in results:
        node = next((n for n in graph_data if n["name"] == node_name), {})
        if node:
            node["similarity"] = score
            node["relationships"] = fetch_relationships_from_neo4j(node_name, driver)
            details.append(node)
    return details

In [55]:
# test 
node_details = fetch_node_details_with_relationships(results, graph_data, driver)

In [56]:
def format_node_details(node_details):
    """
    Format detailed node and relationship information into a user-friendly string.
    Args:
        node_details: List of dictionaries containing detailed node and relationship information.
    Returns:
        A formatted string.
    """
    if not node_details:
        return "No results found."

    response = "Here are the top matching nodes:\n\n"
    for idx, node in enumerate(node_details, start=1):
        # Basic node information
        response += (
            f"{idx}. Name: {node['name']}\n"
            f"   Type: {node['type']}\n"
            f"   Label: {node['label']}\n"
            f"   Format: {node['format']}\n"
            f"   Similarity: {node['similarity']:.4f}\n"
        )

        # Relationship information
        if "relationships" in node and node["relationships"]:
            response += "   Relationships:\n"
            for rel in node["relationships"]:
                response += f"     - {rel['type']} {rel['target']}\n"

        response += "\n"
    return response

# Format the results
formatted_output = format_node_details(node_details)
print(formatted_output)

Here are the top matching nodes:

1. Name: offering_highest_degree
   Type: Variable
   Label: Highest degree offered
   Format: offering_highest_degree
   Similarity: 0.8240
   Relationships:
     - EXISTS_IN directory
     - PROVIDED_BY ipeds

2. Name: degree_seeking
   Type: Variable
   Label: Degree/certificate-seeking
   Format: degree_seeking
   Similarity: 0.8232
   Relationships:
     - EXISTS_IN fall-enrollment-race
     - EXISTS_IN fall-enrollment-age
     - EXISTS_IN sfa-grants-and-net-price
     - PROVIDED_BY ipeds

3. Name: degree_granting
   Type: Variable
   Label: Degree-granting status
   Format: yes_no
   Similarity: 0.7993
   Relationships:
     - EXISTS_IN directory
     - PROVIDED_BY ipeds

4. Name: medical_degree
   Type: Variable
   Label: Institution grants a medical degree
   Format: yes_no
   Similarity: 0.7974
   Relationships:
     - EXISTS_IN directory
     - PROVIDED_BY ipeds

5. Name: other_degree_offered
   Type: Variable
   Label: Other degree offered (

In [57]:
import json
import numpy as np

def save_results_to_json(node_details, filename="call_node_details.json"):
    """
    Save node details and relationships to a JSON file.
    Args:
        node_details: List of dictionaries containing node details and relationships.
        filename: Name of the file to save results to.
    """
    # Convert all numpy.float32 to Python float and ensure all data is serializable
    def make_serializable(data):
        if isinstance(data, np.float32):  # Handle numpy floats
            return float(data)
        if isinstance(data, list):  # Handle nested lists
            return [make_serializable(item) for item in data]
        if isinstance(data, dict):  # Handle nested dictionaries
            return {key: make_serializable(value) for key, value in data.items()}
        return data  # Return native types as-is

    serializable_data = [make_serializable(node) for node in node_details]

    with open(filename, "w") as f:
        json.dump(serializable_data, f, indent=4)
    print(f"Results saved to {filename}")

# Example usage
save_results_to_json(node_details)

Results saved to call_node_details.json


In [64]:
# see graph
from pyvis.network import Network

# Create a pyvis network
net = Network(height="750px", width="100%", directed=True)

# Add nodes
for node in graph_json["nodes"]:
    net.add_node(node["id"], label=node["label"], title=f"Type: {node['label']}")

# Add edges
for rel in graph_json["relationships"]:
    net.add_edge(rel["startNode"], rel["endNode"], title=rel["type"], label=rel["type"])

# Show the graph
net.show("knowledge_graph.html")

KeyError: 'label'