# Setup

In [15]:
import pandas as pd
from neo4j import GraphDatabase
from dotenv import load_dotenv
import os
import glob
import json

# Load environment variables from .env file
load_dotenv()

True

In [11]:

# Get the URI and authentication credentials from environment variables
URI = os.getenv("NEO4J_URI")
AUTH = (os.getenv("NEO4J_USER"), os.getenv("NEO4J_PASSWORD"))

# initalize the driver
driver = GraphDatabase.driver(URI, auth=AUTH)


# Functions

In [7]:
# Function to run a query
def run_query(query, parameters):
    with driver.session() as session:
        session.run(query, parameters)
        
# Create Nodes and Relationships in Neo4j
def create_knowledge_graph(data, source, endpoint_name):
    for _, row in data.iterrows():
        variable_name = row['variable']
        format = row['format']
        label = row['label']

        # Create or update Variable node
        variable_query = """
        MERGE (v:Variable {name: $name})
        ON CREATE SET v.format = $format, v.label = $label
        ON MATCH SET v.format = $format, v.label = $label
        """
        run_query(variable_query, {"name": variable_name, "format": format, "label": label})

        # Create or update Source node
        source_query = """
        MERGE (s:Source {name: $name})
        """
        run_query(source_query, {"name": source})

        # Create or update Endpoint node
        endpoint_query = """
        MERGE (e:Endpoint {name: $name})
        """
        run_query(endpoint_query, {"name": endpoint_name})

        # Create or update Relationships
        relationship_query_1 = """
        MATCH (v:Variable {name: $variable_name})
        MATCH (e:Endpoint {name: $endpoint_name})
        MERGE (v)-[:EXISTS_IN]->(e)
        """
        run_query(relationship_query_1, {"variable_name": variable_name, "endpoint_name": endpoint_name})

        relationship_query_2 = """
        MATCH (e:Endpoint {name: $endpoint_name})
        MATCH (s:Source {name: $source})
        MERGE (e)-[:PROVIDED_BY]->(s)
        """
        run_query(relationship_query_2, {"endpoint_name": endpoint_name, "source": source})

        relationship_query_3 = """
        MATCH (v:Variable {name: $variable_name})
        MATCH (s:Source {name: $source})
        MERGE (v)-[:PROVIDED_BY]->(s)
        """
        run_query(relationship_query_3, {"variable_name": variable_name, "source": source})        

# Process all codebook files in the folder
def process_codebooks(folder_path):
    # Get all Excel files matching the naming convention
    codebook_files = glob.glob(os.path.join(folder_path, "codebook_*.xlsx"))

    for file_path in codebook_files:
        # Extract metadata from the filename
        filename = os.path.basename(file_path)
        _, topic, source, endpoint_name = filename.split('_')
        endpoint_name = endpoint_name.replace('.xlsx', '')

        # Load the Excel file
        df = pd.read_excel(file_path, sheet_name='variables')

        # Add source and endpoint_name to DataFrame
        df['source'] = source
        df['endpoint_name'] = endpoint_name

        # Populate the knowledge graph
        create_knowledge_graph(df, source, endpoint_name)

# Process Codebooks

In [12]:
# Path to the folder containing codebook files
folder_path = "data/codebooks/"

# Process all codebooks in the folder
process_codebooks(folder_path)

print("Knowledge graph populated for all codebooks!")

Knowledge graph populated for all codebooks!


- Next here to scale across all codebooks

In [20]:
# Attempt to connect to Neo4j Aura
try:
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        with driver.session() as session:
            result = session.run("RETURN 1 AS test")
            for record in result:
                print("Connected! Test Query Result:", record["test"])
except Exception as e:
    print("Error:", e)

Connected! Test Query Result: 1


In [None]:
# Function to run a query
def run_query(query, parameters):
    with driver.session() as session:
        session.run(query, parameters)

# Create Nodes and Relationships in Neo4j
def create_knowledge_graph(data):
    for _, row in data.iterrows():
        variable_name = row['variable']
        format = row['format']
        label = row['label']
        source = row['source']
        endpoint_name = row['endpoint_name']

        # Create Variable node
        variable_query = """
        MERGE (v:Variable {name: $name})
        ON CREATE SET v.format = $format, v.label = $label
        """
        run_query(variable_query, {"name": variable_name, "format": format, "label": label})

        # Create Source node
        source_query = """
        MERGE (s:Source {name: $name})
        """
        run_query(source_query, {"name": source})

        # Create Endpoint Name node
        endpoint_query = """
        MERGE (e:Endpoint {name: $name})
        """
        run_query(endpoint_query, {"name": endpoint_name})

        # Create Relationships
        relationship_query_1 = """
        MATCH (v:Variable {name: $variable_name})
        MATCH (e:Endpoint {name: $endpoint_name})
        MERGE (v)-[:EXISTS_IN]->(e)
        """
        run_query(relationship_query_1, {"variable_name": variable_name, "endpoint_name": endpoint_name})

        relationship_query_2 = """
        MATCH (e:Endpoint {name: $endpoint_name})
        MATCH (s:Source {name: $source})
        MERGE (e)-[:PROVIDED_BY]->(s)
        """
        run_query(relationship_query_2, {"endpoint_name": endpoint_name, "source": source})

        relationship_query_3 = """
        MATCH (v:Variable {name: $variable_name})
        MATCH (s:Source {name: $source})
        MERGE (v)-[:PROVIDED_BY]->(s)
        """
        run_query(relationship_query_3, {"variable_name": variable_name, "source": source})

# Call the function with your DataFrame
create_knowledge_graph(df)

print("Knowledge graph populated!")

  with driver.session() as session:


Knowledge graph populated!


In [13]:
# Check the data in the graph
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session() as session:
        result = session.run("RETURN 'Connection successful!' AS message")
        for record in result:
            print(record["message"])

Connection successful!


## Write knowledge graph to json

In [17]:
def fetch_graph_as_json(driver):
    """
    Fetch all nodes and relationships from Neo4j and return as JSON.
    Args:
        driver: The Neo4j driver instance.
    Returns:
        A JSON object containing nodes and relationships.
    """
    query = """
    MATCH (n)-[r]->(m)
    RETURN n, r, m
    """
    graph_data = {"nodes": [], "relationships": []}
    node_ids = set()  # To avoid duplicate nodes

    with driver.session() as session:
        results = session.run(query)
        for record in results:
            # Add start node
            start_node = record["n"]
            if start_node.element_id not in node_ids:
                graph_data["nodes"].append({
                    "id": start_node.element_id,
                    "labels": list(start_node.labels),
                    "properties": dict(start_node)
                })
                node_ids.add(start_node.element_id)

            # Add end node
            end_node = record["m"]
            if end_node.element_id not in node_ids:
                graph_data["nodes"].append({
                    "id": end_node.element_id,
                    "labels": list(end_node.labels),
                    "properties": dict(end_node)
                })
                node_ids.add(end_node.element_id)

            # Add relationship
            relationship = record["r"]
            graph_data["relationships"].append({
                "id": relationship.element_id,
                "type": relationship.type,
                "startNode": relationship.start_node.element_id,
                "endNode": relationship.end_node.element_id,
                "properties": dict(relationship)
            })

    return graph_data

def remove_embedding_property(graph_data):
    """
    Remove the 'embedding' property from all nodes in the graph data.
    Args:
        graph_data: The JSON object containing nodes and relationships.
    Returns:
        The updated graph data with 'embedding' removed from node properties.
    """
    for node in graph_data["nodes"]:
        if "embedding" in node["properties"]:
            del node["properties"]["embedding"]
    return graph_data


In [16]:
# Fetch the graph data
graph_json = fetch_graph_as_json(driver)

# Remove 'embedding' property
graph_json = remove_embedding_property(graph_json)

# Save to a JSON file
with open("graph_data.json", "w") as f:
    json.dump(graph_json, f, indent=4)

print("Graph data saved to graph_data.json without embeddings")

  with driver.session() as session:


Graph data saved to graph_data.json without embeddings


# Query the Graph

In [None]:
def get_variables_and_sources():
    query = """
    MATCH (v:Variable)-[:EXISTS_IN]->(e:Endpoint)-[:PROVIDED_BY]->(s:Source)
    RETURN v.name AS variable, v.label AS label, e.name AS endpoint, s.name AS source
    """
    with driver.session() as session:
        result = session.run(query)
        for record in result:
            print(
                f"Variable: {record['variable']}, "
                f"Label: {record['label']}, "
                f"Endpoint: {record['endpoint']}, "
                f"Source: {record['source']}"
            )

# Call the function to test
get_variables_and_sources()

In [None]:
def get_relationships():
    query = """
    MATCH (a)-[r]->(b)
    RETURN a.name AS from_node, type(r) AS relationship, b.name AS to_node
    """
    with driver.session() as session:
        result = session.run(query)
        print("Relationships in the Graph:")
        for record in result:
            print(
                f"From: {record['from_node']}, "
                f"Relationship: {record['relationship']}, "
                f"To: {record['to_node']}"
            )

# Call the function to test
get_relationships()

# add colleges metadata

In [None]:

# Load the metadata file
with open("data/colleges_meta_data.json") as meta_file:
    metadata = json.load(meta_file)
    
# Extract metadata for sources and endpoints
source_metadata = {item["source"]: item for item in metadata}
endpoint_metadata = {}
for item in metadata:
    for endpoint in item.get("endpoints", []):
        endpoint_metadata[endpoint["endpoint"].lower()] = endpoint

In [22]:
# Function to update node properties
def update_node_properties(tx, label, name, properties):
    query = f"""
    MATCH (n:{label} {{name: $name}})
    SET n += $properties
    """
    tx.run(query, name=name, properties=properties)

# Update graph nodes
try:
    # Create session and update nodes
    with driver.session() as session:
        # Update source nodes
        for source, data in source_metadata.items():
            session.execute_write(
                update_node_properties, 
                "Source", 
                source.lower(), 
                {"description": data.get("description"), "url": data.get("url")}
            )

        # Update endpoint nodes
        for endpoint, data in endpoint_metadata.items():
            session.execute_write(
                update_node_properties, 
                "Endpoint", 
                endpoint.lower(), 
                {"description": data.get("description"), "data_range": data.get("data_range")}
            )
finally:
    # Close the driver after all operations are complete
    driver.close()

  with driver.session() as session:
