In [1]:
# Required libraries (Install using the following commands):
#!pip install pandas
#!pip install graphrag
#!pip install networkx
#!pip install python-louvain
#!pip install matplotlib
#!pip install pyvis
#!pip install openai
#!pip install python-dotenv
#!pip install neo4j

In [84]:
# Section 1: Import Required Libraries
import pandas as pd
import networkx as nx
from pathlib import Path
from community import community_louvain  # For clustering
import matplotlib.pyplot as plt
from pyvis.network import Network
import openai
from openai import OpenAI
import os
from dotenv import load_dotenv
from neo4j import GraphDatabase

In [85]:
# Section 2: Load Environment Variables and Initialize Neo4j Driver
# - Load API keys and connection credentials
# - Connect to Neo4j database

# Load environment variables from .env file
env_path = Path('.') / '.env'
load_dotenv(dotenv_path=env_path)

# Get OpenAI API key from environment variable
openai_api_key = os.getenv('OPENAI_API_KEY')
if not openai_api_key:
    raise ValueError("No OpenAI API key found. Please set the OPENAI_API_KEY environment variable.")


neo4j_uri = "bolt://localhost:7687"
neo4j_user = "neo4j"
neo4j_password = "password"



#neo4j_uri = os.getenv('NEO4J_URI')
#neo4j_user = os.getenv('NEO4J_USER')
#neo4j_password = os.getenv('NEO4J_PASSWORD')
#driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password))

In [89]:
# Section 3: Load Existing Graph or Create a New One
# - Option to load from file or build a new graph
load_existing_graph = input("Do you want to load an existing graph? (yes/no): ").strip().lower() == 'yes'

if load_existing_graph:
    # Load existing graph from file
    graph_file = input("Enter the path of the graph file to load (e.g., 'SurveyKnowledgeGraph.gml'): ").strip()
    knowledge_graph = nx.read_gml(graph_file)
    print(f"Loaded graph from {graph_file}")
else:
    # Section 4: Load and Combine Metadata
    # - Load CSV files for different decades and datasets
    # - Combine all metadata into one dataframe
    census_metadata_1980s = pd.read_csv('./census_metadata_1980s.csv')
    census_metadata_1990s = pd.read_csv('./census_metadata_1990s.csv')
    census_metadata_2000s = pd.read_csv('./census_metadata_2000s.csv', low_memory=False)
    census_metadata_2010s = pd.read_csv('./census_metadata_2010s.csv')
    census_metadata_2020s = pd.read_csv('./census_metadata_2020s.csv')
    census_metadata_unknown = pd.read_csv('./census_metadata_Unknown.csv')
    datasets_metadata = pd.read_csv('./datasets_metadata.csv')

    data_metadata_combined = pd.concat([
        census_metadata_1980s,
        census_metadata_1990s,
        census_metadata_2000s,
        census_metadata_2010s,
        census_metadata_2020s,
        census_metadata_unknown,
        datasets_metadata
    ], ignore_index=True)

Do you want to load an existing graph? (yes/no):  no


In [90]:
    # Section 5: Initialize Knowledge Graph
    # - Create an empty NetworkX graph
    knowledge_graph = nx.Graph()

In [91]:
    # Section 6: Add Nodes and Edges to the Graph
    # - Add datasets and variables as nodes
    # - Create edges between datasets and related variables
    for _, row in data_metadata_combined.iterrows():
        dataset_id = row['identifier']
        knowledge_graph.add_node(
            dataset_id,
            type='Dataset',
            title=row['title'],
            year=row['year'],
            description=row['description'],
            contact=row['contact'],
            access_level=row['access_level'],
            modified=row['modified'],
            publisher=row['publisher'],
            references=row['references'],
            keywords=row['keywords'],
            geographic_coverage=row.get('geographic_coverage'),
            survey_method=row.get('survey_method'),
            frequency=row.get('frequency'),
            units_of_analysis=row.get('units_of_analysis'),
            target_population=row.get('target_population'),
            data_source=row.get('data_source'),
            temporal_coverage=row.get('temporal_coverage'),
            related_publications=row.get('related_publications')
        )

    if 'VariableID' in datasets_metadata.columns:
        for _, row in datasets_metadata.iterrows():
            variable_id = row['VariableID']
            dataset_id = row['identifier']
            knowledge_graph.add_node(
                variable_id,
                type='Variable',
                name=row['Name'],
                description=row['Description'],
                data_type=row['DataType']
            )
            knowledge_graph.add_edge(dataset_id, variable_id, relation='CONTAINS')

In [92]:
    # Section 7: Add Decades as Nodes and Link Datasets
    # - Add nodes representing decades and link datasets to them
    for decade in data_metadata_combined['year'].unique():
        if pd.notna(decade):
            try:
                decade = int(decade)
                decade_str = f"{(decade // 10) * 10}s"
                knowledge_graph.add_node(decade_str, type='Decade')
                for _, row in data_metadata_combined[data_metadata_combined['year'] == decade].iterrows():
                    knowledge_graph.add_edge(decade_str, row['identifier'], relation='BELONGS_TO')
            except ValueError:
                print(f"Skipping invalid year value: {decade}")

Skipping invalid year value: Unknown


In [93]:
from neo4j import GraphDatabase

# Function to clear all records in the Neo4j database
def clear_neo4j_database(uri, user, password):
    # Asking for user confirmation
    confirmation = input("Are you sure you want to clear all records in the Neo4j database? This action is irreversible. (yes/no): ")
    
    if confirmation.lower() == 'yes':
        driver = GraphDatabase.driver(uri, auth=(user, password))
        
        # Cypher query to delete all nodes and relationships
        clear_query = """
        MATCH (n)
        DETACH DELETE n
        """
        
        with driver.session() as session:
            session.run(clear_query)
        
        driver.close()
        print("All records cleared from the database.")
    else:
        print("Operation canceled. No records were deleted.")

# Parameters for the Neo4j connection
neo4j_uri = "bolt://localhost:7687"  # Modify if needed
neo4j_user = "neo4j"  # Replace with your Neo4j username
neo4j_password = "password"  # Replace with your Neo4j password

# Clear the database with confirmation
clear_neo4j_database(neo4j_uri, neo4j_user, neo4j_password)


Are you sure you want to clear all records in the Neo4j database? This action is irreversible. (yes/no):  yes


All records cleared from the database.


In [94]:
    # Section 8: Perform Clustering and Save Graph
    # - Use Louvain method to cluster nodes
    # - Save the graph to a GML file and to Neo4j
    partition = community_louvain.best_partition(knowledge_graph)
    for node, cluster in partition.items():
        knowledge_graph.nodes[node]['cluster'] = cluster

    nx.write_gml(knowledge_graph, 'SurveyKnowledgeGraph.gml', stringizer=lambda x: str(x) if x is not None else '')
    print("Knowledge graph created and exported successfully!")

Knowledge graph created and exported successfully!


In [95]:
from neo4j import GraphDatabase
from neo4j.exceptions import ServiceUnavailable

# neo4j_uri = "bolt://localhost:7687"
# neo4j_user = "neo4j"
# neo4j_password = "password"


# Replace with your Neo4j database connection details
URI = neo4j_uri
AUTH = (neo4j_user, neo4j_password)

driver = GraphDatabase.driver(URI, auth=AUTH)

def verify_connectivity():
    try:
        with driver.session(database="neo4j") as session:  # Specify database name
            result = session.run("MATCH (n) RETURN count(n) as count")
            record = result.single()
            if record:
                print(f"Connection successful! Found {record['count']} nodes.")
            else:
                print("Connection successful but database is empty.")
    except ServiceUnavailable as e:
        print(f"Error connecting to Neo4j database: {e}")
    finally:
        # Always close the driver connection when done
        driver.close()

verify_connectivity()

Connection successful! Found 0 nodes.


In [96]:
import uuid
import pandas as pd
from neo4j import GraphDatabase
from neo4j.exceptions import ServiceUnavailable

# Replace with your Neo4j database connection details
URI = "bolt://localhost:7687"  # Example URI, modify if needed
AUTH = ("neo4j", "password")    # Replace with your Neo4j credentials

# Attempt to connect to Neo4j
try:
    driver = GraphDatabase.driver(URI, auth=AUTH)
    print("Connected to Neo4j successfully.")
except Exception as e:
    print(f"Failed to connect to Neo4j: {e}")

# Function to replace NaN values with a default value in node data
def replace_nan_with_default(data):
    for key, value in data.items():
        if pd.isna(value):  # Use pandas' isna() to catch all NaN-like values
            data[key] = "Unknown"
    return data

# Function to save nodes and relationships to Neo4j
def save_to_neo4j(tx, graph):
    # Debugging: Adding nodes
    if graph is None or len(graph) == 0:
        print("The knowledge_graph is empty. Nothing to save.")
        return
    
    print(f"Saving nodes and relationships for graph with {len(graph.nodes)} nodes and {len(graph.edges)} relationships.")

    # Saving nodes to Neo4j
    for node, data in graph.nodes(data=True):
        # Generate a unique ID if the node id is NaN or None
        if node is None or pd.isna(node):
            node = str(uuid.uuid4())
            print(f"Generated new unique id: {node} for node with data: {data}")

        # Replace NaN values in node properties
        data = replace_nan_with_default(data)

        print(f"Creating node with id: {node}, data: {data}")  # Debug statement
        try:
            tx.run(
                "CREATE (n:Node {id: $id, type: $type, title: $title, year: $year, description: $description})",
                id=node,
                type=data.get('type'),
                title=data.get('title'),
                year=data.get('year'),
                description=data.get('description')
            )
        except Exception as e:
            print(f"Failed to create node with id {node}: {e}")

    # Saving relationships to Neo4j
    for source, target, data in graph.edges(data=True):
        if (source is None or pd.isna(source)) or (target is None or pd.isna(target)):
            print(f"Skipping relationship from {source} to {target} due to invalid node ID.")  # Debug statement for invalid relationships
            continue

        print(f"Creating relationship from {source} to {target} with relation: {data.get('relation')}")  # Debug statement
        try:
            tx.run(
                "MATCH (a:Node {id: $source}), (b:Node {id: $target}) "
                "CREATE (a)-[:RELATES_TO {relation: $relation}]->(b)",
                source=source,
                target=target,
                relation=data.get('relation', "RELATES_TO")
            )
        except Exception as e:
            print(f"Failed to create relationship from {source} to {target}: {e}")

# Function to validate node import in Neo4j
def validate_neo4j_import():
    try:
        with driver.session() as session:
            # Count nodes
            node_count = session.run("MATCH (n:Node) RETURN count(n) AS count").single()["count"]
            print(f"Number of nodes in Neo4j: {node_count}")

            # Verify a sample node
            sample_node = session.run("MATCH (n:Node) RETURN n LIMIT 1").single()
            if sample_node:
                print(f"Sample node: {sample_node['n']}")
            else:
                print("No nodes found in Neo4j for verification.")
    except Exception as e:
        print(f"Failed to validate nodes in Neo4j: {e}")

# Function to validate relationships in Neo4j
def validate_neo4j_relationships():
    try:
        with driver.session() as session:
            # Count relationships
            relationship_count = session.run("MATCH ()-[r:RELATES_TO]->() RETURN count(r) AS count").single()["count"]
            if relationship_count > 0:
                print(f"Relationships successfully created: {relationship_count}")
            else:
                print("No relationships were created. Please verify if nodes are correctly linked.")
    except Exception as e:
        print(f"Failed to validate relationships in Neo4j: {e}")

# Main function to handle Neo4j operations
def main(knowledge_graph):
    try:
        # Write nodes and relationships to Neo4j
        with driver.session(database="neo4j") as session:
            session.execute_write(save_to_neo4j, knowledge_graph)
        print("Knowledge graph saved to Neo4j successfully!")

        # Validate nodes
        validate_neo4j_import()

        # Validate relationships
        validate_neo4j_relationships()

    except Exception as e:
        print(f"An error occurred during Neo4j operations: {e}")

    finally:
        # Close the driver after all operations are done
        driver.close()
        print("Neo4j driver closed.")

# Example usage (make sure 'knowledge_graph' is defined before calling)
# main(knowledge_graph)


Connected to Neo4j successfully.


In [98]:
# Section 9: Ask Questions Using OpenAI (Before Filtering)
# - Use OpenAI GPT-4o to answer questions about the original graph
client = OpenAI()  # This will use your OPENAI_API_KEY environment variable

def ask_openai_question(question):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": question}],
        max_tokens=1500,
        temperature=0.1, 
        top_p=1.0, 
        frequency_penalty=0.5, 
        presence_penalty=0.0
    )
    return response.choices[0].message.content.strip()

question = "Generate the API calls to get me the variables to allow annalysis of communit time by age, sex and income for urban areas in maryland with the most recent data "
answer = ask_openai_question(question)
print(f"Q: {question}\nA: {answer}")

Q: Generate the API calls to get me the variables to allow annalysis of communit time by age, sex and income for urban areas in maryland with the most recent data 
A: To analyze commute time by age, sex, and income for urban areas in Maryland using the most recent data, you can utilize the U.S. Census Bureau's American Community Survey (ACS) data via their API. Below are the steps and example API calls to retrieve this information:

1. **Identify Variables**: First, identify the relevant variables from the ACS that correspond to commute time, age, sex, and income.

2. **API Endpoint**: The ACS API endpoint is typically structured as follows:
   ```
   https://api.census.gov/data/{year}/acs/acs5
   ```

3. **Select Variables**: You will need to select variables related to:
   - Commute time (e.g., `B08303_001E` for total travel time)
   - Age (e.g., `B01001_001E` for total population by age)
   - Sex (e.g., `B01001_002E` for male population)
   - Income (e.g., `B19013_001E` for median h