In [1]:
# Install required libraries (uncomment and run if not already installed)
# !pip install pandas neo4j

# Import necessary libraries
import pandas as pd
from neo4j import GraphDatabase


## 2. Load the Files

In [54]:
# File paths for variables metadata (by decade)
variables_file_paths = {
    '1980s': "./data/census_metadata_1980s.csv",
    '1990s': "./data/census_metadata_1990s.csv",
    '2000s': "./data/census_metadata_2000s.csv",
    '2010s': "./data/census_metadata_2010s.csv",
    '2020s': "./data/census_metadata_2020s.csv",
    'UNKN' : "./data/census_metadata_Unknown.csv"
}

# Function to load datasets and variables metadata
def load_datasets_metadata():
    # Load the datasets metadata (this is the combined file for all decades)
    datasets_metadata = pd.read_csv(datasets_file_path)
    print("Datasets Metadata Loaded:")
    display(datasets_metadata.head(1))
    return datasets_metadata

def load_variables_metadata():
    # Load the variables metadata for each decade and concatenate them into one dataframe
    all_variables_metadata = []
    for decade, file_path in variables_file_paths.items():
        variables_metadata = pd.read_csv(file_path, low_memory=False)
        print(f"Variables Metadata for {decade} Loaded:")
        display(variables_metadata.head(1))

        # Handle 'Unknown' year values specifically for the UNKN dataset
        # When creating the Knowledge Graph and relationships, Year is important
        # We'll be able to still use this data to enrich our KG at the variable level
        
        # If it's the 'UNKN' dataset, replace 'Unknown' year with '9999'
        if decade == 'UNKN':
            # Ensure the 'year' column is treated as a string before replacing 'Unknown'
            variables_metadata['year'] = variables_metadata['year'].astype(str)
            variables_metadata['year'] = variables_metadata['year'].replace('Unknown', '9999')

        # Handle 'Unknown' year values specifically for the UNKN dataset
        if decade == 'UNKN':
            # Ensure the 'year' column is treated as a string before replacing 'Unknown'
            variables_metadata['year'] = variables_metadata['year'].astype(str)
            variables_metadata['year'] = variables_metadata['year'].replace('Unknown', '9999')  # Placeholder year

        
        all_variables_metadata.append(variables_metadata)

    # Combine all variables metadata into one dataframe
    combined_variables_metadata = pd.concat(all_variables_metadata, ignore_index=True)
    
    # Ensure the 'year' column is numeric and handle any 'NaN' values
    combined_variables_metadata['year'] = pd.to_numeric(combined_variables_metadata['year'], errors='coerce')
    combined_variables_metadata['year'].fillna(9999)  # Replace NaNs with 9999
    combined_variables_metadata['year'] = combined_variables_metadata['year'].astype(int)  # Convert to integer type
    
    return combined_variables_metadata

# Load both datasets and variables metadata
datasets_metadata = load_datasets_metadata()
combined_variables_metadata = load_variables_metadata()

# Display combined datasets and variables metadata
print("Combined Datasets Metadata:")
display(datasets_metadata.head(1))

print("Combined Variables Metadata:")
display(combined_variables_metadata.head(1))

print("Data ingest completed...")

# Save the cleaned data to a new CSV file (optional, for later use)
combined_variables_metadata.to_csv('./data/combined_variables_metadata.csv', index=False)
datasets_metadata.to_csv('./data/cleaned_datasets_metadata.csv', index=False)


Datasets Metadata Loaded:


Unnamed: 0,dataset_name,year,title,description,identifier,contact,access_level,modified,publisher,references,keywords
0,cps,1994,Jun 1994 Current Population Survey: Basic Monthly,"To provide estimates of employment, unemployme...",https://api.census.gov/data/id/CPSBASIC199406,CPS Staff,public,2019-10-09 15:05:36.0,U.S. Census Bureau,https://www.census.gov/developers/,census


Variables Metadata for 1980s Loaded:


Unnamed: 0,dataset_name,year,title,variable_name,label,concept,predicateType,group,limit,attributes,decade
0,cbp,1986,1986 County Business Patterns: Business Patterns,for,Census API FIPS 'for' clause,Census API Geography Specification,fips-for,,0,,1980s


Variables Metadata for 1990s Loaded:


Unnamed: 0,dataset_name,year,title,variable_name,label,concept,predicateType,group,limit,attributes,decade
0,cps/basic/jun,1994,Jun 1994 Current Population Survey: Basic Monthly,for,Census API FIPS 'for' clause,Census API Geography Specification,fips-for,,0,,1990s


Variables Metadata for 2000s Loaded:


Unnamed: 0,dataset_name,year,title,variable_name,label,concept,predicateType,group,limit,attributes,decade
0,cbp,2000,2000 County Business Patterns: Business Patterns,for,Census API FIPS 'for' clause,Census API Geography Specification,fips-for,,0,,2000s


Variables Metadata for 2010s Loaded:


Unnamed: 0,dataset_name,year,title,variable_name,label,concept,predicateType,group,limit,attributes,decade
0,cbp,2012,Annual Economic Surveys: Business Patterns: Co...,for,Census API FIPS 'for' clause,Census API Geography Specification,fips-for,,0,,2010s


Variables Metadata for 2020s Loaded:


Unnamed: 0,dataset_name,year,title,variable_name,label,concept,predicateType,group,limit,attributes,decade
0,cps/basic/jan,2021,Current Population Survey: Basic Monthly,for,Census API FIPS 'for' clause,Census API Geography Specification,fips-for,,0,,2020s


Variables Metadata for UNKN Loaded:


Unnamed: 0,dataset_name,year,title,variable_name,label,concept,predicateType,group,limit,attributes,decade
0,http://api.census.gov/data/timeseries/asm/stat...,Unknown,Time Series Annual Survey of Manufactures: Sta...,for,Census API FIPS 'for' clause,Census API Geography Specification,fips-for,,0,,Unknown


Combined Datasets Metadata:


Unnamed: 0,dataset_name,year,title,description,identifier,contact,access_level,modified,publisher,references,keywords
0,cps,1994,Jun 1994 Current Population Survey: Basic Monthly,"To provide estimates of employment, unemployme...",https://api.census.gov/data/id/CPSBASIC199406,CPS Staff,public,2019-10-09 15:05:36.0,U.S. Census Bureau,https://www.census.gov/developers/,census


Combined Variables Metadata:


Unnamed: 0,dataset_name,year,title,variable_name,label,concept,predicateType,group,limit,attributes,decade
0,cbp,1986,1986 County Business Patterns: Business Patterns,for,Census API FIPS 'for' clause,Census API Geography Specification,fips-for,,0,,1980s


Data ingest completed...


# 3. Clean the Data

## **Data Structuring and Relationship Design**

### **Overview**
The goal of structuring the data is to create a hierarchical and navigable knowledge graph that balances usability and precision. This involves grouping repeated surveys into logical parent-child relationships while maintaining links to their associated variables and years.

### **Key Design Decisions**
1. **Parent-Child Relationships for Datasets:**
   - **Why:** Many surveys are conducted multiple times per year, with variations in their metadata. A parent-child hierarchy simplifies navigation for users querying high-level information while retaining granularity for specific queries.
   - **How:** We use the `parent_dataset` field to represent the high-level grouping (e.g., `cps` for the Current Population Survey) and `dataset_name` for specific instances (e.g., `cps/basic/jan` for the January survey).

2. **Linking Variables to Child Datasets:**
   - **Why:** Each dataset instance includes specific variables. Establishing this connection allows users to query datasets for their variables or find which datasets a variable belongs to.
   - **How:** We create `Variable` nodes linked to `ChildDataset` nodes via an `INCLUDES` relationship.

3. **Year-Based Relationships:**
   - **Why:** Many datasets are time-specific. Linking datasets to their respective years ensures queries can filter datasets by year and handle temporal questions like, "What data is available for 1986?"
   - **How:** We create `Year` nodes and connect them to `ChildDataset` nodes via a `BELONGS_TO_YEAR` relationship.

### **Graph Schema**
Here is the schema we use to represent the relationships:
- **ParentDataset**: Represents high-level groupings of surveys (e.g., `cps`, `cbp`).
  - **Relationships:**
    - `PARENT_OF` → `ChildDataset`
- **ChildDataset**: Represents individual survey instances (e.g., `cps/basic/jan`).
  - **Relationships:**
    - `INCLUDES` → `Variable`
    - `BELONGS_TO_YEAR` → `Year`
- **Variable**: Represents specific data variables (e.g., `employment_status`).
- **Year**: Represents the temporal context for datasets (e.g., `1986`).

### **Why This Structure?**
This design ensures:
- **Scalability**: Easily add new datasets, variables, and years.
- **Usability**: Queries can target high-level overviews or specific details.
- **Flexibility**: Supports both general and granular user queries.


In [78]:
# Cleaning datasets metadata
cleaned_datasets = datasets_metadata.drop_duplicates().dropna(subset=['dataset_name'])
cleaned_datasets = cleaned_datasets[['dataset_name', 'year', 'title', 'description']]

# Handle 'Unknown' and NaN in 'year' column and replace with 9999 for missing years
cleaned_datasets['year'] = pd.to_numeric(cleaned_datasets['year'], errors='coerce')  # Coerce invalid values to NaN
cleaned_datasets['year'] = cleaned_datasets['year'].fillna(9999)  # Replace NaNs with 9999 for 'Unknown' years

# Cleaning variables metadata
cleaned_variables = variables_metadata.drop_duplicates().dropna(subset=['dataset_name'])
cleaned_variables = cleaned_variables[
    ['dataset_name', 'year', 'variable_name', 'label', 'concept', 'predicateType']
]

# Handle 'Unknown' and NaN in 'year' column and replace with 9999 for missing years
cleaned_variables['year'] = pd.to_numeric(cleaned_variables['year'], errors='coerce')  # Coerce invalid values to NaN
cleaned_variables['year'] = cleaned_variables['year'].fillna(9999)  # Replace NaNs with 9999 for 'Unknown' years

# Create parent-child relationships
cleaned_variables['parent_dataset'] = cleaned_variables['dataset_name'].str.split('/').str[0]
cleaned_datasets['parent_dataset'] = cleaned_datasets['dataset_name'].str.split('/').str[0]

# Group parent-to-child relationships
parent_to_child = cleaned_datasets[['parent_dataset', 'dataset_name', 'year']].drop_duplicates()
child_to_variable = cleaned_variables[['dataset_name', 'variable_name', 'label', 'concept', 'predicateType']]

# Clean the 'year' entries in child_to_year to remove duplicates (e.g., duplicates of 9999)
cleaned_datasets = child_to_year.drop_duplicates(subset=['dataset_name', 'year'])
child_to_year = cleaned_datasets  # Update the original DataFrame

# Display cleaned data for verification
print("Parent to Child Relationships:")
display(parent_to_child.head())

print("Child to Variable Relationships:")
display(child_to_variable.head())

print("Child to Year Relationships:")
display(child_to_year.head())

print("Data cleaning completed...")


Parent to Child Relationships:


Unnamed: 0,parent_dataset,dataset_name,year
0,cps,cps,1994
1,basic,basic,1994
2,jun,jun,1994
3,cbp,cbp,1986
4,zbp,zbp,1994


Child to Variable Relationships:


Unnamed: 0,dataset_name,variable_name,label,concept,predicateType
0,cbp,for,Census API FIPS 'for' clause,Census API Geography Specification,fips-for
1,cbp,in,Census API FIPS 'in' clause,Census API Geography Specification,fips-in
2,cbp,ESTAB_F,Flag for Total number of Establishments,Employer Statistics,int
3,cbp,GEO_TTL,Title of Geography,Geographic Characteristics,
4,cbp,ST,FIPS State Code,Selectable Geographies,


Child to Year Relationships:


Unnamed: 0,dataset_name,year
0,cps,1994
1,basic,1994
2,jun,1994
3,cbp,1986
4,zbp,1994


Data cleaning completed...


# 4. Connect to Neo4j
- This cell sets up the Neo4j connection.

In [56]:
from neo4j import GraphDatabase

# Initialize Neo4j connection (replace with your credentials)
neo4j_uri = "bolt://localhost:7687"  # Update with your Neo4j URI
neo4j_user = "neo4j"
neo4j_password = "password"  # Update with your password

try:
    # Create a driver instance
    driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password))
    
    # Test connection by opening a session and executing a simple query
    with driver.session() as session:
        session.run("RETURN 1")  # Simple query to check connection
    print("Connection successful.")
except Exception as e:
    print(f"Connection failed: {e}")


Connection successful.


# 5. Ingest Data into Neo4j
- This cell contains the ingestion logic.

## Clear the Neo4J dB if necessary
To delete all the data in your Neo4j database, you can use the following Cypher query, which will remove all nodes and relationships: \
> MATCH (n) \
> DETACH DELETE n

### Explanation:
- MATCH (n): This matches all nodes in the graph.
- DETACH DELETE n: This deletes the nodes and any relationships attached to them.

### How to Run:
1. Open your Neo4j browser or a Neo4j client.
1. Paste the query and execute it.

This will completely clear your Neo4j database of all nodes and relationships, giving you a fresh starting point for your new data ingestion.


In [80]:
# Ensure the driver is opened before running queries
driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password))

def run_cypher_query(query, parameters=None):
    with driver.session() as session:
        session.run(query, parameters)

try:
    # Step 1: Create Parent Dataset Nodes
    print("Step 1: Creating Parent Dataset Nodes...")
    for _, row in parent_to_child.iterrows():
        query = """
        MERGE (p:ParentDataset {name: $parent_dataset})
        """
        run_cypher_query(query, {"parent_dataset": row['parent_dataset']})

    # Step 2: Link Child Datasets to Parents (with year)
    processed_count = 0
    for _, row in parent_to_child.iterrows():
        query = """
        MERGE (c:ChildDataset {name: $child_dataset})
        SET c.year = $year
        MERGE (p:ParentDataset {name: $parent_dataset})
        MERGE (p)-[:PARENT_OF]->(c)
        """
        run_cypher_query(query, {
            "child_dataset": row['dataset_name'],
            "parent_dataset": row['parent_dataset'],
            "year": row['year']
        })
        processed_count += 1
        if processed_count % 500 == 0:
            print(f"Processed {processed_count} rows so far...")
    
    print(f"Finished linking {processed_count} ChildDataset nodes to ParentDataset nodes.")

    # Step 2.5: Create Year Nodes (including year 9999 if necessary)
    print("Step X: Creating Year Nodes...")
    unique_years = child_to_year['year'].dropna().unique()
    
    # Make sure to include 9999 if it's not already in the unique years
    unique_years = list(set(unique_years).union({9999}))
    
    for year in unique_years:
        query = """
        MERGE (y:Year {value: $year})
        """
        try:
            with driver.session() as session:
                session.run(query, {"year": year})
                print(f"Created/Merged Year node for {year}")
        except Exception as e:
            print(f"Error creating Year node for {year}: {e}")
    
    print(f"Finished creating {len(unique_years)} Year nodes.")
        
    # Step 3: Link Variables to Child Datasets
    print("Step 3: Linking Variables to Child Datasets...")
    # Handle missing values in predicateType explicitly
    child_to_variable.loc[:, 'predicateType'] = child_to_variable['predicateType'].fillna("Unknown")
    for _, row in child_to_variable.iterrows():
        query = """
        MERGE (v:Variable {
            name: $variable_name,
            label: $label,
            concept: $concept,
            predicateType: $predicateType
        })
        MERGE (c:ChildDataset {name: $dataset_name})
        MERGE (c)-[:INCLUDES]->(v)
        """
        run_cypher_query(query, {
            "variable_name": row['variable_name'],
            "label": row['label'] if pd.notna(row['label']) else "Unknown",
            "concept": row['concept'] if pd.notna(row['concept']) else "Unknown",
            "predicateType": row['predicateType'],  # Already cleaned
            "dataset_name": row['dataset_name']
        })

    # Step 4: Link Child Datasets to Years (including year 9999)
    print("Step 4: Linking Child Datasets to Years...")
    processed_count = 0
    for _, row in child_to_year.iterrows():
        query = """
        MERGE (y:Year {value: $year})
        MERGE (c:ChildDataset {name: $dataset_name})
        MERGE (c)-[:BELONGS_TO_YEAR]->(y)
        """
        run_cypher_query(query, {
            "dataset_name": row['dataset_name'],
            "year": row['year']
        })
        processed_count += 1
        if processed_count % 500 == 0:
            print(f"Processed {processed_count} rows so far...")
    
    print(f"Finished linking {processed_count} ChildDataset nodes to Year nodes.")

finally:
    # Step 5: Close the driver after all operations
    print("Closing the Neo4j driver...")
    driver.close()
    print("Data ingestion complete!")


Step 1: Creating Parent Dataset Nodes...
Processed 500 rows so far...
Processed 1000 rows so far...
Processed 1500 rows so far...
Finished linking 1555 ChildDataset nodes to ParentDataset nodes.
Step X: Creating Year Nodes...
Created/Merged Year node for 9999
Created/Merged Year node for 1986
Created/Merged Year node for 1987
Created/Merged Year node for 1988
Created/Merged Year node for 1989
Created/Merged Year node for 1990
Created/Merged Year node for 1991
Created/Merged Year node for 1992
Created/Merged Year node for 1993
Created/Merged Year node for 1994
Created/Merged Year node for 1995
Created/Merged Year node for 1996
Created/Merged Year node for 1997
Created/Merged Year node for 1998
Created/Merged Year node for 1999
Created/Merged Year node for 2000
Created/Merged Year node for 2001
Created/Merged Year node for 2002
Created/Merged Year node for 2003
Created/Merged Year node for 2004
Created/Merged Year node for 2005
Created/Merged Year node for 2006
Created/Merged Year node f

Unique years: [1994 1986 1987 1995 1988 1989 1990 1997 1996 2012 1991 1992 1993 1998
 1999 2000 2001 2002 2013 2003 2010 2004 2005 2006 2007 2008 2021 2009
 2011 2020 2019 2014 2015 2016 2017 2018 2022 2023 2024]
