# Senzing and Neo4j Integration Example

## Set up the Python environment

First, we need to import the Python library dependencies which are required for the code we'll be running.

In [2]:
from dataclasses import dataclass, field
import csv
import json
import os
import pathlib
import sys
import typing

from icecream import ic
from tqdm import tqdm
import dotenv
import matplotlib.pyplot as plt
import neo4j
import pandas as pd
import seaborn as sns
import watermark

%load_ext watermark

Show a "watermark" of which versions are being used for system componenents and library dependencies. This may help in case you need to troubleshoot the dependencies on your system, e.g., if there's some conflict during installation.

In [2]:
%watermark
%watermark --iversions

Last updated: 2024-03-21T11:39:00.815526+00:00

Python implementation: CPython
Python version       : 3.11.6
IPython version      : 8.22.2

Compiler    : Clang 15.0.0 (clang-1500.0.40.1)
OS          : Darwin
Release     : 23.2.0
Machine     : x86_64
Processor   : i386
CPU cores   : 16
Architecture: 64bit

sys       : 3.11.6 (main, Oct  2 2023, 13:45:54) [Clang 15.0.0 (clang-1500.0.40.1)]
watermark : 2.4.3
matplotlib: 3.8.3
seaborn   : 0.13.2
neo4j     : 5.18.0
json      : 2.0.9
pandas    : 2.2.1
csv       : 1.0



## Build a Knowledge Graph in Neo4j

Set up a Bolt driver using our credentials for Neo4j Desktop

In [13]:
dotenv.load_dotenv(dotenv.find_dotenv())

bolt_uri: str = os.environ.get("NEO4J_BOLT")
username: str = os.environ.get("NEO4J_USER")
password: str = os.environ.get("NEO4J_PASS")

driver: neo4j.BoltDriver = neo4j.GraphDatabase.driver(
    bolt_uri,
    auth = ( username, password, ),
)

Delete the previous graph data...

In [22]:
with driver.session() as session:
    #session.run("MATCH (x) DETACH DELETE x")
    #session.run("DROP CONSTRAINT unique_record")
    #session.run("DROP CONSTRAINT unique_entity")
    pass

In [23]:
with driver.session() as session:
    query: str = """
CREATE CONSTRAINT unique_record 
    IF NOT EXISTS FOR (rec:Record) 
    REQUIRE rec.uid IS UNIQUE
    """
    
    session.run(query)

    query = """
CREATE CONSTRAINT unique_entity 
    IF NOT EXISTS FOR (ent:Entity) 
    REQUIRE ent.uid IS UNIQUE
    """
    
    session.run(query)

### Populate nodes from the dataset records

Define utility functions used for loading the graph data.

In [9]:
def get_property_keys (
    df: pd.DataFrame,
    ) -> typing.List[ str ]:
    """
Convert the column names from the given Pandas dataframe into Cypher property names.
    """
    return [
        name.lower().replace(" ", "_")
        for name in df.columns.values.tolist()
    ]


def safe_value (
    obj: typing.Any,
    ) -> typing.Any:
    """
Escape double quotes within string values.
    """
    if isinstance(obj, str):
        return obj.replace('"', "'")

    return obj

In [7]:
def format_merge_record (
    keys: typing.List[ str ],
    vals: list,
    ) -> str:
    """
Format one MERGE statement in Cypher for the values of a given Record.
    """
    safe_vals = [ safe_value(v) for v in vals ]
    row_dict: dict = dict(zip(keys, safe_vals))

    uid: str = row_dict["data_source"].upper() + "." + row_dict["record_id"]
    
    props: str = ", ".join([
        f"rec.{key} = \"{val}\""
        for key, val in row_dict.items()
    ])

    return f"""
MERGE (rec:Record {{ uid: \"{uid}\" }})
  ON CREATE
    SET {props}      
RETURN rec.data_source, rec.record_id
    """


def load_records (
    session: neo4j.Session,
    df: pd.DataFrame,
    ) -> None:
    """
Iterate over each Record from one dataset to load using Cypher.
    """
    keys: typing.List[ str ] = get_property_keys(df)

    for _, row in tqdm(df.iterrows(), desc = "merge nodes for records"):
        query: str = format_merge_record(keys, row.tolist())
        session.run(query)

In [10]:
example_data = [
    {
        "data_source": "A",
        "record_id": "1",
        "name": "Alice",
        "age": 30,
    },
    {
        "data_source": "A",
        "record_id": "2",
        "name": "Bob",
        "age": 25,
    },
    {
        "data_source": "B",
        "record_id": "3",
        "name": "Charlie",
        "age": 35,
    },
    {
        "data_source": "B",
        "record_id": "4",
        "name": "David",
        "age": 40,
    },
]

example_df = pd.DataFrame(example_data)

print(example_df)


  data_source record_id     name  age
0           A         1    Alice   30
1           A         2      Bob   25
2           B         3  Charlie   35
3           B         4    David   40


merge nodes for records: 4it [00:00,  6.36it/s]


In [11]:
with driver.session() as session:
    load_records(session, example_df)

merge nodes for records: 4it [00:00, 171.51it/s]


In [25]:
example_data = [
    {
        "data_source": "A",
        "record_id": "1",
        "name": "Alice",
        "age": 30,
    },
    {
        "data_source": "A",
        "record_id": "2",
        "name": "Bob",
        "age": 25,
    },
    {
        "data_source": "B",
        "record_id": "3",
        "name": "Charlie",
        "age": 35,
    },
    {
        "data_source": "B",
        "record_id": "4",
        "name": "David",
        "age": 40,
    },
]

with driver.session() as session:
    for record in example_data:
      session.run("""
        WITH toUpper($input.data_source) + "." + toString($input.record_id) as uid
        MERGE (rec:Record { uid: uid })
          ON CREATE SET rec += $input  
        RETURN rec.data_source, rec.record_id                
        """, 
        input = record 
      )
      

In [20]:
#df = df_poi.head(5)

In [27]:
with driver.session() as session:
    load_records(session, df_poi)

merge nodes for records: 98806it [1:11:58, 22.88it/s]


In [28]:
with driver.session() as session:
    load_records(session, df_dol)

merge nodes for records: 1533it [02:09, 11.80it/s]


In [29]:
with driver.session() as session:
    load_records(session, df_ppp)

merge nodes for records: 3488it [02:30, 23.16it/s]


In [46]:
with driver.session() as session:
    query: str = """
MATCH (rec:Record)
RETURN rec
LIMIT 10
    """
    
    for record in session.run(query):
        ic(record)

ic| record: <Record rec=<Node element_id='4:0f5270a7-dcca-435d-9c07-1268e61b9b6e:0' labels=frozenset({'Record'}) properties={'naics_code': '541110', 'is_intersection': 'nan', 'brands': '[]', 'business_geo_latitude': '36.145647', 'sub_category': 'Offices of Lawyers', 'location_name_org': 'Cantwell Michelle L Atty', 'category_tags': '[]', 'opened_on': 'nan', 'data_source': 'SAFEGRAPH', 'record_type': 'ORGANIZATION', 'business_addr_country': 'US', 'uid': 'SAFEGRAPH.225-222@5yv-j92-tn5', 'business_addr_full': '3320 W Sahara Ave Las Vegas NV 89102-3223', 'record_id': '225-222@5yv-j92-tn5', 'tracking_closed_since': '2019-07-01', 'business_geo_longitude': '-115.186399', 'placekey': '225-222@5yv-j92-tn5', 'mailing_verified_status': 'VERIFIED_PREMISE', 'closed_on': '2024-02-01', 'top_category': 'Legal Services', 'phone_number': '+17023627800'}>>
ic| record: <Record rec=<Node element_id='4:0f5270a7-dcca-435d-9c07-1268e61b9b6e:1' labels=frozenset({'Record'}) properties={'naics_code': '722513', 'i

### Populate nodes from the Senzing entities

In [39]:
query = """
MERGE (ent:Entity {uid: $params.uid, has_ref: $params.has_ref})
"""

with driver.session() as session:
    for entity in tqdm(entities.values(), desc = "merge nodes for entities"):
        params = {
            "uid": entity.entity_id,
            "has_ref": entity.has_ref,
        }

        session.run(
            query,
            params = params,
        )

merge nodes for entities: 100%|████████████████████████████████████████████████████████████████████████| 99156/99156 [46:34<00:00, 35.48it/s]


In [31]:
entity = list(entities.values())[0]
ic(entity);

ic| entity: Entity(entity_id=1,
                   num_recs=3,
                   records={'DOL_WHISARD.7874', 'DOL_WHISARD.7631', 'DOL_WHISARD.7789'},
                   related={113: {'ENTITY_ID': 113,
                                  'ERRULE_CODE': 'CNAME',
                                  'IS_AMBIGUOUS': 0,
                                  'IS_DISCLOSED': 0,
                                  'MATCH_KEY': '+NAME',
                                  'MATCH_LEVEL': 2,
                                  'MATCH_LEVEL_CODE': 'POSSIBLY_SAME',
                                  'RECORDS': [{'DATA_SOURCE': 'DOL_WHISARD',
                                               'RECORD_ID': '8898'},
                                              {'DATA_SOURCE': 'DOL_WHISARD',
                                               'RECORD_ID': '8903'},
                                              {'DATA_SOURCE': 'DOL_WHISARD',
                                               'RECORD_ID': '8128'},
               

### Connect the resolved records and related entities

In [49]:
query = """
MATCH
    (ent:Entity {uid: $params.entity_uid}),
    (rec:Record {uid: $params.record_uid})       
MERGE (ent)-[:RESOLVES]->(rec)
"""

with driver.session() as session:
    for entity in tqdm(entities.values(), desc = "merge entity->record"):
        for record_uid in entity.records:
            params = {
                "entity_uid": entity.entity_id,
                "record_uid": record_uid,
            }

            session.run(
                query,
                params = params,
            )

merge entity->record: 100%|███████████████████████████████████████████████████████████████████████████| 99156/99156 [03:55<00:00, 421.28it/s]


In [14]:
query = """
MATCH
    (ent:Entity {uid: $params.entity_uid}),
    (rel_ent:Entity {uid: $params.rel_ent})       
MERGE (ent)-[:RELATED {ambiguous: $params.ambiguous, disclosed: $params.disclosed, match_key: $params.match_key, match_level: $params.match_level, match_level_code: $params.match_level_code}]->(rel_ent)
"""

with driver.session() as session:
    for entity in tqdm(entities.values(), desc = "merge entity->related"):
        for rel_key, rel_ent in entity.related.items():
            params = {
                "entity_uid": entity.entity_id,
                "rel_ent": rel_ent["ENTITY_ID"],
                "ambiguous": (rel_ent["IS_AMBIGUOUS"] == 0),
                "disclosed": (rel_ent["IS_DISCLOSED"] == 0),
                "match_key": rel_ent["MATCH_KEY"],
                "match_level": rel_ent["MATCH_LEVEL"],
                "match_level_code": rel_ent["MATCH_LEVEL_CODE"],
            }

            session.run(
                query,
                params = params,
            )

merge entity->related: 100%|█████████████████████████████████████████████████████████████████████████| 99156/99156 [2:18:18<00:00, 11.95it/s]


In [24]:
with driver.session() as session:
    query = """
MATCH (ent:Entity)
RETURN
    ent.uid, COUNT { (ent)-[:RESOLVES]->(:Record) } AS num_recs
ORDER BY num_recs DESC
LIMIT 20
    """
    
    for ent in session.run(query):
        ic(ent)

ic| ent: <Record ent.uid=121 num_recs=15>
ic| ent: <Record ent.uid=137 num_recs=11>
ic| ent: <Record ent.uid=509 num_recs=10>
ic| ent: <Record ent.uid=17 num_recs=8>
ic| ent: <Record ent.uid=65 num_recs=7>
ic| ent: <Record ent.uid=113 num_recs=7>
ic| ent: <Record ent.uid=3 num_recs=6>
ic| ent: <Record ent.uid=41 num_recs=6>
ic| ent: <Record ent.uid=225 num_recs=6>
ic| ent: <Record ent.uid=146 num_recs=6>
ic| ent: <Record ent.uid=151 num_recs=5>
ic| ent: <Record ent.uid=383 num_recs=5>
ic| ent: <Record ent.uid=374 num_recs=5>
ic| ent: <Record ent.uid=433 num_recs=4>
ic| ent: <Record ent.uid=368 num_recs=4>
ic| ent: <Record ent.uid=239 num_recs=4>
ic| ent: <Record ent.uid=105 num_recs=4>
ic| ent: <Record ent.uid=286 num_recs=4>
ic| ent: <Record ent.uid=134 num_recs=4>
ic| ent: <Record ent.uid=355 num_recs=4>
