In [1]:
import os
import dotenv
import pandas as pd
from graphdatascience import GraphDataScience

dotenv.load_dotenv(dotenv.find_dotenv())

# Load from environment
NEO4J_URI = os.getenv('NEO4J_BOLT')
NEO4J_USERNAME = os.getenv('NEO4J_USER')
NEO4J_PASSWORD = os.getenv('NEO4J_PASS')

DATA_FILE = f"./data/form13.csv"

print(f"Connecting to Neo4j at {NEO4J_URI} as {NEO4J_USERNAME}")
print(f"Using data from {DATA_FILE}")


Connecting to Neo4j at neo4j://localhost:7687 as neo4j
Using data from ./data/form13.csv


In [2]:
gds = GraphDataScience(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

gds.run_cypher("RETURN 'hello' AS greeting")

Unnamed: 0,greeting
0,hello


In [3]:
gds.run_cypher("""
CREATE CONSTRAINT `manager_node_key` IF NOT EXISTS
  FOR (mgr:Manager)
  REQUIRE mgr.cik IS NODE KEY
""")

In [4]:
csv_df = pd.read_csv(DATA_FILE)

# the csv has three entities per row: a management firm, a public company, and an investment
# each row will become ==> (:Manager)-[:OWNS_STOCK_IN]->(:Company)
csv_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52496 entries, 0 to 52495
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   source                   52496 non-null  object 
 1   managerCik               52496 non-null  int64  
 2   managerAddress           52496 non-null  object 
 3   managerName              52496 non-null  object 
 4   reportCalendarOrQuarter  52496 non-null  object 
 5   cusip6                   52496 non-null  object 
 6   cusip                    52496 non-null  object 
 7   companyName              52496 non-null  object 
 8   value                    52496 non-null  float64
 9   shares                   52496 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 4.0+ MB


In [5]:
merge_row_cypher = """
MERGE (mgr:Manager {cik: $row.managerCik})
ON CREATE 
  SET mgr.name = $row.managerName,
      mgr.address = $row.managerAddress
MERGE (com:Company {cik: $row.cusip6})
  ON CREATE
    SET com.name = $row.companyName
MERGE (mgr)-[:OWNS_STOCK_IN { shares: $row.shares, value: $row.value }]->(com)
"""

In [6]:
%%time
for index, row in csv_df.iterrows():
    gds.run_cypher(merge_row_cypher, {"row": row.to_dict()})

CPU times: user 1min 40s, sys: 24.7 s, total: 2min 5s
Wall time: 5min 49s


In [7]:
# count all the things
mgrCount = gds.run_cypher("MATCH (mgr:Manager) RETURN count(*) as count").iloc[0]
comCount = gds.run_cypher("MATCH (com:Company) RETURN count(*) as count").iloc[0]
investmentCount = gds.run_cypher("MATCH (:Manager)-[:OWNS_STOCK_IN]->(:Company) RETURN count(*) as count").iloc[0]

print(f"Loaded {mgrCount['count']} managers, {comCount['count']} companies, and {investmentCount['count']} investments")

Loaded 4846 managers, 166 companies, and 52496 investments


In [8]:
# delete all nodes and relationships
gds.run_cypher("MATCH (n) DETACH DELETE n")

In [10]:
%%time

# "UNWIND" can turn a list like the $rows parameter into individual rows
# combined with a sub-query it's almost like a batched for loop 
unwind_rows_then_merge = """
UNWIND $rows AS row
CALL {
  WITH row
  MERGE (mgr:Manager {cik: row.managerCik})
  ON CREATE 
    SET mgr.name = row.managerName,
        mgr.address = row.managerAddress
  MERGE (com:Company {cik: row.cusip6})
    ON CREATE
      SET com.name = row.companyName
  MERGE (mgr)-[:OWNS_STOCK_IN { shares: row.shares, value: row.value }]->(com)
} IN TRANSACTIONS OF 10000 ROWS
"""

gds.run_cypher(unwind_rows_then_merge, {"rows": csv_df.to_dict(orient='records')})

CPU times: user 2.27 s, sys: 78.5 ms, total: 2.35 s
Wall time: 7.39 s


In [11]:
# count all the things
mgrCount = gds.run_cypher("MATCH (mgr:Manager) RETURN count(*) as count").iloc[0]
comCount = gds.run_cypher("MATCH (com:Company) RETURN count(*) as count").iloc[0]
investmentCount = gds.run_cypher("MATCH (:Manager)-[:OWNS_STOCK_IN]->(:Company) RETURN count(*) as count").iloc[0]

print(f"Loaded {mgrCount['count']} managers, {comCount['count']} companies, and {investmentCount['count']} investments")

Loaded 4846 managers, 166 companies, and 52496 investments
