In [None]:
import os
import dotenv
import pandas as pd
from graphdatascience import GraphDataScience

dotenv.load_dotenv(dotenv.find_dotenv())

# Load from environment
NEO4J_URI = os.getenv('NEO4J_BOLT')
NEO4J_USERNAME = os.getenv('NEO4J_USER')
NEO4J_PASSWORD = os.getenv('NEO4J_PASS')

DATA_FILE = f"./data/form13.csv"

print(f"Connecting to Neo4j at {NEO4J_URI} as {NEO4J_USERNAME}")
print(f"Using data from {DATA_FILE}")


In [None]:
gds = GraphDataScience(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

gds.run_cypher("RETURN 'hello' AS greeting")

In [None]:
gds.run_cypher("""
CREATE CONSTRAINT `manager_node_key` IF NOT EXISTS
  FOR (mgr:Manager)
  REQUIRE mgr.cik IS NODE KEY
""")

In [None]:
csv_df = pd.read_csv(DATA_FILE)

# the csv has three entities per row: a management firm, a public company, and an investment
# each row will become ==> (:Manager)-[:OWNS_STOCK_IN]->(:Company)
csv_df.info()

In [None]:
merge_row_cypher = """
MERGE (mgr:Manager {cik: $row.managerCik})
ON CREATE 
  SET mgr.name = $row.managerName,
      mgr.address = $row.managerAddress
MERGE (com:Company {cik: $row.cusip6})
  ON CREATE
    SET com.name = $row.companyName
MERGE (mgr)-[:OWNS_STOCK_IN { shares: $row.shares, value: $row.value }]->(com)
"""

In [None]:
%%time
for index, row in csv_df.iterrows():
    gds.run_cypher(merge_row_cypher, {"row": row.to_dict()})

In [None]:
# count all the things
mgrCount = gds.run_cypher("MATCH (mgr:Manager) RETURN count(*) as count").iloc[0]
comCount = gds.run_cypher("MATCH (com:Company) RETURN count(*) as count").iloc[0]
investmentCount = gds.run_cypher("MATCH (:Manager)-[:OWNS_STOCK_IN]->(:Company) RETURN count(*) as count").iloc[0]

print(f"Loaded {mgrCount['count']} managers, {comCount['count']} companies, and {investmentCount['count']} investments")

In [None]:
# delete all nodes and relationships
gds.run_cypher("MATCH (n) DETACH DELETE n")

# delete the constraint
gds.run_cypher("DROP CONSTRAINT `manager_node_key` IF EXISTS")

# create the constraint again
gds.run_cypher("""
CREATE CONSTRAINT `manager_node_key` IF NOT EXISTS
  FOR (mgr:Manager)
  REQUIRE mgr.cik IS NODE KEY
""")

In [None]:
%%time

# "UNWIND" can turn a list like the $rows parameter into individual rows
# combined with a sub-query it's almost like a batched for loop 
unwind_rows_then_merge = """
UNWIND $rows AS row
CALL {
  WITH row
  MERGE (mgr:Manager {cik: row.managerCik})
  ON CREATE 
    SET mgr.name = row.managerName,
        mgr.address = row.managerAddress
  MERGE (com:Company {cik: row.cusip6})
    ON CREATE
      SET com.name = row.companyName
  MERGE (mgr)-[:OWNS_STOCK_IN { shares: row.shares, value: row.value }]->(com)
} IN TRANSACTIONS OF 10000 ROWS
"""

gds.run_cypher(unwind_rows_then_merge, {"rows": csv_df.to_dict(orient='records')})

In [None]:
# count all the things
mgrCount = gds.run_cypher("MATCH (mgr:Manager) RETURN count(*) as count").iloc[0]
comCount = gds.run_cypher("MATCH (com:Company) RETURN count(*) as count").iloc[0]
investmentCount = gds.run_cypher("MATCH (:Manager)-[:OWNS_STOCK_IN]->(:Company) RETURN count(*) as count").iloc[0]

print(f"Loaded {mgrCount['count']} managers, {comCount['count']} companies, and {investmentCount['count']} investments")