In [2]:
import pandas as pd
import networkx as nx
from datetime import datetime
from typing import List, Dict, Any, Tuple
import numpy as np

In [2]:
# Implement a graph network using networkx in python.
# # Requirements:

# NODES:
# 1. Entity Types:
#    - Regular Entities (with account/country info)
#    - Standalone Entities (ID only, typically counterparties)
#    - CASH node (special node for cash transactions)

# 2. Account Nodes:
#    - Connected to their parent entities
#    - Some entities have multiple accounts
#    - Some entities have no accounts

# 3. Node Properties:
#    ```
#    Entity Node:
#    - entity_id (from party_Id/cparty_Id)
#    - node_type: 'ENTITY' or 'STANDALONE'
#    - country

#    Account Node:
#    - node_id: {entity_id}_{account}
#    - entity_id (parent)
#    - account number
#    - node_type: 'ACCOUNT'

#    Cash Node:
#    - node_id: 'CASH'
#    - node_type: 'CASH'
#    ```

# 4. Entity-Account Relationships:
#    - Entity → Account (HAS_ACCOUNT)
#    - Not all entities have accounts

# Input would be a dataframe containing all unique combinations of id, account, country (may be incomplete)
# Take into consideration that edges will be added for transactions.
# Include sample code on how do add future nodes and how to add transaction edges?
# Can we implement this in networkx and I want to be able to handle batch adding
# The entities/accounts are in a dataframe that I can load, columns are the following (Id, Account, Country)
# Add the code after this and if possible vectorised adding because I have 250k rows

from neo4j import GraphDatabase
import pandas as pd
from tqdm import tqdm
from datetime import datetime

class GraphBuilder:
    def __init__(self, uri="bolt://localhost:7687", user="neo4j", password="neo4j"):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))
        
    def close(self):
        self.driver.close()
        
    def clear_database(self):
        with self.driver.session() as session:
            session.run("MATCH (n) DETACH DELETE n")
    
    def create_cash_node(self):
        with self.driver.session() as session:
            session.run("""
                MERGE (c:Node {node_id: 'CASH', node_type: 'CASH'})
            """)
    
    def batch_create_nodes(self, df: pd.DataFrame, batch_size=1000):
        """Create entity and account nodes in batches from dataframe"""
        total_batches = len(df) // batch_size + (1 if len(df) % batch_size != 0 else 0)
        
        for i in tqdm(range(total_batches)):
            start_idx = i * batch_size
            end_idx = min((i + 1) * batch_size, len(df))
            batch_df = df.iloc[start_idx:end_idx]
            
            # Convert batch to parameter lists
            params = {
                'entity_ids': batch_df['Id'].tolist(),
                'countries': batch_df['Country'].fillna('UNKNOWN').tolist(),
                'accounts': batch_df['Account'].fillna('').tolist()
            }
            
            with self.driver.session() as session:
                # Create entity nodes
                session.run("""
                    UNWIND range(0, size($entity_ids)-1) as i
                    MERGE (e:Node {
                        entity_id: $entity_ids[i],
                        node_type: CASE WHEN $countries[i] = 'UNKNOWN' THEN 'STANDALONE' ELSE 'ENTITY' END,
                        country: $countries[i]
                    })
                """, params)
                
                # Create account nodes and relationships
                session.run("""
                    UNWIND range(0, size($entity_ids)-1) as i
                    MATCH (e:Node {entity_id: $entity_ids[i]})
                    WITH e, $accounts[i] as account
                    WHERE account <> ''
                    MERGE (a:Node {
                        node_id: e.entity_id + '_' + account,
                        entity_id: e.entity_id,
                        account: account,
                        node_type: 'ACCOUNT'
                    })
                    MERGE (e)-[:HAS_ACCOUNT]->(a)
                """, params)
    
    def add_transaction_edge(self, from_id: str, to_id: str, amount: float, timestamp: datetime):
        """Add a transaction edge between nodes"""
        with self.driver.session() as session:
            session.run("""
                MATCH (from:Node), (to:Node)
                WHERE (from.node_id = $from_id OR from.entity_id = $from_id)
                AND (to.node_id = $to_id OR to.entity_id = $to_id)
                CREATE (from)-[:TRANSACTION {
                    amount: $amount,
                    timestamp: $timestamp
                }]->(to)
            """, {'from_id': from_id, 'to_id': to_id, 'amount': amount, 'timestamp': timestamp})

# Example usage:
# graph = GraphBuilder()
# graph.clear_database()  # Clear existing data
# graph.create_cash_node()  # Create CASH node

# # Load your dataframe
# df = pd.read_csv('your_data.csv')  # columns: Id, Account, Country
# graph.batch_create_nodes(df)

# # Example transaction
# graph.add_transaction_edge(
#     from_id="entity1_account1",  # or just "entity1" for entity-level transaction
#     to_id="CASH",
#     amount=1000.0,
#     timestamp=datetime.now()
# )

# graph.close()


In [9]:
df = pd.read_parquet('../data/jp_morgan/sorted/entities_aml.parquet')
graph = GraphBuilder()

In [10]:
graph.clear_database()
graph.create_cash_node()

In [11]:
graph.batch_create_nodes(df)

 23%|██▎       | 52/227 [2:21:24<7:55:53, 163.16s/it]  


ClientError: {code: Neo.ClientError.Statement.SemanticError} {message: Cannot merge the following node because of null property value for 'entity_id': (:Node {entity_id: null})}

In [None]:
# Implement a graph network using networkx in python.
# # Requirements:

# NODES:
# 1. Entity Types:
#    - Regular Entities (with account/country info)
#    - Standalone Entities (ID only, typically counterparties)
#    - CASH node (special node for cash transactions)

# 2. Account Nodes:
#    - Connected to their parent entities
#    - Some entities have multiple accounts
#    - Some entities have no accounts

# 3. Node Properties:
#    ```
#    Entity Node:
#    - entity_id (from party_Id/cparty_Id)
#    - node_type: 'ENTITY' or 'STANDALONE'
#    - country

#    Account Node:
#    - node_id: {entity_id}_{account}
#    - entity_id (parent)
#    - account number
#    - node_type: 'ACCOUNT'

#    Cash Node:
#    - node_id: 'CASH'
#    - node_type: 'CASH'
#    ```

# 4. Entity-Account Relationships:
#    - Entity → Account (HAS_ACCOUNT)
#    - Not all entities have accounts

# Input would be a dataframe containing all unique combinations of id, account, country (may be incomplete)
# Take into consideration that edges will be added for transactions.
# Include sample code on how do add future nodes and how to add transaction edges?
# Can we implement this in networkx and I want to be able to handle batch adding
# The entities/accounts are in a dataframe that I can load, columns are the following (Id, Account, Country)
# Add the code after this and if possible vectorised adding because I have 250k rows

import networkx as nx
import pandas as pd
from typing import Optional

class TransactionGraph:
    def __init__(self):
        self.G = nx.Graph()
        # Create CASH node
        self.G.add_node('CASH', node_type='CASH')
        
    def add_entity_node(self, entity_id: str, country: Optional[str] = None):
        """Add an entity node with optional country info"""
        node_type = 'ENTITY' if country else 'STANDALONE'
        attrs = {'node_type': node_type}
        if country:
            attrs['country'] = country
        self.G.add_node(entity_id, **attrs)
        
    def add_account_node(self, entity_id: str, account: str):
        """Add an account node and link it to its parent entity"""
        account_id = f"{entity_id}_{account}"
        self.G.add_node(account_id, 
                       node_type='ACCOUNT',
                       entity_id=entity_id,
                       account_number=account)
        self.G.add_edge(entity_id, account_id, relationship='HAS_ACCOUNT')
        
    def batch_create_nodes(self, df: pd.DataFrame):
        """Batch create nodes from a dataframe with Id, Account, Country columns"""
        # First create all entity nodes
        entities = df[['Id', 'Country']].drop_duplicates()
        for _, row in entities.iterrows():
            self.add_entity_node(str(row['Id']), row['Country'])
            
        # Then create account nodes where they exist
        accounts = df[df['Account'].notna()][['Id', 'Account']]
        for _, row in accounts.iterrows():
            self.add_account_node(str(row['Id']), str(row['Account']))
            
    def add_transaction_edge(self, from_id: str, to_id: str, 
                           transaction_id: str, amount: float, 
                           timestamp: str):
        """Add a transaction edge between nodes"""
        self.G.add_edge(from_id, to_id,
                       transaction_id=transaction_id,
                       amount=amount,
                       timestamp=timestamp,
                       relationship='TRANSACTION')

# Example usage:
# graph = TransactionGraph()
# df = pd.DataFrame({
#     'Id': ['E1', 'E2', 'E3'],
#     'Account': ['A1', 'A2', None],
#     'Country': ['US', 'UK', None]
# })
# graph.batch_create_nodes(df)
# 
# # Add a transaction
# graph.add_transaction_edge('E1_A1', 'E2_A2', 'T1', 1000.0, '2023-01-01')
