In [9]:
import pickle
import networkx as nx

with open('../data/jp_morgan/pickled/graph_aml_final.pickle', 'rb') as f:
    G = pickle.load(f)

import random 
random_nodes = random.sample(list(G.nodes()), 100)
print(len(random_nodes))
random_subgraph = nx.MultiDiGraph(G.subgraph(random_nodes))

print(random_subgraph.nodes())

with open('../data/jp_morgan/pickled/test.pickle', 'wb') as f:
    pickle.dump(random_subgraph, f)


100
['OWNER-369323-06', 'OWNER-289159-11_CHECKING-289162-11', 'CUSTOMER-1395495-00_CHECKING-1395496-00', 'OWNER-1641076-06', 'OWNER-404599-10_CHECKING-404602-10', 'OWNER-485904-14_CHECKING-485907-14', 'JPMC-CLIENT-2600467-00_CHECKING-2600475-00', 'BILLING-COMPANY-210823-10_CHECKING-210821-10', 'JPMC-CLIENT-951368-10_CHECKING-951374-10', 'JPMC-CUSTOMER-343887-00_CHECKING-343890-00', 'OWNER-137028-02_CHECKING-137031-02', 'JPMC-CLIENT-1856846-00_CHECKING-1856854-00', 'OWNER-521573-14_CHECKING-521576-14', 'OWNER-85257-01', 'OWNER-1487371-04_CHECKING-1487374-04', 'CHECKING-82600-14', 'OWNER-1518553-04_CHECKING-1518556-04', 'OWNER-58456-10', 'JPMC-CLIENT-1478392-00_CHECKING-1478399-00', 'BILLING-COMPANY-101103-14', 'STANDARD-COMPANY-2876363-00', 'OWNER-76443-06', 'OWNER-561212-11_CHECKING-561215-11', 'JPMC-CLIENT-1995266-00', 'OWNER-225681-01_CHECKING-225684-01', 'OWNER-1352044-06_CHECKING-1352047-06', 'BILLING-COMPANY-990233-05', 'OWNER-2153419-04', 'CHECKING-621854-04', 'OWNER-571827-11_CH

In [2]:
import networkx as nx
import pandas as pd
from typing import Dict, Any

def extract_node_metrics(G: nx.Graph) -> pd.DataFrame:
    """
    Extract various node-level metrics from a NetworkX graph.
    
    Parameters:
    -----------
    G : nx.Graph
        Input NetworkX graph
        
    Returns:
    --------
    pd.DataFrame
        DataFrame containing node metrics with nodes as index
    """
    # Dictionary to store all metrics
    metrics: Dict[str, Dict[Any, float]] = {}
    
    # Basic centrality measures
    metrics['degree'] = dict(G.degree())
    metrics['degree_centrality'] = nx.degree_centrality(G)
    metrics['betweenness_centrality'] = nx.betweenness_centrality(G)
    metrics['closeness_centrality'] = nx.closeness_centrality(G)
    metrics['eigenvector_centrality'] = nx.eigenvector_centrality(G, max_iter=1000)
    
    # Local clustering coefficient
    metrics['clustering_coefficient'] = nx.clustering(G)
    
    # PageRank
    metrics['pagerank'] = nx.pagerank(G)
    
    # Load centrality (if graph is connected)
    if nx.is_connected(G):
        metrics['load_centrality'] = nx.load_centrality(G)
    
    # Node connectivity and other structural metrics
    metrics['eccentricity'] = nx.eccentricity(G) if nx.is_connected(G) else {}
    
    # Convert to DataFrame
    df_metrics = pd.DataFrame(metrics)
    
    # Add some additional computed metrics
    if nx.is_connected(G):
        # Average neighbor degree
        avg_neighbor_degree = nx.average_neighbor_degree(G)
        df_metrics['avg_neighbor_degree'] = pd.Series(avg_neighbor_degree)
    
    return df_metrics

def analyze_network(G: nx.Graph) -> Dict[str, Any]:
    """
    Calculate global network metrics.
    
    Parameters:
    -----------
    G : nx.Graph
        Input NetworkX graph
        
    Returns:
    --------
    Dict[str, Any]
        Dictionary containing global network metrics
    """
    global_metrics = {
        'num_nodes': G.number_of_nodes(),
        'num_edges': G.number_of_edges(),
        'density': nx.density(G),
        'average_clustering': nx.average_clustering(G),
        'is_connected': nx.is_connected(G),
        'average_shortest_path_length': nx.average_shortest_path_length(G) if nx.is_connected(G) else None,
        'diameter': nx.diameter(G) if nx.is_connected(G) else None,
        'transitivity': nx.transitivity(G),
        'number_connected_components': nx.number_connected_components(G),
    }
    
    try:
        global_metrics['graph_assortativity'] = nx.degree_assortativity_coefficient(G)
    except:
        global_metrics['graph_assortativity'] = None
        
    return global_metrics

node_metrics = extract_node_metrics(graph)

KeyboardInterrupt: 