In [1]:
import time
from pathlib import Path
import pandas as pd
import networkx as nx
import numpy as np
from scipy.sparse import csr_array
from graspologic.partition import leiden
from graspologic.embed import LaplacianSpectralEmbed
from graspologic.utils import pass_to_ranks
from umap import UMAP
from graspologic.plot import networkplot


In [3]:
t0 = time.time()


# reading data
df_features = pd.read_csv("./elliptic/elliptic_txs_features.csv", header=None)
df_classes= pd.read_csv("./elliptic/elliptic_txs_classes.csv")
df_edgelist = pd.read_csv("./elliptic/elliptic_txs_edgelist.csv")

# renaming columns
df_classes.loc[df_classes['class'] == '1', 'class'] = "illicit"
df_classes.loc[df_classes['class'] == '2', 'class'] = "licit"

df_features.columns = ["id", "time step"] + [f"local_feat_{i}" for i in range(93)] + [f"agg_feat_{i}" for i in range(72)]
df_classes.columns = ["id", "class"]

# adding class data
df = pd.merge(df_features, df_classes, how="inner", on="id")
second_column = df.pop('class')
df.insert(1, 'class', second_column)
df.head()

Unnamed: 0,id,class,time step,local_feat_0,local_feat_1,local_feat_2,local_feat_3,local_feat_4,local_feat_5,local_feat_6,...,agg_feat_62,agg_feat_63,agg_feat_64,agg_feat_65,agg_feat_66,agg_feat_67,agg_feat_68,agg_feat_69,agg_feat_70,agg_feat_71
0,230425980,unknown,1,-0.171469,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,...,-0.562153,-0.600999,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
1,5530458,unknown,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,...,0.947382,0.673103,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
2,232022460,unknown,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,...,0.670883,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792
3,232438397,licit,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,...,-0.577099,-0.613614,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792
4,230460314,unknown,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,...,-0.511871,-0.400422,0.517257,0.579382,0.018279,0.277775,0.326394,1.29375,0.178136,0.179117


In [4]:
def symmetrze_nx(g):
    """Leiden requires a symmetric/undirected graph. This converts a directed graph to
    undirected just for this community detection step"""
    sym_g = g.to_undirected()
    return sym_g


In [5]:
def subsample_edges(adjacency, n_edges_kept=100_000):
    row_inds, col_inds = np.nonzero(adjacency)
    n_edges = len(row_inds)
    if n_edges_kept > n_edges:
        return adjacency

    choice_edge_inds = np.random.choice(n_edges, size=n_edges_kept, replace=False)
    row_inds = row_inds[choice_edge_inds]
    col_inds = col_inds[choice_edge_inds]
    data = adjacency[row_inds, col_inds]

    return csr_array((data, (row_inds, col_inds)), shape=adjacency.shape)

In [18]:
def plot(i):
    time_step_i = df.loc[(df['time step'] == i), 'id']
    time_step_i = df_edgelist.loc[df_edgelist['txId1'].isin(time_step_i)]

    g = nx.from_pandas_edgelist(time_step_i, source = 'txId1', target = 'txId2', create_using = nx.DiGraph())
    
    nodelist = list(g.nodes)
    adj = nx.to_scipy_sparse_array(g, nodelist=nodelist)

    sym_adj = adj + adj.T
    out = leiden(sym_adj)

    sym_g = symmetrze_nx(g)
    out2 = leiden(sym_g)

    node_df = pd.Series(out2)
    node_df.index.name = "node_id"
    node_df.name = "community"
    node_df = node_df.to_frame()

    nodelist = node_df.index
    adj = nx.to_scipy_sparse_array(g, nodelist=nodelist)

    node_df["strength"] = adj.sum(axis=1) + adj.sum(axis=0)
    node_df['rank_strength'] = node_df['strength'].rank(method='dense')

    ptr_adj = pass_to_ranks(adj)

    lse = LaplacianSpectralEmbed(n_components=32, concat=True)
    lse_embedding = lse.fit_transform(adj)

    currtime = time.time()
    n_components = 32
    n_neighbors = 32
    min_dist = 0.8
    metric = "cosine"
    umap = UMAP(
    n_components=2,
    n_neighbors=n_neighbors,
    min_dist=min_dist,
    metric=metric,
    )
    umap_embedding = umap.fit_transform(lse_embedding)

    node_df["x"] = umap_embedding[:, 0]
    node_df["y"] = umap_embedding[:, 1]

    # this is optional, may not need depending on the number of edges you have
    sub_adj = subsample_edges(adj, 100_000)

    ax = networkplot(
        sub_adj,
        x="x",
        y="y",
        node_data=node_df,
        node_size="rank_strength",
        node_sizes=(10, 80),
        figsize=(20, 20),
        node_hue="community",
        edge_linewidth=0.3,
        )

    ax.axis("off")
    
    print("Time Step ", i, " graph created.")

    return ax
