In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import networkx as nx

%store -r tx_level_data
%store -r wallet_analysis_df

pd.set_option('display.width', 200)  # Set a large width
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.expand_frame_repr', False)  # Prevent line wrapping



print(tx_level_data.head())

            timestamp                                          signature      type          source tx_status  block_number                                token_address  token_amount direction                                        sender                                      receiver                                  counterparty    tx_fee                        program_id  PRE_BALANCE  POST_BALANCE SYMBOL TOKEN_NAME symbol        day       price  token_amount_usd
0 2025-04-06 12:12:19  6mPqrKS4AKYJeZ3JLjjubHSwNB96oiJdFX1DgeZNZEBHxM...  TRANSFER  SYSTEM_PROGRAM   success   331670303.0  So11111111111111111111111111111111111111111  1.000000e-09  received  5Hr7wZg7oBpVhH5nngRqzr5W7ZFUfCsfEhbziZJak7fr  AGPZnBZUxmhAtcp8XjT4n8bCia9dEYhhm16M2sfFvmTU  5Hr7wZg7oBpVhH5nngRqzr5W7ZFUfCsfEhbziZJak7fr  0.000006  11111111111111111111111111111111     0.017773      0.017773    SOL     Solana    NaN 2025-04-06  116.447500      1.164475e-07
1 2025-04-06 00:10:50  29aP1yrCx2dbJM4Mxk8257G5e87bY7CgxySDS6LiAAm

In [3]:
wallet_analysis_df.head()

Unnamed: 0,wallet_address,entity_label,num_transactions,total_sol_volume_sent,total_sol_volume_received,total_token_volume_sent,total_token_volume_recieved,first_tx_time,last_tx_time,avg_tx_interval (seconds),num_unique_senders,num_unique_receivers
0,AGPZnBZUxmhAtcp8XjT4n8bCia9dEYhhm16M2sfFvmTU,Unknown Entity,493,47.911934,47.641968,"{'token_amount': {'$WEN': 1287304.0, 'BLZE': 1...","{'token_amount': {'$WEN': 643652.0, 'BLZE': 19...",2023-11-14 15:20:07,2025-04-06 12:12:19,44681.231707,105,176


In [16]:
# Clean merge for sender

# Drop any previous 'sender_entity' and 'receiver_entity' columns if they exist
tx_level_data = tx_level_data.drop(columns=['sender_entity', 'receiver_entity'], errors='ignore')

tx_level_data = tx_level_data.merge(
    wallet_analysis_df[['wallet_address', 'entity_label']].rename(columns={
        'wallet_address': 'sender',
        'entity_label': 'sender_entity'
    }),
    how='left',
    on='sender'
)

# Clean merge for receiver
tx_level_data = tx_level_data.merge(
    wallet_analysis_df[['wallet_address', 'entity_label']].rename(columns={
        'wallet_address': 'receiver',
        'entity_label': 'receiver_entity'
    }),
    how='left',
    on='receiver'
)

tx_level_data['entity_label'] = tx_level_data['sender_entity'].fillna(tx_level_data['receiver_entity'])

tx_level_data.drop(columns=['sender_entity', 'receiver_entity'], inplace=True)

tx_level_data = tx_level_data[[col for col in tx_level_data.columns if not col.endswith('_x') and not col.endswith('_y')]]


tx_level_data.head()


Unnamed: 0,timestamp,signature,type,source,tx_status,block_number,token_address,token_amount,direction,sender,receiver,counterparty,tx_fee,program_id,PRE_BALANCE,POST_BALANCE,SYMBOL,TOKEN_NAME,symbol,day,price,token_amount_usd,entity_label
0,2025-04-06 12:12:19,6mPqrKS4AKYJeZ3JLjjubHSwNB96oiJdFX1DgeZNZEBHxM...,TRANSFER,SYSTEM_PROGRAM,success,331670303.0,So11111111111111111111111111111111111111111,1e-09,received,5Hr7wZg7oBpVhH5nngRqzr5W7ZFUfCsfEhbziZJak7fr,AGPZnBZUxmhAtcp8XjT4n8bCia9dEYhhm16M2sfFvmTU,5Hr7wZg7oBpVhH5nngRqzr5W7ZFUfCsfEhbziZJak7fr,6e-06,11111111111111111111111111111111,0.017773,0.017773,SOL,Solana,,2025-04-06,116.4475,1.164475e-07,Unknown Entity
1,2025-04-06 00:10:50,29aP1yrCx2dbJM4Mxk8257G5e87bY7CgxySDS6LiAAmsxX...,TRANSFER,SYSTEM_PROGRAM,success,331561835.0,So11111111111111111111111111111111111111111,1e-09,received,FLiPgGTXtBtEJoytikaywvWgbz5a56DdHKZU72HSYMFF,AGPZnBZUxmhAtcp8XjT4n8bCia9dEYhhm16M2sfFvmTU,FLiPgGTXtBtEJoytikaywvWgbz5a56DdHKZU72HSYMFF,5e-06,11111111111111111111111111111111,0.017773,0.017773,SOL,Solana,,2025-04-06,116.4475,1.164475e-07,Unknown Entity
2,2025-04-06 00:10:45,2D7D4ndxhZfwZ6TRN6pzY7PQEFkRKTQhQjQ1zpz5MyNDVS...,TRANSFER,SYSTEM_PROGRAM,success,331561822.0,So11111111111111111111111111111111111111111,1e-09,received,FLiPGqowc82LLR173hKiFYBq2fCxLZEST5iHbHwj8xKb,AGPZnBZUxmhAtcp8XjT4n8bCia9dEYhhm16M2sfFvmTU,FLiPGqowc82LLR173hKiFYBq2fCxLZEST5iHbHwj8xKb,5e-06,11111111111111111111111111111111,0.017773,0.017773,SOL,Solana,,2025-04-06,116.4475,1.164475e-07,Unknown Entity
3,2025-04-05 23:38:43,nPrzWnG7SLNepcPeBWjEMKadD6WNuWScRgEwEqSNsHiZNN...,TRANSFER,SYSTEM_PROGRAM,success,331557009.0,So11111111111111111111111111111111111111111,1e-05,received,6UgXZZBoydXRNX6SLjwFDUUgn19GhjBpvNAXU3iepzUV,AGPZnBZUxmhAtcp8XjT4n8bCia9dEYhhm16M2sfFvmTU,6UgXZZBoydXRNX6SLjwFDUUgn19GhjBpvNAXU3iepzUV,5e-06,11111111111111111111111111111111,0.017763,0.017773,SOL,Solana,,2025-04-05,120.140417,0.001201404,Unknown Entity
4,2025-04-05 23:38:25,yqSAMWuNg3pC9H5EZJmTr1MudCuUajZye13BMfaS2atLvC...,TRANSFER,SYSTEM_PROGRAM,success,331556964.0,So11111111111111111111111111111111111111111,1e-09,received,GUq7PhyAUZko2mPhv3CupmdJKQ61LH8VyrdsRL25q7zg,AGPZnBZUxmhAtcp8XjT4n8bCia9dEYhhm16M2sfFvmTU,GUq7PhyAUZko2mPhv3CupmdJKQ61LH8VyrdsRL25q7zg,5e-06,11111111111111111111111111111111,0.017763,0.017763,SOL,Solana,,2025-04-05,120.140417,1.201404e-07,Unknown Entity


In [19]:
# Initialize a graph
G = nx.Graph()

# Add edges from each transaction's sender and receiver
for _, row in tx_level_data.iterrows():
    sender = row['sender']
    receiver = row['receiver']
    G.add_edge(sender, receiver)

# Extract connected components as clusters
clusters = list(nx.connected_components(G))

# Prepare the cluster data
cluster_data = []
for cluster_id, wallets in enumerate(clusters):
    # Filter transactions involving cluster wallets
    mask = tx_level_data['sender'].isin(wallets) | tx_level_data['receiver'].isin(wallets)
    cluster_txs = tx_level_data[mask]
    #cluster_txs = tx_level_data['signature'].nunique()
    unique_entities = cluster_txs['entity_label'].dropna().unique().tolist()

    # Optionally infer cluster type based on entities present
    if len(unique_entities) == 1:
        cluster_type = unique_entities[0]
    elif len(unique_entities) == 0:
        cluster_type = "Unknown"
    else:
        cluster_type = "Mixed"


    # Aggregate metrics
    cluster_entry = {
        'cluster_id': cluster_id,
        'wallets_in_cluster': list(wallets),
        'total_transactions': cluster_txs['signature'].nunique(),
        'cluster_start_time': cluster_txs['timestamp'].min(),
        'cluster_end_time': cluster_txs['timestamp'].max(),
        'cluster_size': len(wallets),
        'cluster_type': cluster_type  # Placeholder for further analysis
    }
    cluster_data.append(cluster_entry)

# Create the DataFrame
cluster_df = pd.DataFrame(cluster_data)[
    ['cluster_id', 'wallets_in_cluster', 'total_transactions',
     'cluster_start_time', 'cluster_end_time', 'cluster_size', 'cluster_type']
]

In [21]:
cluster_df.head()

Unnamed: 0,cluster_id,wallets_in_cluster,total_transactions,cluster_start_time,cluster_end_time,cluster_size,cluster_type
0,0,"[Habp5bncMSsBC3vkChyebepym5dcTNRYeg2LVG464E96,...",493,2023-11-14 15:20:07,2025-04-06 12:12:19,243,Unknown Entity


In [22]:
from collections import Counter

# Add graph metrics to each cluster
for cluster in cluster_data:
    wallets = cluster['wallets_in_cluster']
    subgraph = G.subgraph(wallets)
    
    degrees = dict(subgraph.degree())
    avg_degree = sum(degrees.values()) / len(degrees)
    
    # Degree centrality
    central_wallets = sorted(degrees.items(), key=lambda x: x[1], reverse=True)[:3]
    central_wallets = [wallet for wallet, _ in central_wallets]
    
    cluster['avg_degree'] = avg_degree
    cluster['density'] = nx.density(subgraph)
    cluster['central_wallets'] = central_wallets


In [23]:
# Define thresholds
high_density_threshold = 0.3
small_cluster_threshold = 5
high_tx_rate_threshold = 5  # transactions per wallet
high_centrality_ratio = 0.5  # one wallet handles >50% of edges

for cluster in cluster_data:
    flags = []

    # High density + small cluster (potential wash trading or bot rings)
    if cluster['density'] > high_density_threshold and cluster['cluster_size'] < small_cluster_threshold:
        flags.append('Dense Small Cluster')

    # High transaction rate
    tx_rate = cluster['total_transactions'] / cluster['cluster_size']
    if tx_rate > high_tx_rate_threshold:
        flags.append('High Tx Rate')

    # Check for centralization
    wallets = cluster['wallets_in_cluster']
    subgraph = G.subgraph(wallets)
    degrees = dict(subgraph.degree())
    max_deg = max(degrees.values())
    if max_deg / (2 * cluster['total_transactions']) > high_centrality_ratio:
        flags.append('Centralized Flow')

    cluster['flags'] = flags if flags else ['Normal']


In [1]:
cluster_data

NameError: name 'cluster_data' is not defined

In [25]:
final_cluster_df = pd.DataFrame(cluster_data)[
    ['cluster_id', 'wallets_in_cluster', 'total_transactions',
     'cluster_start_time', 'cluster_end_time', 'cluster_size',
     'cluster_type', 'avg_degree', 'density', 'central_wallets', 'flags']
]

final_cluster_df.head()

Unnamed: 0,cluster_id,wallets_in_cluster,total_transactions,cluster_start_time,cluster_end_time,cluster_size,cluster_type,avg_degree,density,central_wallets,flags
0,0,"[Habp5bncMSsBC3vkChyebepym5dcTNRYeg2LVG464E96,...",493,2023-11-14 15:20:07,2025-04-06 12:12:19,243,Unknown Entity,1.99177,0.00823,"[AGPZnBZUxmhAtcp8XjT4n8bCia9dEYhhm16M2sfFvmTU,...",[Normal]
