In [5]:
import pandas as pd
import numpy as np
from datetime import datetime
import networkx as nx

%store -r tx_level_data

pd.set_option('display.width', 200)  # Set a large width
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.expand_frame_repr', False)  # Prevent line wrapping



print(tx_level_data.head())

            timestamp                                          signature      type          source tx_status  block_number                                token_address  token_amount direction                                        sender                                      receiver                                  counterparty    tx_fee                        program_id  PRE_BALANCE  POST_BALANCE SYMBOL TOKEN_NAME symbol        day       price  token_amount_usd
0 2025-04-06 12:12:19  6mPqrKS4AKYJeZ3JLjjubHSwNB96oiJdFX1DgeZNZEBHxM...  TRANSFER  SYSTEM_PROGRAM   success   331670303.0  So11111111111111111111111111111111111111111  1.000000e-09  received  5Hr7wZg7oBpVhH5nngRqzr5W7ZFUfCsfEhbziZJak7fr  AGPZnBZUxmhAtcp8XjT4n8bCia9dEYhhm16M2sfFvmTU  5Hr7wZg7oBpVhH5nngRqzr5W7ZFUfCsfEhbziZJak7fr  0.000006  11111111111111111111111111111111     0.017773      0.017773    SOL     Solana    NaN 2025-04-06  116.447500      1.164475e-07
1 2025-04-06 00:10:50  29aP1yrCx2dbJM4Mxk8257G5e87bY7CgxySDS6LiAAm

In [6]:
# Initialize a graph
G = nx.Graph()

# Add edges from each transaction's sender and receiver
for _, row in tx_level_data.iterrows():
    sender = row['sender']
    receiver = row['receiver']
    G.add_edge(sender, receiver)

# Extract connected components as clusters
clusters = list(nx.connected_components(G))

# Prepare the cluster data
cluster_data = []
for cluster_id, wallets in enumerate(clusters):
    # Filter transactions involving cluster wallets
    mask = tx_level_data['sender'].isin(wallets) | tx_level_data['receiver'].isin(wallets)
    cluster_txs = tx_level_data[mask]
    #cluster_txs = tx_level_data['signature'].nunique()
    
    # Aggregate metrics
    cluster_entry = {
        'cluster_id': cluster_id,
        'wallets_in_cluster': list(wallets),
        'total_transactions': cluster_txs['signature'].nunique(),
        'cluster_start_time': cluster_txs['timestamp'].min(),
        'cluster_end_time': cluster_txs['timestamp'].max(),
        'cluster_size': len(wallets),
        'cluster_type': 'Unknown'  # Placeholder for further analysis
    }
    cluster_data.append(cluster_entry)

# Create the DataFrame
cluster_df = pd.DataFrame(cluster_data)[
    ['cluster_id', 'wallets_in_cluster', 'total_transactions',
     'cluster_start_time', 'cluster_end_time', 'cluster_size', 'cluster_type']
]

In [7]:
cluster_df.head()

Unnamed: 0,cluster_id,wallets_in_cluster,total_transactions,cluster_start_time,cluster_end_time,cluster_size,cluster_type
0,0,"[, CoaKnxNQCJ91FyyNqxmwxEHwzdw8YHmgF3ZpLNjf1Tz...",493,2023-11-14 15:20:07,2025-04-06 12:12:19,243,Unknown
