In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

%store -r tx_level_data

pd.set_option('display.width', 200)  # Set a large width
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.expand_frame_repr', False)  # Prevent line wrapping



print(tx_level_data.head())

            timestamp                                          signature      type          source tx_status  block_number                                token_address  token_amount direction                                        sender                                      receiver                                  counterparty    tx_fee                        program_id  PRE_BALANCE  POST_BALANCE SYMBOL TOKEN_NAME symbol        day       price  token_amount_usd
0 2025-04-06 12:12:19  6mPqrKS4AKYJeZ3JLjjubHSwNB96oiJdFX1DgeZNZEBHxM...  TRANSFER  SYSTEM_PROGRAM   success   331670303.0  So11111111111111111111111111111111111111111  1.000000e-09  received  5Hr7wZg7oBpVhH5nngRqzr5W7ZFUfCsfEhbziZJak7fr  AGPZnBZUxmhAtcp8XjT4n8bCia9dEYhhm16M2sfFvmTU  5Hr7wZg7oBpVhH5nngRqzr5W7ZFUfCsfEhbziZJak7fr  0.000006  11111111111111111111111111111111     0.017773      0.017773    SOL     Solana    NaN 2025-04-06  116.447500      1.164475e-07
1 2025-04-06 00:10:50  29aP1yrCx2dbJM4Mxk8257G5e87bY7CgxySDS6LiAAm

In [2]:
tx_level_data.columns.to_list()

['timestamp',
 'signature',
 'type',
 'source',
 'tx_status',
 'block_number',
 'token_address',
 'token_amount',
 'direction',
 'sender',
 'receiver',
 'counterparty',
 'tx_fee',
 'program_id',
 'PRE_BALANCE',
 'POST_BALANCE',
 'SYMBOL',
 'TOKEN_NAME',
 'symbol',
 'day',
 'price',
 'token_amount_usd']

# Data Cleaning 

In [3]:
def clean_tx_data(df):
    df = df.copy()  # Avoid SettingWithCopyWarning

    # Fill missing columns
    default_columns = [
        'sender_name', 'receiver_name', 'counterparty_name', 
        'wallet_entity_label', 'program_name'
    ]
    for col in default_columns:
        if col not in df.columns:
            df[col] = 'Unknown Address' if col != 'program_name' else 'Unknown Program'

    # Only apply fallback logic if 'wallet' column exists
    if 'wallet' in df.columns:
        df.loc[:, 'sender_name'] = np.where(
            df['sender'].isin(df['wallet'].values),
            df['wallet_entity_label'],
            df['sender_name']
        )

        df.loc[:, 'receiver_name'] = np.where(
            df['receiver'].isin(df['wallet'].values),
            df['wallet_entity_label'],
            df['receiver_name']
        )

        df.loc[:, 'counterparty_name'] = np.where(
            df['counterparty'].isin(df['wallet'].values),
            df['wallet_entity_label'],
            df['counterparty_name']
        )

    # Convert timestamp to datetime
    if 'timestamp' in df.columns:
        df.loc[:, 'timestamp'] = pd.to_datetime(df['timestamp'])

    return df


# Transaction Flow Aggregation

In [4]:
def build_tx_graph(df):
    nodes = {}
    edges = []

    for _, row in df.iterrows():
        sender = row['sender']
        receiver = row['receiver']
        sender_label = row['sender_name']
        receiver_label = row['receiver_name']
        amount_usd = row.get('amount_usd', 0)  # Ensure amount_usd has a fallback value

        # Initialize sender node if not already present
        if sender not in nodes:
            nodes[sender] = {
                "id": sender,
                "label": sender_label,
                "amount_usd_sent": 0,
                "amount_usd_received": 0
            }

        # Initialize receiver node if not already present
        if receiver not in nodes:
            nodes[receiver] = {
                "id": receiver,
                "label": receiver_label,
                "amount_usd_sent": 0,
                "amount_usd_received": 0
            }

        # Add amount to the sender's "sent" total, ensuring no NoneType errors
        nodes[sender]["amount_usd_sent"] += amount_usd if amount_usd is not None else 0

        # Add amount to the receiver's "received" total, ensuring no NoneType errors
        nodes[receiver]["amount_usd_received"] += amount_usd if amount_usd is not None else 0

        # Add edge between sender and receiver
        edges.append({
            "from": sender,
            "to": receiver,
            "value": amount_usd if amount_usd is not None else 0
        })

    return {"nodes": nodes, "edges": edges}


In [5]:
# Example usage
if __name__ == "__main__":
    tx_level_data_clean = clean_tx_data(tx_level_data)
    tx_graph = build_tx_graph(tx_level_data_clean)

    import json
    with open("tx_graph.json", "w") as f:
        json.dump(tx_graph, f, indent=2)


    print("✅ Cleaned and graph data saved to tx_graph.json")

✅ Cleaned and graph data saved to tx_graph.json
