# Preprocessing

In [None]:
from preprocessing import preprocessing

trades = preprocessing("data/IDEXTrades.csv")
trades.head()

# Algorithm

## Find important Ciycles

In [6]:
import matplotlib.pyplot as plt
import time
from scc_algorithm import *

In [7]:
def benchmark_algorithms(df, runs=3):
    sequential_times = []
    parallel_times = []

    for _ in range(runs):
        start = time.perf_counter()
        scc_algo_seq(df.copy())
        sequential_times.append(time.perf_counter() - start)

        start = time.perf_counter()
        scc_algo_parallel(df.copy())
        parallel_times.append(time.perf_counter() - start)

    return sequential_times, parallel_times

In [None]:
seq_times, par_times = benchmark_algorithms(trades, runs=5)

# Plot results
plt.figure(figsize=(10, 6))
plt.plot(seq_times, label="Sequential", marker='o')
plt.plot(par_times, label="Parallel", marker='o')
plt.xlabel("Run")
plt.ylabel("Execution Time (seconds)")
plt.title("SCC Algorithm Performance: Sequential vs Parallel")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

Processing tokens: 100%|██████████| 1199/1199 [05:37<00:00,  3.55it/s]
Processing tokens:   0%|          | 3/1199 [00:19<2:11:40,  6.61s/it]


## Volume Matching

In [17]:
import numpy as np
from collections import defaultdict

In [18]:
def seqlast(start: float, stop: float, step: int) -> list:
    """Mimics seqlast in R (sequence from start to stop, step size in seconds)"""
    seq = list(np.arange(start, stop + step, step))
    seq[-1] = stop
    return seq

In [19]:
window_sizes_in_seconds = [3600, 86400, 604800]
ether = False

In [20]:
trades["wash_label"] = pd.NA
window_start = trades["cut"].min()
relevant_scc = relevant["scc_hash"].to_list()
wash_trades = defaultdict(lambda: defaultdict(list))

In [21]:
def detect_label_wash_trades(df: pd.DataFrame, margin: float = 0.01) -> pd.DataFrame:
    df = df.copy().reset_index(drop=True)
    
    buyers = df["buyer"]
    sellers = df["seller"]
    amounts = df["amount"]

    balance_map = defaultdict(float)
    trade_amounts = []

    # Step 1: build balanceMap and track amounts
    for i in range(len(df)):
        amt = amounts[i]
        trade_amounts.append(amt)
        balance_map[buyers[i]] += amt
        balance_map[sellers[i]] -= amt

    # Step 2: reverse iterate and test if current prefix is a wash trade
    for idx in reversed(range(1, len(df))):
        balances = np.array(list(balance_map.values()))
        mean_trade_vol = np.mean(trade_amounts)
        if mean_trade_vol == 0:
            break  # avoid division by zero

        normed_balances = np.abs(balances / mean_trade_vol)
        if np.all(normed_balances <= margin):
            df.loc[:idx, "wash_label"] = True
            return df

        # Remove trade at idx
        amt = amounts[idx]
        trade_amounts.pop(idx)
        balance_map[buyers[idx]] -= amt
        balance_map[sellers[idx]] += amt

    return df

In [22]:
win_idx = 1
with tqdm(total=len(window_sizes_in_seconds) * len(relevant), desc="Processing SCCs") as pbar:
    for window_size in window_sizes_in_seconds:
        breaks = seqlast(window_start, trades["timestamp"].max(), window_size)

        scc_idx = 1
        for scc_id in relevant_scc:
            scc_traders = global_scc_traders_map[scc_id]
            scc_trades = trades[
                (trades["eth_buyer_id"].isin(scc_traders)) &
                (trades["eth_seller_id"].isin(scc_traders)) &
                (trades["wash_label"].isna() | (trades["wash_label"] == False))
            ].sort_values("cut")

            if scc_trades.empty:
                wash_trades[scc_id][str(window_size)] = []
                pbar.update(1)
                continue

            # Mark trades as "checked"
            trades.loc[trades["transactionHash"].isin(scc_trades["transactionHash"]), "wash_label"] = False

            if ether:
                temp_trades = scc_trades[[
                    "transactionHash", "token", "date", "timestamp", 
                    "eth_buyer", "eth_seller", "trade_amount_eth", "trade_amount_dollar", "wash_label"
                ]].copy()
                temp_trades.rename(columns={
                    "eth_buyer": "buyer",
                    "eth_seller": "seller",
                    "trade_amount_eth": "amount"
                }, inplace=True)
            else:
                temp_trades = scc_trades[[
                    "transactionHash", "token", "date", "timestamp", 
                    "eth_seller", "eth_buyer", "trade_amount_token", "trade_amount_dollar", "wash_label"
                ]].copy()
                temp_trades.rename(columns={
                    "eth_seller": "buyer",
                    "eth_buyer": "seller",
                    "trade_amount_token": "amount"
                }, inplace=True)

            # Create window labels (right-exclusive, left-inclusive)
            temp_trades["window"] = pd.cut(
                temp_trades["timestamp"],
                bins=breaks,
                right=False,
                include_lowest=True,
            )
            # temp trades ist correct

            # Group by token and time window
            grouped = temp_trades.groupby(["token", "window"], observed=True)

            scc_wash_trades_all = []
            for (token, window), group in grouped:
                labeled_group = detect_label_wash_trades(group.copy())
                scc_wash_trades_all.append(labeled_group)

            # Store
            wash_trades[scc_id][str(window_size)] = scc_wash_trades_all

            # Update labels in main trades DataFrame
            checked_trades = pd.concat(scc_wash_trades_all, ignore_index=True)

            true_hashes = checked_trades[checked_trades["wash_label"] == True]["transactionHash"]
            trades.loc[trades["transactionHash"].isin(true_hashes), "wash_label"] = True

            pbar.update(1)
            scc_idx += 1
        win_idx += 1

Processing SCCs: 100%|██████████| 6/6 [00:00<00:00, 10.61it/s]


In [23]:
address_clusters = {}

for scc_id in relevant_scc:
    trader_ids = global_scc_traders_map.get(scc_id, [])
    addresses = global_trader_hashes[global_trader_hashes["trader_id"].isin(trader_ids)]["trader_address"].tolist()
    address_clusters[str(scc_id)] = addresses

address_clusters

{'28350215842494700130262459757615087967611956572884224275235703561513727331982': ['0x0fdabcff4e4c7107d07ce7ec765b64bf2ad4581f',
  '0x5186fcad00a548b01f5274f686132644e7c6748b'],
 '90762889311631373556662398140887537943631964536540503316693059694673431708751': ['0x96b0e9762b3fb99405bc416d8c58d2a47b4b27ad',
  '0xa78dec35b794ef1ba2da9d3eed26858b344e597a']}