In [6]:
import pandas as pd
import networkx as nx
import csv
import matplotlib.pyplot as plt
from collections import Counter

In [9]:
# Read the CSV file and extract relevant data
def read_transactions(file_path):
    transactions = []
    with open(file_path, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            transactions.append({
                'from': row['from'],
                'to': row['to'],
                'value': row['value'],
                'hash': row['hash'],
                'timeStamp': row['timeStamp'],
                'blockNumber': row['blockNumber']
            })
    return transactions

# Create a directed graph from the transactions
def create_graph(transactions):
    G = nx.DiGraph()  # Directed graph since transactions are one-way (from -> to)
    
    for tx in transactions:
        from_addr = tx['from']
        to_addr = tx['to']
        value = float(tx['value'])
        
        # Add nodes (wallet addresses)
        G.add_node(from_addr)
        G.add_node(to_addr)
        
        # Add an edge for each transaction
        G.add_edge(from_addr, to_addr, weight=value)
    
    return G

In [10]:
from sklearn.cluster import KMeans
import numpy as np

# Function to extract features from the graph for each wallet
def extract_features(G):
    features = []
    addresses = []
    
    for node in G.nodes():
        in_degree = G.in_degree(node)
        out_degree = G.out_degree(node)
        total_out_value = sum([G[u][v]['weight'] for u, v in G.out_edges(node)])  # Sum of outgoing transactions
        total_in_value = sum([G[u][v]['weight'] for u, v in G.in_edges(node)])  # Sum of incoming transactions
        
        # Append features: in-degree, out-degree, total value sent, total value received
        features.append([in_degree, out_degree, total_in_value, total_out_value])
        addresses.append(node)
    
    return np.array(features), addresses

# Function to run K-Means on the extracted features
def run_kmeans(features, addresses, n_clusters=3):
    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(features)
    labels = kmeans.labels_
    
    # Group addresses by cluster
    clusters = {}
    for i, label in enumerate(labels):
        if label not in clusters:
            clusters[label] = []
        clusters[label].append(addresses[i])
    
    return clusters

# Main function to run K-Means clustering
def main_kmeans(file_path):
    # Step 1: Read the transactions from CSV
    transactions = read_transactions(file_path)
    
    # Step 2: Create the graph
    G = create_graph(transactions)
    
    # Step 3: Extract features from the graph
    features, addresses = extract_features(G)
    
    # Step 4: Run K-Means clustering
    clusters = run_kmeans(features, addresses, n_clusters=3)  # You can adjust the number of clusters
    
    # Print the addresses in each cluster
    for cluster_id, cluster_addresses in clusters.items():
        print(f"Cluster {cluster_id}: {cluster_addresses}")

# Example usage
file_path = 'ethersccan_2000.csv'  # Path to your CSV file
main_kmeans(file_path)




Cluster 0: ['0xe60519c2cbac5f07ac2c0e1487fe46fdb8d0893c', '0xa5409ec958c83c3f309868babaca7c86dcb077c1', '0x45db714f24f5a313569c41683047f1d49e78ba07', '0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2', '0x7be8076f4ea4a4ad08075c2508e481d6c946d12b', '0x881d40237659c251811cec9c364ef91dc08d300c', '0xabea9132b05a70803a4e85094fd0e1800777fbef', '0x4dbd4fc535ac27206064b68ffcf827b0a60bab3f', '0x4678f0a6958e4d2bc4f1baf7bc52e8f3564f3fe4', '0x6cb18ff2a33e981d1e38a663ca056c0a5265066a', '0x6b175474e89094c44da98b954eedeac495271d0f', '0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48', '0x80c67432656d59144ceff962e8faf8926599bcf8', '0x4976a4a02f38326660d17bf34b431dc6e2eb2327', '0x3fc91a3afd70395cd496c647d5a6cc9d4b2b7fad', '0xae78736cd615f374d3085123a210448e74fc6393', '0x858646372cc42e1a627fce94aa7a7033e7cf075a', '0x39053d51b77dc0d36036fc1fcc8cb819df8ef37a', '0x035bdaeab85e47710c27eda7fd754ba80ad4ad02', '0xf7858da8a6617f7c6d0ff2bcafdb6d2eedf64840', '0x041333a0a76b365beb10d9e0a33bc5d327f48294', '0x4aa799c5dfc01ee7d79

In [11]:
file_path = 'ethersccan.csv'  # Path to your CSV file
main_kmeans(file_path)

Cluster 0: ['0xf726dc178d1a4d9292a8d63f01e0fa0a1235e65c', '0xd78e2f7328de3d66dec1da43e7474f0fc73ccf66', '0x275b69aa7c8c1d648a0557656bce1c286e69a29d', '0xb248d8d69cc9a74afc80f546fc2049f2aaec1975', '0xb2f91964a48c21e3325431a02537c3e02ea2603a', '0x754e697586902a0d42da842bec9bae7b32de63a6', '0x58fee717d64b9cd2eadf321fc6eb5c99ee9c2504', '0x3bdf7200e680fa9a0ed85a312a4cbdcc164c76d3', '0xb8ee423956e752286ed6cab2729001e8de0806e9', '0x96346a7cc7768e07d0e9bde6d22177d10a5e1c91', '0x38e0fcf756d046c4fcc6574f62b34f75d9f7374f', '0x533168e80d28f633c34571052190332493ba49d0', '0xccbe6c55e3fdb607222f31f8a3e3a778f74696e8', '0x9961bfbab74749a8eb1d303dfdfd13b45093ee77', '0x8fc3455ef398d059acd51f32d67deba0346dc3f5', '0x1bc653276cae38733c591c52389e3e00a3e28a32', '0x1d04a9b7f42568ec945dadc2529b879387b4fe8d', '0xf7858da8a6617f7c6d0ff2bcafdb6d2eedf64840', '0xa0a22c18d896f2949f4bc26d7c40b251302a66bd', '0x4aa799c5dfc01ee7d790e3bf1a7c2257ce1dceff', '0x58c346defd05db993938df88dd8eb9f34dac79d2', '0xf91882f02dda63ed168



In [12]:
import csv
import numpy as np
from sklearn.cluster import DBSCAN
import networkx as nx
import matplotlib.pyplot as plt

# Read the CSV file and extract relevant data
def read_transactions(file_path):
    transactions = []
    with open(file_path, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            transactions.append({
                'from': row['from'],
                'to': row['to'],
                'value': float(row['value']),
                'hash': row['hash'],
                'timeStamp': row['timeStamp'],
                'blockNumber': row['blockNumber']
            })
    return transactions

# Create a directed graph from the transactions
def create_graph(transactions):
    G = nx.DiGraph()  # Directed graph since transactions are one-way (from -> to)
    
    for tx in transactions:
        from_addr = tx['from']
        to_addr = tx['to']
        value = tx['value']
        
        # Add nodes (wallet addresses)
        G.add_node(from_addr)
        G.add_node(to_addr)
        
        # Add an edge for each transaction
        G.add_edge(from_addr, to_addr, weight=value)
    
    return G

# Generate features for each address
def generate_features(G):
    features = []
    addresses = list(G.nodes())
    
    for address in addresses:
        outgoing_tx = G.out_degree(address)
        incoming_tx = G.in_degree(address)
        total_out_value = sum([G.get_edge_data(address, nbr).get('weight', 0) for nbr in G.successors(address)])
        total_in_value = sum([G.get_edge_data(nbr, address).get('weight', 0) for nbr in G.predecessors(address)])
        unique_counterparties = len(set(G.successors(address)) | set(G.predecessors(address)))
        
        # Create a feature vector for each address
        features.append([outgoing_tx, incoming_tx, total_out_value, total_in_value, unique_counterparties])
    
    return np.array(features), addresses

# Apply DBSCAN to detect Sybil attacks
def detect_sybil_dbscan(features, addresses, eps=0.5, min_samples=2):
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    labels = dbscan.fit_predict(features)
    
    # Identify outliers (label == -1)
    flagged_addresses = [addresses[i] for i, label in enumerate(labels) if label == -1]
    
    return flagged_addresses

# Main function
def main(file_path):
    # Step 1: Read the transactions from CSV
    transactions = read_transactions(file_path)
    
    # Step 2: Create the graph
    G = create_graph(transactions)
    
    # Step 3: Generate features for DBSCAN
    features, addresses = generate_features(G)
    
    # Step 4: Apply DBSCAN to detect Sybil attacks
    flagged_addresses = detect_sybil_dbscan(features, addresses)
    
    # Step 5: Output the flagged addresses
    print(f"Flagged addresses (potential Sybil attackers): {len(flagged_addresses)}")
    unique_addresses = set(flagged_addresses)
    print(f"Unique addresses: {len(unique_addresses)}")


main('ethersccan.csv')


Flagged addresses (potential Sybil attackers): 1117
Unique addresses: 1117
