### GCN for ethereum phishing detection
M. Weber, G. Domeniconi, J. Chen, D. K. I. Weidele, C. Bellei, T. Robinson, C. E. Leiserson, "Anti-Money Laundering in Bitcoin: Experimenting with Graph Convolutional Networks for Financial Forensics", KDD ’19 Workshop on Anomaly Detection in Finance, August 2019, Anchorage, AK, USA.

https://www.kaggle.com/code/divyareddyyeruva/elliptic-gcn-pyg/notebook


In [1]:
import pandas as pd 
import os 
import warnings
import networkx as nx 

warnings.filterwarnings('ignore')

In [2]:
pwd = os.getcwd()

In [3]:
# 将边权转化为点权，只获取节点的度和余额

node_feature = pd.DataFrame()

In [None]:
# open数据集
for filename in os.listdir(pwd + '/../original_data/open/钓鱼一阶节点/'):
    if filename.endswith('.csv'):
        df = pd.read_csv(pwd + '/../original_data/open/钓鱼一阶节点/' + filename)

        # node feature
        features = {}
        features['address'] = filename.split('.')[0]
        features['value_out'] = df[df['From'] == filename.split('.')[0]]['Value'].sum()
        features['value_in'] = df[df['To'] == filename.split('.')[0]]['Value'].sum()
        features['balance'] = features['value_out'] - features['value_in']
        features['degree'] = len(df)
        features['label'] = 0

        node_feature = node_feature.append(features, ignore_index=True)



In [4]:
# open数据集
for filename in os.listdir(pwd + '/../original_data/open/非钓鱼一阶节点/'):
    if filename.endswith('.csv'):
        df = pd.read_csv(pwd + '/../original_data/open/非钓鱼一阶节点/' + filename)

        # node feature
        features = {}
        features['address'] = filename.split('.')[0]
        features['value_out'] = df[df['From'] == filename.split('.')[0]]['Value'].sum()
        features['value_in'] = df[df['To'] == filename.split('.')[0]]['Value'].sum()
        features['balance'] = features['value_out'] - features['value_in']
        features['degree'] = len(df)
        features['label'] = 1

        node_feature = node_feature.append(features, ignore_index=True)

In [5]:
# etherscan数据集

for filename in os.listdir(pwd + '/../original_data/etherscan/1d/phish-hack/'):
    if filename.endswith('.csv'):
        try:
            df = pd.read_csv(pwd + '/../original_data/etherscan/1d/phish-hack/' + filename)
        except:
            continue

        df['value'] = df['value'].astype('float')
        df['value'] = df['value'] / 1e18


        

        # node feature
        features = {}
        features['address'] = filename.split('.')[0]
        features['value_out'] = df[df['from'] == filename.split('.')[0]]['value'].sum()
        features['value_in'] = df[df['to'] == filename.split('.')[0]]['value'].sum()
        features['balance'] = features['value_out'] - features['value_in']
        features['degree'] = len(df)
        features['label'] = 0

        node_feature = node_feature.append(features, ignore_index=True)

In [6]:
# etherscan数据集

for filename in os.listdir(pwd + '/../original_data/etherscan/1d/normal/'):
    if filename.endswith('.csv'):
        try:
            df = pd.read_csv(pwd + '/../original_data/etherscan/1d/normal/' + filename)
        except:
            continue
        
        df['value'] = df['value'].astype('float')
        df['value'] = df['value'] / 1e18
        
        # node feature
        features = {}
        features['address'] = filename.split('.')[0]
        features['value_out'] = df[df['from'] == filename.split('.')[0]]['value'].sum()
        features['value_in'] = df[df['to'] == filename.split('.')[0]]['value'].sum()
        features['balance'] = features['value_out'] - features['value_in']
        features['degree'] = len(df)
        features['label'] = 1

        node_feature = node_feature.append(features, ignore_index=True)

In [7]:
node_feature.head()

Unnamed: 0,address,value_out,value_in,balance,degree,label
0,0x0000000000000000000000000000000000000000,0.0,4961.202218,-4961.202218,1377,1
1,0x000419c40a811a052b56372f4a80823be47db756,1545.456345,1567.301437,-21.845092,252,1
2,0x000e0e5701b14fb77160bcc7bfe7256522d5927b,4000.0,8100.0,-4100.0,5,1
3,0x00267e4a01f25072e0e9347a2ccdd67091d7aeac,950.094658,950.1,-0.005342,22,1
4,0x00316d956f5f35591ae021f4858a2a865c6ba02a,4861.91816,3495.49,1366.42816,33,1


In [8]:
edgelist = []

In [None]:
# open数据集
# 二阶节点特征
for floderename in os.listdir(pwd + '/../original_data/open/钓鱼二阶节点/'):
    if floderename == '.DS_Store':
        continue
    print("Process Phishing: ", floderename)
    for filename in os.listdir(pwd + '/../original_data/open/钓鱼二阶节点/' + floderename):
        if filename.endswith('.csv'):
            df = pd.read_csv(pwd + '/../original_data/open/钓鱼二阶节点/' + floderename + '/' + filename)

            # node feature
            features = {}
            features['address'] = filename.split('.')[0]
            features['value_out'] = df[df['From'] == filename.split('.')[0]]['Value'].sum()
            features['value_in'] = df[df['To'] == filename.split('.')[0]]['Value'].sum()
            features['balance'] = features['value_out'] - features['value_in']
            features['degree'] = len(df)
            features['label'] = 'unknown'

            node_feature = node_feature.append(features, ignore_index=True)
            edgelist.append((floderename.split('.')[0], filename.split('.')[0]))



In [9]:
# open数据集
for floderename in os.listdir(pwd + '/../original_data/open/非钓鱼二阶节点/'):
    if floderename == '.DS_Store':
        continue
    print("Process Normal: ", floderename)
    for filename in os.listdir(pwd + '/../original_data/open/非钓鱼二阶节点/' + floderename):
        if filename.endswith('.csv'):

            df = pd.read_csv(pwd + '/../original_data/open/非钓鱼二阶节点/' + floderename + '/' + filename)


            # node feature
            features = {}
            features['address'] = filename.split('.')[0]
            features['value_out'] = df[df['From'] == filename.split('.')[0]]['Value'].sum()
            features['value_in'] = df[df['To'] == filename.split('.')[0]]['Value'].sum()
            features['balance'] = features['value_out'] - features['value_in']
            features['degree'] = len(df)
            features['label'] = 'unknown'

            node_feature = node_feature.append(features, ignore_index=True)
            edgelist.append((floderename.split('.')[0], filename.split('.')[0]))

Process Normal:  0x0000000000000000000000000000000000000000
Process Normal:  0x000419c40a811a052b56372f4a80823be47db756
Process Normal:  0x000e0e5701b14fb77160bcc7bfe7256522d5927b
Process Normal:  0x00267e4a01f25072e0e9347a2ccdd67091d7aeac
Process Normal:  0x00316d956f5f35591ae021f4858a2a865c6ba02a
Process Normal:  0x0034cf6e02f4c47fb30df22fc81b8dedddbf1fb0
Process Normal:  0x004e3def0c754a921af751d1004df95f9650ea00
Process Normal:  0x005bdf2845064db405f5c99aaf9510d0b19e7ac2
Process Normal:  0x005e288d713a5fb3d7c9cf1b43810a98688c7223
Process Normal:  0x006deef69f1b3c01173b5ba74a5de9050e72d702
Process Normal:  0x007c508c6368d2ad35608cb8e98edc9ef1bf0e84
Process Normal:  0x00a2df284ba5f6428a39dff082ba7ff281852e06
Process Normal:  0x00a80d1f0649358af50c5b9263777e4b1dcec366
Process Normal:  0x00ce8ad7e44fea30ecbe2fb69963eaa26b5b56a0
Process Normal:  0x00d681a7b6584f978f63c81cfd847064ce19a080
Process Normal:  0x00e5c013694c9ee92b76ce6ad7ad3bcc20475d6f
Process Normal:  0x00e72cfa92cdf0a7f9109

In [10]:
# etherscan数据集

for floderename in os.listdir(pwd + '/../original_data/etherscan/2d/phish-hack/'):
    if floderename == '.DS_Store':
        continue
    print("Process Normal: ", floderename)
    for filename in os.listdir(pwd + '/../original_data/etherscan/2d/phish-hack/' + floderename):
        if filename.endswith('.csv'):
            try:
                df = pd.read_csv(pwd + '/../original_data/etherscan/2d/phish-hack/' + floderename + '/' + filename)
            except:
                continue
            
            df['value'] = df['value'].astype('float')
            df['value'] = df['value'] / 1e18

            # node feature
            features = {}
            features['address'] = filename.split('.')[0]
            features['value_out'] = df[df['from'] == filename.split('.')[0]]['value'].sum()
            features['value_in'] = df[df['to'] == filename.split('.')[0]]['value'].sum()
            features['balance'] = features['value_out'] - features['value_in']
            features['degree'] = len(df)
            features['label'] = 'unknown'

            node_feature = node_feature.append(features, ignore_index=True)
            edgelist.append((floderename.split('.')[0], filename.split('.')[0]))

Process Normal:  0x000000000532b45f47779fce440748893b257865
Process Normal:  0x00000000072d54638c2c2a3da3f715360269eea1
Process Normal:  0x0000000009324b6434d7766af41908e4c49ee1d7
Process Normal:  0x0000000086c5d614bec59dfd2c9b88f7cb57f23c
Process Normal:  0x00000000bf02300fd6251627aa3db8933a0eee83
Process Normal:  0x00000e32e51011e28958d4696627c82c3dacd5a6
Process Normal:  0x00067010f3ae17aa53e2b4d5142dda35380cf72d
Process Normal:  0x0020731604c882cf7bf8c444be97d17b19ea4316
Process Normal:  0x00278018530825863b765dc6cd581c0a8d471ade
Process Normal:  0x002bf459dc58584d58886169ea0e80f3ca95ffaf
Process Normal:  0x002f0c8119c16d310342d869ca8bf6ace34d9c39
Process Normal:  0x003eb9c77b5b896fcc27adead606d23def34510e
Process Normal:  0x003f12c49df27295fe97d4990b7901e2c855adf5
Process Normal:  0x0059b14e35dab1b4eee1e2926c7a5660da66f747
Process Normal:  0x0061fb5485dff4bb85c078dca80d19119224d97e
Process Normal:  0x00650a784ee109797ba8c05f3496bdbf2b2b7a1c
Process Normal:  0x0067b43cb530e683dbffb

In [11]:
# etherscan数据集

for floderename in os.listdir(pwd + '/../original_data/etherscan/2d/normal/'):
    if floderename == '.DS_Store':
        continue
    print("Process Normal: ", floderename)
    for filename in os.listdir(pwd + '/../original_data/etherscan/2d/normal/' + floderename):
        if filename.endswith('.csv'):
            try:
                df = pd.read_csv(pwd + '/../original_data/etherscan/2d/normal/' + floderename + '/' + filename)
            except:
                continue

            df['value'] = df['value'].astype('float')
            df['value'] = df['value'] / 1e18

            # node feature
            features = {}
            features['address'] = filename.split('.')[0]
            features['value_out'] = df[df['from'] == filename.split('.')[0]]['value'].sum()
            features['value_in'] = df[df['to'] == filename.split('.')[0]]['value'].sum()
            features['balance'] = features['value_out'] - features['value_in']
            features['degree'] = len(df)
            features['label'] = 'unknown'

            node_feature = node_feature.append(features, ignore_index=True)
            edgelist.append((floderename.split('.')[0], filename.split('.')[0]))

Process Normal:  0x0000000000000000000000000000000000000000
Process Normal:  0x0000000000000000000000000000000000000002
Process Normal:  0x0000000000000000000000000000000000000003
Process Normal:  0x0000000000000000000000000000000000000004
Process Normal:  0x0000000000000000000000000000000000000005
Process Normal:  0x0000000000000000000000000000000000000006
Process Normal:  0x0000000000000000000000000000000000000007
Process Normal:  0x0000000000000000000000000000000000000008
Process Normal:  0x0000000000000000000000000000000000000009
Process Normal:  0x000000000000000000000000000000000000dead
Process Normal:  0x0000000000004946c0e9f43f4dee607b0ef1fa1c
Process Normal:  0x0000000000013949f288172bd7e36837bddc7211
Process Normal:  0x0000000000075efbee23fe2de1bd0b7690883cc9
Process Normal:  0x0000000000085d4780b73119b644ae5ecd22b376
Process Normal:  0x00000000000c2e074ec69a0dfb2997ba6c7d2e1e
Process Normal:  0x00000000006c3852cbef3e08e8df289169ede581
Process Normal:  0x0000000031f7382a812c6

In [12]:
node_feature.head()

Unnamed: 0,address,value_out,value_in,balance,degree,label
0,0x0000000000000000000000000000000000000000,0.0,4961.202218,-4961.202218,1377,1
1,0x000419c40a811a052b56372f4a80823be47db756,1545.456345,1567.301437,-21.845092,252,1
2,0x000e0e5701b14fb77160bcc7bfe7256522d5927b,4000.0,8100.0,-4100.0,5,1
3,0x00267e4a01f25072e0e9347a2ccdd67091d7aeac,950.094658,950.1,-0.005342,22,1
4,0x00316d956f5f35591ae021f4858a2a865c6ba02a,4861.91816,3495.49,1366.42816,33,1


In [None]:
# 保存节点特征
node_feature.to_csv(pwd + '/../original_data/open/node_feature.csv', index=False)

In [13]:
import numpy as np
# 将edgelist转化为dataframe
edgelists = pd.DataFrame(edgelist, columns=['addr1', 'addr2'])
edgelists.head()

Unnamed: 0,addr1,addr2
0,0x0000000000000000000000000000000000000000,0x0002a6f06a636c370ad59a8024ea6e46ca161a5a
1,0x0000000000000000000000000000000000000000,0x000ddec4e987de213447a202e2780c1399b5a13a
2,0x0000000000000000000000000000000000000000,0x0027ca2de624752f3e39d7557d76bb5b09d92882
3,0x0000000000000000000000000000000000000000,0x002e89cc5acbe515b4ceb2a799758c40e0b46016
4,0x0000000000000000000000000000000000000000,0x0042d963dde2686a35b66d0dd666dff05d0af697


In [None]:
# 保存edgelist
edgelists.to_csv(pwd + '/../original_data/open/edgelist.csv', index=False)

In [14]:
# GCN to extract spatial features

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

In [15]:
# 使用GCN提取节点特征

class Net(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Net, self).__init__()
        self.conv1 = GCNConv(in_channels, 16)
        self.conv2 = GCNConv(16, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [16]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.data import DataLoader

In [17]:
# 获取label为0或1的节点
classified_node = node_feature[node_feature['label'] != 'unknown']

In [18]:
# all nodes
all_nodes = node_feature['address'].tolist()
map_id = {j: i for i, j in enumerate(all_nodes)}

edges = edgelists.copy()
edges.addr1 = edges.addr1.map(map_id)
edges.addr2 = edges.addr2.map(map_id)
edges.astype(int)

edge_index = np.array(edges).T.tolist()

edge_index = torch.tensor(edge_index, dtype=torch.long).contiguous()
weights = torch.tensor([1] * len(edge_index[0]), dtype=torch.float)
print(edge_index.shape)
print(weights.shape)


torch.Size([2, 233637])
torch.Size([233637])


In [19]:
# maping txIds to corresponding indexes, to pass node features to the model
addr_features = node_feature.copy()

# 将address转化为id
addr_features.address = addr_features.address.map(map_id)

classified_index = addr_features[addr_features['label'] != 'unknown']['address'].tolist()
unclassified_index = addr_features[addr_features['label'] == 'unknown']['address'].tolist()

# repalce label 'unknown' with 1
addr_features['label'] = addr_features['label'].replace('unknown', 1)

In [20]:
labels = addr_features['label'].values

addr_features_tensor = torch.tensor(addr_features.drop(['address', 'label'], axis=1).values, dtype=torch.float)

data_train = Data(x=addr_features_tensor, edge_index=edge_index, y=torch.tensor(labels, dtype=torch.long))

In [21]:
data_train

Data(x=[242633, 4], edge_index=[2, 233637], y=[242633])

In [22]:
data_train.x[classified_index].shape

torch.Size([8996, 4])

In [23]:
from sklearn.model_selection import train_test_split

# 划分训练集和测试集
x_train, x_val, y_train, y_val, train_idx, val_idx = train_test_split(data_train.x[classified_index], data_train.y[classified_index], classified_index, test_size=0.2, random_state=42, stratify=data_train.y[classified_index])


In [24]:
data_train.y[classified_index].sum()

tensor(5027)

In [25]:
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
class Net(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Net, self).__init__()
        self.conv1 = GCNConv(in_channels, 16)
        self.conv2 = GCNConv(16, 16)
        self.conv3 = GCNConv(32, 32)
        self.conv4 = GCNConv(16, out_channels)

    def forward(self, data, adj=None):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.1, training=self.training)
        # print(x.shape, edge_index.shape)
        x = self.conv4(x, edge_index)

        return F.sigmoid(x)

In [26]:
from sklearn.metrics import classification_report

In [27]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net(4, 1).to(device)
# model = Net().to(device)
model.float()
data_train = data_train.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
criterion = torch.nn.BCELoss()

model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data_train)
    # print(out.shape)

    # reshape the output to match the shape of the label
    loss = criterion(out[classified_index].reshape(-1), data_train.y[classified_index].float())

    loss.backward()
    optimizer.step()
    scheduler.step(loss)

    # auc = roc_auc_score(data_train.y.detach().cpu().numpy()[train_idx], out.detach().cpu().numpy()[train_idx]) #[train_idx]

    result = classification_report(data_train.y.detach().cpu().numpy()[train_idx], (out.detach().cpu().numpy()[train_idx] > 0.5).astype(int), output_dict=True)['1'].values()
    result = list(result)

    if epoch%5 == 0:
      # print("epoch: {} - loss: {} - roc: {}".format(epoch, loss.item(), auc))
      print("epoch: {} - loss: {} - precision: {} - recall: {} - f1_score: {}".format(epoch, loss.item(), result[0], result[1], result[2]))

epoch: 0 - loss: 39.64671325683594 - precision: 0.88 - recall: 0.01641382740611788 - f1_score: 0.0322265625
epoch: 5 - loss: 22.628459930419922 - precision: 0.5408689248895434 - recall: 0.3653320069634419 - f1_score: 0.43609915392607984
epoch: 10 - loss: 13.812678337097168 - precision: 0.5491866769945778 - recall: 0.8816214871922408 - f1_score: 0.6767850324551354
epoch: 15 - loss: 14.772713661193848 - precision: 0.5580063019192208 - recall: 0.9689132056702313 - f1_score: 0.7081704989548305
epoch: 20 - loss: 15.411971092224121 - precision: 0.5589808558558559 - recall: 0.9875652822680925 - f1_score: 0.7138876404494382
epoch: 25 - loss: 15.541848182678223 - precision: 0.5594198817234581 - recall: 0.9880626709773688 - f1_score: 0.714375618088645
epoch: 30 - loss: 15.520478248596191 - precision: 0.5584415584415584 - recall: 0.9838348669485203 - f1_score: 0.7124718595227375
epoch: 35 - loss: 15.210552215576172 - precision: 0.5586576424139876 - recall: 0.9853270330763492 - f1_score: 0.7130387