### GCN for ethereum phishing detection
M. Weber, G. Domeniconi, J. Chen, D. K. I. Weidele, C. Bellei, T. Robinson, C. E. Leiserson, "Anti-Money Laundering in Bitcoin: Experimenting with Graph Convolutional Networks for Financial Forensics", KDD ’19 Workshop on Anomaly Detection in Finance, August 2019, Anchorage, AK, USA.

https://www.kaggle.com/code/divyareddyyeruva/elliptic-gcn-pyg/notebook


In [8]:
import pandas as pd 
import os 
import warnings
import networkx as nx 

warnings.filterwarnings('ignore')

In [9]:
pwd = os.getcwd()

In [10]:
# 将边权转化为点权，只获取节点的度和余额

node_feature = pd.DataFrame()

for filename in os.listdir(pwd + '/../original_data/open/钓鱼一阶节点/'):
    if filename.endswith('.csv'):
        df = pd.read_csv(pwd + '/../original_data/open/钓鱼一阶节点/' + filename)

        # node feature
        features = {}
        features['address'] = filename.split('.')[0]
        features['value_out'] = df[df['From'] == filename.split('.')[0]]['Value'].sum()
        features['value_in'] = df[df['To'] == filename.split('.')[0]]['Value'].sum()
        features['balance'] = features['value_out'] - features['value_in']
        features['degree'] = len(df)
        features['label'] = 0

        node_feature = node_feature.append(features, ignore_index=True)

         

for filename in os.listdir(pwd + '/../original_data/open/非钓鱼一阶节点/'):
    if filename.endswith('.csv'):
        df = pd.read_csv(pwd + '/../original_data/open/非钓鱼一阶节点/' + filename)

        # node feature
        features = {}
        features['address'] = filename.split('.')[0]
        features['value_out'] = df[df['From'] == filename.split('.')[0]]['Value'].sum()
        features['value_in'] = df[df['To'] == filename.split('.')[0]]['Value'].sum()
        features['balance'] = features['value_out'] - features['value_in']
        features['degree'] = len(df)
        features['label'] = 1

        node_feature = node_feature.append(features, ignore_index=True)


In [11]:
node_feature.head()

Unnamed: 0,address,value_out,value_in,balance,degree,label
0,0x002bf459dc58584d58886169ea0e80f3ca95ffaf,1.532693,1.533785,-0.001092,4,0
1,0x002f0c8119c16d310342d869ca8bf6ace34d9c39,1.317896,1.31882,-0.000924,5,0
2,0x0059b14e35dab1b4eee1e2926c7a5660da66f747,38.133767,37.556046,0.577721,101,0
3,0x0061fb5485dff4bb85c078dca80d19119224d97e,1.0,5.045869,-4.045869,12,0
4,0x00650a784ee109797ba8c05f3496bdbf2b2b7a1c,0.6793,0.68,-0.0007,3,0


In [12]:
edgelist = []

# 二阶节点特征
for floderename in os.listdir(pwd + '/../original_data/open/钓鱼二阶节点/'):
    if floderename == '.DS_Store':
        continue
    print("Process Phishing: ", floderename)
    for filename in os.listdir(pwd + '/../original_data/open/钓鱼二阶节点/' + floderename):
        if filename.endswith('.csv'):
            df = pd.read_csv(pwd + '/../original_data/open/钓鱼二阶节点/' + floderename + '/' + filename)

            # node feature
            features = {}
            features['address'] = filename.split('.')[0]
            features['value_out'] = df[df['From'] == filename.split('.')[0]]['Value'].sum()
            features['value_in'] = df[df['To'] == filename.split('.')[0]]['Value'].sum()
            features['balance'] = features['value_out'] - features['value_in']
            features['degree'] = len(df)
            features['label'] = 'unknown'

            node_feature = node_feature.append(features, ignore_index=True)
            edgelist.append((floderename.split('.')[0], filename.split('.')[0]))

for floderename in os.listdir(pwd + '/../original_data/open/非钓鱼二阶节点/'):
    if floderename == '.DS_Store':
        continue
    print("Process Normal: ", floderename)
    for filename in os.listdir(pwd + '/../original_data/open/非钓鱼二阶节点/' + floderename):
        if filename.endswith('.csv'):
            df = pd.read_csv(pwd + '/../original_data/open/非钓鱼二阶节点/' + floderename + '/' + filename)

            # node feature
            features = {}
            features['address'] = filename.split('.')[0]
            features['value_out'] = df[df['From'] == filename.split('.')[0]]['Value'].sum()
            features['value_in'] = df[df['To'] == filename.split('.')[0]]['Value'].sum()
            features['balance'] = features['value_out'] - features['value_in']
            features['degree'] = len(df)
            features['label'] = 'unknown'

            node_feature = node_feature.append(features, ignore_index=True)
            edgelist.append((floderename.split('.')[0], filename.split('.')[0]))

Process Phishing:  0x002bf459dc58584d58886169ea0e80f3ca95ffaf
Process Phishing:  0x002f0c8119c16d310342d869ca8bf6ace34d9c39
Process Phishing:  0x0059b14e35dab1b4eee1e2926c7a5660da66f747
Process Phishing:  0x0061fb5485dff4bb85c078dca80d19119224d97e
Process Phishing:  0x00650a784ee109797ba8c05f3496bdbf2b2b7a1c
Process Phishing:  0x0084515449b037205a33d6d3940a5684126aa4b5
Process Phishing:  0x00c33c49f9a2a920e3f3787204cbda9012d1912e
Process Phishing:  0x00eb6f5199cd0b671da371969b1a0f948e982fea
Process Phishing:  0x0128282ce73c72decabaeace9358344adff449fe
Process Phishing:  0x015c0e438b3a01511b98d928bd031d3dc50abb9e
Process Phishing:  0x015fca1d09287823e634ae51237ee1eeff03d9d5
Process Phishing:  0x0167409e6106ec3e3f05a09fcf04606918d21ad5
Process Phishing:  0x0177eb92b752fa0715ee0dce1d860eaf739b5cf4
Process Phishing:  0x017f86b90a46d8fd999eaefda1339355b98da12f
Process Phishing:  0x018f2f4b3c9e38607aa7ab4dea23e9d663e3e050
Process Phishing:  0x020b1573f2ca670190d33ca2f0a57b0c0399ad37
Process 

In [13]:
node_feature.head()

Unnamed: 0,address,value_out,value_in,balance,degree,label
0,0x002bf459dc58584d58886169ea0e80f3ca95ffaf,1.532693,1.533785,-0.001092,4,0
1,0x002f0c8119c16d310342d869ca8bf6ace34d9c39,1.317896,1.31882,-0.000924,5,0
2,0x0059b14e35dab1b4eee1e2926c7a5660da66f747,38.133767,37.556046,0.577721,101,0
3,0x0061fb5485dff4bb85c078dca80d19119224d97e,1.0,5.045869,-4.045869,12,0
4,0x00650a784ee109797ba8c05f3496bdbf2b2b7a1c,0.6793,0.68,-0.0007,3,0


In [14]:
# 保存节点特征
node_feature.to_csv(pwd + '/../original_data/open/node_feature.csv', index=False)

In [30]:
import numpy as np
# 将edgelist转化为dataframe
edgelists = pd.DataFrame(edgelist, columns=['addr1', 'addr2'])
edgelists.head()

Unnamed: 0,addr1,addr2
0,0x0061fb5485dff4bb85c078dca80d19119224d97e,0x0e2b7403c55e5cdb132d3bb968e7e8316da2b1ec
1,0x0061fb5485dff4bb85c078dca80d19119224d97e,0x294f24da5c42d127c6e875ba4acc096bc779f5b5
2,0x0061fb5485dff4bb85c078dca80d19119224d97e,0x59896b17c44101d68681ee52c45fe0c2da770d73
3,0x0061fb5485dff4bb85c078dca80d19119224d97e,0x701efa4f52300bfea9217fb7d7831d86d807a3af
4,0x0061fb5485dff4bb85c078dca80d19119224d97e,0x841220262567a9b472651e4f188a45ef271d98f1


In [31]:
# 保存edgelist
edgelists.to_csv(pwd + '/../original_data/open/edgelist.csv', index=False)

In [32]:
# GCN to extract spatial features

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

In [36]:
# 使用GCN提取节点特征

class Net(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Net, self).__init__()
        self.conv1 = GCNConv(in_channels, 16)
        self.conv2 = GCNConv(16, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [50]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.data import DataLoader

In [37]:
# 获取label为0或1的节点
classified_node = node_feature[node_feature['label'] != 'unknown']

In [39]:
# all nodes
all_nodes = node_feature['address'].tolist()
map_id = {j: i for i, j in enumerate(all_nodes)}

edges = edgelists.copy()
edges.addr1 = edges.addr1.map(map_id)
edges.addr2 = edges.addr2.map(map_id)
edges.astype(int)

edge_index = np.array(edges).T.tolist()

edge_index = torch.tensor(edge_index, dtype=torch.long).contiguous()
weights = torch.tensor([1] * len(edge_index[0]), dtype=torch.float)
print(edge_index.shape)
print(weights.shape)


torch.Size([2, 92982])
torch.Size([92982])


In [60]:
# maping txIds to corresponding indexes, to pass node features to the model
addr_features = node_feature.copy()

# 将address转化为id
addr_features.address = addr_features.address.map(map_id)

classified_index = addr_features[addr_features['label'] != 'unknown']['address'].tolist()
unclassified_index = addr_features[addr_features['label'] == 'unknown']['address'].tolist()

# repalce label 'unknown' with 1
addr_features['label'] = addr_features['label'].replace('unknown', 1)

In [61]:
labels = addr_features['label'].values

addr_features_tensor = torch.tensor(addr_features.drop(['address', 'label'], axis=1).values, dtype=torch.float)

data_train = Data(x=addr_features_tensor, edge_index=edge_index, y=torch.tensor(labels, dtype=torch.long))

In [64]:
data_train

Data(x=[96342, 4], edge_index=[2, 92982], y=[96342])

In [68]:
data_train.x[classified_index].shape

torch.Size([3360, 4])

In [74]:
from sklearn.model_selection import train_test_split

# 划分训练集和测试集
x_train, x_val, y_train, y_val, train_idx, val_idx = train_test_split(data_train.x[classified_index], data_train.y[classified_index], classified_index, test_size=0.2, random_state=42, stratify=data_train.y[classified_index])


In [75]:
data_train.y[classified_index].sum()

tensor(1980)

In [96]:
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
class Net(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Net, self).__init__()
        self.conv1 = GCNConv(in_channels, 16)
        self.conv2 = GCNConv(16, 16)
        self.conv3 = GCNConv(32, 32)
        self.conv4 = GCNConv(16, out_channels)

    def forward(self, data, adj=None):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.1, training=self.training)
        # print(x.shape, edge_index.shape)
        x = self.conv4(x, edge_index)

        return F.sigmoid(x)

In [100]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net(4, 1).to(device)
# model = Net().to(device)
model.float()
data_train = data_train.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
criterion = torch.nn.BCELoss()

model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data_train)
    # print(out.shape)

    # reshape the output to match the shape of the label
    loss = criterion(out[classified_index].reshape(-1), data_train.y[classified_index].float())

    loss.backward()
    optimizer.step()
    scheduler.step(loss)

    auc = roc_auc_score(data_train.y.detach().cpu().numpy()[train_idx], out.detach().cpu().numpy()[train_idx]) #[train_idx]
    # loss.backward()
    # optimizer.step()
    if epoch%5 == 0:
      print("epoch: {} - loss: {} - roc: {}".format(epoch, loss.item(), auc))

epoch: 0 - loss: 47.045318603515625 - roc: 0.1740022507685551
epoch: 5 - loss: 17.605337142944336 - roc: 0.6841887511894306
epoch: 10 - loss: 7.930608749389648 - roc: 0.877396302243449
epoch: 15 - loss: 6.2713117599487305 - roc: 0.8921255123700775
epoch: 20 - loss: 4.764793872833252 - roc: 0.8982233453191334
epoch: 25 - loss: 3.926542043685913 - roc: 0.9033976540770019
epoch: 30 - loss: 3.5580403804779053 - roc: 0.9088324366856975
epoch: 35 - loss: 3.6461384296417236 - roc: 0.9092047055701946
epoch: 40 - loss: 3.7589292526245117 - roc: 0.9054299791392184
epoch: 45 - loss: 3.280167818069458 - roc: 0.9118969358439467
epoch: 50 - loss: 3.03861665725708 - roc: 0.9137176795125165
epoch: 55 - loss: 3.101026773452759 - roc: 0.9102703324000878
epoch: 60 - loss: 3.022524833679199 - roc: 0.9099844115978627
epoch: 65 - loss: 3.164541482925415 - roc: 0.9066863151441955
epoch: 70 - loss: 2.7233500480651855 - roc: 0.9144024598338457
epoch: 75 - loss: 2.635331153869629 - roc: 0.9276688991362906
epoch