### Trans2Vec
Wu J, Yuan Q, Lin D, et al. Who are the phishers? phishing scam detection on ethereum via network embedding[J]. IEEE Transactions on Systems, Man, and Cybernetics: Systems, 2020.

In [112]:
import pandas as pd 
import os 
import networkx as nx 
import warnings

warnings.filterwarnings('ignore')

In [113]:
pwd = os.getcwd()

In [114]:
# 带重启的随机游走，有概率返回到起点，有概率继续游走, 当游走到终点时，返回到起点, 可以反向游走
import random

def random_walk(G, start, walk_length, restart_prob, timestamp_alpha, value_alpha):
    walk = [start]
    walk_edge = []
    while len(walk) < walk_length:
        cur = walk[-1]
        # 有概率返回到起点
        if random.random() < restart_prob:
            walk.append(random.choice(list(G.nodes())))
            walk_edge.append(0)
        else:
            # 有概率继续游走，timestamp_alpha控制时间因素，value_alpha控制价值因素, 两者都是0-1之间的数
            # 两者都为0时，就是普通的随机游走
            # 两者都为1时，就是只考虑时间因素或者价值因素
            if timestamp_alpha == 0 and value_alpha == 0:
                walk.append(random.choice(list(G.neighbors(cur))))
                walk_edge.append(G.get_edge_data(cur, walk[-1])['Value'])
            else:
                # 计算当前节点的邻居节点的权重
                neighbor_weights = []
                for neighbor in G.neighbors(cur):
                    edge = G.get_edge_data(cur, neighbor)
                    # 时间因素
                    timestamp_weight = 1 / (1 + edge['TimeStamp'])
                    # 价值因素
                    value_weight = edge['Value']
                    # 两者加权求和
                    neighbor_weight = timestamp_alpha * timestamp_weight + value_alpha * value_weight
                    neighbor_weights.append(neighbor_weight)
                # 根据权重随机选择一个邻居节点
                walk.append(random.choices(list(G.neighbors(cur)), weights=neighbor_weights)[0])
                walk_edge.append(G.get_edge_data(cur, walk[-1])['Value'])


    # print(walk_edge, len(walk_edge))
    return walk_edge, walk

In [115]:
# load data
# phishing data
embedding_features = pd.DataFrame()

for filename in os.listdir(pwd + '/../origin_data/钓鱼一阶节点/'):
    if filename.endswith('.csv'):
        phishing = pd.read_csv(pwd + '/../origin_data/钓鱼一阶节点/' + filename)
        G = nx.from_pandas_edgelist(phishing, source='From', target='To', edge_attr=['TimeStamp', 'Value'])
        # 转为无向图
        # G = G.to_undirected()
        # random walk
        start = random.choice(list(G.nodes()))
        walk_edge, _ = random_walk(G, start, 100, 0.8, 0, 0)
        embedding_features = embedding_features.append(pd.DataFrame(walk_edge).T)
        embedding_features['label'] = 1


embedding_features = embedding_features.reset_index(drop=True)

In [116]:
embedding_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,label
0,0.585408,0.0,0.0,0.585408,0.0,0.947284,0.0,0.0,0.0,0.0,...,0.0,0.585408,0.0,0.0,0.585408,0.0,0.0,0.585408,0.0,1
1,0.0,0.0,0.8178,0.0,0.8178,0.0,0.0,0.0,0.0,0.8178,...,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,1
2,0.000784,0.0,0.0,0.0,0.0,0.0,0.0,0.433767,0.0,0.0,...,0.007079,36.7,36.7,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,...,0.5,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.676,1


In [117]:
# load data
# phishing data
# embedding_features = pd.DataFrame()
embedding_features_normal = pd.DataFrame()

for filename in os.listdir(pwd + '/../origin_data/非钓鱼一阶节点/'):
    if filename.endswith('.csv'):
        phishing = pd.read_csv(pwd + '/../origin_data/非钓鱼一阶节点/' + filename)
        G = nx.from_pandas_edgelist(phishing, source='From', target='To', edge_attr=['TimeStamp', 'Value'])
        # 转为无向图
        # G = G.to_undirected()
        # random walk
        start = random.choice(list(G.nodes()))
        walk_edge, _ = random_walk(G, start, 100, 0.8, 0, 0)
        embedding_features_normal = embedding_features.append(pd.DataFrame(walk_edge).T)
        # 添加label
        embedding_features_normal['label'] = 0

embedding_features_normal = embedding_features_normal.reset_index(drop=True)

In [118]:
embedding_features.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,label
1655,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,500.0,...,0.0,0.0,0.0,499.99874,0.0,0.0,0.0,0.0,3.236475,1
1656,0.0,0.0,0.0,0.0,0.0,15.375782,0.0,0.0,0.0,0.5,...,0.0,0.0,0.0,0.506303,0.0,1.119755,0.0,0.0,0.0,1
1657,0.0,0.0,0.0,0.0,19.999916,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.32856,...,0.0,0.055,0.0,21.31,0.5,0.0,0.0,0.0,0.0,1
1659,0.5,0.0,0.5,0.5,0.0,0.5,0.5,0.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,1


In [119]:
# merge
embedding_features = embedding_features.append(embedding_features_normal)

In [120]:
# 划分数据集
from sklearn.model_selection import train_test_split

X = embedding_features.iloc[:, :-1]
y = embedding_features.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [121]:
# xgboost
import xgboost as xgb

xgb_model = xgb.XGBClassifier()

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.13      0.15      0.14       313
           1       0.09      0.07      0.08       352

    accuracy                           0.11       665
   macro avg       0.11      0.11      0.11       665
weighted avg       0.10      0.11      0.11       665

