In [5]:
import pandas as pd 

# 设置当前位置为项目根目录
import os
os.chdir('..')
import src.utils.preprocess as preprocess
import src.utils.graphlets as graphlets
import src.utils.shapelets as shapelets

### 对于不同大小的子图采样取值，对结果的影响

In [6]:
phishing_data_path = r'X:\Datasets\Blockchain\xblock.pro\eth-phishing-detection\original_data\open\钓鱼一阶节点'
normal_data_path = r'X:\Datasets\Blockchain\xblock.pro\eth-phishing-detection\original_data\open\非钓鱼一阶节点'

In [7]:
phishing_node_features, phishing_node_graphs, phishing_node_tx = preprocess.load_data_1d(phishing_data_path, 0, isetherscan=False)
print("phishing: ", phishing_node_features.shape, len(phishing_node_graphs), phishing_node_tx.shape)

normal_node_features, normal_node_graphs, normal_node_tx = preprocess.load_data_1d(normal_data_path, 1, isetherscan=False)
print("normal: ", normal_node_features.shape, len(normal_node_graphs), normal_node_tx.shape)

Preproocess: 0xffde23396d57e10abf58bd929bb1e856c7718218.csv, 1659/1660
phishing:  (1660, 13) 1660 (1660, 52)
Preproocess: 0xfffc21bc8b74fd9f9b86be03d7a35afdcda0f0fc.csv, 1699/1700
normal:  (1700, 13) 1700 (1700, 52)


In [20]:
num_walks = [2, 4, 8, 6, 10]
walk_length = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
restart_prob = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]

In [21]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

In [17]:
# num_walks = 10, restart_prob = 0.8
wl_precision = []
wl_recall = []
wl_f1 = []

for wl in walk_length:
    print("wl: ", wl)
    phishing_node_graphlets = graphlets.extract_graphlets(phishing_node_graphs, 10, wl, 0.8, 0)
    normal_node_graphlets = graphlets.extract_graphlets(normal_node_graphs, 10, wl, 0.8, 1)
    print("phishing: ", len(phishing_node_graphlets), "normal: ", len(normal_node_graphlets))

    # 合并数据
    node_graphlets = pd.concat([phishing_node_graphlets, normal_node_graphlets], axis=0)
    print("node_graphlets: ", node_graphlets.shape)

    # 划分数据集
    # 去除address列
    X_train, X_test, y_train, y_test = train_test_split(node_graphlets.drop(['label', 'address'], axis=1), node_graphlets['label'], test_size=0.2, random_state=0)

    # 训练模型
    model = xgb.XGBClassifier()
    model.fit(X_train, y_train)

    # 预测
    y_pred = model.predict(X_test)

    # 评估
    res = classification_report(y_test, y_pred, digits=4, output_dict=True)
    wl_precision.append(res['0']['precision'])
    wl_recall.append(res['0']['recall'])
    wl_f1.append(res['0']['f1-score'])

    print("precision: ", res['0']['precision'])
    print("recall: ", res['0']['recall'])
    print("f1-score: ", res['0']['f1-score'])


print("wl_precision: ", wl_precision)
print("wl_recall: ", wl_recall)
print("wl_f1: ", wl_f1)

wl:  5
Process(graphlets): 0xffde23396d57e10abf58bd929bb1e856c7718218, 1659/1660
Process(graphlets): 0xfffc21bc8b74fd9f9b86be03d7a35afdcda0f0fc, 1699/1700
phishing:  1660 normal:  1700
node_graphlets:  (3360, 16)
precision:  0.7583081570996979
recall:  0.742603550295858
f1-score:  0.750373692077728
wl:  10
Process(graphlets): 0xffde23396d57e10abf58bd929bb1e856c7718218, 1659/1660
Process(graphlets): 0xfffc21bc8b74fd9f9b86be03d7a35afdcda0f0fc, 1699/1700
phishing:  1660 normal:  1700
node_graphlets:  (3360, 16)
precision:  0.7590361445783133
recall:  0.7455621301775148
f1-score:  0.7522388059701492
wl:  15
Process(graphlets): 0xffde23396d57e10abf58bd929bb1e856c7718218, 1659/1660
Process(graphlets): 0xfffc21bc8b74fd9f9b86be03d7a35afdcda0f0fc, 1699/1700
phishing:  1660 normal:  1700
node_graphlets:  (3360, 16)
precision:  0.7629179331306991
recall:  0.742603550295858
f1-score:  0.7526236881559221
wl:  20
Process(graphlets): 0xffde23396d57e10abf58bd929bb1e856c7718218, 1659/1660
Process(graph

In [22]:
# walk_length = 30, restart_prob = 0.8
nw_precision = []
nw_recall = []
nw_f1 = []

for nw in num_walks:
    print("nw: ", nw)
    phishing_node_graphlets = graphlets.extract_graphlets(phishing_node_graphs, nw, 30, 0.8, 0)
    normal_node_graphlets = graphlets.extract_graphlets(normal_node_graphs, nw, 30, 0.8, 1)
    print("phishing: ", len(phishing_node_graphlets), "normal: ", len(normal_node_graphlets))

    # 合并数据
    node_graphlets = pd.concat([phishing_node_graphlets, normal_node_graphlets], axis=0)
    print("node_graphlets: ", node_graphlets.shape)

    # 划分数据集
    # 去除address列
    X_train, X_test, y_train, y_test = train_test_split(node_graphlets.drop(['label', 'address'], axis=1), node_graphlets['label'], test_size=0.2, random_state=0)

    # 训练模型
    model = xgb.XGBClassifier()
    model.fit(X_train, y_train)

    # 预测
    y_pred = model.predict(X_test)

    # 评估
    res = classification_report(y_test, y_pred, digits=4, output_dict=True)
    nw_precision.append(res['0']['precision'])
    nw_recall.append(res['0']['recall'])
    nw_f1.append(res['0']['f1-score'])

    print("precision: ", res['0']['precision'])
    print("recall: ", res['0']['recall'])
    print("f1-score: ", res['0']['f1-score'])

print("nw_precision: ", nw_precision)
print("nw_recall: ", nw_recall)
print("nw_f1: ", nw_f1)

nw:  2
Process(graphlets): 0xffde23396d57e10abf58bd929bb1e856c7718218, 1659/1660
Process(graphlets): 0xfffc21bc8b74fd9f9b86be03d7a35afdcda0f0fc, 1699/1700
phishing:  1660 normal:  1700
node_graphlets:  (3360, 16)
precision:  0.7657657657657657
recall:  0.7544378698224852
f1-score:  0.7600596125186289
nw:  4
Process(graphlets): 0xffde23396d57e10abf58bd929bb1e856c7718218, 1659/1660
Process(graphlets): 0xfffc21bc8b74fd9f9b86be03d7a35afdcda0f0fc, 1699/1700
phishing:  1660 normal:  1700
node_graphlets:  (3360, 16)
precision:  0.7558823529411764
recall:  0.7603550295857988
f1-score:  0.7581120943952803
nw:  8
Process(graphlets): 0xffde23396d57e10abf58bd929bb1e856c7718218, 1659/1660
Process(graphlets): 0xfffc21bc8b74fd9f9b86be03d7a35afdcda0f0fc, 1699/1700
phishing:  1660 normal:  1700
node_graphlets:  (3360, 16)
precision:  0.778816199376947
recall:  0.7396449704142012
f1-score:  0.7587253414264037
nw:  6
Process(graphlets): 0xffde23396d57e10abf58bd929bb1e856c7718218, 1659/1660
Process(graphl

In [23]:
# walk_length = 30, num_walks = 10
rp_precision = []
rp_recall = []
rp_f1 = []

for rp in restart_prob:
    print("rp: ", rp)
    phishing_node_graphlets = graphlets.extract_graphlets(phishing_node_graphs, 10, 30, rp, 0)
    normal_node_graphlets = graphlets.extract_graphlets(normal_node_graphs, 10, 30, rp, 1)
    print("phishing: ", len(phishing_node_graphlets), "normal: ", len(normal_node_graphlets))

    # 合并数据
    node_graphlets = pd.concat([phishing_node_graphlets, normal_node_graphlets], axis=0)
    print("node_graphlets: ", node_graphlets.shape)

    # 划分数据集
    # 去除address列
    X_train, X_test, y_train, y_test = train_test_split(node_graphlets.drop(['label', 'address'], axis=1), node_graphlets['label'], test_size=0.2, random_state=0)

    # 训练模型
    model = xgb.XGBClassifier()
    model.fit(X_train, y_train)

    # 预测
    y_pred = model.predict(X_test)

    # 评估
    res = classification_report(y_test, y_pred, digits=4, output_dict=True)
    rp_precision.append(res['0']['precision'])
    rp_recall.append(res['0']['recall'])
    rp_f1.append(res['0']['f1-score'])

    print("precision: ", res['0']['precision'])
    print("recall: ", res['0']['recall'])
    print("f1-score: ", res['0']['f1-score'])

print("rp_precision: ", rp_precision)
print("rp_recall: ", rp_recall)
print("rp_f1: ", rp_f1)

rp:  0.5
Process(graphlets): 0xffde23396d57e10abf58bd929bb1e856c7718218, 1659/1660
Process(graphlets): 0xfffc21bc8b74fd9f9b86be03d7a35afdcda0f0fc, 1699/1700
phishing:  1660 normal:  1700
node_graphlets:  (3360, 16)
precision:  0.7791798107255521
recall:  0.7307692307692307
f1-score:  0.7541984732824427
rp:  0.55
Process(graphlets): 0xffde23396d57e10abf58bd929bb1e856c7718218, 1659/1660
Process(graphlets): 0xfffc21bc8b74fd9f9b86be03d7a35afdcda0f0fc, 1699/1700
phishing:  1660 normal:  1700
node_graphlets:  (3360, 16)
precision:  0.7626112759643917
recall:  0.7603550295857988
f1-score:  0.7614814814814815
rp:  0.6
Process(graphlets): 0xffde23396d57e10abf58bd929bb1e856c7718218, 1659/1660
Process(graphlets): 0xfffc21bc8b74fd9f9b86be03d7a35afdcda0f0fc, 1699/1700
phishing:  1660 normal:  1700
node_graphlets:  (3360, 16)
precision:  0.7643504531722054
recall:  0.7485207100591716
f1-score:  0.7563527653213752
rp:  0.65
Process(graphlets): 0xffde23396d57e10abf58bd929bb1e856c7718218, 1659/1660
Pro