### Trans2Vec
Wu J, Yuan Q, Lin D, et al. Who are the phishers? phishing scam detection on ethereum via network embedding[J]. IEEE Transactions on Systems, Man, and Cybernetics: Systems, 2020.

In [19]:
import pandas as pd 
import os 
import networkx as nx 
import warnings

warnings.filterwarnings('ignore')

In [20]:
pwd = os.getcwd()

In [22]:
# convert data to trans2vec format
# dataset1
data = {}
label = {}

for filename in os.listdir(pwd + '/../original_data/open/钓鱼一阶节点/'):
    if filename.endswith('.csv'):
        df = pd.read_csv(pwd + '/../original_data/open/钓鱼一阶节点/' + filename)
        G = nx.from_pandas_edgelist(df, source='From', target='To', edge_attr=('Value', 'TimeStamp'))
        addr = filename.split('.')[0]
        data[addr] = G
        label[addr] = 0

for filename in os.listdir(pwd + '/../original_data/open/非钓鱼一阶节点/'):
    if filename.endswith('.csv'):
        df = pd.read_csv(pwd + '/../original_data/open/非钓鱼一阶节点/' + filename)
        G = nx.from_pandas_edgelist(df, source='From', target='To', edge_attr=('Value', 'TimeStamp'))
        addr = filename.split('.')[0]
        data[addr] = G
        label[addr] = 1

In [3]:
# dataset2

data = {}
label = {}

for filename in os.listdir(pwd + '/../original_data/etherscan/1d/phish-hack/'):
    if filename.endswith('.csv'):
        print("Processing file: " + filename, end='\r')
        try:
            df = pd.read_csv(pwd + '/../original_data/etherscan/1d/phish-hack/' + filename)
            columns_key = {'from': 'From', 'to': 'To', 'value': 'Value', 'timeStamp': 'TimeStamp'}
            # 统一格式
            df.rename(columns=columns_key, inplace=True)
            df['Value'] = df['Value'].astype(float) / 1**18           
            G = nx.from_pandas_edgelist(df, source='From', target='To', edge_attr=('Value', 'TimeStamp'))
            addr = filename.split('.')[0]
            data[addr] = G
            label[addr] = 0
        except:
            continue

for filename in os.listdir(pwd + '/../original_data/etherscan/1d/normal/'):
    if filename.endswith('.csv'):
        print("Processing file: " + filename, end='\r')
        try:
            df = pd.read_csv(pwd + '/../original_data/etherscan/1d/normal/'+ filename)
            # 将from、to、value、timeStamp转换为From、To、Value、TimeStamp
            columns_key = {'from': 'From', 'to': 'To', 'value': 'Value', 'timeStamp': 'TimeStamp'}
            df.rename(columns=columns_key, inplace=True)
            df['Value'] = df['Value'].astype(float) / 1**18
            G = nx.from_pandas_edgelist(df, source='From', target='To', edge_attr=('Value', 'TimeStamp'))
            addr = filename.split('.')[0]
            data[addr] = G
            label[addr] = 1
        except:
            continue

for filename in os.listdir(pwd + '/../original_data/open/非钓鱼一阶节点/'):
    if filename.endswith('.csv'):
        print("Processing file: " + filename, end='\r')
        df = pd.read_csv(pwd + '/../original_data/open/非钓鱼一阶节点/' + filename)
        G = nx.from_pandas_edgelist(df, source='From', target='To', edge_attr=('Value', 'TimeStamp'))
        addr = filename.split('.')[0]
        data[addr] = G
        label[addr] = 1

Processing file: 0xfffc21bc8b74fd9f9b86be03d7a35afdcda0f0fc.csv

### 完全随机游走

In [23]:
len(data)

3360

In [24]:
# random walk
import random
import numpy as np

def random_walk(G, walk_length, start_node=None):
    """Performs a random walk starting from start node."""
    if start_node:
        path = [start_node]
    else:
        # Sampling is uniform w.r.t V, and not w.r.t E
        path = [random.choice(list(G.nodes()))]

    while len(path) < walk_length:
        cur = path[-1]
        if len(list(G.neighbors(cur))) > 0:
            path.append(random.choice(list(G.neighbors(cur))))
        else:
            break
    res = [str(node) for node in path]
    # print(len(res), res)
    return res

In [25]:
# generate random walk sequences
walks = []

for addr in data:
    G = data[addr]
    walk = random_walk(G, 20)
    walks.append(walk)


In [26]:
# word2vec
from gensim.models import Word2Vec

model = Word2Vec(walks, vector_size=64, window=2, min_count=0, sg=1, workers=4)

features = []

for wl in walks:
    feature = np.zeros(64)
    for w in wl:
        feature += model.wv[w]
    features.append(feature)

In [27]:
print(len(features))

3360


In [28]:
# classification
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, list(label.values()), test_size=0.2, random_state=42)


In [29]:
# xgboost
import xgboost as xgb

clf = xgb.XGBClassifier()

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, digits=4))


              precision    recall  f1-score   support

           0     0.6510    0.6676    0.6592       352
           1     0.6238    0.6062    0.6149       320

    accuracy                         0.6384       672
   macro avg     0.6374    0.6369    0.6370       672
weighted avg     0.6380    0.6384    0.6381       672



### 按照timestamp和value偏向随机游走

In [30]:
# random walk

import random

def baised_random_walk(G, walk_length, start_node=None, is_baised=True, baised_attr=('TimeStamp', 'Value'), baised_alpha=(0.5, 0.5)):
    """Performs a random walk starting from start node."""
    if start_node:
        path = [start_node]
    else:
        # Sampling is uniform w.r.t V, and not w.r.t E
        path = [random.choice(list(G.nodes()))]

    while len(path) < walk_length:
        cur = path[-1]
        if len(list(G.neighbors(cur))) > 0:
            if is_baised:
                neighbors = list(G.neighbors(cur))
                attr = [G[cur][n][baised_attr[0]] * baised_alpha[0] + G[cur][n][baised_attr[1]] * baised_alpha[1] for n in neighbors]
                path.append(neighbors[np.argmax(attr)])
            else:
                path.append(random.choice(list(G.neighbors(cur))))
        else:
            break
    res = [str(node) for node in path]
    # print(len(res), res)
    return res

In [31]:
# generate random walk sequences
walks = []

for addr in data:
    G = data[addr]
    walk = baised_random_walk(G, 20)
    walks.append(walk)

In [32]:
# word2vec
from gensim.models import Word2Vec

model = Word2Vec(walks, vector_size=64, window=10, min_count=0, sg=1, workers=4)

features = []

for wl in walks:
    feature = np.zeros(64)
    for w in wl:
        feature += model.wv[w]
    features.append(feature)

In [33]:
# classification
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, list(label.values()), test_size=0.2, random_state=42)

In [34]:
# xgboost
import xgboost as xgb

clf = xgb.XGBClassifier()

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8404    0.7926    0.8158       352
           1     0.7853    0.8344    0.8091       320

    accuracy                         0.8125       672
   macro avg     0.8128    0.8135    0.8124       672
weighted avg     0.8141    0.8125    0.8126       672

