### Trans2Vec
Wu J, Yuan Q, Lin D, et al. Who are the phishers? phishing scam detection on ethereum via network embedding[J]. IEEE Transactions on Systems, Man, and Cybernetics: Systems, 2020.

In [168]:
import pandas as pd 
import os 
import networkx as nx 
import warnings

warnings.filterwarnings('ignore')

In [169]:
pwd = os.getcwd()

In [201]:
# convert data to trans2vec format
data = {}
label = {}

for filename in os.listdir(pwd + '/../original_data/钓鱼一阶节点/'):
    if filename.endswith('.csv'):
        df = pd.read_csv(pwd + '/../original_data/钓鱼一阶节点/' + filename)
        G = nx.from_pandas_edgelist(df, source='From', target='To', edge_attr=('Value', 'TimeStamp'))
        addr = filename.split('.')[0]
        data[addr] = G
        label[addr] = 0

for filename in os.listdir(pwd + '/../original_data/非钓鱼一阶节点/'):
    if filename.endswith('.csv'):
        df = pd.read_csv(pwd + '/../original_data/非钓鱼一阶节点/' + filename)
        G = nx.from_pandas_edgelist(df, source='From', target='To', edge_attr=('Value', 'TimeStamp'))
        addr = filename.split('.')[0]
        data[addr] = G
        label[addr] = 1

### 完全随机游走

In [207]:
# random walk
import random
import numpy as np

def random_walk(G, walk_length, start_node=None):
    """Performs a random walk starting from start node."""
    if start_node:
        path = [start_node]
    else:
        # Sampling is uniform w.r.t V, and not w.r.t E
        path = [random.choice(list(G.nodes()))]

    while len(path) < walk_length:
        cur = path[-1]
        if len(list(G.neighbors(cur))) > 0:
            path.append(random.choice(list(G.neighbors(cur))))
        else:
            break
    res = [str(node) for node in path]
    # print(len(res), res)
    return res

In [222]:
# generate random walk sequences
walks = []

for addr in data:
    G = data[addr]
    walk = random_walk(G, 20)
    walks.append(walk)


In [223]:
# word2vec
from gensim.models import Word2Vec

model = Word2Vec(walks, vector_size=64, window=2, min_count=0, sg=1, workers=4)

features = []

for wl in walks:
    feature = np.zeros(64)
    for w in wl:
        feature += model.wv[w]
    features.append(feature)

In [224]:
print(len(features))

3360


In [225]:
# classification
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, list(label.values()), test_size=0.2, random_state=42)


In [227]:
# xgboost
import xgboost as xgb

clf = xgb.XGBClassifier()

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.63      0.63      0.63       352
           1       0.59      0.59      0.59       320

    accuracy                           0.61       672
   macro avg       0.61      0.61      0.61       672
weighted avg       0.61      0.61      0.61       672



### 按照timestamp偏向随机游走

In [247]:
# random walk

import random

def baised_random_walk(G, walk_length, start_node=None, is_baised=True, baised_attr=('TimeStamp', 'Value'), baised_alpha=(0.5, 0.5)):
    """Performs a random walk starting from start node."""
    if start_node:
        path = [start_node]
    else:
        # Sampling is uniform w.r.t V, and not w.r.t E
        path = [random.choice(list(G.nodes()))]

    while len(path) < walk_length:
        cur = path[-1]
        if len(list(G.neighbors(cur))) > 0:
            if is_baised:
                neighbors = list(G.neighbors(cur))
                attr = [G[cur][n][baised_attr[0]] * baised_alpha[0] + G[cur][n][baised_attr[1]] * baised_alpha[1] for n in neighbors]
                path.append(neighbors[np.argmax(attr)])
            else:
                path.append(random.choice(list(G.neighbors(cur))))
        else:
            break
    res = [str(node) for node in path]
    # print(len(res), res)
    return res

In [248]:
# generate random walk sequences
walks = []

for addr in data:
    G = data[addr]
    walk = baised_random_walk(G, 20)
    walks.append(walk)

In [253]:
# word2vec
from gensim.models import Word2Vec

model = Word2Vec(walks, vector_size=64, window=10, min_count=0, sg=1, workers=4)

features = []

for wl in walks:
    feature = np.zeros(64)
    for w in wl:
        feature += model.wv[w]
    features.append(feature)

In [254]:
# classification
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, list(label.values()), test_size=0.2, random_state=42)

In [255]:
# xgboost
import xgboost as xgb

clf = xgb.XGBClassifier()

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.81      0.83       352
           1       0.80      0.84      0.82       320

    accuracy                           0.83       672
   macro avg       0.83      0.83      0.83       672
weighted avg       0.83      0.83      0.83       672

