In [1]:
import pandas as pd 
import os 

In [2]:
pwd = os.getcwd()

In [3]:
# 获取地址的统计特征

def get_address_feature(df, address, label):
    # 获取地址的统计特征
    # df: 数据集
    # address: 地址
    # lable: 标签
    # return: 返回数据集
    result = {}
    result['address'] = address
    # 作为发送者的发送金额
    result['send_amount'] = df[df['From'] == address]['Value'].sum()
    # 作为接收者的接收金额
    result['receive_amount'] = df[df['To'] == address]['Value'].sum()
    # 作为发送者的发送次数
    result['send_count'] = df[df['From'] == address]['Value'].count()
    # 作为接收者的接收次数
    result['receive_count'] = df[df['To'] == address]['Value'].count()
    # 作为发送者的发送次数所占百分比
    result['send_count_ratio'] = result['send_count'] / len(df)
    # 作为接收者的接收次数所占百分比
    result['receive_count_ratio'] = result['receive_count'] / len(df)
    # 作为发送者的发送金额所占百分比
    result['send_amount_ratio'] = result['send_amount'] / df['Value'].sum()
    # 作为接收者的接收金额所占百分比
    result['receive_amount_ratio'] = result['receive_amount'] / df['Value'].sum()
    # 作为发送者的发送金额的平均值
    result['send_amount_mean'] = df[df['From'] == address]['Value'].mean()
    # 作为接收者的接收金额的平均值
    result['receive_amount_mean'] = df[df['To'] == address]['Value'].mean()
    # 作为发送者的发送金额的中位数
    result['send_amount_median'] = df[df['From'] == address]['Value'].median()
    # 作为接收者的接收金额的中位数
    result['receive_amount_median'] = df[df['To'] == address]['Value'].median()
    # 作为发送者的发送金额的最大值
    result['send_amount_max'] = df[df['From'] == address]['Value'].max()
    # 作为接收者的接收金额的最大值
    result['receive_amount_max'] = df[df['To'] == address]['Value'].max()
    # 作为发送者的发送金额的最小值
    result['send_amount_min'] = df[df['From'] == address]['Value'].min()
    # 作为接收者的接收金额的最小值
    result['receive_amount_min'] = df[df['To'] == address]['Value'].min()
    # 作为发送者的发送金额的标准差
    result['send_amount_std'] = df[df['From'] == address]['Value'].std()
    # 作为接收者的接收金额的标准差
    result['receive_amount_std'] = df[df['To'] == address]['Value'].std()
    # 作为发送者的发送金额四分位数
    result['send_amount_q1'] = df[df['From'] == address]['Value'].quantile(0.25)
    result['send_amount_q2'] = df[df['From'] == address]['Value'].quantile(0.5)
    result['send_amount_q3'] = df[df['From'] == address]['Value'].quantile(0.75)
    # 作为接收者的接收金额四分位数
    result['receive_amount_q1'] = df[df['To'] == address]['Value'].quantile(0.25)
    result['receive_amount_q2'] = df[df['To'] == address]['Value'].quantile(0.5)
    result['receive_amount_q3'] = df[df['To'] == address]['Value'].quantile(0.75)
    # 余额
    result['balance'] = result['receive_amount'] - result['send_amount']
    result['label'] = label
    # print(result)
    return pd.DataFrame(result, index=[0])

In [4]:

count = 0
phish_data = {}
statistics_data = pd.DataFrame()

for file in os.listdir(pwd + "/钓鱼一阶节点/"):
    if file.endswith(".csv"):
        df = pd.read_csv(pwd + "/钓鱼一阶节点/" + file)
        if len(df) > 5 and len(df) < 1000:
            # 按照时间戳进行排序
            print("Process: {}".format(file))
            df = df.sort_values(by='TimeStamp')
            df['is_out'] = df['From'] == file.split('.')[0]
            df['is_out'] = df['is_out'].apply(lambda x: -1 if x else 1)
            df['tx_value'] = df['Value'] * df['is_out']
            # 账户余额的变化
            df['balance'] = df['tx_value'].cumsum()
            # 与前一个时间戳的差值，第一个为0
            df['time_diff'] = df['TimeStamp'].diff()
            df['time_diff'] = df['time_diff'].fillna(0)
            phish_data[file.split('.')[0]] = [df[['balance', 'is_out', 'tx_value', 'time_diff']], 0]
            result = get_address_feature(df, file.split('.')[0], 0)
            statistics_data = pd.concat([statistics_data, result], axis=0, ignore_index=True)
            count += 1

Process: 0x0059b14e35dab1b4eee1e2926c7a5660da66f747.csv
Process: 0x0061fb5485dff4bb85c078dca80d19119224d97e.csv
Process: 0x0084515449b037205a33d6d3940a5684126aa4b5.csv
Process: 0x00c33c49f9a2a920e3f3787204cbda9012d1912e.csv
Process: 0x00eb6f5199cd0b671da371969b1a0f948e982fea.csv
Process: 0x0128282ce73c72decabaeace9358344adff449fe.csv
Process: 0x015c0e438b3a01511b98d928bd031d3dc50abb9e.csv
Process: 0x0167409e6106ec3e3f05a09fcf04606918d21ad5.csv
Process: 0x0177eb92b752fa0715ee0dce1d860eaf739b5cf4.csv
Process: 0x020b1573f2ca670190d33ca2f0a57b0c0399ad37.csv
Process: 0x024c344da7208e60356378a252dab771c34be111.csv
Process: 0x026e78f168df546aabb2733b37920c55b335be80.csv
Process: 0x0297a3211d69a1a268591e1ff6f570699ccc50ca.csv
Process: 0x02d0b53ec925f5c5907eb3dd85bededaa4362564.csv
Process: 0x02dfa0d5184c41689377d5d47054da210ce941f6.csv
Process: 0x03f034fb47965123ea4148e3147e2cfdc5b1f7a5.csv
Process: 0x043375ebc36fff7a4fcef1359f3c89afddf56e08.csv
Process: 0x051005cdcecd916fb8b98643d923646acc7e0

In [5]:
len(phish_data)

1161

In [6]:

count = 0
normal_data = {}

for file in os.listdir(pwd + "/非钓鱼一阶节点/"):
    if file.endswith(".csv"):
        df = pd.read_csv(pwd + "/非钓鱼一阶节点/" + file)
        if len(df) > 5 and len(df) < 1000:
            # 按照时间戳进行排序
            print("Process: {}".format(file))
            df = df.sort_values(by='TimeStamp')
            df['is_out'] = df['From'] == file.split('.')[0]
            df['is_out'] = df['is_out'].apply(lambda x: -1 if x else 1)
            df['tx_value'] = df['Value'] * df['is_out']
            # 账户余额的变化
            df['balance'] = df['tx_value'].cumsum()
                    # 与前一个时间戳的差值，第一个为0
            df['time_diff'] = df['TimeStamp'].diff()
            df['time_diff'] = df['time_diff'].fillna(0)
            normal_data[file.split('.')[0]] = [df[['balance', 'is_out', 'tx_value', 'time_diff']], 1]
            result = get_address_feature(df, file.split('.')[0], 1)
            statistics_data = pd.concat([statistics_data, result], axis=0, ignore_index=True)
            count += 1

Process: 0x000419c40a811a052b56372f4a80823be47db756.csv
Process: 0x00267e4a01f25072e0e9347a2ccdd67091d7aeac.csv
Process: 0x00316d956f5f35591ae021f4858a2a865c6ba02a.csv
Process: 0x0034cf6e02f4c47fb30df22fc81b8dedddbf1fb0.csv
Process: 0x004e3def0c754a921af751d1004df95f9650ea00.csv
Process: 0x005bdf2845064db405f5c99aaf9510d0b19e7ac2.csv
Process: 0x005e288d713a5fb3d7c9cf1b43810a98688c7223.csv
Process: 0x006deef69f1b3c01173b5ba74a5de9050e72d702.csv
Process: 0x007c508c6368d2ad35608cb8e98edc9ef1bf0e84.csv
Process: 0x00ce8ad7e44fea30ecbe2fb69963eaa26b5b56a0.csv
Process: 0x00d681a7b6584f978f63c81cfd847064ce19a080.csv
Process: 0x00e5c013694c9ee92b76ce6ad7ad3bcc20475d6f.csv
Process: 0x00e72cfa92cdf0a7f9109705f088c2c4e81c2f73.csv
Process: 0x00e9fec55aec66b9aece8e678fa8c99804afc334.csv
Process: 0x00f345f3766dd1498dc6970017db25adbf2d9ac1.csv
Process: 0x0194426ec9b4aaf73151d3a1aac1db84a8639019.csv
Process: 0x01adb5a14196d302004e3a1970a8bb3183dd2565.csv
Process: 0x01eb3d7559c2aa19a6b55dfb48508093ee471

In [7]:
len(normal_data)

1125

In [8]:
statistics_data.head()

Unnamed: 0,address,send_amount,receive_amount,send_count,receive_count,send_count_ratio,receive_count_ratio,send_amount_ratio,receive_amount_ratio,send_amount_mean,...,send_amount_std,receive_amount_std,send_amount_q1,send_amount_q2,send_amount_q3,receive_amount_q1,receive_amount_q2,receive_amount_q3,balance,label
0,0x0059b14e35dab1b4eee1e2926c7a5660da66f747,38.133767,37.556046,3,98,0.029703,0.970297,0.503816,0.496184,12.711256,...,20.776791,1.787557,0.716884,1.0,18.85,0.003978,0.0078,0.023875,-0.577721,0
1,0x0061fb5485dff4bb85c078dca80d19119224d97e,1.0,5.045869,1,11,0.083333,0.916667,0.165402,0.834598,1.0,...,,0.221008,1.0,1.0,1.0,0.5,0.5,0.530865,4.045869,0
2,0x0084515449b037205a33d6d3940a5684126aa4b5,2.749253,2.750975,2,7,0.222222,0.777778,0.499843,0.500157,1.374626,...,0.884641,0.379784,1.061858,1.374626,1.687394,0.129526,0.220971,0.622976,0.001722,0
3,0x00c33c49f9a2a920e3f3787204cbda9012d1912e,6.745249,2.518046,39,38,0.506494,0.493506,0.72817,0.27183,0.172955,...,0.270409,0.10124,0.002,0.109931,0.187,0.019943,0.039926,0.050389,-4.227203,0
4,0x00eb6f5199cd0b671da371969b1a0f948e982fea,29.535269,32.044408,1,22,0.043478,0.956522,0.479627,0.520373,29.535269,...,,2.096021,29.535269,29.535269,29.535269,0.519958,0.695,0.996,2.509139,0


In [9]:
len(statistics_data)

2286

In [10]:
statistics_data.describe()

Unnamed: 0,send_amount,receive_amount,send_count,receive_count,send_count_ratio,receive_count_ratio,send_amount_ratio,receive_amount_ratio,send_amount_mean,receive_amount_mean,...,send_amount_std,receive_amount_std,send_amount_q1,send_amount_q2,send_amount_q3,receive_amount_q1,receive_amount_q2,receive_amount_q3,balance,label
count,2286.0,2286.0,2286.0,2286.0,2286.0,2286.0,2286.0,2286.0,2102.0,2270.0,...,1720.0,2216.0,2102.0,2102.0,2102.0,2270.0,2270.0,2270.0,2286.0,2286.0
mean,8036.809,10132.48,16.407699,38.73972,0.336073,0.665894,0.412623,0.589135,675.345849,672.7791,...,1142.714943,1019.678128,122.745066,364.079439,778.402053,196.6003,327.2475,729.2171,2095.675,0.492126
std,79384.11,97852.8,41.960188,92.195753,0.273701,0.273405,0.218046,0.217536,12000.928035,3639.527,...,18609.982476,6736.903839,1077.54594,6857.971674,14799.743331,1925.192,2256.533,3562.612,60914.54,0.500047
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000206,9.11e-16,...,0.0,0.0,9e-06,3e-05,6.8e-05,9.11e-16,9.11e-16,9.11e-16,-427900.0,0.0
25%,7.478728,15.10395,2.0,6.0,0.092221,0.473684,0.330139,0.500003,3.587793,0.9825429,...,3.021013,1.244422,0.347456,1.0,3.00605,0.2,0.5,0.9851523,0.000861,0.0
50%,54.03516,347.7317,5.0,13.0,0.285714,0.714807,0.499878,0.500135,21.779357,9.529124,...,39.280852,15.158223,3.910265,10.0,20.0,0.5369656,1.042064,5.103148,0.01120427,0.0
75%,1570.95,2818.399,14.0,28.0,0.527778,0.909091,0.499997,0.675267,179.764887,250.857,...,312.242091,342.089201,23.713725,70.0,196.4375,12.88985,72.67171,236.8013,953.6261,1.0
max,3246786.0,3246786.0,600.0,964.0,1.0,1.0,1.0,1.0,541130.953016,91667.02,...,757984.974361,224536.387472,44400.0,308845.724537,668359.181552,75000.0,75000.0,75000.0,2564504.0,1.0


In [11]:
# 将所有的NaN替换为0
statistics_data = statistics_data.fillna(0)

In [12]:
# 对于每个账户的交易记录，使用LSTM计算出每个账户的交易特征, 输出中间隐藏变量
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size)
        self.hidden2out = nn.Linear(hidden_size, output_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.zeros(1, 1, self.hidden_size),
                torch.zeros(1, 1, self.hidden_size))

    def forward(self, input):
        lstm_out, self.hidden = self.lstm(input.view(len(input), 1, -1), self.hidden)
        output = self.hidden2out(lstm_out.view(len(input), -1))
        return output, lstm_out


In [13]:
def get_lstm_hidden_vector(data, address):
    # data: pd.DataFrame
    # return: torch.Tensor
    print("Process: {}, length: {}".format(address, len(data)))
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    input_size = 4
    hidden_size = 32
    output_size = 4
    # dataframe转换为tensor
    data = torch.tensor(data.values, dtype=torch.float)
    # print(data.shape)
    lstm = LSTM(input_size, hidden_size, output_size)

    loss_function = nn.MSELoss()
    optimizer = optim.SGD(lstm.parameters(), lr=0.1)
    for i in range(50):
        lstm.zero_grad()
        lstm.hidden = lstm.init_hidden()
        output, lstm_out = lstm(data)
        loss = loss_function(output, data)
        loss.backward()
        optimizer.step()
    # 使用最后一个隐藏变量作为特征
    # print(output.shape)
    # print(lstm_out.shape)
    return output[-1].detach().numpy()

In [14]:
labels = []
features = []
max_count = 2
for key in phish_data:
    labels.append(phish_data[key][1])
    features.append(get_lstm_hidden_vector(phish_data[key][0], key))

for key in normal_data:
    labels.append(normal_data[key][1])
    features.append(get_lstm_hidden_vector(normal_data[key][0], key))

Process: 0x0059b14e35dab1b4eee1e2926c7a5660da66f747, length: 101
Process: 0x0061fb5485dff4bb85c078dca80d19119224d97e, length: 12
Process: 0x0084515449b037205a33d6d3940a5684126aa4b5, length: 9
Process: 0x00c33c49f9a2a920e3f3787204cbda9012d1912e, length: 77
Process: 0x00eb6f5199cd0b671da371969b1a0f948e982fea, length: 23
Process: 0x0128282ce73c72decabaeace9358344adff449fe, length: 93
Process: 0x015c0e438b3a01511b98d928bd031d3dc50abb9e, length: 20
Process: 0x0167409e6106ec3e3f05a09fcf04606918d21ad5, length: 9
Process: 0x0177eb92b752fa0715ee0dce1d860eaf739b5cf4, length: 16
Process: 0x020b1573f2ca670190d33ca2f0a57b0c0399ad37, length: 44
Process: 0x024c344da7208e60356378a252dab771c34be111, length: 22
Process: 0x026e78f168df546aabb2733b37920c55b335be80, length: 70
Process: 0x0297a3211d69a1a268591e1ff6f570699ccc50ca, length: 16
Process: 0x02d0b53ec925f5c5907eb3dd85bededaa4362564, length: 30
Process: 0x02dfa0d5184c41689377d5d47054da210ce941f6, length: 23
Process: 0x03f034fb47965123ea4148e3147e2c

In [15]:
print(features[0].shape)

(4,)


In [16]:
print(len(features))

2286


In [17]:
# 转化为numpy数组
import numpy as np

# 将torch.Tensor转化为numpy数组
features = np.array([feature for feature in features])
labels = np.array(labels)

In [18]:
# 输出features的维度
print(features.shape)

(2286, 4)


In [19]:
# 将features的维度转化为(样本数, 特征数)
features = features.reshape(features.shape[0], -1)

# 时间特征分类

In [20]:
# 生成训练集和测试集
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [21]:
# 使用MLP进行分类
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(32, 32), random_state=1)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

0.5109170305676856


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


# 统计特征分类

In [22]:
# 使用统计特征进行分类
# 划分训练集和测试集
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(statistics_data.drop(['label', 'address'], axis=1), statistics_data['label'], test_size=0.2, random_state=42)

In [23]:
# 使用MLP进行分类
from sklearn.neural_network import MLPClassifier

mlp_model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(32, 32), random_state=1)
mlp_model.fit(X_train, y_train)
y_pred = mlp_model.predict(X_test)
print(mlp_model.score(X_test, y_test))

0.9039301310043668


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


# 统计和时间特征

In [24]:
print(features.shape)
# 拼接统计特征和LSMT特征
featurex = np.concatenate((features, statistics_data.drop(['label', 'address'], axis=1).values), axis=1)


# 生成训练集和测试集
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(featurex, labels, test_size=0.2, random_state=42)

(2286, 4)


In [25]:
print(X_train.shape)

(1828, 29)


In [26]:
# 使用MLP进行分类
from sklearn.neural_network import MLPClassifier

mlp_model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(32, 32), random_state=1)
mlp_model.fit(X_train, y_train)
y_pred = mlp_model.predict(X_test)
print(mlp_model.score(X_test, y_test))

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


0.5262008733624454
