# 数据预处理
baseline 部分借鉴 https://blog.csdn.net/m0_63642362/article/details/128328456

In [1]:
! ls /kaggle/input/ueba-dataset/train_data

train_data.csv


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [3]:
train_data = pd.read_csv("/kaggle/input/ueba-dataset/train_data/train_data.csv", encoding='gbk')

In [4]:
train_data.head()

Unnamed: 0,id,account,group,IP,url,port,vlan,switchIP,time,ret
0,1,xiaojiawei@qq.com,人事行政中心,192.168.1.50,http://123.6.4.41,15788,700,129.30.06.37,2021/6/16 7:56,0.1149
1,2,xiaojiawei@qq.com,人事行政中心,192.168.31.46,http://104.192.108.154,12665,700,152.91.89.45,2021/6/28 7:58,0.1801
2,3,xiaojiawei@qq.com,人事行政中心,192.168.2.3,http://42.236.37.80,25551,700,129.30.06.37,2021/6/1 6:37,0.369
3,4,xiaojiawei@qq.com,人事行政中心,192.168.145.30,http://uf-api6-client.ksord.com,18274,700,162.6.8.29,2021/5/5 8:18,0.1532
4,5,xiaojiawei@qq.com,人事行政中心,192.168.178.92,http://101.199.128.170,34572,700,162.0.1.84,2021/6/20 6:52,0.1449


字段名称	字段说明
id	日志数据记录编号
account	用户账号，已脱敏
group	用户归属部门
IP	终端IP
url	终端上网网址
port	终端上网应用端口
vlan	终端所在虚拟网域编号
switchIP	终端连接交换机IP
time	终端上网行为发生时间
ret	异常行为评价得分

In [5]:
# 查看列索引
train_data.columns

Index(['id', 'account', 'group', 'IP', 'url', 'port', 'vlan', 'switchIP',
       'time', 'ret'],
      dtype='object')

In [6]:
train_data['time'] = pd.to_datetime(train_data['time'])
train_data['hour'] = train_data['time'].dt.hour
train_data['weekday'] = train_data['time'].dt.weekday
train_data['year'] = train_data['time'].dt.year
train_data['month'] = train_data['time'].dt.month
train_data['day'] = train_data['time'].dt.day

In [7]:
# 查看训练集中各字段的数据类型
train_data.dtypes

id                   int64
account             object
group               object
IP                  object
url                 object
port                 int64
vlan                 int64
switchIP            object
time        datetime64[ns]
ret                float64
hour                 int64
weekday              int64
year                 int64
month                int64
day                  int64
dtype: object

In [8]:
# 查看各字段的基本统计学数据描述
train_data.describe()

Unnamed: 0,id,port,vlan,ret,hour,weekday,year,month,day
count,528690.0,528690.0,528690.0,528690.0,528690.0,528690.0,528690.0,528690.0,528690.0
mean,264345.5,25241.669929,1152.486902,0.257757,6.334313,2.993121,2021.0,5.500458,14.999383
std,152619.801246,8543.017235,438.932786,0.219326,4.258042,1.997915,0.0,0.500828,8.369525
min,1.0,11122.0,700.0,0.0001,0.0,0.0,2021.0,4.0,1.0
25%,132173.25,16865.0,900.0,0.0785,3.0,1.0,2021.0,5.0,8.0
50%,264345.5,25278.0,1000.0,0.208,6.0,3.0,2021.0,6.0,15.0
75%,396517.75,33474.0,1200.0,0.3774,9.0,5.0,2021.0,6.0,22.0
max,528690.0,38867.0,2000.0,1.0,23.0,6.0,2021.0,6.0,31.0


In [9]:
train_data.duplicated().any()

False

In [10]:
pd.notnull(train_data).all()

id          True
account     True
group       True
IP          True
url         True
port        True
vlan        True
switchIP    True
time        True
ret         True
hour        True
weekday     True
year        True
month       True
day         True
dtype: bool

In [11]:
# 肯定要再改进的地方
from sklearn.preprocessing import LabelEncoder

for feat in ['account', 'group', 'IP', 'url', 'switchIP']:
    labelencoder = LabelEncoder()
    train_data[feat] = labelencoder.fit_transform(train_data[feat])

In [12]:
# 查看编码后的数据集
train_data.head()

Unnamed: 0,id,account,group,IP,url,port,vlan,switchIP,time,ret,hour,weekday,year,month,day
0,1,113,1,18,216,15788,700,44,2021-06-16 07:56:00,0.1149,7,2,2021,6,16
1,2,113,1,101,157,12665,700,91,2021-06-28 07:58:00,0.1801,7,0,2021,6,28
2,3,113,1,81,373,25551,700,44,2021-06-01 06:37:00,0.369,6,1,2021,6,1
3,4,113,1,39,1135,18274,700,102,2021-05-05 08:18:00,0.1532,8,2,2021,5,5
4,5,113,1,77,57,34572,700,92,2021-06-20 06:52:00,0.1449,6,6,2021,6,20


- 选取了'account', 'group', 'IP', 'url', 'port', 'vlan', 'switchIP', 'hour', 'weekday', 'year', 'month', 'day'字段作为训练的特征（features）。
- 'ret'字段为想要预测的目标值/标签值（label）。
- 我们将数据集划分为：训练集：测试集 = 0.75：0.25

浅谈对所选特征的一些理解
时间相关的标签 -- 上网时间不对 比如深夜上网？可能就会导致 ret升高
url,IP -- 很好理解就不解释了
port -- 哪有天天访问非正常的port的行为 如果说访问攻击机的port 就有可能出现异常行为

In [13]:
data_X = train_data.loc[train_data.index[:], ['account', 'group', 'IP', 'url', 'port', 'vlan', 'switchIP', 'hour', 'weekday', 'year', 'month', 'day']]
data_Y = train_data['ret']

In [14]:
from sklearn.model_selection import train_test_split
# 数据集划分
x_train, x_test, y_train, y_test = train_test_split(data_X, data_Y, test_size=0.25, random_state=6)
print("训练集的特征值：\n", x_train, x_train.shape)
print("测试集的标签值：\n", y_test, y_test.shape)

print("The length of original data X is:", data_X.shape[0])
print("The length of train Data is:", x_train.shape[0])
print("The length of test Data is:", x_test.shape[0])

训练集的特征值：
         account  group   IP   url   port  vlan  switchIP  hour  weekday  year  \
83071       101      3   40   544  25518  1200       120     0        4  2021   
294361       56      4   69   597  36282  2000        70     1        2  2021   
504243      130      6  123   201  25278   900       103     2        6  2021   
285369       26      1   69   186  27548   700        83     0        6  2021   
290928       26      1  104   472  28534   700        85    10        4  2021   
...         ...    ...  ...   ...    ...   ...       ...   ...      ...   ...   
138315       41      5   57   575  12875  1000        23     3        6  2021   
457217      146      1   73  1060  14576   700       120     4        6  2021   
349465       19      3   15   678  17365  1200       124     7        2  2021   
195949        7      5  102  1166  13611  1000       120     0        1  2021   
162698       80      6   13  1117  35643   900        67     6        3  2021   

        month  da

训练集的特征数组（features）为二维数组，而训练集的标签数据（label）是一维数组，需要将其转变为二维数组再传入神经网络训练，保持维度一致，否则会报错

In [15]:
x_train = np.array(x_train, dtype='float32')
y_train = np.array(y_train, dtype='float32')

In [16]:
y_train

array([0.1724, 0.0298, 0.0739, ..., 0.3177, 0.0929, 0.1663], dtype=float32)

In [17]:
y_train = y_train.reshape(-1,1)
y_train

array([[0.1724],
       [0.0298],
       [0.0739],
       ...,
       [0.3177],
       [0.0929],
       [0.1663]], dtype=float32)

In [18]:
x_test = np.array(x_test, dtype='float32')
y_test = np.array(y_test, dtype='float32')

In [19]:
y_test = y_test.reshape(-1,1)
y_test

array([[0.5703],
       [0.3863],
       [0.1671],
       ...,
       [0.3912],
       [0.3341],
       [0.1708]], dtype=float32)

我们对训练集和测试集的特征值进行了标准化

In [20]:
# 标准化
from sklearn.preprocessing import StandardScaler
# 1. 实例化一个转换器类
transfer = StandardScaler()
# 2. 标准化
x_train = transfer.fit_transform(x_train)
x_test = transfer.fit_transform(x_test)

In [21]:
x_train

array([[ 0.5950775 , -0.09783325, -0.52306074, ...,  0.        ,
        -0.9989819 ,  1.5538428 ],
       [-0.4391312 ,  0.4092987 ,  0.21764496, ...,  0.        ,
        -0.9989819 ,  0.478375  ],
       [ 1.2615676 ,  1.4235625 ,  1.5968901 , ...,  0.        ,
        -0.9989819 , -0.71658915],
       ...,
       [-1.2894806 , -0.09783325, -1.1616002 , ...,  0.        ,
        -0.9989819 ,  0.478375  ],
       [-1.5652696 ,  0.91643053,  1.060517  , ...,  0.        ,
        -0.9989819 , -0.47759628],
       [ 0.11244678,  1.4235625 , -1.2126833 , ...,  0.        ,
        -0.9989819 , -0.23860347]], dtype=float32)

因为只是baseline 随便写写啦
但我们的标签是属于 [0,1]
https://stackoverflow.com/questions/66626700/difference-between-tensorflows-tf-keras-layers-dense-and-pytorchs-torch-nn-lin

In [22]:
x_train[0].shape

(12,)

In [23]:
import tensorflow as tf
from keras.layers import LeakyReLU

model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, input_shape=(12,),activation=tf.keras.layers.LeakyReLU(alpha=0.3)),
    tf.keras.layers.Dense(128, input_shape=(64,),activation=tf.keras.layers.LeakyReLU(alpha=0.3)),
    tf.keras.layers.Dense(32, input_shape=(128,),activation=tf.keras.layers.LeakyReLU(alpha=0.3)),
    tf.keras.layers.Dense(8, input_shape=(32,),activation=tf.keras.layers.LeakyReLU(alpha=0.3)),
    tf.keras.layers.Dense(1, input_shape=(8,),activation="tanh"),
])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                832       
                                                                 
 dense_1 (Dense)             (None, 128)               8320      
                                                                 
 dense_2 (Dense)             (None, 32)                4128      
                                                                 
 dense_3 (Dense)             (None, 8)                 264       
                                                                 
 dense_4 (Dense)             (None, 1)                 9         
                                                                 
Total params: 13,553
Trainable params: 13,553
Non-trainable params: 0
_________________________________________________________________


In [24]:
predictions = model(x_train[:1]).numpy()
predictions

array([[-0.10745475]], dtype=float32)

In [25]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss=tf.keras.losses.MeanSquaredError(),)

In [26]:
model.fit(x_train, y_train, epochs=500)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.callbacks.History at 0x7f6d6c6c6590>

In [27]:
test_loss = model.evaluate(x_test,  y_test)
print('\nTest loss:', test_loss)


Test loss: 0.014796343632042408


In [28]:
y_pred = model.predict(x_test)



In [29]:
mse = np.sum((y_pred - y_test) ** 2) / len(y_pred)
mse

0.01479634083975169

In [30]:
np.sum(abs(y_pred - y_test)) / len(y_pred)

0.08820940257985746

In [31]:
y_test[:1]

array([[0.5703]], dtype=float32)

In [32]:
model(x_test[:1]).numpy()

array([[0.51272994]], dtype=float32)

In [33]:
abs(y_pred - y_test)

array([[0.05756992],
       [0.01223585],
       [0.2009521 ],
       ...,
       [0.15340823],
       [0.00566551],
       [0.00132042]], dtype=float32)

In [34]:
model.save_weights('/kaggle/working/my_checkpoint')