In [27]:
import sys

# 將包含 deeplog 模組的目錄路徑添加到 sys.path
sys.path.append('../DeepLog/deeplog')
# import DeepLog and Preprocessor
from deeplog              import DeepLog
from preprocessor import Preprocessor
# Import pytorch
import torch

In [28]:
##############################################################################
#                                 Load data                                  #
##############################################################################

# Create preprocessor for cd loading data
preprocessor = Preprocessor(
    length  = 20,           # Extract sequences of 20 items
    timeout = float('inf'), # Do not include a maximum allowed time between events
)

training_data_path = "./data/IDS2018_train_Benign"

# Load data from csv file
#X, y, label, mapping = preprocessor.csv(training_data_path)
# Load data from txt file
X, y, label, mapping = preprocessor.text(training_data_path)

print("X:", X, "\nShape:", X.shape, "\nmapping:", mapping)


X: tensor([[47, 47, 47,  ..., 47, 47, 47],
        [47, 47, 47,  ..., 47, 47, 37],
        [47, 47, 47,  ..., 47, 37, 34],
        ...,
        [36, 37, 34,  ..., 34, 36, 37],
        [37, 34, 36,  ..., 36, 37, 37],
        [34, 36, 37,  ..., 37, 37, 37]]) 
Shape: torch.Size([9034, 20]) 
mapping: {0: 1, 1: 2, 2: 6, 3: 12, 4: 13, 5: 14, 6: 16, 7: 18, 8: 19, 9: 20, 10: 25, 11: 27, 12: 32, 13: 35, 14: 37, 15: 43, 16: 44, 17: 55, 18: 98, 19: 109, 20: 153, 21: 172, 22: 1014, 23: 1074, 24: 4200, 25: 6005, 26: 6006, 27: 6009, 28: 6013, 29: 7000, 30: 7001, 31: 7002, 32: 7009, 33: 7023, 34: 7024, 35: 7026, 36: 7031, 37: 7036, 38: 7040, 39: 7045, 40: 10016, 41: 10148, 42: 16962, 43: 50036, 44: 50037, 45: 51046, 46: 51047, 47: -1337}


In [29]:
##############################################################################
#                                  DeepLog                                   #
##############################################################################

# Create DeepLog object
deeplog = DeepLog(
    input_size  = 60, # Number of different events to expect
    hidden_size = 64 , # Hidden dimension, we suggest 64
    output_size = 100, # Number of different events to expect
)

# Optionally cast data and DeepLog to cuda, if available
if torch.cuda.is_available():
    deeplog = deeplog.to("cuda")
    X       = X      .to("cuda")
    y       = y      .to("cuda")

# Train deeplog
deeplog.fit(
    X          = X,
    y          = y,
    epochs     = 10,
    batch_size = 128,
)

[Epoch  1/10] average loss = 4.4885 ######################################## (100.00%) runtime 0:00:17.2
[Epoch  2/10] average loss = 4.1647 ######################################## (100.00%) runtime 0:00:15.5
[Epoch  3/10] average loss = 3.7711 ######################################## (100.00%) runtime 0:00:14.3
[Epoch  4/10] average loss = 3.2063 ######################################## (100.00%) runtime 0:00:15.8
[Epoch  5/10] average loss = 2.4081 ######################################## (100.00%) runtime 0:00:14.8
[Epoch  6/10] average loss = 1.7602 ######################################## (100.00%) runtime 0:00:15.5
[Epoch  7/10] average loss = 1.5042 ######################################## (100.00%) runtime 0:00:14.4
[Epoch  8/10] average loss = 1.4290 ######################################## (100.00%) runtime 0:00:14.8
[Epoch  9/10] average loss = 1.4044 ######################################## (100.00%) runtime 0:00:16.6
[Epoch 10/10] average loss = 1.3944 ###################

DeepLog(
  (lstm): LSTM(60, 64, num_layers=2, batch_first=True)
  (out): Linear(in_features=64, out_features=100, bias=True)
  (softmax): LogSoftmax(dim=-1)
)

In [30]:
##############################################################################
#                                  Predict                                   #
##############################################################################

data_path = "./data/IDS2018_test_Benign"

# Load data from csv file
#Xp, yp, label, mapping_p = preprocessor.csv("/home/ubuntu/DeepLog/examples/data/hdfs_train")
# Load data from txt file
Xp, yp, label, mapping_p = preprocessor.text(data_path)
print("Xp:", Xp, "\nShape:", X.shape, "\nmapping_p:", mapping_p)

# Predict using deeplog
y_pred, confidence = deeplog.predict(
    X = Xp,
    k = 9,
)

print("y_pred:", y_pred, "\nshape:", y_pred.shape)

Xp: tensor([[41, 41, 41,  ..., 41, 41, 41],
        [41, 41, 41,  ..., 41, 41, 34],
        [41, 41, 41,  ..., 41, 41, 41],
        ...,
        [30, 31, 28,  ..., 28, 30, 31],
        [31, 28, 30,  ..., 30, 31, 31],
        [28, 30, 31,  ..., 31, 31, 28]]) 
Shape: torch.Size([9034, 20]) 
mapping_p: {0: 1, 1: 2, 2: 6, 3: 12, 4: 13, 5: 14, 6: 18, 7: 19, 8: 20, 9: 25, 10: 27, 11: 32, 12: 43, 13: 44, 14: 55, 15: 98, 16: 109, 17: 153, 18: 172, 19: 1074, 20: 4200, 21: 6005, 22: 6006, 23: 6009, 24: 6013, 25: 7001, 26: 7002, 27: 7023, 28: 7024, 29: 7026, 30: 7031, 31: 7036, 32: 7040, 33: 7045, 34: 10016, 35: 10148, 36: 16962, 37: 50036, 38: 50037, 39: 51046, 40: 51047, 41: -1337}
[Epoch 1/1] average loss = 0.0000 ######################################## (100.00%) runtime 0:00:04.8
y_pred: tensor([[37, 36, 34,  ..., 46, 31, 99],
        [37, 36, 34,  ..., 46, 14, 31],
        [37, 36, 34,  ..., 46, 31, 99],
        ...,
        [37, 36, 34,  ..., 46, 14, 31],
        [37, 36, 34,  ..., 46, 14,

In [35]:
##############################################################################
#                                 Comparison                                 #
##############################################################################

# 讀取測試文件
with open(data_path, 'r') as file:
    data = file.readlines()

cleaned_data = [item.strip() for item in data]

# 使用列表推導式分隔每個字串，並將結果扁平化形成一個新的串列
data = [item for sublist in cleaned_data for item in sublist.split()]

print("data:", data, "\nsize:", len(data))

# 反轉 mapping，以便我們可以根據事件ID找到對應的編號
reverse_mapping = {v: k for k, v in mapping_p.items()}
print(reverse_mapping)

# 轉換 events 列表中的每個字串為數字，然後根據 reverse_mapping 進行映射
mapped_data = [reverse_mapping[int(event)] for event in data]

print("mapped_data:", mapped_data, "\nsize:", len(mapped_data))

# 初始化一個零張量，用於存儲比較結果，長度與 test_normal_data 相同
results = []

# 遍歷 test_normal_data 的每一行
for i in range(0, len(mapped_data)):
    match = False
    for j in y_pred[i]:
        if mapped_data[i] == j:
            match = True
            break
    
    # 如果有匹配，設置結果為1
    if match:
        results.append(1)
    else:
        results.append(0)

print("results:", results, "\nsize:", len(results))

data: ['10016', '10016', '10016', '7036', '7036', '7036', '7036', '6013', '1', '1', '7036', '7036', '7036', '1074', '10016', '6006', '7036', '7036', '7002', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '51047', '50037', '7036', '7036', '7036', '7036', '6009', '6005', '6013', '7036', '7036', '7036', '7036', '7036', '7036', '109', '13', '12', '153', '20', '27', '25', '18', '32', '6', '6', '98', '6', '6', '172', '55', '14', '16962', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '6', '6', '6', '7036', '7036', '7036', '7036', '7036', '7036', '50036', '7036', '7036', '7036', '51046', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '10148', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '7036', '

In [36]:
# 計算列表中0的數量
num_of_zero = results.count(0)
print("異常數量：%d" %num_of_zero)

# 計算機率
abnormal_rate = num_of_zero / len(results)

print("異常率：%.3f" %abnormal_rate)

異常數量：517
異常率：0.573
