In [2]:
import sys

# 將包含 deeplog 模組的目錄路徑添加到 sys.path
sys.path.append('../DeepLog/deeplog')
# import DeepLog and Preprocessor
from deeplog              import DeepLog
from preprocessor import Preprocessor
# Import pytorch
import torch

In [3]:
##############################################################################
#                                 Load data                                  #
##############################################################################

# Create preprocessor for cd loading data
preprocessor = Preprocessor(
    length  = 20,           # Extract sequences of 20 items
    timeout = float('inf'), # Do not include a maximum allowed time between events
)

training_data_path = "./data/IDS2018_train_benign"

# Load data from csv file
#X, y, label, mapping = preprocessor.csv(training_data_path)
# Load data from txt file
X, y, label, mapping = preprocessor.text(training_data_path)

print("X:", X, "\nShape:", X.shape, "\nmapping:", mapping)


X: tensor([[53, 53, 53,  ..., 53, 53, 53],
        [53, 53, 53,  ..., 53, 53, 41],
        [53, 53, 53,  ..., 53, 41, 41],
        ...,
        [40, 41, 38,  ..., 38, 40, 41],
        [41, 38, 40,  ..., 40, 41, 41],
        [38, 40, 41,  ..., 41, 41, 38]]) 
Shape: torch.Size([114026, 20]) 
mapping: {0: 1, 1: 2, 2: 6, 3: 12, 4: 13, 5: 14, 6: 15, 7: 16, 8: 18, 9: 19, 10: 20, 11: 25, 12: 27, 13: 32, 14: 35, 15: 37, 16: 43, 17: 44, 18: 47, 19: 50, 20: 55, 21: 98, 22: 109, 23: 153, 24: 172, 25: 1014, 26: 1074, 27: 4200, 28: 6005, 29: 6006, 30: 6009, 31: 6013, 32: 7000, 33: 7001, 34: 7002, 35: 7009, 36: 7022, 37: 7023, 38: 7024, 39: 7026, 40: 7031, 41: 7036, 42: 7040, 43: 7042, 44: 7045, 45: 10010, 46: 10016, 47: 10148, 48: 16962, 49: 50036, 50: 50037, 51: 51046, 52: 51047, 53: -1337}


In [4]:
##############################################################################
#                                  DeepLog                                   #
##############################################################################

# Create DeepLog object
deeplog = DeepLog(
    input_size  = 60, # Number of different events to expect
    hidden_size = 64 , # Hidden dimension, we suggest 64
    output_size = 100, # Number of different events to expect
)

# Optionally cast data and DeepLog to cuda, if available
if torch.cuda.is_available():
    deeplog = deeplog.to("cuda")
    X       = X      .to("cuda")
    y       = y      .to("cuda")

# Train deeplog
deeplog.fit(
    X          = X,
    y          = y,
    epochs     = 10,
    batch_size = 128,
)

[Epoch  1/10] average loss = 2.1706 ######################################## (100.00%) runtime 0:00:33.2
[Epoch  2/10] average loss = 1.3509 ######################################## (100.00%) runtime 0:00:34.9
[Epoch  3/10] average loss = 1.3430 ######################################## (100.00%) runtime 0:00:32.0
[Epoch  4/10] average loss = 1.3361 ######################################## (100.00%) runtime 0:00:35.2
[Epoch  5/10] average loss = 1.3295 ######################################## (100.00%) runtime 0:00:32.9
[Epoch  6/10] average loss = 1.3195 ######################################## (100.00%) runtime 0:00:32.7
[Epoch  7/10] average loss = 1.2444 ######################################## (100.00%) runtime 0:00:35.3
[Epoch  8/10] average loss = 1.1856 ######################################## (100.00%) runtime 0:00:33.6
[Epoch  9/10] average loss = 1.1670 ######################################## (100.00%) runtime 0:00:33.0
[Epoch 10/10] average loss = 1.1496 ###################

DeepLog(
  (lstm): LSTM(60, 64, num_layers=2, batch_first=True)
  (out): Linear(in_features=64, out_features=100, bias=True)
  (softmax): LogSoftmax(dim=-1)
)

In [8]:
##############################################################################
#                                  Predict                                   #
##############################################################################

data_path = "./data/IDS2018_test_abnormal_Infiltration"

# Load data from csv file
#Xp, yp, label, mapping_p = preprocessor.csv("/home/ubuntu/DeepLog/examples/data/hdfs_train")
# Load data from txt file
Xp, yp, label, mapping_p = preprocessor.text(data_path)
print("Xp:", Xp, "\nShape:", X.shape, "\nmapping_p:", mapping_p)

# Predict using deeplog
y_pred, confidence = deeplog.predict(
    X = Xp,
    k = 9,
)

print("y_pred:", y_pred, "\nshape:", y_pred.shape)

Xp: tensor([[15, 15, 15,  ..., 15, 15, 15],
        [15, 15, 15,  ..., 15, 15,  4],
        [15, 15, 15,  ..., 15,  4, 12],
        ...,
        [12, 13,  7,  ..., 12, 12, 12],
        [13,  7,  5,  ..., 12, 12, 10],
        [ 7,  5,  6,  ..., 12, 10, 11]]) 
Shape: torch.Size([114026, 20]) 
mapping_p: {0: 2, 1: 16, 2: 35, 3: 37, 4: 104, 5: 134, 6: 4200, 7: 4201, 8: 7000, 9: 7009, 10: 7024, 11: 7031, 12: 7036, 13: 7045, 14: 20001, 15: -1337}
[Epoch 1/1] average loss = 0.0000 ######################################## (100.00%) runtime 0:00:00.2
y_pred: tensor([[41, 40, 38,  ...,  0,  7, 15],
        [41, 40, 38,  ...,  0,  7, 15],
        [41, 40, 38,  ...,  0,  7, 15],
        ...,
        [41, 40, 38,  ...,  0,  7, 15],
        [41, 40, 38,  ...,  0,  7, 15],
        [41, 40, 38,  ...,  0,  7, 15]]) 
shape: torch.Size([226, 9])


In [9]:
##############################################################################
#                                 Comparison                                 #
##############################################################################

# 讀取測試文件
with open(data_path, 'r') as file:
    data = file.readlines()

cleaned_data = [item.strip() for item in data]

# 使用列表推導式分隔每個字串，並將結果扁平化形成一個新的串列
data = [item for sublist in cleaned_data for item in sublist.split()]

print("data:", data, "\nsize:", len(data))

# 反轉 mapping，以便我們可以根據事件ID找到對應的編號
reverse_mapping = {v: k for k, v in mapping_p.items()}

# 轉換 events 列表中的每個字串為數字，然後根據 reverse_mapping 進行映射
mapped_data = [reverse_mapping[int(event)] for event in data]

print("mapped_data:", mapped_data, "\nsize:", len(mapped_data))

# 初始化一個零張量，用於存儲比較結果，長度與 test_normal_data 相同
results = []

# 遍歷 test_normal_data 的每一行
for i in range(0, len(mapped_data)):
    match = False
    for j in y_pred[i]:
        if mapped_data[i] == j:
            match = True
            break
    
    # 如果有匹配，設置結果為1
    if match:
        results.append(1)
    else:
        results.append(0)

print("results:", results, "\nsize:", len(results))

data: ['104', '7036', '16', '7036', '7024', '7031', '7036', '7009', '7000', '7036', '7036', '7036', '37', '35', '7036', '7036', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', '7036', '7024', '7031', 

In [10]:
# 計算列表中0的數量
num_of_zero = results.count(0)
print("異常數量：%d" %num_of_zero)

# 計算機率
abnormal_rate = num_of_zero / len(results)

print("異常率：%.3f" %abnormal_rate)

異常數量：223
異常率：0.987
