In [55]:
import torch
import pandas as pd
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
import seaborn as sns

In [56]:
data = pd.read_csv('Data_encoded\LSTM_data\combined_data_processed.csv')
data.head()

Unnamed: 0,duration,protocol_type_icmp,protocol_type_tcp,protocol_type_udp,service_IRC,service_X11,service_Z39_50,service_aol,service_auth,service_bgp,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,Class
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.098039,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,Normal
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.003922,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,Normal
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.101961,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,DOS
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,Normal
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Normal


In [57]:
# 自定义数据集类
class CustomDataset(Dataset):
    # 类的构造函数。它接受两个参数features和labels，分别表示数据集的特征和标签。
    # 在初始化过程中，将这些特征和标签存储在类的实例变量中。
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    # 这是一个特殊方法，用于返回数据集的长度（即数据样本的数量）。
    # 在这个方法中，它返回了存储在features中的样本数量，即数据集的长度。
    def __len__(self):
        return len(self.features)

    # 这也是一个特殊方法，用于根据给定索引idx来获取数据集中的样本。
    # 在这个方法中，它根据索引idx从features和labels中获取对应索引的特征和标签，并将它们作为元组返回。
    def __getitem__(self, idx):
        # return self.features[idx], self.labels[idx]
        feature = self.features[idx]
        label = self.labels[idx]

        # 确保 feature 是一个数值型数组
        if isinstance(feature, np.ndarray):
            if feature.dtype.type is np.str_ or feature.dtype.type is np.object_:
                raise ValueError("Features must be numeric")

        # 如果 feature 不是一个 ndarray，或者它的 dtype 不是浮点数，尝试将其转换
        if not isinstance(feature, np.ndarray) or feature.dtype != 'float32':
            feature = np.array(feature, dtype=np.float32)

        # 转换为 PyTorch 张量
        feature = torch.tensor(feature, dtype=torch.float32)

        # 如果标签不是一个张量，转换它
        if not torch.is_tensor(label):
            label = torch.tensor(label, dtype=torch.long)

        return feature, label

In [58]:
# 定义BiLSTMLayer层模型
class BiLSTMLayer(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers=1):
        super(BiLSTMLayer, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)

    def forward(self, x):
        # LSTM的输出包括所有隐藏状态、最后的隐藏状态和最后的细胞状态
        output, _ = self.lstm(x)
        # 只返回输出张量，不返回隐藏状态和细胞状态
        # return output
        return output[:, -1, :]  # 只返回最后一个时间步的输出

In [59]:
# 定义网络结构
class CNNBiLSTMModel(nn.Module):
    def __init__(self):
        super(CNNBiLSTMModel, self).__init__()
        self.conv1d = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=122, padding='same')  # 保持输出尺寸不变
        self.maxpool1 = nn.MaxPool1d(kernel_size=5)
        self.batchnorm1 = nn.BatchNorm1d(64)
        # out.shape=(batch=32, channel = 64, seq=24(122池化后的数字))

        # input_dim = 就是nn.LSTM(input_size(x的特征维度),hidden_size,...)中的input_size,
        # 在该数据中,input_size恒为1

        self.bilstm1 = BiLSTMLayer(input_dim=64, hidden_dim=64)  # hidden_size即为上一层的输出channel

        # 此处需要将(128, ) reshape为(1,128), 因为要沿着128的方向做池化,
        # 为啥要沿128的方向,个人理解128为预测出来的特征,故继续提取特征
        self.maxpool1d2 = nn.MaxPool1d(kernel_size=5)
        self.batchnorm2 = nn.BatchNorm1d(1)

        # 第二个BiLSTM
        # input=(input_size=1, hidden_size=128, 其他默认) ,seq=25(根据上一层的输出判断的)
        self.bilstm2 = BiLSTMLayer(input_dim=1, hidden_dim=128)  # BiLSTM只取了最后一个时间步的输出
        # out.shape = (batch=32, 1(啥意思暂不明白), 256(就是128池化后的数字))

        self.dropout = nn.Dropout(0.5)  # 将上一层随机丢弃一半传入下层
        self.fc = nn.Linear(256, 5)
        # self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.conv1d(x)
        x = F.relu(x)
        x = self.maxpool1(x)
        x = self.batchnorm1(x)
        # shape=(32, 64, 24)

        x = x.permute(0, 2, 1)  # 重排维度以适配LSTM输入
        # shape=(32, 24, 64)

        # 第一个BiLSTM
        # BiLSTM.output.shape = (batch, seq, hidden_size*2) = (32, 24, 128)
        x = self.bilstm1(x)  # 但此处只取了最后一个seq, 此时x.shape=(32,128)
        x = x.unsqueeze(1)  # 增加一个维度以适配MaxPool1d
        # shape=(32,1,128)

        x = self.maxpool1d2(x)  # shape=(32, 1, 25)
        x = self.batchnorm2(x)
        # out.shape=(32, 1, 25)

        x = x.permute(0, 2, 1)  # 重排维度以适配LSTM输入
        # out.shape=(32, 25, 1)

        # 第二个BiLSTM
        x = self.bilstm2(x)
        # out.shape=(batch=32, 256)

        x = self.dropout(x)
        # x = torch.flatten(x, 1)  # 展平除batch_size外的所有维度, 但是维度已经是(batch, 256)了,没得展了
        x = self.fc(x)
        # x = self.softmax(x)
        return x

In [60]:
# [调试用]打印每一层的输出形状
def print_layer_shapes(model, input_tensor):
    def hook(module, input, output):
        print(f"{module.__class__.__name__}: {output.shape}")

    # 注册hook
    hooks = []
    for layer in model.children():
        hook_handle = layer.register_forward_hook(hook)
        hooks.append(hook_handle)

    # 前向传播
    with torch.no_grad():
        model(input_tensor)

    # 移除hooks
    for hook in hooks:
        hook.remove()

In [61]:
# 查看模型每一层的输出
CNN_LSTM_model = CNNBiLSTMModel()
in_tensor = torch.randn(64, 1, 122)  # batch_size=32, in_channels=1, sequence_length=122
print_layer_shapes(CNN_LSTM_model, in_tensor)

Conv1d: torch.Size([64, 64, 122])
MaxPool1d: torch.Size([64, 64, 24])
BatchNorm1d: torch.Size([64, 64, 24])
BiLSTMLayer: torch.Size([64, 128])
MaxPool1d: torch.Size([64, 1, 25])
BatchNorm1d: torch.Size([64, 1, 25])
BiLSTMLayer: torch.Size([64, 256])
Dropout: torch.Size([64, 256])
Linear: torch.Size([64, 5])


In [62]:
# 检查CUDA
def try_device():
    if torch.cuda.is_available():
        # 选择第一个CUDA设备
        device = torch.device("cuda:0")
        print(f"Using CUDA device: {torch.cuda.get_device_name(0)}")
    else:
        device = torch.device("cpu")
        print("CUDA is not available. Using CPU instead.")
    return device

In [63]:
# 定义数据加载器
def loop_data_loder(data_features, data_labels, batch_size):
    # 设置features
    x_columns = data_features.columns  # 取训练features的全部列名,
    x_array = data_features[x_columns].values  # x_array即为本轮循环中,模型的train_features
    # x_array.shape = (-1, 122), x_array.class=ndarray

    # 重塑features.shape为(-1, c_in=1, seq=122),使其符合网络结构输入
    x_features = np.reshape(x_array, (x_array.shape[0], 1, x_array.shape[1]))
    # shape=(-1, 1, 122)

    # 设置Class
    # 如果data_labels已经是一个包含类别名称的Series或者列，你可以这样获取类别索引:
    # 假设data_labels是类别名称的Series，你需要将这些名称映射到索引
    # 首先获取类别名称到索引的映射字典
    label_to_idx = {label: idx for idx, label in enumerate(data_labels.unique())}
    # 然后将类别名称转换为索引
    y_labels = data_labels.replace(label_to_idx).values
    # print(y_labels)
    # print(y_labels.dtype)
    # train_labels = Index(['DOS', 'Probe', 'R2L', 'U2R', 'normal'], dtype='object')
    # train_labels.shape = (-1,5)

    # 创建数据集和数据加载器
    dataset = CustomDataset(x_features, y_labels)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    return data_loader

In [64]:
# 设定超参数
learning_rate = 0.01
numb_epochs = 10
batch_size = 64
weight_decay = 0.05
device = try_device()

Using CUDA device: NVIDIA GeForce RTX 3060 Laptop GPU


In [65]:
# 分层K折
num_folds = 6
k_fold = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)  # 随机种子固定,保证每次生成的都一样

In [66]:
k_fold

StratifiedKFold(n_splits=6, random_state=42, shuffle=True)

In [67]:
def init_weights(m):
    if type(m) == nn.Linear or type(m) == nn.Conv2d:
        nn.init.xavier_uniform_(m.weight)

In [68]:
# 评估accuracy
def evaluate_accuracy(net, data_loader, device):
    net.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for X, y in data_loader:
            X, y = X.to(device), y.to(device)
            outputs = net(X)
            _, predicted = torch.max(outputs, 1)
            total += y.size(0)
            correct += (predicted == y).sum().item()

    accuracy = correct / total
    return accuracy

In [69]:
# 实例化模型
model = CNNBiLSTMModel()

# 初始化模型参数
model.apply(init_weights)

# 模型传入device
print('Training on', device)
model.to(device)

# 设置优化器和损失函数
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
loss_fn = nn.CrossEntropyLoss()
train_losses = []
train_accuracies = []
test_accuracies = []

oos_pred = []  # 用于存储每个验证集的准确率

# todo 检查是否还有其他需要初始化的


Training on cuda:0


In [70]:
train_data = data.copy()    # 不改变源数据
print(data.shape)
print(train_data.shape)
# 下面两项操作都不会改变train_data数据,在模型中不需要改变
labels = train_data['Class']
print(labels.shape)
print(train_data.shape)
features  = train_data.drop(['Class'], axis=1, inplace=False)
print(features.shape)

(148517, 123)
(148517, 123)
(148517,)
(148517, 123)
(148517, 122)


In [71]:
# todo 完善训练模型
for epoch in range(numb_epochs):
    print(f'Epoch {epoch + 1}/{numb_epochs}')
    # 全部K折完算一次epoch, 共需要经历numb_epochs次迭代
    train_loss = 0.0
    correct = 0
    total = 0
    
    fold = 0
    for (train_index, val_index) in k_fold.split(features, labels):
        fold += 1
        print(f'Fold {fold}/{num_folds}')   # 当前为第fold次K折
        # 根据K折设置训练集和验证集
        train_data, val_data = features.iloc[train_index], features.iloc[val_index]
        train_labels, val_labels = labels.iloc[train_index], labels.iloc[val_index]
        # print(train_data.shape, val_data.shape)
        # print(train_labels.shape, val_labels.shape)
        
        # 设置data_loder
        # loop_data_loder(data_features, data_labels, batch_size)
        train_loder = loop_data_loder(train_data, train_labels, batch_size)
        val_loder = loop_data_loder(val_data, val_labels, batch_size)
        # 在data_loder后将数据传到device

        # todo 在训练集上训练模型
        model.train()
        for train_batch, train_label_batch in train_loder:
            optimizer.zero_grad()
            train_batch, train_label_batch = train_batch.to(device), train_label_batch.to(device)
            train_batch = train_batch.float()  # 确保输入数据类型为FloatTensor
            train_label_batch = train_label_batch.long()  # 将目标张量转换为长整型
            
            y_hat = model(train_batch)
            # print(y_hat)
            loss = loss_fn(y_hat, train_label_batch)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, train_pred = torch.max(y_hat, 1)
            total += train_label_batch.size(0)
            correct += (train_pred == train_label_batch).sum().item()

        train_accuracy = correct / total
        train_loss = train_loss / len(train_loder)

        # todo 在验证集上评估模型
        model.eval()
        val_total = 0
        val_correct = 0
        val_loss = 0.0

        with torch.no_grad():
            for val_batch, val_label_batch in val_loder:
                val_batch, val_label_batch = val_batch.to(device), val_label_batch.to(device)
                val_batch = val_batch.float()  # 确保输入数据类型为FloatTensor
                # val_label_batch = val_label_batch.long()    # 将目标张量转换为长整型
                y_val = model(val_batch)
                

                loss_val = loss_fn(y_val, val_label_batch)
                val_loss += loss_val.item()
                _, val_pred = torch.max(y_val, 1)
                val_total += val_label_batch.size(0)
                val_correct += (val_pred == val_label_batch).sum().item()

        val_accuracy = val_correct / val_total
        val_loss = val_loss / len(val_loder)

        print(f'Training Loss: {train_loss}, Training Accuracy: {train_accuracy}')
        print(f'Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')

    print('--------------------------------------')

Epoch 1/10
Fold 1/6
Training Loss: 0.32583088131145704, Training Accuracy: 0.9226754144985618
Validation Loss: 3.1408667071537146, Validation Accuracy: 0.029491374782854605
Fold 2/6
Training Loss: 0.31194368914201914, Training Accuracy: 0.9246792282085259
Validation Loss: 0.3230449287389292, Validation Accuracy: 0.9282915202197713
Fold 3/6
Training Loss: 0.32685274381233137, Training Accuracy: 0.9248085065123945
Validation Loss: 0.5417083008960852, Validation Accuracy: 0.843816911081485
Fold 4/6
Training Loss: 0.3325381259369977, Training Accuracy: 0.9250246436766749
Validation Loss: 4.137502382584012, Validation Accuracy: 0.026703833878721772
Fold 5/6
Training Loss: 0.3283541427092591, Training Accuracy: 0.9254128825829805
Validation Loss: 4.297146988777535, Validation Accuracy: 0.033288894275441364
Fold 6/6
Training Loss: 0.3257765591206733, Training Accuracy: 0.925710861382872
Validation Loss: 0.503890749981545, Validation Accuracy: 0.8467598577892695
-------------------------------

In [72]:
# [测试]前向传播过程是否正常
import torch
import torch.nn as nn

# 假设有 3 个类别
num_classes = 3
batch_size = 5

# 创建一个简单的模型
model = nn.Sequential(
    nn.Linear(10, num_classes)
)

# 定义损失函数
loss_fn = nn.CrossEntropyLoss()

# 随机生成一些输入和标签
inputs = torch.randn(batch_size, 10)
targets = torch.randint(0, num_classes, (batch_size,))

# 前向传播
outputs = model(inputs)

# 计算损失
loss = loss_fn(outputs, targets)

print(loss)


tensor(1.0117, grad_fn=<NllLossBackward0>)
