In [1]:
# =====================================================================
# Hierarchical Transformer-Enhanced Network Intrusion Detection System
# 简化输出版本 - 保持核心功能，精简输出信息
# =====================================================================

# Cell 1: 环境准备和导入库
import subprocess
import sys

# 安装必要的包
subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "scikit-learn", "imbalanced-learn"], 
                     stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

import numpy as np
import pandas as pd
import os
import gc
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from collections import Counter

from tqdm import tqdm
import warnings
import math
warnings.filterwarnings('ignore')

# 设置随机种子
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"环境初始化完成，使用设备: {device}")

# =====================================================================
# Cell 2: Transformer增强的模型架构
# =====================================================================

class MultiScaleAttention(nn.Module):
    """多尺度注意力机制：结合类别、时间、空间注意力"""
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.num_classes = num_classes
        self.input_dim = input_dim
        
        # 类别特定注意力（保持原有设计）
        self.class_attention = nn.ModuleList([
            nn.Sequential(
                nn.Linear(input_dim, input_dim // 4),
                nn.ReLU(),
                nn.Dropout(0.1),
                nn.Linear(input_dim // 4, input_dim),
                nn.Sigmoid()
            ) for _ in range(num_classes)
        ])
        
        # 🆕 时间序列注意力 - 学习流量的时序模式
        self.temporal_attention = nn.MultiheadAttention(
            input_dim, num_heads=4, dropout=0.1, batch_first=True
        )
        
        # 🆕 空间特征注意力 - 学习特征间的空间关系
        self.spatial_attention = nn.Sequential(
            nn.Linear(input_dim, input_dim // 8),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(input_dim // 8, input_dim),
            nn.Sigmoid()
        )
        
        # 🆕 注意力融合权重
        self.fusion_weights = nn.Parameter(torch.ones(3) / 3)
    
    def forward(self, x):
        batch_size = x.size(0)
        
        # 1. 类别特定注意力（原有机制）
        class_features = []
        for i in range(self.num_classes):
            attention_weights = self.class_attention[i](x)
            attended_features = attention_weights * x
            class_features.append(attended_features)
        class_attended = torch.stack(class_features, dim=1)
        
        # 2. 🆕 时间序列注意力
        x_temporal = x.unsqueeze(1)  # [B, 1, D]
        temporal_attended, _ = self.temporal_attention(x_temporal, x_temporal, x_temporal)
        temporal_attended = temporal_attended.squeeze(1)  # [B, D]
        
        # 3. 🆕 空间特征注意力
        spatial_weights = self.spatial_attention(x)
        spatial_attended = spatial_weights * x
        
        # 4. 🆕 多尺度融合
        weights = F.softmax(self.fusion_weights, dim=0)
        class_attended_mean = class_attended.mean(dim=1)  # [B, D]
        
        # 加权融合三种注意力
        fused_features = (weights[0] * class_attended_mean + 
                         weights[1] * temporal_attended + 
                         weights[2] * spatial_attended)
        
        return class_attended, fused_features

class TransformerEnhancedEnsembleModel(nn.Module):
    """Transformer增强的集成模型：支持层次化检测架构"""
    def __init__(self, input_dim, num_classes, dropout_rate=0.3, use_pretrained=False):
        super().__init__()
        self.num_classes = num_classes
        self.use_pretrained = use_pretrained
        
        # 共享编码器（可以被预训练和迁移）
        self.shared_encoder = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(dropout_rate / 2)
        )
        
        # 🆕 Transformer编码器 - 学习长距离依赖关系
        self.feature_dim = 256
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=self.feature_dim,
            nhead=8,
            dim_feedforward=512,
            dropout=0.1,
            batch_first=True,
            activation='gelu'  # 使用GELU激活函数
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=2)
        
        # 🆕 位置编码 - 为特征添加位置信息
        self.pos_encoding = nn.Parameter(torch.randn(1, 1, self.feature_dim) * 0.1)
        
        # 多尺度注意力机制
        self.multi_scale_attention = MultiScaleAttention(256, num_classes)
        
        # 类别特定分类头（保持原有设计）
        self.class_specific_heads = nn.ModuleList([
            nn.Sequential(
                nn.Linear(256, 128),
                nn.BatchNorm1d(128),
                nn.ReLU(),
                nn.Dropout(dropout_rate),
                nn.Linear(128, 64),
                nn.ReLU(),
                nn.Linear(64, 1)
            ) for _ in range(num_classes)
        ])
        
        # 全局分类器（改进版）
        self.global_classifier = nn.Sequential(
            nn.Linear(256, 128),
            nn.LayerNorm(128),  # 🆕 使用LayerNorm
            nn.GELU(),          # 🆕 使用GELU激活
            nn.Dropout(dropout_rate),
            nn.Linear(128, num_classes)
        )
        
        # 🆕 自适应融合网络 - 学习最优融合策略
        self.fusion_network = nn.Sequential(
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 2),
            nn.Softmax(dim=-1)
        )
        
        # 特征提取器（用于迁移学习）
        self.feature_extractor = nn.Identity()
        
        # 🆕 不确定性估计头 - 评估预测置信度
        self.uncertainty_head = nn.Sequential(
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x, return_features=False, return_uncertainty=False):
        # 共享特征提取
        shared_features = self.shared_encoder(x)
        
        if return_features:
            return shared_features
        
        # 🆕 Transformer处理 - 学习特征间的全局依赖
        # 添加位置编码并转换为序列格式
        transformer_input = shared_features.unsqueeze(1) + self.pos_encoding
        transformer_features = self.transformer_encoder(transformer_input)
        transformer_features = transformer_features.squeeze(1)  # [B, 256]
        
        # 🆕 残差连接 + Layer Normalization
        enhanced_features = F.layer_norm(
            shared_features + transformer_features, 
            normalized_shape=[self.feature_dim]
        )
        
        # 多尺度注意力处理
        class_attended_features, fused_attention_features = self.multi_scale_attention(enhanced_features)
        
        # 类别特定输出
        class_specific_outputs = []
        for i in range(self.num_classes):
            output = self.class_specific_heads[i](class_attended_features[:, i, :])
            class_specific_outputs.append(output)
        class_specific_logits = torch.cat(class_specific_outputs, dim=1)
        
        # 全局输出（使用融合后的特征）
        global_logits = self.global_classifier(fused_attention_features)
        
        # 🆕 学习的自适应融合
        fusion_weights = self.fusion_network(enhanced_features)
        final_logits = (fusion_weights[:, 0:1] * class_specific_logits + 
                       fusion_weights[:, 1:2] * global_logits)
        
        # 🆕 不确定性估计
        if return_uncertainty:
            uncertainty = self.uncertainty_head(enhanced_features)
            return final_logits, uncertainty
        
        return final_logits
    
    def load_pretrained_encoder(self, pretrained_model_path):
        """加载预训练的编码器权重"""
        if os.path.exists(pretrained_model_path):
            pretrained_state = torch.load(pretrained_model_path, map_location=device)
            # 只加载编码器部分的权重
            encoder_state = {}
            for key, value in pretrained_state.items():
                if key.startswith('shared_encoder'):
                    encoder_state[key] = value
            
            self.load_state_dict(encoder_state, strict=False)
            print(f"成功加载预训练编码器: {pretrained_model_path}")
        else:
            print(f"预训练模型不存在: {pretrained_model_path}")

# =====================================================================
# Cell 3: 改进的损失函数和训练工具
# =====================================================================

class UncertaintyAwareFocalLoss(nn.Module):
    """不确定性感知的自适应焦点损失"""
    def __init__(self, alpha=None, gamma=2.0, class_specific_gamma=None, uncertainty_weight=1.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.class_specific_gamma = class_specific_gamma or {}
        self.uncertainty_weight = uncertainty_weight
        
    def forward(self, inputs, targets, uncertainty=None):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        
        # 为不同类别使用不同的gamma值
        gamma_values = torch.full_like(targets, self.gamma, dtype=torch.float)
        for class_idx, gamma_val in self.class_specific_gamma.items():
            mask = (targets == class_idx)
            gamma_values[mask] = gamma_val
        
        # 基础focal loss
        focal_loss = (1 - pt) ** gamma_values * ce_loss
        
        # 🆕 不确定性加权：高不确定性样本获得更多关注
        if uncertainty is not None:
            uncertainty_weight = 1 + self.uncertainty_weight * uncertainty.squeeze()
            focal_loss = uncertainty_weight * focal_loss
        
        # Alpha权重平衡
        if self.alpha is not None:
            if self.alpha.device != inputs.device:
                self.alpha = self.alpha.to(inputs.device)
            alpha_t = self.alpha[targets]
            focal_loss = alpha_t * focal_loss
        
        return focal_loss.mean()

def create_stratified_sampler(y_train, sampling_strategy):
    """创建分层采样策略"""
    class_counts = Counter(y_train)
    total_samples = len(y_train)
    
    # 应用采样策略
    if sampling_strategy['method'] == 'hierarchical_smote':
        return apply_hierarchical_smote(y_train, sampling_strategy)
    elif sampling_strategy['method'] == 'adaptive_smote':
        return apply_adaptive_smote(y_train, sampling_strategy)
    else:
        return None

def apply_hierarchical_smote(y_train, strategy):
    """分层SMOTE采样"""
    tier1_classes = strategy.get('tier1_classes', [])  # 关键少数类
    tier2_classes = strategy.get('tier2_classes', [])  # 普通少数类
    
    sampling_ratios = {}
    class_counts = Counter(y_train)
    max_count = max(class_counts.values())
    
    for class_idx, count in class_counts.items():
        if class_idx in tier1_classes:
            # 关键类别：采样到最大类别的50%
            sampling_ratios[class_idx] = int(max_count * 0.5)
        elif class_idx in tier2_classes:
            # 普通少数类：采样到最大类别的30%
            sampling_ratios[class_idx] = int(max_count * 0.3)
        else:
            # 多数类：保持原样
            sampling_ratios[class_idx] = count
    
    return sampling_ratios

def apply_adaptive_smote(y_train, strategy):
    """自适应SMOTE采样"""
    class_counts = Counter(y_train)
    total_samples = len(y_train)
    
    sampling_ratios = {}
    for class_idx, count in class_counts.items():
        # 基于类别占比的自适应采样
        class_ratio = count / total_samples
        if class_ratio < 0.01:  # 极少数类
            target_ratio = 0.05
        elif class_ratio < 0.05:  # 少数类
            target_ratio = 0.1
        else:  # 多数类
            target_ratio = class_ratio
        
        sampling_ratios[class_idx] = int(total_samples * target_ratio)
    
    return sampling_ratios

def train_transformer_model(model, train_loader, val_loader, device, config):
    """改进的Transformer模型训练函数"""
    print(f"开始训练 {config['model_name']}")
    
    # 🆕 分层学习率优化器设置
    if config.get('use_pretrained', False):
        # 微调模式：不同组件使用不同学习率
        encoder_params = []
        transformer_params = []
        other_params = []
        
        for name, param in model.named_parameters():
            if 'shared_encoder' in name:
                encoder_params.append(param)
            elif 'transformer' in name or 'pos_encoding' in name:
                transformer_params.append(param)
            else:
                other_params.append(param)
        
        optimizer = optim.AdamW([
            {'params': encoder_params, 'lr': config['lr'] * 0.1},      # 编码器较低学习率
            {'params': transformer_params, 'lr': config['lr'] * 0.5},  # Transformer中等学习率
            {'params': other_params, 'lr': config['lr']}               # 其他组件正常学习率
        ], weight_decay=config.get('weight_decay', 1e-4))
    else:
        optimizer = optim.AdamW(model.parameters(), lr=config['lr'], 
                              weight_decay=config.get('weight_decay', 1e-4))
    
    # 🆕 余弦退火学习率调度器
    scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer, 
        max_lr=config['lr'] if not config.get('use_pretrained', False) else [config['lr']*0.1, config['lr']*0.5, config['lr']],
        epochs=config['num_epochs'],
        steps_per_epoch=len(train_loader),
        pct_start=0.1,
        anneal_strategy='cos'
    )
    
    # 🆕 不确定性感知损失函数
    if 'class_weights' in config and config['class_weights'] is not None:
        alpha = torch.FloatTensor(config['class_weights']).to(device)
    else:
        alpha = None
    
    # 为关键少数类使用更高的gamma值
    class_specific_gamma = {}
    if 'minority_classes' in config:
        for class_idx in config['minority_classes']:
            class_specific_gamma[class_idx] = 3.0  # 更关注困难样本
    
    criterion = UncertaintyAwareFocalLoss(
        alpha=alpha, 
        gamma=2.0,
        class_specific_gamma=class_specific_gamma,
        uncertainty_weight=0.5
    )
    
    # 训练循环
    best_val_f1 = 0.0
    patience_counter = 0
    best_model_path = f"/kaggle/working/best_{config['model_name']}.pth"
    
    for epoch in range(config['num_epochs']):
        # 训练阶段
        model.train()
        total_loss = 0
        train_preds, train_labels = [], []
        
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{config['num_epochs']}")
        for batch_idx, (inputs, labels) in enumerate(pbar):
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            
            # 🆕 前向传播（包含不确定性估计）
            outputs, uncertainty = model(inputs, return_uncertainty=True)
            loss = criterion(outputs, labels, uncertainty)
            
            loss.backward()
            
            # 🆕 梯度裁剪防止梯度爆炸
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            
            total_loss += loss.item()
            
            # 收集预测结果
            _, predicted = torch.max(outputs, 1)
            train_preds.extend(predicted.cpu().numpy())
            train_labels.extend(labels.cpu().numpy())
            
            pbar.set_postfix({'Loss': f'{loss.item():.4f}'})
        
        # 验证阶段
        model.eval()
        val_loss = 0
        val_preds, val_labels = [], []
        val_uncertainties = []
        
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs, uncertainty = model(inputs, return_uncertainty=True)
                loss = criterion(outputs, labels, uncertainty)
                val_loss += loss.item()
                
                _, predicted = torch.max(outputs, 1)
                val_preds.extend(predicted.cpu().numpy())
                val_labels.extend(labels.cpu().numpy())
                val_uncertainties.extend(uncertainty.cpu().numpy())
        
        # 计算指标
        train_f1 = f1_score(train_labels, train_preds, average='macro')
        val_f1 = f1_score(val_labels, val_preds, average='macro')
        
        if (epoch + 1) % 5 == 0:  # 只每5个epoch打印一次
            print(f"Epoch {epoch+1}: 训练F1: {train_f1:.4f} | 验证F1: {val_f1:.4f}")
        
        # 保存最佳模型
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            patience_counter = 0
            torch.save(model.state_dict(), best_model_path)
        else:
            patience_counter += 1
            if patience_counter >= config.get('patience', 7):
                print(f"早停触发，最佳F1: {best_val_f1:.4f}")
                break
    
    return best_model_path, best_val_f1

# =====================================================================
# Cell 4: 数据加载和预处理（修正版 - 与simple.ipynb保持一致）
# =====================================================================

def load_and_preprocess_data():
    """加载和预处理CIC-IDS2017数据"""
    print("数据加载和预处理中...")
    
    # 加载数据
    data_path = '/kaggle/input/cicids2017'
    
    parquet_files = [os.path.join(data_path, f) for f in os.listdir(data_path) 
                     if f.endswith('.parquet')]
    
    df_list = []
    for file in tqdm(parquet_files, desc="加载数据文件", leave=False):
        df_temp = pd.read_parquet(file)
        df_list.append(df_temp)
    
    df = pd.concat(df_list, ignore_index=True)
    del df_list
    gc.collect()
    
    print(f"数据形状: {df.shape}")
    
    # 数据清理
    df.rename(columns={col: col.strip() for col in df.columns}, inplace=True)
    label_column = 'Label'
    
    # 处理无穷值和NaN
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(0, inplace=True)
    
    # 离群值修剪
    numeric_columns = df.select_dtypes(include=np.number).columns
    numeric_columns = [col for col in numeric_columns if col != label_column]
    
    for col in tqdm(numeric_columns, desc="数据清理", leave=False):
        q99, q01 = df[col].quantile(0.99), df[col].quantile(0.01)
        df[col] = df[col].clip(lower=q01, upper=q99)
    
    # 去重
    rows_before = df.shape[0]
    df.drop_duplicates(inplace=True)
    if rows_before != df.shape[0]:
        print(f"移除了 {rows_before - df.shape[0]:,} 条重复记录")
    
    # 标签清理和映射
    df[label_column] = df[label_column].astype(str).str.replace(
        r'[^a-zA-Z0-9\s-]', '', regex=True
    ).str.replace(r'\s+', ' ', regex=True).str.strip()
    
    # 创建标签映射
    # 1. 二分类标签
    df['Binary_Label'] = df[label_column].apply(
        lambda x: 'Benign' if x == 'Benign' else 'Malicious'
    )
    
    # 2. 使用与simple.ipynb相同的多分类标签映射
    multi_class_mapping = {
        'DoS Hulk': 'DoS',
        'DoS GoldenEye': 'DoS', 
        'DoS slowloris': 'DoS',
        'DoS Slowhttptest': 'DoS',
        'FTP-Patator': 'Brute_Force',
        'SSH-Patator': 'Brute_Force',
        'Web Attack Brute Force': 'Web_Attack',
        'Web Attack XSS': 'Web_Attack',
        'Web Attack Sql Injection': 'Web_Attack',
        'PortScan': 'PortScan',
        'Bot': 'Bot',
        'Infiltration': 'Rare_Attacks',
        'Heartbleed': 'Rare_Attacks'
    }
    
    df['Multi_Label'] = df[label_column].replace(multi_class_mapping)
    
    # 移除极少数类别（样本太少）
    df = df[~df['Multi_Label'].isin(['Rare_Attacks'])]
    
    print("标签分布:")
    print(df['Multi_Label'].value_counts())
    
    # 准备特征和标签
    feature_columns = [col for col in df.columns 
                      if col not in [label_column, 'Binary_Label', 'Multi_Label']]
    X = df[feature_columns].copy()
    
    # 编码标签
    le_binary = LabelEncoder()
    y_binary = le_binary.fit_transform(df['Binary_Label'])
    
    le_multi = LabelEncoder()
    y_multi = le_multi.fit_transform(df['Multi_Label'])
    
    print(f"特征维度: {X.shape[1]}")
    print(f"多分类类别: {list(le_multi.classes_)}")
    
    return X, y_binary, y_multi, le_binary, le_multi

# 执行数据加载
X, y_binary, y_multi, le_binary, le_multi = load_and_preprocess_data()

# =====================================================================
# Cell 5: 第一阶段 - 训练二分类预训练模型
# =====================================================================

print("\n=== 第一阶段：训练Transformer增强的二分类预训练模型 ===")

# 数据分割
X_train, X_test, y_train_binary, y_test_binary = train_test_split(
    X, y_binary, test_size=0.2, random_state=42, stratify=y_binary
)
X_train, X_val, y_train_binary, y_val_binary = train_test_split(
    X_train, y_train_binary, test_size=0.2, random_state=42, stratify=y_train_binary
)

# 特征标准化
scaler_binary = StandardScaler()
X_train_scaled = scaler_binary.fit_transform(X_train)
X_val_scaled = scaler_binary.transform(X_val)
X_test_scaled = scaler_binary.transform(X_test)

print(f"训练集大小: {len(X_train_scaled):,}")

# 计算类别权重
class_weights_binary = compute_class_weight(
    'balanced', 
    classes=np.unique(y_train_binary), 
    y=y_train_binary
)

# 创建数据加载器
train_loader_binary = DataLoader(
    TensorDataset(torch.from_numpy(X_train_scaled).float(), 
                 torch.from_numpy(y_train_binary).long()),
    batch_size=1024, shuffle=True, num_workers=2
)

val_loader_binary = DataLoader(
    TensorDataset(torch.from_numpy(X_val_scaled).float(), 
                 torch.from_numpy(y_val_binary).long()),
    batch_size=1024, num_workers=2
)

# 创建Transformer增强的二分类模型
binary_model = TransformerEnhancedEnsembleModel(
    input_dim=X.shape[1], 
    num_classes=2, 
    dropout_rate=0.3
).to(device)

print(f"模型参数量: {sum(p.numel() for p in binary_model.parameters()):,}")

# 训练配置
binary_config = {
    'model_name': 'TransformerBinary_Pretrain',
    'num_epochs': 20,
    'lr': 0.001,
    'weight_decay': 1e-4,
    'patience': 5,
    'class_weights': class_weights_binary,
    'minority_classes': [1],
    'use_pretrained': False
}

# 训练Transformer增强的二分类模型
best_binary_path, best_binary_f1 = train_transformer_model(
    binary_model, train_loader_binary, val_loader_binary, device, binary_config
)

print(f"二分类预训练完成，最佳F1: {best_binary_f1:.4f}")

# 清理内存
del train_loader_binary, val_loader_binary
gc.collect()

# =====================================================================
# Cell 6: 第二阶段 - 准备多分类数据和分层采样
# =====================================================================

print("\n=== 第二阶段：准备多分类数据和分层采样 ===")

# 保持原有逻辑：只在恶意流量中进行多分类
malicious_indices = (y_binary == 1)
X_malicious = X[malicious_indices].copy()
y_malicious_original = y_multi[malicious_indices].copy()

print(f"恶意流量样本数: {len(X_malicious):,}")

# 检查原始标签的唯一值
unique_labels = np.unique(y_malicious_original)

# 重新映射标签，确保从0开始连续
label_mapping = {old_label: new_label for new_label, old_label in enumerate(unique_labels)}
reverse_mapping = {new_label: old_label for old_label, new_label in label_mapping.items()}

y_malicious = np.array([label_mapping[label] for label in y_malicious_original])

# 创建新的标签编码器
le_multi_subset = LabelEncoder()
class_names_subset = [le_multi.classes_[reverse_mapping[i]] for i in range(len(unique_labels))]
le_multi_subset.classes_ = np.array(class_names_subset)

print("恶意流量类别分布:")
multi_class_counts = Counter(y_malicious)
for class_idx, count in sorted(multi_class_counts.items()):
    class_name = class_names_subset[class_idx]
    percentage = count / len(y_malicious) * 100
    print(f"  {class_name}: {count:,} 样本 ({percentage:.2f}%)")

# 识别关键少数类别
tier1_classes = []
tier2_classes = []
total_malicious = len(y_malicious)

for class_idx, count in multi_class_counts.items():
    ratio = count / total_malicious
    if ratio < 0.05:
        tier1_classes.append(class_idx)
    elif ratio < 0.2:
        tier2_classes.append(class_idx)

# 数据分割
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(
    X_malicious, y_malicious, test_size=0.2, random_state=42, stratify=y_malicious
)
X_train_m, X_val_m, y_train_m, y_val_m = train_test_split(
    X_train_m, y_train_m, test_size=0.2, random_state=42, stratify=y_train_m
)

print(f"多分类训练集: {len(X_train_m):,}")

# 应用分层SMOTE采样
max_count = max(Counter(y_train_m).values())
sampling_ratios = {}

for class_idx in range(len(class_names_subset)):
    current_count = sum(y_train_m == class_idx)
    if class_idx in tier1_classes:
        target_count = max(current_count, int(max_count * 0.4))
    elif class_idx in tier2_classes:
        target_count = max(current_count, int(max_count * 0.25))
    else:
        target_count = current_count
    
    sampling_ratios[class_idx] = target_count

# 应用SMOTE采样
try:
    smote = SMOTE(
        sampling_strategy=sampling_ratios,
        random_state=42,
        k_neighbors=min(5, min(Counter(y_train_m).values()) - 1)
    )
    
    X_train_m_resampled, y_train_m_resampled = smote.fit_resample(X_train_m, y_train_m)
    print(f"SMOTE采样完成: {len(X_train_m):,} → {len(X_train_m_resampled):,} 样本")
        
except Exception as e:
    print(f"SMOTE采样失败: {str(e)}，使用原始数据")
    X_train_m_resampled, y_train_m_resampled = X_train_m, y_train_m

# =====================================================================
# Cell 7: 第三阶段 - 端到端微调多分类模型
# =====================================================================

print("\n=== 第三阶段：端到端微调多分类模型 ===")

# 特征标准化
scaler_multi = StandardScaler()
X_train_m_scaled = scaler_multi.fit_transform(X_train_m_resampled)
X_val_m_scaled = scaler_multi.transform(X_val_m)
X_test_m_scaled = scaler_multi.transform(X_test_m)

# 计算多分类的类别权重
unique_classes = np.arange(len(class_names_subset))
class_weights_multi = compute_class_weight(
    'balanced', 
    classes=unique_classes, 
    y=y_train_m_resampled
)

# 创建数据加载器
train_loader_multi = DataLoader(
    TensorDataset(torch.from_numpy(X_train_m_scaled).float(), 
                 torch.from_numpy(y_train_m_resampled).long()),
    batch_size=512, shuffle=True, num_workers=0
)

val_loader_multi = DataLoader(
    TensorDataset(torch.from_numpy(X_val_m_scaled).float(), 
                 torch.from_numpy(y_val_m).long()),
    batch_size=512, num_workers=0
)

test_loader_multi = DataLoader(
    TensorDataset(torch.from_numpy(X_test_m_scaled).float(), 
                 torch.from_numpy(y_test_m).long()),
    batch_size=512, num_workers=0
)

# 创建多分类模型
num_classes = len(class_names_subset)

multi_model = TransformerEnhancedEnsembleModel(
    input_dim=X.shape[1], 
    num_classes=num_classes,
    dropout_rate=0.4,
    use_pretrained=True
).to(device)

# 加载预训练的编码器权重
multi_model.load_pretrained_encoder(best_binary_path)

# 微调配置
multi_config = {
    'model_name': 'TransformerMultiClass_FineTuned',
    'num_epochs': 25,
    'lr': 0.001,
    'weight_decay': 1e-4,
    'patience': 8,
    'class_weights': class_weights_multi,
    'minority_classes': tier1_classes + tier2_classes,
    'use_pretrained': True
}

# 开始训练多分类模型
best_multi_path, best_multi_f1 = train_transformer_model(
    multi_model, train_loader_multi, val_loader_multi, device, multi_config
)

print(f"多分类微调完成，最佳F1: {best_multi_f1:.4f}")

# 保存标签映射信息
label_mapping_info = {
    'original_to_new': label_mapping,
    'new_to_original': reverse_mapping,
    'class_names_subset': class_names_subset,
    'tier1_classes': tier1_classes,
    'tier2_classes': tier2_classes
}

import pickle
with open('/kaggle/working/label_mapping.pkl', 'wb') as f:
    pickle.dump(label_mapping_info, f)

# =====================================================================
# Cell 8: 模型评估和对比
# =====================================================================

def evaluate_transformer_model(model, model_path, test_loader, device, 
                               class_names, model_name):
    """评估Transformer模型性能"""
    print(f"\n评估 {model_name}")
    
    # 加载最佳模型
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()
    
    y_true, y_pred, y_scores, uncertainties = [], [], [], []
    
    with torch.no_grad():
        for inputs, labels in tqdm(test_loader, desc="模型评估", leave=False):
            inputs = inputs.to(device)
            outputs, uncertainty = model(inputs, return_uncertainty=True)
            
            # 获取预测概率
            probs = F.softmax(outputs, dim=1)
            y_scores.extend(probs.cpu().numpy())
            
            # 获取预测标签
            _, predicted = torch.max(outputs, 1)
            y_pred.extend(predicted.cpu().numpy())
            y_true.extend(labels.numpy())
            uncertainties.extend(uncertainty.cpu().numpy())
    
    # 转换为numpy数组
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    uncertainties = np.array(uncertainties)
    
    # 确保标签范围正确
    max_label = max(y_true.max(), y_pred.max())
    if max_label >= len(class_names):
        y_true = np.clip(y_true, 0, len(class_names)-1)
        y_pred = np.clip(y_pred, 0, len(class_names)-1)
    
    # 获取实际存在的标签
    unique_labels = np.unique(np.concatenate([y_true, y_pred]))
    actual_class_names = [class_names[i] for i in unique_labels if i < len(class_names)]
    
    # 生成分类报告
    try:
        report = classification_report(
            y_true, y_pred, 
            labels=unique_labels,
            target_names=actual_class_names,
            digits=4, 
            zero_division=0
        )
        print(f"\n{model_name} 分类报告:")
        print(report)
    except Exception as e:
        print(f"生成分类报告时出错: {e}")
    
    # 计算整体指标
    try:
        macro_f1 = f1_score(y_true, y_pred, labels=unique_labels, average='macro')
        weighted_f1 = f1_score(y_true, y_pred, labels=unique_labels, average='weighted')
    except:
        macro_f1 = 0.0
        weighted_f1 = 0.0
    
    overall_accuracy = np.mean(y_true == y_pred)
    avg_uncertainty = np.mean(uncertainties)
    
    print(f"\n{model_name} 整体性能:")
    print(f"  准确率: {overall_accuracy:.4f}")
    print(f"  宏平均F1: {macro_f1:.4f}")
    print(f"  加权平均F1: {weighted_f1:.4f}")
    print(f"  平均不确定性: {avg_uncertainty:.4f}")
    
    return {
        'y_true': y_true,
        'y_pred': y_pred,
        'macro_f1': macro_f1,
        'weighted_f1': weighted_f1,
        'accuracy': overall_accuracy,
        'avg_uncertainty': avg_uncertainty,
        'class_names': actual_class_names,
        'unique_labels': unique_labels
    }

print("\n=== 层次化Transformer增强模型评估 ===")

# 评估二分类模型
test_loader_binary = DataLoader(
    TensorDataset(torch.from_numpy(X_test_scaled).float(), 
                 torch.from_numpy(y_test_binary).long()),
    batch_size=1024, num_workers=0
)

binary_class_names = le_binary.classes_.tolist()
binary_results = evaluate_transformer_model(
    binary_model, best_binary_path, test_loader_binary, device,
    binary_class_names, "Transformer增强二分类模型"
)

# 评估多分类模型
multi_results = evaluate_transformer_model(
    multi_model, best_multi_path, test_loader_multi, device,
    class_names_subset, "Transformer增强多分类模型"
)

# 性能总结
print("\n=== 层次化Transformer增强检测系统效果总结 ===")

print(f"\n二分类模型性能:")
print(f"   准确率: {binary_results['accuracy']:.4f}")
print(f"   宏平均F1: {binary_results['macro_f1']:.4f}")

print(f"\n多分类模型性能:")
print(f"   准确率: {multi_results['accuracy']:.4f}")
print(f"   宏平均F1: {multi_results['macro_f1']:.4f}")

# 保存完整结果
results_summary = {
    'transformer_binary_model': {
        'path': best_binary_path,
        'accuracy': binary_results['accuracy'],
        'macro_f1': binary_results['macro_f1']
    },
    'transformer_multi_model': {
        'path': best_multi_path,
        'accuracy': multi_results['accuracy'],
        'macro_f1': multi_results['macro_f1']
    }
}

import json
try:
    with open('/kaggle/working/hierarchical_transformer_results.json', 'w') as f:
        json.dump(results_summary, f, indent=2, default=str)
    print("\n结果已保存到: /kaggle/working/hierarchical_transformer_results.json")
except Exception as e:
    print(f"保存结果时出错: {e}")

print("\n层次化Transformer增强的网络入侵检测系统完成！")

环境初始化完成，使用设备: cuda
数据加载和预处理中...


                                                           

数据形状: (2313810, 78)


                                                         

移除了 84,674 条重复记录
标签分布:
Multi_Label
Benign         1892659
DoS             193730
DDoS            128014
Brute_Force       9150
Web_Attack        2143
PortScan          1956
Bot               1437
Name: count, dtype: int64
特征维度: 77
多分类类别: ['Benign', 'Bot', 'Brute_Force', 'DDoS', 'DoS', 'PortScan', 'Web_Attack']

=== 第一阶段：训练Transformer增强的二分类预训练模型 ===
训练集大小: 1,426,616
模型参数量: 1,722,730
开始训练 TransformerBinary_Pretrain


Epoch 1/20: 100%|██████████| 1394/1394 [00:30<00:00, 45.21it/s, Loss=0.0040]
Epoch 2/20: 100%|██████████| 1394/1394 [00:27<00:00, 51.62it/s, Loss=0.0098]
Epoch 3/20: 100%|██████████| 1394/1394 [00:27<00:00, 50.36it/s, Loss=0.0027]
Epoch 4/20: 100%|██████████| 1394/1394 [00:26<00:00, 51.88it/s, Loss=0.0030]
Epoch 5/20: 100%|██████████| 1394/1394 [00:26<00:00, 52.33it/s, Loss=0.0024]


Epoch 5: 训练F1: 0.9831 | 验证F1: 0.9829


Epoch 6/20: 100%|██████████| 1394/1394 [00:26<00:00, 52.17it/s, Loss=0.0216]
Epoch 7/20: 100%|██████████| 1394/1394 [00:26<00:00, 51.92it/s, Loss=0.0047]
Epoch 8/20: 100%|██████████| 1394/1394 [00:27<00:00, 51.30it/s, Loss=0.0225]
Epoch 9/20: 100%|██████████| 1394/1394 [00:27<00:00, 50.50it/s, Loss=0.0030]
Epoch 10/20: 100%|██████████| 1394/1394 [00:27<00:00, 51.54it/s, Loss=0.0015]


Epoch 10: 训练F1: 0.9862 | 验证F1: 0.9898


Epoch 11/20: 100%|██████████| 1394/1394 [00:27<00:00, 51.47it/s, Loss=0.0015]
Epoch 12/20: 100%|██████████| 1394/1394 [00:27<00:00, 51.35it/s, Loss=0.0091]
Epoch 13/20: 100%|██████████| 1394/1394 [00:26<00:00, 52.07it/s, Loss=0.0035]
Epoch 14/20: 100%|██████████| 1394/1394 [00:27<00:00, 50.37it/s, Loss=0.0032]
Epoch 15/20: 100%|██████████| 1394/1394 [00:26<00:00, 52.26it/s, Loss=0.0050]


Epoch 15: 训练F1: 0.9879 | 验证F1: 0.9866


Epoch 16/20: 100%|██████████| 1394/1394 [00:26<00:00, 52.43it/s, Loss=0.0109]
Epoch 17/20: 100%|██████████| 1394/1394 [00:26<00:00, 51.95it/s, Loss=0.0033]
Epoch 18/20: 100%|██████████| 1394/1394 [00:26<00:00, 52.17it/s, Loss=0.0052]


早停触发，最佳F1: 0.9899
二分类预训练完成，最佳F1: 0.9899

=== 第二阶段：准备多分类数据和分层采样 ===
恶意流量样本数: 336,430
恶意流量类别分布:
  Bot: 1,437 样本 (0.43%)
  Brute_Force: 9,150 样本 (2.72%)
  DDoS: 128,014 样本 (38.05%)
  DoS: 193,730 样本 (57.58%)
  PortScan: 1,956 样本 (0.58%)
  Web_Attack: 2,143 样本 (0.64%)
多分类训练集: 215,315
SMOTE采样完成: 215,315 → 404,292 样本

=== 第三阶段：端到端微调多分类模型 ===
成功加载预训练编码器: /kaggle/working/best_TransformerBinary_Pretrain.pth
开始训练 TransformerMultiClass_FineTuned


Epoch 1/25: 100%|██████████| 790/790 [00:19<00:00, 40.55it/s, Loss=0.0112]
Epoch 2/25: 100%|██████████| 790/790 [00:19<00:00, 40.76it/s, Loss=0.0258]
Epoch 3/25: 100%|██████████| 790/790 [00:19<00:00, 40.58it/s, Loss=0.0044]
Epoch 4/25: 100%|██████████| 790/790 [00:19<00:00, 40.02it/s, Loss=0.0029]
Epoch 5/25: 100%|██████████| 790/790 [00:19<00:00, 39.61it/s, Loss=0.0177]


Epoch 5: 训练F1: 0.9904 | 验证F1: 0.9090


Epoch 6/25: 100%|██████████| 790/790 [00:19<00:00, 40.30it/s, Loss=0.0036]
Epoch 7/25: 100%|██████████| 790/790 [00:19<00:00, 40.38it/s, Loss=0.0039]
Epoch 8/25: 100%|██████████| 790/790 [00:19<00:00, 40.09it/s, Loss=0.0020]
Epoch 9/25: 100%|██████████| 790/790 [00:19<00:00, 40.39it/s, Loss=0.0056]
Epoch 10/25: 100%|██████████| 790/790 [00:20<00:00, 39.31it/s, Loss=0.0022]


Epoch 10: 训练F1: 0.9916 | 验证F1: 0.9297


Epoch 11/25: 100%|██████████| 790/790 [00:19<00:00, 39.55it/s, Loss=0.0009]


早停触发，最佳F1: 0.9687
多分类微调完成，最佳F1: 0.9687

=== 层次化Transformer增强模型评估 ===

评估 Transformer增强二分类模型


                                                           


Transformer增强二分类模型 分类报告:
              precision    recall  f1-score   support

      Benign     0.9979    0.9960    0.9970    378532
   Malicious     0.9777    0.9883    0.9830     67286

    accuracy                         0.9948    445818
   macro avg     0.9878    0.9922    0.9900    445818
weighted avg     0.9949    0.9948    0.9948    445818


Transformer增强二分类模型 整体性能:
  准确率: 0.9948
  宏平均F1: 0.9900
  加权平均F1: 0.9948
  平均不确定性: 0.0001

评估 Transformer增强多分类模型


                                                            


Transformer增强多分类模型 分类报告:
              precision    recall  f1-score   support

         Bot     0.9379    1.0000    0.9680       287
 Brute_Force     1.0000    0.9781    0.9890      1830
        DDoS     0.9999    0.9993    0.9996     25603
         DoS     0.9994    0.9979    0.9986     38746
    PortScan     0.9842    0.9540    0.9688       391
  Web_Attack     0.7992    0.9930    0.8857       429

    accuracy                         0.9976     67286
   macro avg     0.9534    0.9871    0.9683     67286
weighted avg     0.9979    0.9976    0.9977     67286


Transformer增强多分类模型 整体性能:
  准确率: 0.9976
  宏平均F1: 0.9683
  加权平均F1: 0.9977
  平均不确定性: 0.0027

=== 层次化Transformer增强检测系统效果总结 ===

二分类模型性能:
   准确率: 0.9948
   宏平均F1: 0.9900

多分类模型性能:
   准确率: 0.9976
   宏平均F1: 0.9683

结果已保存到: /kaggle/working/hierarchical_transformer_results.json

层次化Transformer增强的网络入侵检测系统完成！


In [2]:
# =====================================================================
# 模型打包和后端集成准备
# =====================================================================
import os
import json
import pickle
from datetime import datetime

# 确保所有必要的变量都存在
try:
    assert 'X' in locals(), "X 变量不存在"
    assert 'scaler_multi' in locals(), "scaler_multi 变量不存在"
    assert 'le_binary' in locals(), "le_binary 变量不存在"
    assert 'le_multi_subset' in locals(), "le_multi_subset 变量不存在"
    assert 'class_names_subset' in locals(), "class_names_subset 变量不存在"
    assert 'label_mapping_info' in locals(), "label_mapping_info 变量不存在"
    assert 'tier1_classes' in locals(), "tier1_classes 变量不存在"
    assert 'tier2_classes' in locals(), "tier2_classes 变量不存在"
    assert 'best_binary_path' in locals(), "best_binary_path 变量不存在"
    assert 'best_multi_path' in locals(), "best_multi_path 变量不存在"
    assert 'binary_results' in locals(), "binary_results 变量不存在"
    assert 'multi_results' in locals(), "multi_results 变量不存在"
    print("✅ 所有必要变量检查通过，开始打包...")
except AssertionError as e:
    print(f"❌ 变量检查失败: {e}")
    print("请确保在完整的训练和评估流程之后再运行此打包代码。")

# 创建模型包目录
model_package_dir = "/kaggle/working/model_package"
os.makedirs(model_package_dir, exist_ok=True)

# 1. 保存特征缩放器 (scaler.pkl)
scaler_path = os.path.join(model_package_dir, "scaler.pkl")
with open(scaler_path, 'wb') as f:
    pickle.dump(scaler_multi, f)

# 2. 保存标签编码器 (label_encoder.pkl)
label_encoders = {
    'binary_encoder': le_binary,
    'multi_encoder': le_multi_subset,
    'binary_classes': le_binary.classes_.tolist(),
    'multi_classes': class_names_subset,
    'label_mapping': label_mapping_info
}
label_encoder_path = os.path.join(model_package_dir, "label_encoder.pkl")
with open(label_encoder_path, 'wb') as f:
    pickle.dump(label_encoders, f)

# 3. 保存特征选择器 (feature_selector.pkl)
feature_selector = {
    'feature_columns': X.columns.tolist(),
    'selected_features': X.columns.tolist(),
    'feature_count': len(X.columns),
    'selection_method': 'all_features'
}
feature_selector_path = os.path.join(model_package_dir, "feature_selector.pkl")
with open(feature_selector_path, 'wb') as f:
    pickle.dump(feature_selector, f)

# 4. 保存选中特征列表 (selected_features.json)
selected_features = {
    'features': X.columns.tolist(),
    'count': len(X.columns)
}
selected_features_path = os.path.join(model_package_dir, "selected_features.json")
with open(selected_features_path, 'w') as f:
    json.dump(selected_features, f, indent=2)

# 5. 保存模型权重 (model.pth)
model_state = {
    'binary_model_state': torch.load(best_binary_path, map_location='cpu'),
    'multi_model_state': torch.load(best_multi_path, map_location='cpu')
}
model_path = os.path.join(model_package_dir, "model.pth")
torch.save(model_state, model_path)

# 6. 保存模型元信息 (model_info.json)
model_info = {
    'model_name': 'HierarchicalTransformerIDS',
    'model_version': '1.0.0',
    'architecture': {
        'input_features': len(X.columns),
        'binary_classes': 2,
        'multi_classes': len(class_names_subset),
        'dropout_rate': 0.3
    },
    'classes': {
        'binary': le_binary.classes_.tolist(),
        'multi': class_names_subset
    },
    'performance': {
        'binary_stage': {
            'accuracy': float(binary_results['accuracy']),
            'macro_f1': float(binary_results['macro_f1'])
        },
        'multi_stage': {
            'accuracy': float(multi_results['accuracy']),
            'macro_f1': float(multi_results['macro_f1'])
        }
    },
    'features': {
        'total_features': len(X.columns),
        'feature_names': X.columns.tolist()
    }
}
model_info_path = os.path.join(model_package_dir, "model_info.json")
with open(model_info_path, 'w') as f:
    json.dump(model_info, f, indent=2, default=str)

# 7. 创建模型加载器类 (model_loader.py)
model_loader_code = '''
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pickle
import json
import os

class MultiScaleAttention(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.num_classes = num_classes
        self.input_dim = input_dim
        self.class_attention = nn.ModuleList([
            nn.Sequential(
                nn.Linear(input_dim, input_dim // 4), nn.ReLU(), nn.Dropout(0.1),
                nn.Linear(input_dim // 4, input_dim), nn.Sigmoid()
            ) for _ in range(num_classes)
        ])
        self.temporal_attention = nn.MultiheadAttention(input_dim, num_heads=4, dropout=0.1, batch_first=True)
        self.spatial_attention = nn.Sequential(
            nn.Linear(input_dim, input_dim // 8), nn.ReLU(), nn.Dropout(0.1),
            nn.Linear(input_dim // 8, input_dim), nn.Sigmoid()
        )
        self.fusion_weights = nn.Parameter(torch.ones(3) / 3)
    
    def forward(self, x):
        class_features = [self.class_attention[i](x) * x for i in range(self.num_classes)]
        class_attended = torch.stack(class_features, dim=1)
        x_temporal = x.unsqueeze(1)
        temporal_attended, _ = self.temporal_attention(x_temporal, x_temporal, x_temporal)
        temporal_attended = temporal_attended.squeeze(1)
        spatial_weights = self.spatial_attention(x)
        spatial_attended = spatial_weights * x
        weights = F.softmax(self.fusion_weights, dim=0)
        class_attended_mean = class_attended.mean(dim=1)
        fused_features = (weights[0] * class_attended_mean + 
                         weights[1] * temporal_attended + 
                         weights[2] * spatial_attended)
        return class_attended, fused_features

class TransformerEnhancedEnsembleModel(nn.Module):
    def __init__(self, input_dim, num_classes, dropout_rate=0.3):
        super().__init__()
        self.num_classes = num_classes
        self.shared_encoder = nn.Sequential(
            nn.Linear(input_dim, 512), nn.BatchNorm1d(512), nn.ReLU(), nn.Dropout(dropout_rate),
            nn.Linear(512, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(dropout_rate / 2)
        )
        self.feature_dim = 256
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=self.feature_dim, nhead=8, dim_feedforward=512,
            dropout=0.1, batch_first=True, activation='gelu'
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=2)
        self.pos_encoding = nn.Parameter(torch.randn(1, 1, self.feature_dim) * 0.1)
        self.multi_scale_attention = MultiScaleAttention(256, num_classes)
        self.class_specific_heads = nn.ModuleList([
            nn.Sequential(
                nn.Linear(256, 128), nn.BatchNorm1d(128), nn.ReLU(), nn.Dropout(dropout_rate),
                nn.Linear(128, 64), nn.ReLU(), nn.Linear(64, 1)
            ) for _ in range(num_classes)
        ])
        self.global_classifier = nn.Sequential(
            nn.Linear(256, 128), nn.LayerNorm(128), nn.GELU(),
            nn.Dropout(dropout_rate), nn.Linear(128, num_classes)
        )
        self.fusion_network = nn.Sequential(nn.Linear(256, 64), nn.ReLU(), nn.Linear(64, 2), nn.Softmax(dim=-1))
        self.uncertainty_head = nn.Sequential(nn.Linear(256, 64), nn.ReLU(), nn.Linear(64, 1), nn.Sigmoid())
        
    def forward(self, x, return_uncertainty=False):
        shared_features = self.shared_encoder(x)
        transformer_input = shared_features.unsqueeze(1) + self.pos_encoding
        transformer_features = self.transformer_encoder(transformer_input).squeeze(1)
        enhanced_features = F.layer_norm(shared_features + transformer_features, normalized_shape=[self.feature_dim])
        class_attended_features, fused_attention_features = self.multi_scale_attention(enhanced_features)
        class_specific_outputs = [self.class_specific_heads[i](class_attended_features[:, i, :]) for i in range(self.num_classes)]
        class_specific_logits = torch.cat(class_specific_outputs, dim=1)
        global_logits = self.global_classifier(fused_attention_features)
        fusion_weights = self.fusion_network(enhanced_features)
        final_logits = (fusion_weights[:, 0:1] * class_specific_logits + fusion_weights[:, 1:2] * global_logits)
        if return_uncertainty:
            return final_logits, self.uncertainty_head(enhanced_features)
        return final_logits

class HierarchicalTransformerIDSLoader:
    def __init__(self, model_package_path: str):
        self.model_package_path = model_package_path
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self._load_components()
    
    def _load_components(self):
        with open(os.path.join(self.model_package_path, "model_info.json"), 'r') as f:
            self.model_info = json.load(f)
        with open(os.path.join(self.model_package_path, "scaler.pkl"), 'rb') as f:
            self.scaler = pickle.load(f)
        with open(os.path.join(self.model_package_path, "label_encoder.pkl"), 'rb') as f:
            self.label_encoders = pickle.load(f)
        model_checkpoint = torch.load(os.path.join(self.model_package_path, "model.pth"), map_location=self.device)
        
        arch_info = self.model_info['architecture']
        self.binary_model = TransformerEnhancedEnsembleModel(
            input_dim=arch_info['input_features'], num_classes=arch_info['binary_classes'],
            dropout_rate=arch_info['dropout_rate']
        ).to(self.device)
        self.multi_model = TransformerEnhancedEnsembleModel(
            input_dim=arch_info['input_features'], num_classes=arch_info['multi_classes'],
            dropout_rate=arch_info['dropout_rate']
        ).to(self.device)
        
        self.binary_model.load_state_dict(model_checkpoint['binary_model_state'])
        self.multi_model.load_state_dict(model_checkpoint['multi_model_state'])
        
        self.binary_model.eval()
        self.multi_model.eval()
    
    def preprocess(self, data: np.ndarray) -> torch.Tensor:
        scaled_data = self.scaler.transform(data)
        return torch.from_numpy(scaled_data).float().to(self.device)
    
    def predict(self, data: np.ndarray):
        with torch.no_grad():
            processed_data = self.preprocess(data)
            binary_logits, binary_uncertainty = self.binary_model(processed_data, return_uncertainty=True)
            binary_probs = F.softmax(binary_logits, dim=1)
            binary_preds = torch.argmax(binary_probs, dim=1)
            
            multi_probs = torch.zeros(len(data), len(self.model_info['classes']['multi']))
            multi_uncertainty = torch.zeros(len(data), 1)
            
            malicious_mask = binary_preds == 1
            if malicious_mask.sum() > 0:
                malicious_data = processed_data[malicious_mask]
                multi_logits, multi_unc = self.multi_model(malicious_data, return_uncertainty=True)
                multi_probs[malicious_mask] = F.softmax(multi_logits, dim=1)
                multi_uncertainty[malicious_mask] = multi_unc
            
            return {
                'binary': {
                    'predictions': binary_preds.cpu().numpy(),
                    'probabilities': binary_probs.cpu().numpy(),
                    'uncertainty': binary_uncertainty.cpu().numpy(),
                    'classes': self.model_info['classes']['binary']
                },
                'multi': {
                    'probabilities': multi_probs.cpu().numpy(),
                    'uncertainty': multi_uncertainty.cpu().numpy(),
                    'classes': self.model_info['classes']['multi']
                }
            }
'''
model_loader_path = os.path.join(model_package_dir, "model_loader.py")
with open(model_loader_path, 'w') as f:
    f.write(model_loader_code)

# 8. 验证模型包完整性
print("\n📋 验证模型包...")
required_files = [
    "scaler.pkl", "label_encoder.pkl", "feature_selector.pkl",
    "selected_features.json", "model.pth", "model_info.json", "model_loader.py"
]
for file in required_files:
    file_path = os.path.join(model_package_dir, file)
    if os.path.exists(file_path):
        print(f"  ✅ {file}")
    else:
        print(f"  ❌ {file} - 缺失")

print(f"\n🎉 模型包已生成于: {model_package_dir}")

✅ 所有必要变量检查通过，开始打包...

📋 验证模型包...
  ✅ scaler.pkl
  ✅ label_encoder.pkl
  ✅ feature_selector.pkl
  ✅ selected_features.json
  ✅ model.pth
  ✅ model_info.json
  ✅ model_loader.py

🎉 模型包已生成于: /kaggle/working/model_package


In [3]:
# =====================================================================
# 模型打包和后端集成准备 - 简化版
# =====================================================================
import os
import json
import pickle
from datetime import datetime

print("🔧 开始模型打包...")

# 变量检查
required_vars = ['X', 'scaler_multi', 'le_binary', 'le_multi_subset', 'class_names_subset', 
                'label_mapping_info', 'tier1_classes', 'tier2_classes', 'best_binary_path', 
                'best_multi_path', 'binary_results', 'multi_results']

try:
    for var in required_vars:
        assert var in locals(), f"{var} 变量不存在"
    print("✅ 变量检查通过")
except AssertionError as e:
    print(f"❌ {e}")
    print("请确保在完整训练流程后运行此代码")
    exit()

# 创建模型包目录
model_package_dir = "/kaggle/working/model_package"
os.makedirs(model_package_dir, exist_ok=True)

print("📦 保存模型组件...")

# 1. 特征缩放器
with open(os.path.join(model_package_dir, "scaler.pkl"), 'wb') as f:
    pickle.dump(scaler_multi, f)

# 2. 标签编码器
label_encoders = {
    'binary_encoder': le_binary,
    'multi_encoder': le_multi_subset,
    'binary_classes': le_binary.classes_.tolist(),
    'multi_classes': class_names_subset,
    'label_mapping': label_mapping_info
}
with open(os.path.join(model_package_dir, "label_encoder.pkl"), 'wb') as f:
    pickle.dump(label_encoders, f)

# 3. 特征选择器
feature_selector = {
    'feature_columns': X.columns.tolist(),
    'selected_features': X.columns.tolist(),
    'feature_count': len(X.columns),
    'selection_method': 'all_features'
}
with open(os.path.join(model_package_dir, "feature_selector.pkl"), 'wb') as f:
    pickle.dump(feature_selector, f)

# 4. 特征列表
selected_features = {
    'features': X.columns.tolist(),
    'count': len(X.columns),
    'creation_date': datetime.now().isoformat()
}
with open(os.path.join(model_package_dir, "selected_features.json"), 'w') as f:
    json.dump(selected_features, f, indent=2)

# 5. 模型权重
model_state = {
    'binary_model_state': torch.load(best_binary_path, map_location='cpu'),
    'multi_model_state': torch.load(best_multi_path, map_location='cpu'),
    'model_architecture': {
        'input_dim': X.shape[1],
        'binary_classes': 2,
        'multi_classes': len(class_names_subset),
        'dropout_rate': 0.3
    }
}
torch.save(model_state, os.path.join(model_package_dir, "model.pth"))

# 6. 模型信息
model_info = {
    'model_name': 'HierarchicalTransformerIDS',
    'model_version': '1.0.0',
    'created_date': datetime.now().isoformat(),
    'architecture': {
        'input_features': len(X.columns),
        'binary_classes': 2,
        'multi_classes': len(class_names_subset),
        'dropout_rate': 0.3
    },
    'classes': {
        'binary': le_binary.classes_.tolist(),
        'multi': class_names_subset,
        'tier1_critical': [class_names_subset[i] for i in tier1_classes],
        'tier2_minority': [class_names_subset[i] for i in tier2_classes]
    },
    'performance': {
        'binary_stage': {
            'accuracy': float(binary_results['accuracy']),
            'macro_f1': float(binary_results['macro_f1']),
            'weighted_f1': float(binary_results.get('weighted_f1', 0))
        },
        'multi_stage': {
            'accuracy': float(multi_results['accuracy']),
            'macro_f1': float(multi_results['macro_f1']),
            'weighted_f1': float(multi_results.get('weighted_f1', 0))
        }
    },
    'features': {
        'total_features': len(X.columns),
        'feature_names': X.columns.tolist(),
        'preprocessing': 'StandardScaler + Outlier Clipping'
    },
    'deployment': {
        'input_format': 'numpy array of shape (batch_size, n_features)',
        'requires_preprocessing': True,
        'batch_inference': True
    }
}
with open(os.path.join(model_package_dir, "model_info.json"), 'w') as f:
    json.dump(model_info, f, indent=2, default=str)

# 7. 模型加载器
model_loader_code = '''import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pickle
import json
import os

class MultiScaleAttention(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.num_classes = num_classes
        self.class_attention = nn.ModuleList([
            nn.Sequential(nn.Linear(input_dim, input_dim//4), nn.ReLU(), nn.Dropout(0.1),
                         nn.Linear(input_dim//4, input_dim), nn.Sigmoid())
            for _ in range(num_classes)
        ])
        self.temporal_attention = nn.MultiheadAttention(input_dim, num_heads=4, dropout=0.1, batch_first=True)
        self.spatial_attention = nn.Sequential(nn.Linear(input_dim, input_dim//8), nn.ReLU(), nn.Dropout(0.1),
                                             nn.Linear(input_dim//8, input_dim), nn.Sigmoid())
        self.fusion_weights = nn.Parameter(torch.ones(3) / 3)
    
    def forward(self, x):
        # 类别注意力
        class_features = [self.class_attention[i](x) * x for i in range(self.num_classes)]
        class_attended = torch.stack(class_features, dim=1)
        # 时间注意力
        x_temporal = x.unsqueeze(1)
        temporal_attended, _ = self.temporal_attention(x_temporal, x_temporal, x_temporal)
        temporal_attended = temporal_attended.squeeze(1)
        # 空间注意力
        spatial_attended = self.spatial_attention(x) * x
        # 融合
        weights = F.softmax(self.fusion_weights, dim=0)
        class_attended_mean = class_attended.mean(dim=1)
        fused = weights[0] * class_attended_mean + weights[1] * temporal_attended + weights[2] * spatial_attended
        return class_attended, fused

class TransformerEnhancedEnsembleModel(nn.Module):
    def __init__(self, input_dim, num_classes, dropout_rate=0.3):
        super().__init__()
        self.num_classes = num_classes
        # 编码器
        self.shared_encoder = nn.Sequential(
            nn.Linear(input_dim, 512), nn.BatchNorm1d(512), nn.ReLU(), nn.Dropout(dropout_rate),
            nn.Linear(512, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(dropout_rate/2)
        )
        # Transformer
        self.feature_dim = 256
        encoder_layer = nn.TransformerEncoderLayer(d_model=self.feature_dim, nhead=8, dim_feedforward=512,
                                                 dropout=0.1, batch_first=True, activation='gelu')
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=2)
        self.pos_encoding = nn.Parameter(torch.randn(1, 1, self.feature_dim) * 0.1)
        # 注意力机制
        self.multi_scale_attention = MultiScaleAttention(256, num_classes)
        # 分类头
        self.class_specific_heads = nn.ModuleList([
            nn.Sequential(nn.Linear(256, 128), nn.BatchNorm1d(128), nn.ReLU(), nn.Dropout(dropout_rate),
                         nn.Linear(128, 64), nn.ReLU(), nn.Linear(64, 1))
            for _ in range(num_classes)
        ])
        self.global_classifier = nn.Sequential(nn.Linear(256, 128), nn.LayerNorm(128), nn.GELU(),
                                             nn.Dropout(dropout_rate), nn.Linear(128, num_classes))
        self.fusion_network = nn.Sequential(nn.Linear(256, 64), nn.ReLU(), nn.Linear(64, 2), nn.Softmax(dim=-1))
        self.uncertainty_head = nn.Sequential(nn.Linear(256, 64), nn.ReLU(), nn.Linear(64, 1), nn.Sigmoid())
        
    def forward(self, x, return_uncertainty=False):
        # 特征提取
        shared_features = self.shared_encoder(x)
        # Transformer处理
        transformer_input = shared_features.unsqueeze(1) + self.pos_encoding
        transformer_features = self.transformer_encoder(transformer_input).squeeze(1)
        enhanced_features = F.layer_norm(shared_features + transformer_features, normalized_shape=[self.feature_dim])
        # 多尺度注意力
        class_attended_features, fused_attention_features = self.multi_scale_attention(enhanced_features)
        # 分类
        class_outputs = [self.class_specific_heads[i](class_attended_features[:, i, :]) for i in range(self.num_classes)]
        class_logits = torch.cat(class_outputs, dim=1)
        global_logits = self.global_classifier(fused_attention_features)
        fusion_weights = self.fusion_network(enhanced_features)
        final_logits = fusion_weights[:, 0:1] * class_logits + fusion_weights[:, 1:2] * global_logits
        
        if return_uncertainty:
            return final_logits, self.uncertainty_head(enhanced_features)
        return final_logits

class HierarchicalTransformerIDSLoader:
    def __init__(self, model_package_path: str):
        self.model_package_path = model_package_path
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self._load_components()
    
    def _load_components(self):
        # 加载配置和组件
        with open(os.path.join(self.model_package_path, "model_info.json"), 'r') as f:
            self.model_info = json.load(f)
        with open(os.path.join(self.model_package_path, "scaler.pkl"), 'rb') as f:
            self.scaler = pickle.load(f)
        with open(os.path.join(self.model_package_path, "label_encoder.pkl"), 'rb') as f:
            self.label_encoders = pickle.load(f)
        
        # 加载模型
        model_checkpoint = torch.load(os.path.join(self.model_package_path, "model.pth"), map_location=self.device)
        arch_info = self.model_info['architecture']
        
        self.binary_model = TransformerEnhancedEnsembleModel(
            input_dim=arch_info['input_features'], num_classes=arch_info['binary_classes'],
            dropout_rate=arch_info['dropout_rate']).to(self.device)
        self.multi_model = TransformerEnhancedEnsembleModel(
            input_dim=arch_info['input_features'], num_classes=arch_info['multi_classes'],
            dropout_rate=arch_info['dropout_rate']).to(self.device)
        
        self.binary_model.load_state_dict(model_checkpoint['binary_model_state'])
        self.multi_model.load_state_dict(model_checkpoint['multi_model_state'])
        self.binary_model.eval()
        self.multi_model.eval()
    
    def preprocess(self, data: np.ndarray) -> torch.Tensor:
        scaled_data = self.scaler.transform(data)
        return torch.from_numpy(scaled_data).float().to(self.device)
    
    def predict(self, data: np.ndarray):
        with torch.no_grad():
            processed_data = self.preprocess(data)
            
            # 二分类预测
            binary_logits, binary_uncertainty = self.binary_model(processed_data, return_uncertainty=True)
            binary_probs = F.softmax(binary_logits, dim=1)
            binary_preds = torch.argmax(binary_probs, dim=1)
            
            # 多分类预测（仅恶意流量）
            multi_probs = torch.zeros(len(data), len(self.model_info['classes']['multi']))
            multi_uncertainty = torch.zeros(len(data), 1)
            
            malicious_mask = binary_preds == 1
            if malicious_mask.sum() > 0:
                malicious_data = processed_data[malicious_mask]
                multi_logits, multi_unc = self.multi_model(malicious_data, return_uncertainty=True)
                multi_probs[malicious_mask] = F.softmax(multi_logits, dim=1)
                multi_uncertainty[malicious_mask] = multi_unc
            
            return {
                'binary': {
                    'predictions': binary_preds.cpu().numpy(),
                    'probabilities': binary_probs.cpu().numpy(),
                    'uncertainty': binary_uncertainty.cpu().numpy(),
                    'classes': self.model_info['classes']['binary']
                },
                'multi': {
                    'probabilities': multi_probs.cpu().numpy(),
                    'uncertainty': multi_uncertainty.cpu().numpy(),
                    'classes': self.model_info['classes']['multi']
                },
                'metadata': {
                    'model_name': self.model_info['model_name'],
                    'num_samples': len(data)
                }
            }
    
    def get_model_info(self):
        return self.model_info

# 使用示例
# loader = HierarchicalTransformerIDSLoader("/path/to/model_package")
# results = loader.predict(network_traffic_data)
'''

with open(os.path.join(model_package_dir, "model_loader.py"), 'w') as f:
    f.write(model_loader_code)

# 8. 创建快速测试脚本
test_script = f'''import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from model_loader import HierarchicalTransformerIDSLoader
import numpy as np

def test_model():
    try:
        print("🔄 测试模型加载...")
        loader = HierarchicalTransformerIDSLoader(".")
        print("✅ 模型加载成功")
        
        print("🔄 测试预测...")
        sample_data = np.random.randn(3, {len(X.columns)})
        results = loader.predict(sample_data)
        print("✅ 预测成功")
        print(f"二分类形状: {{results['binary']['probabilities'].shape}}")
        print(f"多分类形状: {{results['multi']['probabilities'].shape}}")
        
        model_info = loader.get_model_info()
        print(f"\\n📋 模型: {{model_info['model_name']}} v{{model_info['model_version']}}")
        print(f"性能: 二分类F1={model_info['performance']['binary_stage']['macro_f1']:.3f}, "
              f"多分类F1={model_info['performance']['multi_stage']['macro_f1']:.3f}")
        print("\\n🎉 测试通过！")
        return True
    except Exception as e:
        print(f"❌ 测试失败: {{e}}")
        return False

if __name__ == "__main__":
    success = test_model()
    sys.exit(0 if success else 1)
'''

with open(os.path.join(model_package_dir, "test_model.py"), 'w') as f:
    f.write(test_script)

# 9. 验证完整性
print("📋 验证模型包...")
required_files = ["scaler.pkl", "label_encoder.pkl", "feature_selector.pkl", 
                  "selected_features.json", "model.pth", "model_info.json", 
                  "model_loader.py", "test_model.py"]

all_good = True
for file in required_files:
    if os.path.exists(os.path.join(model_package_dir, file)):
        print(f"  ✅ {file}")
    else:
        print(f"  ❌ {file} - 缺失")
        all_good = False

# 10. 总结
if all_good:
    total_size = sum(os.path.getsize(os.path.join(model_package_dir, f)) 
                    for f in os.listdir(model_package_dir)) / (1024*1024)
    print(f"\n🎉 模型包生成完成!")
    print(f"📍 位置: {model_package_dir}")
    print(f"📊 大小: {total_size:.1f}MB")
    print(f"🧪 运行测试: cd {model_package_dir} && python test_model.py")
    print(f"\n🚀 集成示例:")
    print(f"from model_loader import HierarchicalTransformerIDSLoader")
    print(f"loader = HierarchicalTransformerIDSLoader('/path/to/model_package')")
    print(f"results = loader.predict(data)")
else:
    print("\n❌ 模型包生成不完整，请检查错误信息")

print("\n✨ 打包完成！")

🔧 开始模型打包...
✅ 变量检查通过
📦 保存模型组件...
📋 验证模型包...
  ✅ scaler.pkl
  ✅ label_encoder.pkl
  ✅ feature_selector.pkl
  ✅ selected_features.json
  ✅ model.pth
  ✅ model_info.json
  ✅ model_loader.py
  ✅ test_model.py

🎉 模型包生成完成!
📍 位置: /kaggle/working/model_package
📊 大小: 14.4MB
🧪 运行测试: cd /kaggle/working/model_package && python test_model.py

🚀 集成示例:
from model_loader import HierarchicalTransformerIDSLoader
loader = HierarchicalTransformerIDSLoader('/path/to/model_package')
results = loader.predict(data)

✨ 打包完成！
