In [8]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from torch.utils.data import DataLoader, TensorDataset

# Load the dataset
data = pd.read_csv("02-16-2018.csv", low_memory=False)
data = data.sample(n=400000, random_state=42).reset_index(drop=True)

# Normalize label values
data['Label'] = data['Label'].replace({
    'DoS attacks-Hulk': 1,
    'DoS attacks-SlowHTTPTest': 2,
    'Benign': 0,
    'Other': 2
})
data['Label'] = pd.to_numeric(data['Label'], errors='coerce').fillna(0).astype(int)
pd.set_option('future.no_silent_downcasting', True)
data = data.infer_objects(copy=False)

# Inject 10% label noise
np.random.seed(42)
noise_indices = np.random.choice(len(data), size=int(0.10 * len(data)), replace=False)
data.loc[noise_indices, 'Label'] = np.random.randint(0, 3, size=len(noise_indices))

# Separate features
categorical_features = data.select_dtypes(include=['object']).columns.tolist()
numerical_features = data.select_dtypes(include=['number']).columns.tolist()
categorical_features = [col for col in categorical_features if col != 'Label']
numerical_features = [col for col in numerical_features if col != 'Label']

# Remove high-cardinality categorical features
categorical_features = [col for col in categorical_features if data[col].nunique() <= 100]

# One-hot encode categorical features
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=True)
encoded = encoder.fit_transform(data[categorical_features])
encoded = encoded.toarray() if encoded.shape[1] < 10000 else encoded

if isinstance(encoded, np.ndarray):
    encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(categorical_features))
    X = pd.concat([data[numerical_features], encoded_df], axis=1)
else:
    X = data[numerical_features]
y = data['Label']

# Feature selection using RandomForest importance
rf = RandomForestClassifier(n_estimators=100, random_state=42)  # Increased n_estimators
rf.fit(X.fillna(0), y)
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1][:20]  # Selecting top 20 features
selected_feature_names = X.columns[indices]
X_selected = X[selected_feature_names]

print("Selected Features:", selected_feature_names)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Normalize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Mixup augmentation
def mixup_data(x, y, alpha=0.2):
    lam = np.random.beta(alpha, alpha)
    batch_size = x.size()[0]
    index = torch.randperm(batch_size)
    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

# Label smoothing
class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes, smoothing=0.1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes

    def forward(self, x, target):
        log_probs = nn.functional.log_softmax(x, dim=-1)
        true_dist = torch.zeros_like(log_probs)
        true_dist.fill_(self.smoothing / (self.cls - 1))
        true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * log_probs, dim=-1))

# Tensor conversion
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).unsqueeze(1)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

# CNN+LSTM Model with Enhancements
class EnhancedCNN_LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(EnhancedCNN_LSTM, self).__init__()
        self.conv1 = nn.Conv1d(1, 64, kernel_size=3, padding=1)  # Increased filters
        self.conv2 = nn.Conv1d(64, 128, kernel_size=3, padding=1)  # Added second convolution layer
        self.relu = nn.ReLU()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=2, batch_first=True, dropout=0.3)  # More LSTM layers
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x, _ = self.lstm(x)
        x = self.dropout(x[:, -1, :])
        return self.fc(x)

# Initialize
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = EnhancedCNN_LSTM(X_train_tensor.shape[2], hidden_dim=128, num_classes=3).to(device)  # Larger hidden_dim
criterion = LabelSmoothingLoss(classes=3, smoothing=0.05)  # Reduced smoothing to 0.05 for less smoothing
optimizer = optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-4)  # Reduced learning rate

# Training
best_acc = 0
patience = 4
counter = 0
for epoch in range(20):
    model.train()
    running_loss = 0
    correct, total = 0, 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        inputs, targets_a, targets_b, lam = mixup_data(inputs, labels)
        outputs = model(inputs)
        loss = lam * criterion(outputs, targets_a) + (1 - lam) * criterion(outputs, targets_b)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)
    train_acc = 100 * correct / total

    # Evaluation
    model.eval()
    correct_test, total_test = 0, 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            correct_test += (predicted == labels).sum().item()
            total_test += labels.size(0)
    test_acc = 100 * correct_test / total_test
    print(f"Epoch {epoch+1}/20, Loss: {running_loss/len(train_loader):.4f}, Train Acc: {train_acc:.2f}%, Test Acc: {test_acc:.2f}%")
    if test_acc > best_acc:
        best_acc = test_acc
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping.")
            break

# Final Evaluation
print(f"\nFinal Test Accuracy: {best_acc:.2f}%")


Selected Features: Index(['Pkt Len Max_935', 'Fwd Header Len_168', 'Tot Fwd Pkts_5',
       'Tot Bwd Pkts_0', 'Bwd Header Len_0', 'Subflow Bwd Pkts_0',
       'Fwd Seg Size Min_40', 'Fwd Act Data Pkts_1', 'Init Bwd Win Byts_-1',
       'Init Fwd Win Byts_225', 'Subflow Fwd Pkts_5', 'ACK Flag Cnt_1',
       'Fwd Header Len_40', 'Fwd Act Data Pkts_0', 'Init Fwd Win Byts_26883',
       'Init Bwd Win Byts_211', 'ACK Flag Cnt_0', 'Bwd Header Len_104',
       'Pkt Len Max_0', 'Subflow Bwd Pkts_3'],
      dtype='object')
Epoch 1/20, Loss: 0.3649, Train Acc: 70.15%, Test Acc: 77.75%
Epoch 2/20, Loss: 0.3472, Train Acc: 71.17%, Test Acc: 77.75%
Epoch 3/20, Loss: 0.3401, Train Acc: 67.96%, Test Acc: 77.76%
Epoch 4/20, Loss: 0.3439, Train Acc: 70.47%, Test Acc: 77.76%
Epoch 5/20, Loss: 0.3407, Train Acc: 69.62%, Test Acc: 77.76%
Epoch 6/20, Loss: 0.3384, Train Acc: 70.01%, Test Acc: 77.76%
Epoch 7/20, Loss: 0.3408, Train Acc: 70.28%, Test Acc: 77.76%
Early stopping.

Final Test Accuracy: 77.76%
