In [75]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader, random_split
from PIL import Image
from sklearn.model_selection import train_test_split
import math

# Step 1: Load the CSV dataset
def load_data(csv_path):
    data = pd.read_csv(csv_path, low_memory=False)
    data = data.sample(frac=0.01, random_state=42)  # Shuffle data
    return data

# Step 2: Convert numerical features to grayscale images
def convert_to_image(data, save_dir):
    os.makedirs(save_dir, exist_ok=True)

    # Identify the label column dynamically
    label_col = None
    for col in data.columns:
        if "label" in col.lower() or "class" in col.lower() or "attack" in col.lower():
            label_col = col
            break

    if label_col is None:
        raise ValueError("No label column found in dataset. Check column names!")

    # Extract feature columns (excluding the label)
    feature_columns = data.select_dtypes(include=[np.number]).columns.tolist()
    if label_col in feature_columns:
        feature_columns.remove(label_col)

    print(f"Using label column: {label_col}")  # Debugging output

    # Ensure all feature columns are numeric and fill NaN values
    data[feature_columns] = data[feature_columns].apply(pd.to_numeric, errors='coerce')
    data[feature_columns] = data[feature_columns].fillna(0)  # Fill NaNs with 0

    images, labels = [], []
    num_features = len(feature_columns)
    image_size = math.ceil(math.sqrt(num_features))  # Find the next perfect square

    for i, row in enumerate(data.iterrows()):
        features = row[1][feature_columns].values
        
        # Normalize features to [0, 1] range
        min_features = np.min(features)
        max_features = np.max(features)
        
        # Avoid division by zero if max == min
        if max_features != min_features:
            normalized_features = (features - min_features) / (max_features - min_features)
        else:
            normalized_features = np.zeros_like(features)  # All values are the same, set to 0

        # Scale features to [0, 255] and convert to uint8
        scaled_features = (normalized_features * 255).astype(np.uint8)
        
        # Reshape into a square image
        padded_features = np.pad(scaled_features, (0, image_size**2 - num_features), 'constant')
        image_matrix = padded_features.reshape(image_size, image_size)

        # Convert to image
        img = Image.fromarray(image_matrix, mode='L')
        image_path = os.path.join(save_dir, f"{i}.png")
        img.save(image_path)

        # Append image path and label
        images.append(image_path)
        labels.append(row[1][label_col])

    return images, labels

# Step 3: Create custom dataset class
class DDoSDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform
        
        # Create a dynamic label-to-index mapping
        self.label_map = {label: idx for idx, label in enumerate(sorted(set(labels)))}
        
        # Debugging: Print out the label mapping to ensure it's correct
        print(f"Label mapping: {self.label_map}")
        
        # Store the number of classes dynamically
        self.num_classes = len(self.label_map)

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        # Load image
        img = Image.open(self.image_paths[idx])

        # Convert grayscale to RGB if the image is in grayscale
        if img.mode != 'RGB':
            img = img.convert('RGB')

        # Get the label and map it to integer (dynamic number of classes)
        label = self.labels[idx]
        label = self.label_map[label]  # Convert the label to an integer

        # Apply transformations if any
        if self.transform:
            img = self.transform(img)

        return img, torch.tensor(label)

# Step 4: Define transformations for images
transform = transforms.Compose([
    transforms.Resize((128, 128)),  # Resize to fixed size
    transforms.ToTensor(),  # Convert image to Tensor
    transforms.Normalize(mean=[0.5], std=[0.5])  # Normalize if needed
])

# Step 5: Load the data and convert to images
data = load_data("/Users/book_kuno/Desktop/DDoS 2018/02-21-2018.csv")  # Change to your file path
image_paths, labels = convert_to_image(data, "./ddos_images")

# Step 6: Split dataset into training, validation, and test sets
train_imgs, test_imgs, train_labels, test_labels = train_test_split(image_paths, labels, test_size=0.2, random_state=42)
train_imgs, val_imgs, train_labels, val_labels = train_test_split(train_imgs, train_labels, test_size=0.2, random_state=42)

# Step 7: Create datasets and dataloaders
train_dataset = DDoSDataset(train_imgs, train_labels, transform)
val_dataset = DDoSDataset(val_imgs, val_labels, transform)
test_dataset = DDoSDataset(test_imgs, test_labels, transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


Using label column: Label
Label mapping: {'Benign': 0, 'DDOS attack-HOIC': 1, 'DDOS attack-LOIC-UDP': 2}
Label mapping: {'Benign': 0, 'DDOS attack-HOIC': 1, 'DDOS attack-LOIC-UDP': 2}
Label mapping: {'Benign': 0, 'DDOS attack-HOIC': 1, 'DDOS attack-LOIC-UDP': 2}


In [76]:
# Step 8: Load pre-trained ResNet-18 and modify it for dynamic number of classes
model = models.resnet18(pretrained=True) #modeify if want to custom the parameters?
num_features = model.fc.in_features
# Modify the fully connected layer to handle a dynamic number of classes
model.fc = nn.Linear(num_features, len(set(labels)))  # len(set(labels)) is the number of unique classes
model = model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))


# Step 9: Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Step 10: Define transformations (including resizing and normalization for ResNet)
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize images to 224x224 for ResNet
    transforms.ToTensor(),  # Convert image to Tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # ResNet normalization
])

# Step 10: Train the model
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader)}")
    return model

# Train the model
model = train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10)

# Step 11: Evaluate on test data
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f"Test Accuracy: {100 * correct / total:.2f}%")

# Evaluate the model on the test set
evaluate_model(model, test_loader)



Epoch 1/10, Loss: 0.03523568095380532
Epoch 2/10, Loss: 0.00547225185848566
Epoch 3/10, Loss: 0.0028260180337340334
Epoch 4/10, Loss: 0.0004046818112328765
Epoch 5/10, Loss: 7.199259491590922e-05
Epoch 6/10, Loss: 2.1666055434772196e-05
Epoch 7/10, Loss: 1.2919169541549178e-05
Epoch 8/10, Loss: 1.0425077492809146e-05
Epoch 9/10, Loss: 9.57701368493179e-06
Epoch 10/10, Loss: 5.477775315891146e-06
Test Accuracy: 100.00%


<!-- 
Todo:
1)Add validation step (in/after training)
2)Visulaiations on the performance 
3)Parameters of resnet18

Todo: 2 datasets
1)two datasets version

Todo: Metrics
1)Performamce Metric
2)Resource Efficiency Metrics
-->