In [9]:
# Setup: imports and paths (DoS-focused version)
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch

pd.set_option('display.max_columns', 120)
sns.set_theme(style='whitegrid')

DATA_DIR = os.path.join(os.getcwd(), 'datasets')
assert os.path.isdir(DATA_DIR), f"Missing datasets dir: {DATA_DIR}"
print('PyTorch:', torch.__version__)
print('Data dir:', DATA_DIR)


PyTorch: 2.8.0
Data dir: /Users/nitishmalluru/HW/CSCE_482/datasets


In [10]:
# Config
LABEL_COL = 'Label'
NROWS_PER_CSV = 200_000   # sample more for V2; adjust as needed
BATCH_SIZE = 512
EPOCHS = 5
LOG_INTERVAL = 50
MAX_BATCHES = None
SEED = 42

import random
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)


<torch._C.Generator at 0x11ec9a830>

In [11]:
# Load CSVs and combine
csv_paths = sorted(glob.glob(os.path.join(DATA_DIR, '*.csv')))
assert csv_paths, 'No CSVs found in datasets/'
print('Found CSVs:', len(csv_paths))

frames = []
for p in csv_paths:
    try:
        df = pd.read_csv(p, nrows=NROWS_PER_CSV)
        df['__source__'] = os.path.basename(p)
        frames.append(df)
        print('Loaded', os.path.basename(p), df.shape)
    except Exception as e:
        print('Skip', os.path.basename(p), e)

data = pd.concat(frames, ignore_index=True)
print('Combined shape:', data.shape)

data.head()


Found CSVs: 8
Loaded Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv (200000, 80)
Loaded Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv (200000, 80)
Loaded Friday-WorkingHours-Morning.pcap_ISCX.csv (191033, 80)
Loaded Monday-WorkingHours.pcap_ISCX.csv (200000, 80)
Loaded Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv (200000, 80)
Loaded Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv (170366, 80)
Loaded Tuesday-WorkingHours.pcap_ISCX.csv (200000, 80)
Loaded Wednesday-workingHours.pcap_ISCX.csv (200000, 80)
Combined shape: (1561399, 80)


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Fwd Avg Bytes/Bulk,Fwd Avg Packets/Bulk,Fwd Avg Bulk Rate,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Avg Bulk Rate,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,__source__
0,54865,3,2,0,12,0,6,6,6.0,0.0,0,0,0.0,0.0,4000000.0,666666.6667,3.0,0.0,3,3,3,3.0,0.0,3,3,0,0.0,0.0,0,0,0,0,0,0,40,0,666666.6667,0.0,6,6,6.0,0.0,0.0,0,0,0,0,1,0,0,0,0,9.0,6.0,0.0,40,0,0,0,0,0,0,2,12,0,0,33,-1,1,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN,Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
1,55054,109,1,1,6,6,6,6,6.0,0.0,6,6,6.0,0.0,110091.7,18348.62385,109.0,0.0,109,109,0,0.0,0.0,0,0,0,0.0,0.0,0,0,0,0,0,0,20,20,9174.311927,9174.311927,6,6,6.0,0.0,0.0,0,0,0,0,1,1,0,0,1,9.0,6.0,6.0,20,0,0,0,0,0,0,1,6,1,6,29,256,0,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN,Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
2,55055,52,1,1,6,6,6,6,6.0,0.0,6,6,6.0,0.0,230769.2,38461.53846,52.0,0.0,52,52,0,0.0,0.0,0,0,0,0.0,0.0,0,0,0,0,0,0,20,20,19230.76923,19230.76923,6,6,6.0,0.0,0.0,0,0,0,0,1,1,0,0,1,9.0,6.0,6.0,20,0,0,0,0,0,0,1,6,1,6,29,256,0,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN,Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
3,46236,34,1,1,6,6,6,6,6.0,0.0,6,6,6.0,0.0,352941.2,58823.52941,34.0,0.0,34,34,0,0.0,0.0,0,0,0,0.0,0.0,0,0,0,0,0,0,20,20,29411.76471,29411.76471,6,6,6.0,0.0,0.0,0,0,0,0,1,1,0,0,1,9.0,6.0,6.0,20,0,0,0,0,0,0,1,6,1,6,31,329,0,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN,Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
4,54863,3,2,0,12,0,6,6,6.0,0.0,0,0,0.0,0.0,4000000.0,666666.6667,3.0,0.0,3,3,3,3.0,0.0,3,3,0,0.0,0.0,0,0,0,0,0,0,40,0,666666.6667,0.0,6,6,6.0,0.0,0.0,0,0,0,0,1,0,0,0,0,9.0,6.0,0.0,40,0,0,0,0,0,0,2,12,0,0,32,-1,1,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN,Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv


In [12]:
# Normalize columns and preview labels
orig_cols = list(data.columns)
data.columns = [str(c).strip() for c in data.columns]
if orig_cols != list(data.columns):
    print('Normalized column names (stripped whitespace).')

assert LABEL_COL in data.columns, f"Label column '{LABEL_COL}' missing in data."
print('Label unique values (sample):', data[LABEL_COL].astype(str).unique()[:20])


Normalized column names (stripped whitespace).
Label unique values (sample): ['BENIGN' 'DDoS' 'PortScan' 'Bot' 'Infiltration'
 'Web Attack � Brute Force' 'Web Attack � XSS'
 'Web Attack � Sql Injection' 'FTP-Patator' 'SSH-Patator' 'DoS slowloris'
 'DoS Slowhttptest' 'DoS Hulk']


In [13]:
# DoS-focused feature list
FEATURES = [
    'Flow Packets/s',
    'Flow Duration',
    'Flow IAT Mean',
    'Idle Mean',
    'Fwd Packets/s',
    'Flow IAT Max',
    'Total Fwd Packets',
    'Packet Length Variance',
    'Init_Win_bytes_forward',
    'SYN Flag Count',
]

# Some datasets use slightly different naming; create a mapping fallback
ALIASES = {
    'Fwd Packets/s': ['Fwd Pkts/s', 'Forward Packets/s'],
    'Flow Packets/s': ['Flow Pkts/s'],
    'Packet Length Variance': ['Pkt Len Var', 'Packet Len Var'],
    'Init_Win_bytes_forward': ['Init_Win_bytes_fwd', 'Init_Win_bytes Fwd', 'Init_Win_bytes forward'],
    'SYN Flag Count': ['SYN Flag Cnt'],
}

resolved_features = []
missing = []
for f in FEATURES:
    if f in data.columns:
        resolved_features.append(f)
    else:
        candidates = ALIASES.get(f, [])
        found = None
        for c in candidates:
            if c in data.columns:
                found = c
                break
        if found is not None:
            print(f"Alias used: '{f}' -> '{found}'")
            resolved_features.append(found)
        else:
            missing.append(f)

print('Resolved features:', resolved_features)
if missing:
    print('Missing features (not found in data):', missing)

X_df = data[resolved_features].copy()


Resolved features: ['Flow Packets/s', 'Flow Duration', 'Flow IAT Mean', 'Idle Mean', 'Fwd Packets/s', 'Flow IAT Max', 'Total Fwd Packets', 'Packet Length Variance', 'Init_Win_bytes_forward', 'SYN Flag Count']


In [14]:
# Clean NaNs/Infs and scale
X_df = X_df.replace([np.inf, -np.inf], np.nan)
na_ratio = X_df.isna().mean().sort_values(ascending=False)
print('NaN ratio (top):')
print(na_ratio.head(10))

X_df = X_df.fillna(0.0).astype(np.float32)

# Standard scaling (z-score)
X_mean = X_df.mean()
X_std = X_df.std().replace(0, 1)
X = ((X_df - X_mean) / X_std).astype(np.float32)

# Labels
y_cat = data[LABEL_COL].astype('category')
class_names = list(y_cat.cat.categories)
y = y_cat.cat.codes.values.astype(np.int64)
num_classes = len(class_names)
print('X shape:', X.shape, '| classes:', num_classes)
print('Classes:', class_names[:20])


NaN ratio (top):
Flow Packets/s            0.000956
Flow Duration             0.000000
Flow IAT Mean             0.000000
Idle Mean                 0.000000
Fwd Packets/s             0.000000
Flow IAT Max              0.000000
Total Fwd Packets         0.000000
Packet Length Variance    0.000000
Init_Win_bytes_forward    0.000000
SYN Flag Count            0.000000
dtype: float64
X shape: (1561399, 10) | classes: 13
Classes: ['BENIGN', 'Bot', 'DDoS', 'DoS Hulk', 'DoS Slowhttptest', 'DoS slowloris', 'FTP-Patator', 'Infiltration', 'PortScan', 'SSH-Patator', 'Web Attack � Brute Force', 'Web Attack � Sql Injection', 'Web Attack � XSS']


In [16]:
# Stratified 60/20/20 split using sklearn
from sklearn.model_selection import train_test_split

# First split: Train vs temp (train=60%, temp=40%)
X_temp, X_test, y_temp, y_test = train_test_split(
    X.values, y, test_size=0.20, random_state=SEED, stratify=y
)
# Now split remaining into train and val (train: 60% of total, val: 20% of total)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=SEED, stratify=y_temp
)

print('Shapes:', X_train.shape, X_val.shape, X_test.shape)


ModuleNotFoundError: No module named 'sklearn'

In [None]:
# Torch datasets and loaders
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.optim as optim

X_train_t = torch.from_numpy(X_train).float()
X_val_t = torch.from_numpy(X_val).float()
X_test_t = torch.from_numpy(X_test).float()
y_train_t = torch.from_numpy(y_train)
y_val_t = torch.from_numpy(y_val)
y_test_t = torch.from_numpy(y_test)

train_ds = TensorDataset(X_train_t, y_train_t)
val_ds = TensorDataset(X_val_t, y_val_t)
test_ds = TensorDataset(X_test_t, y_test_t)

train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=BATCH_SIZE)
test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE)

class MLP(nn.Module):
    def __init__(self, in_dim: int, out_dim: int):
        super().__init__()
        hidden = max(32, min(512, in_dim * 2))
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden),
            nn.ReLU(),
            nn.BatchNorm1d(hidden),
            nn.Linear(hidden, hidden),
            nn.ReLU(),
            nn.BatchNorm1d(hidden),
            nn.Linear(hidden, out_dim),
        )
    def forward(self, x):
        return self.net(x)

in_dim = X_train.shape[1]
model = MLP(in_dim, num_classes)

if torch.cuda.is_available():
    device = torch.device('cuda')
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')
print('Using device:', device)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)


In [None]:
# Training
try:
    from tqdm import tqdm
except Exception:
    def tqdm(x, **kwargs):
        return x


def accuracy(logits, targets):
    preds = logits.argmax(dim=1)
    return (preds == targets).float().mean().item()

history = {'train_loss': [], 'val_loss': [], 'train_acc': [], 'val_acc': []}

for epoch in range(1, EPOCHS + 1):
    model.train()
    train_loss = 0.0
    train_acc = 0.0
    num_train = 0
    for b_idx, (xb, yb) in enumerate(tqdm(train_dl, leave=False)):
        if MAX_BATCHES and b_idx >= MAX_BATCHES:
            break
        xb = xb.to(device)
        yb = yb.to(device)
        optimizer.zero_grad()
        out = model(xb)
        loss = criterion(out, yb.long())
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * xb.size(0)
        train_acc += accuracy(out, yb) * xb.size(0)
        num_train += xb.size(0)
        if (b_idx + 1) % LOG_INTERVAL == 0:
            print(f'Epoch {epoch} | Batch {b_idx+1} | loss={loss.item():.4f}')

    train_loss /= max(1, num_train)
    train_acc /= max(1, num_train)

    model.eval()
    val_loss = 0.0
    val_acc = 0.0
    num_val = 0
    with torch.no_grad():
        for xb, yb in val_dl:
            xb = xb.to(device)
            yb = yb.to(device)
            out = model(xb)
            loss = criterion(out, yb.long())
            val_loss += loss.item() * xb.size(0)
            val_acc += accuracy(out, yb) * xb.size(0)
            num_val += xb.size(0)
    val_loss /= max(1, num_val)
    val_acc /= max(1, num_val)

    history['train_loss'].append(train_loss)
    history['val_loss'].append(val_loss)
    history['train_acc'].append(train_acc)
    history['val_acc'].append(val_acc)

    print(f'Epoch {epoch}: train_loss={train_loss:.4f} val_loss={val_loss:.4f} train_acc={train_acc:.4f} val_acc={val_acc:.4f}')


In [None]:
# Test evaluation
model.eval()
from math import isnan

test_loss = 0.0
test_acc = 0.0
num_test = 0
with torch.no_grad():
    for xb, yb in test_dl:
        xb = xb.to(device)
        yb = yb.to(device)
        out = model(xb)
        loss = criterion(out, yb.long())
        test_loss += loss.item() * xb.size(0)
        test_acc += ((out.argmax(dim=1) == yb).float().mean().item()) * xb.size(0)
        num_test += xb.size(0)

test_loss /= max(1, num_test)
# average of per-batch accuracies weighted by batch size
correct = 0
count = 0
with torch.no_grad():
    for xb, yb in test_dl:
        xb = xb.to(device)
        yb = yb.to(device)
        out = model(xb)
        preds = out.argmax(dim=1)
        correct += (preds == yb).sum().item()
        count += xb.size(0)

test_acc = correct / max(1, count)
print(f'Test: loss={test_loss:.4f} acc={test_acc:.4f}')

# Confusion matrix
from sklearn.metrics import confusion_matrix, classification_report

all_preds = []
all_true = []
with torch.no_grad():
    for xb, yb in test_dl:
        xb = xb.to(device)
        out = model(xb)
        all_preds.extend(out.argmax(dim=1).cpu().numpy())
        all_true.extend(yb.cpu().numpy())

cm = confusion_matrix(all_true, all_preds)
print('Classification report:')
print(classification_report(all_true, all_preds, target_names=class_names))

import seaborn as sns
plt.figure(figsize=(10,8))
sns.heatmap(cm, annot=False, cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()


In [None]:
# Curves
fig, axes = plt.subplots(1, 2, figsize=(12,4))
x_epochs = list(range(1, len(history['train_loss']) + 1))
axes[0].plot(x_epochs, history['train_loss'], label='Train Loss')
axes[0].plot(x_epochs, history['val_loss'], label='Val Loss')
axes[0].set_title('Loss (Train / Val)')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].legend(loc='best')
axes[1].plot(x_epochs, history['train_acc'], label='Train Acc')
axes[1].plot(x_epochs, history['val_acc'], label='Val Acc')
axes[1].set_title('Accuracy (Train / Val)')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].legend(loc='best')
plt.tight_layout()
plt.show()


### Notes
- This `modelV2.ipynb` focuses on DoS-relevant features only:
  - Flow Packets/s, Flow Duration, Flow IAT Mean, Idle Mean, Fwd Packets/s,
    Flow IAT Max, Total Fwd Packets, Packet Length Variance,
    Init_Win_bytes_forward, SYN Flag Count.
- Uses stratified 60/20/20 split (train/val/test) and a compact MLP.
- Adjust `NROWS_PER_CSV`, `EPOCHS`, and model width as needed for runtime/performance.
- If any feature columns are missing under slightly different names, the alias map will attempt to resolve them.


In [None]:
# Save artifacts (optional)
ARTIFACTS_DIR = os.path.join(os.getcwd(), 'artifacts')
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

# Save model
model_path = os.path.join(ARTIFACTS_DIR, 'dos_mlp.pt')
torch.save(model.state_dict(), model_path)
print('Saved model to:', model_path)

# Save scaler stats for reproducible preprocessing
scaler_path = os.path.join(ARTIFACTS_DIR, 'scaler_stats.npz')
np.savez(scaler_path, mean=X_mean.values, std=X_std.values, columns=np.array(X_df.columns))
print('Saved scaler to:', scaler_path)

# Save class mapping
classes_path = os.path.join(ARTIFACTS_DIR, 'classes.txt')
with open(classes_path, 'w') as f:
    for cname in class_names:
        f.write(str(cname) + '\n')
print('Saved classes to:', classes_path)
