In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import torch
from torch import nn
import joblib


In [60]:
import pandas as pd

paths = [
    "../../datasets/UNSW-NB15_1.csv",
]

raw = pd.read_csv(paths[0], header=None)
print("Raw shape:", raw.shape)
raw.head()


  raw = pd.read_csv(paths[0], header=None)


Raw shape: (700001, 49)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,39,40,41,42,43,44,45,46,47,48
0,59.166.0.0,1390,149.171.126.6,53,udp,CON,0.001055,132,164,31,...,0,3,7,1,3,1,1,1,,0
1,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,31,...,0,2,4,2,3,1,1,2,,0
2,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,31,...,0,12,8,1,2,2,1,1,,0
3,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132,164,31,...,0,6,9,1,1,1,1,1,,0
4,59.166.0.3,49664,149.171.126.0,53,udp,CON,0.001169,146,178,31,...,0,7,9,1,1,1,1,1,,0


In [61]:
feat_path = "../../datasets/NUSW-NB15_features.csv"
feat_df = pd.read_csv(feat_path, encoding="latin1")

print("Feature file shape:", feat_df.shape)
print(feat_df.head())
print(feat_df.tail())


Feature file shape: (49, 4)
   No.    Name    Type               Description
0    1   srcip  nominal        Source IP address
1    2   sport  integer       Source port number
2    3   dstip  nominal   Destination IP address
3    4  dsport  integer  Destination port number
4    5   proto  nominal     Transaction protocol
    No.              Name    Type   \
44   45  ct_src_dport_ltm  integer   
45   46  ct_dst_sport_ltm  integer   
46   47    ct_dst_src_ltm  integer   
47   48        attack_cat  nominal   
48   49             Label   binary   

                                          Description  
44  No of connections of the same source address (...  
45  No of connections of the same destination addr...  
46  No of connections of the same source (1) and t...  
47  The name of each attack category. In this data...  
48              0 for normal and 1 for attack records  


In [53]:
import pandas as pd

# 1) Load feature names
feat_path = "../../datasets/NUSW-NB15_features.csv"
feat_df = pd.read_csv(feat_path, encoding="latin1")

# Column with the feature names is usually "Name"
col_names = feat_df["Name"].tolist()
print("Number of feature names:", len(col_names))
print("Last 5 feature names:", col_names[-5:])

# 2) Load the 4 data parts (they have NO header)
paths = [
    "../../datasets/UNSW-NB15_1.csv",
    "../../datasets/UNSW-NB15_2.csv",
    "../../datasets/UNSW-NB15_3.csv",
    "../../datasets/UNSW-NB15_4.csv",
]

dfs = []
for p in paths:
    print("Loading:", p)
    dfs.append(pd.read_csv(p, header=None))

df = pd.concat(dfs, ignore_index=True)
print("Raw data shape:", df.shape)

# 3) If feature count and column count mismatch, trim/pad feature list
if len(col_names) > df.shape[1]:
    col_names = col_names[:df.shape[1]]
elif len(col_names) < df.shape[1]:
    col_names = col_names + [f"extra_{i}" for i in range(df.shape[1] - len(col_names))]

df.columns = col_names
print("Columns (last 10):", df.columns[-10:])

# 4) Try to detect a label-like column and normalize its name to exactly "label"
label_candidates = [c for c in df.columns if "label" in c.lower()]
print("Label-like columns:", label_candidates)

if "label" not in df.columns and label_candidates:
    df = df.rename(columns={label_candidates[0]: "label"})
    print(f"Renamed {label_candidates[0]} -> 'label'")

print("'label' in columns?", "label" in df.columns)


Number of feature names: 49
Last 5 feature names: ['ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat', 'Label']
Loading: ../../datasets/UNSW-NB15_1.csv


  dfs.append(pd.read_csv(p, header=None))


Loading: ../../datasets/UNSW-NB15_2.csv


  dfs.append(pd.read_csv(p, header=None))


Loading: ../../datasets/UNSW-NB15_3.csv
Loading: ../../datasets/UNSW-NB15_4.csv
Raw data shape: (2540047, 49)
Columns (last 10): Index(['ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat',
       'Label'],
      dtype='object')
Label-like columns: ['Label']
Renamed Label -> 'label'
'label' in columns? True


In [56]:
keep_cols = [
    "srcip","sport","dstip","dsport","proto",
    "dur","sbytes","dbytes","sttl","dttl",
    "ct_state_ttl","label"
]

print("Missing from df:", [c for c in keep_cols if c not in df.columns])

df_clean = df[keep_cols].copy()
df_clean.head()


Missing from df: []


Unnamed: 0,srcip,sport,dstip,dsport,proto,dur,sbytes,dbytes,sttl,dttl,ct_state_ttl,label
0,59.166.0.0,1390,149.171.126.6,53,udp,0.001055,132,164,31,29,0,0
1,59.166.0.0,33661,149.171.126.9,1024,udp,0.036133,528,304,31,29,0,0
2,59.166.0.6,1464,149.171.126.7,53,udp,0.001119,146,178,31,29,0,0
3,59.166.0.5,3593,149.171.126.5,53,udp,0.001209,132,164,31,29,0,0
4,59.166.0.3,49664,149.171.126.0,53,udp,0.001169,146,178,31,29,0,0


In [58]:
# Start fresh from df
keep_feature_cols = [
    "srcip","sport","dstip","dsport","proto",
    "dur","sbytes","dbytes","sttl","dttl",
    "ct_state_ttl","attack_cat"   # note: include attack_cat for now
]

df_clean = df[keep_feature_cols].copy()
df_clean.head()


Unnamed: 0,srcip,sport,dstip,dsport,proto,dur,sbytes,dbytes,sttl,dttl,ct_state_ttl,attack_cat
0,59.166.0.0,1390,149.171.126.6,53,udp,0.001055,132,164,31,29,0,
1,59.166.0.0,33661,149.171.126.9,1024,udp,0.036133,528,304,31,29,0,
2,59.166.0.6,1464,149.171.126.7,53,udp,0.001119,146,178,31,29,0,
3,59.166.0.5,3593,149.171.126.5,53,udp,0.001209,132,164,31,29,0,
4,59.166.0.3,49664,149.171.126.0,53,udp,0.001169,146,178,31,29,0,


In [59]:
# New label: 1 = attack, 0 = normal
df_clean["label"] = (df_clean["attack_cat"] != "Normal").astype(int)

# We don't need attack_cat anymore in the features
df_clean.drop(columns=["attack_cat"], inplace=True)

print(df_clean["label"].value_counts())


label
1    2540047
Name: count, dtype: int64


In [55]:
# Convert proto to category
df_clean["proto"] = df_clean["proto"].astype("category").cat.codes

# Convert label: 1 = attack, 0 = normal
df_clean["label"] = df_clean["label"].apply(lambda x: 1 if x == "Attack" else 0)


KeyError: 'label'

In [None]:
# 1. Encode proto as category codes
df_clean["proto"] = df_clean["proto"].astype("category").cat.codes

# 2. Turn label into 0/1 if it isn't already
df_clean["label"] = df_clean["label"].apply(lambda x: 1 if x == "Attack" or x == 1 else 0)

# 3. Use ONLY numeric-friendly columns (drop srcip, dstip)
feature_cols = [
    "sport", "dsport", "proto",
    "dur", "sbytes", "dbytes",
    "sttl", "dttl", "ct_state_ttl"
]

# Make sure everything is numeric
for c in feature_cols:
    df_clean[c] = pd.to_numeric(df_clean[c], errors="coerce")

# Optional: replace NaNs from coercion with 0
df_clean[feature_cols] = df_clean[feature_cols].fillna(0)


In [None]:
X = df_clean[feature_cols]
y = df_clean["label"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_test  = torch.tensor(X_test, dtype=torch.float32)
y_test  = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)


In [None]:
class IDSModel(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        return self.net(x)

model = IDSModel(X_train.shape[1])
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [None]:
for epoch in range(30):
    model.train()
    optimizer.zero_grad()
    
    preds = model(X_train)
    loss = criterion(preds, y_train)
    
    loss.backward()
    optimizer.step()
    
    print(f"Epoch {epoch+1}/10 - Loss: {loss.item():.4f}")


Epoch 1/10 - Loss: 0.6885
Epoch 2/10 - Loss: 0.6814
Epoch 3/10 - Loss: 0.6742
Epoch 4/10 - Loss: 0.6668
Epoch 5/10 - Loss: 0.6593
Epoch 6/10 - Loss: 0.6515
Epoch 7/10 - Loss: 0.6436
Epoch 8/10 - Loss: 0.6356
Epoch 9/10 - Loss: 0.6274
Epoch 10/10 - Loss: 0.6190
Epoch 11/10 - Loss: 0.6106
Epoch 12/10 - Loss: 0.6019
Epoch 13/10 - Loss: 0.5930
Epoch 14/10 - Loss: 0.5839
Epoch 15/10 - Loss: 0.5747
Epoch 16/10 - Loss: 0.5652
Epoch 17/10 - Loss: 0.5555
Epoch 18/10 - Loss: 0.5457
Epoch 19/10 - Loss: 0.5356
Epoch 20/10 - Loss: 0.5254
Epoch 21/10 - Loss: 0.5150
Epoch 22/10 - Loss: 0.5044
Epoch 23/10 - Loss: 0.4935
Epoch 24/10 - Loss: 0.4826
Epoch 25/10 - Loss: 0.4715
Epoch 26/10 - Loss: 0.4603
Epoch 27/10 - Loss: 0.4490
Epoch 28/10 - Loss: 0.4376
Epoch 29/10 - Loss: 0.4260
Epoch 30/10 - Loss: 0.4145


In [None]:
model.eval()
with torch.no_grad():
    preds = model(X_test)
    preds = (preds.numpy() > 0.5).astype(int)

print(classification_report(y_test.numpy(), preds))


              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    508010

    accuracy                           1.00    508010
   macro avg       1.00      1.00      1.00    508010
weighted avg       1.00      1.00      1.00    508010



In [None]:
torch.save(model.state_dict(), "../models/ids_model.pt")
joblib.dump(scaler, "../models/scaler.pkl")

print("Model + scaler saved!")


Model + scaler saved!


In [None]:
print(df_clean["label"].value_counts())


label
0    2540047
Name: count, dtype: int64


In [None]:
df_clean[["label", "proto", "sbytes", "dsport"]].head(20)


Unnamed: 0,label,proto,sbytes,dsport
0,0,120,132,53.0
1,0,120,528,1024.0
2,0,120,146,53.0
3,0,120,132,53.0
4,0,120,146,53.0
5,0,120,568,111.0
6,0,120,132,53.0
7,0,6,46,0.0
8,0,120,146,53.0
9,0,120,132,53.0


In [None]:
print("df shape:", df.shape)
print("Number of feature names:", len(col_names))


df shape: (2540047, 49)
Number of feature names: 49


In [None]:
print(df.columns[-10:])


Index(['ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat',
       'label'],
      dtype='object')
