In [7]:
import pandas as pd

train_df = pd.read_parquet(f)

print("Training dataset loaded")
print("Shape:", train_df.shape)
train_df.head()


Training dataset loaded
Shape: (175341, 36)


Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,attack_cat,label
0,0.121478,tcp,-,FIN,6,4,258,172,74.087486,14158.942383,...,0,0,1,1,0,0,0,0,Normal,0
1,0.649902,tcp,-,FIN,14,38,734,42014,78.473373,8395.112305,...,0,0,1,1,0,0,0,0,Normal,0
2,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,1572.271851,...,0,0,1,1,0,0,0,0,Normal,0
3,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,2740.178955,...,0,0,1,1,1,1,0,0,Normal,0
4,0.449454,tcp,-,FIN,10,6,534,268,33.373825,8561.499023,...,0,0,2,1,0,0,0,0,Normal,0


In [9]:
test_df = pd.read_parquet("dataset/UNSW_NB15_testing-set.parquet")

print("Testing dataset loaded")
print("Shape:", test_df.shape)
test_df.head()


Testing dataset loaded
Shape: (82332, 36)


Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,attack_cat,label
0,1.1e-05,udp,-,INT,2,0,496,0,90909.09375,180363632.0,...,0,0,1,1,0,0,0,0,Normal,0
1,8e-06,udp,-,INT,2,0,1762,0,125000.0,881000000.0,...,0,0,1,1,0,0,0,0,Normal,0
2,5e-06,udp,-,INT,2,0,1068,0,200000.0,854400000.0,...,0,0,1,1,0,0,0,0,Normal,0
3,6e-06,udp,-,INT,2,0,900,0,166666.65625,600000000.0,...,0,0,2,1,0,0,0,0,Normal,0
4,1e-05,udp,-,INT,2,0,2126,0,100000.0,850400000.0,...,0,0,2,1,0,0,0,0,Normal,0


In [10]:
print("Training columns:")
print(train_df.columns)

print("\nTesting columns:")
print(test_df.columns)


Training columns:
Index(['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt',
       'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt',
       'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_src_dport_ltm', 'ct_dst_sport_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'is_sm_ips_ports',
       'attack_cat', 'label'],
      dtype='object')

Testing columns:
Index(['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt',
       'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt',
       'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_src_dport_ltm', 'ct_dst_sport_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'is_sm_ips_ports',
       'attack_cat', 'label'],
      dtype='object')


In [11]:
df = pd.concat([train_df, test_df], ignore_index=True)
print("Merged shape:", df.shape)


Merged shape: (257673, 36)


In [12]:
X = df.drop(columns=["attack_cat"])
y = df["attack_cat"]


In [13]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print("Classes:", label_encoder.classes_)


Classes: ['Analysis' 'Backdoor' 'DoS' 'Exploits' 'Fuzzers' 'Generic' 'Normal'
 'Reconnaissance' 'Shellcode' 'Worms']


In [14]:
import numpy as np
X = X.replace('-', np.nan)


  X = X.replace('-', np.nan)


In [23]:
# FORCE categorical columns (UNSW-NB15 specific)
force_cat_cols = ["proto", "service", "state"]

for col in force_cat_cols:
    if col in X.columns:
        X[col] = X[col].astype(str)

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

cat_cols = X.select_dtypes(include="object").columns
num_cols = X.select_dtypes(exclude="object").columns

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])

model = Pipeline([
    ("preprocess", preprocessor),
    ("classifier", RandomForestClassifier(
        n_estimators=50,
        random_state=42,
        n_jobs=-1
    ))
])


In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.8706704181624139


In [27]:
import joblib

joblib.dump(model, "unsw_rf_model.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")
print("Model saved")


Model saved


In [31]:
from sklearn.ensemble import IsolationForest

# ðŸ”¹ Use SAME preprocessing as ML model
X_processed = model.named_steps["preprocess"].transform(X)

# Train Isolation Forest on processed numeric data
threat_model = IsolationForest(
    n_estimators=100,
    contamination=0.05,
    random_state=42
)

threat_model.fit(X_processed)

print("Threat (anomaly) model trained successfully")


Threat (anomaly) model trained successfully


In [32]:
from sklearn.ensemble import IsolationForest

# ðŸ”¹ Use SAME preprocessing as ML model
X_processed = model.named_steps["preprocess"].transform(X)

# Train Isolation Forest on processed numeric data
threat_model = IsolationForest(
    n_estimators=100,
    contamination=0.05,
    random_state=42
)

threat_model.fit(X_processed)

print("Threat (anomaly) model trained successfully")


Threat (anomaly) model trained successfully


In [33]:
# Auto IP block (simulation) 
import random

def generate_ip():
    return f"192.168.1.{random.randint(1,254)}"

df["src_ip"] = [generate_ip() for _ in range(len(df))]


In [None]:
# Predict anomaly status using threat_model
anomaly_pred = threat_model.predict(X_processed)
df["threat_status"] = ["Threat" if x == -1 else "Normal" for x in anomaly_pred]

blocked_ips = set()   # already blocked IPs

def autonomous_response(row):
    if row["threat_status"] == "Threat":
        ip = row["src_ip"]
        blocked_ips.add(ip)
        return "BLOCKED"
    else:
        return "ALLOWED"

df["action"] = df.apply(autonomous_response, axis=1)


KeyError: 'threat_status'