In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import joblib
import pickle

# 1. Paths to your CSVs
TRAIN_CSV = 'UNSW_NB15_training-set.csv'
TEST_CSV  = 'UNSW_NB15_testing-set.csv'
assert os.path.exists(TRAIN_CSV), f"{TRAIN_CSV} not found"
assert os.path.exists(TEST_CSV),  f"{TEST_CSV} not found"

# 2. Load data
train_df = pd.read_csv(TRAIN_CSV)
test_df  = pd.read_csv(TEST_CSV)

# 3. Drop ID & attack category (already encoded in 'label')
for df in (train_df, test_df):
    df.drop(['id', 'attack_cat'], axis=1, inplace=True, errors='ignore')

# 4. Split features/target
X_train = train_df.drop('label', axis=1)
y_train = train_df['label']
X_test  = test_df.drop('label', axis=1)
y_test  = test_df['label']

# 5. Encode any categorical cols
for col in X_train.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    le.fit(pd.concat([X_train[col], X_test[col]], axis=0))
    X_train[col] = le.transform(X_train[col])
    X_test[col]  = le.transform(X_test[col])

# 6. Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# 7. Feature selection (top 30 by mutual info)
selector = SelectKBest(mutual_info_classif, k=30)
X_train_sel = selector.fit_transform(X_train_scaled, y_train)
X_test_sel  = selector.transform(X_test_scaled)
sel_feats = X_train.columns[selector.get_support()]
print("Top-30 selected features:")
print(list(sel_feats))

# 8. Define your six models
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForest'      : RandomForestClassifier(n_estimators=100),
    'GradientBoosting'  : GradientBoostingClassifier(),

    'KNN'               : KNeighborsClassifier(),
    'DecisionTree'      : DecisionTreeClassifier(),
}

from sklearn.metrics import accuracy_score

# 9. Train, evaluate, save
performance = []
for name, model in models.items():
    print(f"\n>>> Training & evaluating {name}")
    model.fit(X_train_sel, y_train)
    y_pred = model.predict(X_test_sel)

    # Classification report (attack label = 1)
    rpt = classification_report(y_test, y_pred, output_dict=True)
    atk = rpt.get('1', {})

    # Accuracy (overall)
    acc = accuracy_score(y_test, y_pred) * 100

    performance.append({
        'Model'    : name,
        'Precision': atk.get('precision', np.nan) * 100,
        'Recall'   : atk.get('recall',    np.nan) * 100,
        'F1-score' : atk.get('f1-score',  np.nan) * 100,
        'Accuracy' : acc
    })
    # 9c. Save model artifacts
    joblib.dump(model, f'{name}.joblib')
    with open(f'{name}.pkl', 'wb') as f:
        pickle.dump(model, f)
# Final performance table
perf_df = pd.DataFrame(performance).set_index('Model')
print("\n## Model Performance on ‘attack’ class:")
print(perf_df.round(2).to_markdown())



# 10. Save preprocessing artifacts
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(selector, 'feature_selector.joblib')

# 11. Print performance summary
perf_df = pd.DataFrame(performance).set_index('Model')
print("\n### Model Performance on ‘attack’ class:")
print(perf_df.to_markdown())


Top-30 selected features:
['dur', 'proto', 'state', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'ct_srv_src', 'ct_state_ttl', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_srv_dst']

>>> Training & evaluating LogisticRegression

>>> Training & evaluating RandomForest

>>> Training & evaluating GradientBoosting

>>> Training & evaluating KNN

>>> Training & evaluating DecisionTree

## Model Performance on ‘attack’ class:
| Model              |   Precision |   Recall |   F1-score |   Accuracy |
|:-------------------|------------:|---------:|-----------:|-----------:|
| LogisticRegression |       95.17 |    86.97 |      90.88 |      88.12 |
| RandomForest       |       97.95 |    88.69 |      93.09 |      91.04 |
| GradientBoosting   |       98.13 |    88.32 |      92.97 |      90.9  |
| KNN                |       97.06 |    84.25 |      

In [None]:
from sklearn.metrics import accuracy_score

# 9. Train, evaluate, save
performance = []
for name, model in models.items():
    print(f"\n>>> Training & evaluating {name}")
    model.fit(X_train_sel, y_train)
    y_pred = model.predict(X_test_sel)

    # Classification report (attack label = 1)
    rpt = classification_report(y_test, y_pred, output_dict=True)
    atk = rpt.get('1', {})

    # Accuracy (overall)
    acc = accuracy_score(y_test, y_pred) * 100

    performance.append({
        'Model'    : name,
        'Precision': atk.get('precision', np.nan) * 100,
        'Recall'   : atk.get('recall',    np.nan) * 100,
        'F1-score' : atk.get('f1-score',  np.nan) * 100,
        'Accuracy' : acc
    })

# Final performance table
perf_df = pd.DataFrame(performance).set_index('Model')
print("\n## Model Performance on ‘attack’ class:")
print(perf_df.round(2).to_markdown())



>>> Training & evaluating LogisticRegression


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



>>> Training & evaluating RandomForest

>>> Training & evaluating GradientBoosting

>>> Training & evaluating KNN

>>> Training & evaluating DecisionTree

## Model Performance on ‘attack’ class:
| Model              |   Precision |   Recall |   F1-score |   Accuracy |
|:-------------------|------------:|---------:|-----------:|-----------:|
| LogisticRegression |        0    |     0    |       0    |      68.3  |
| RandomForest       |      100    |     3.61 |       6.97 |      76.07 |
| GradientBoosting   |       76.19 |     6.41 |      11.83 |      76.31 |
| KNN                |        9.52 |     0.34 |       0.66 |      71.96 |
| DecisionTree       |       57.97 |     6.87 |      12.29 |      75.04 |


In [None]:
import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# 1. Load data
train_df = pd.read_csv("UNSW_NB15_training-set.csv")
test_df  = pd.read_csv("UNSW_NB15_testing-set.csv")

# 2. Handle missing values
train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)

# 3. Define top features and target
TOP_FEATURES = [
    'dur','proto','state','spkts','dpkts','sbytes','dbytes','rate',
    'sttl','dttl','sload','dload','sloss','dloss','sinpkt','dinpkt',
    'sjit','djit','swin','stcpb','tcprtt','synack','ackdat','smean',
    'dmean','ct_srv_src','ct_state_ttl','ct_src_dport_ltm',
    'ct_dst_sport_ltm','ct_srv_dst'
]
TARGET_COL = 'attack_cat'

# 4. Combine train+test for consistent encoding
combined = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)

# 5. Fit LabelEncoders on combined data
le_proto      = LabelEncoder().fit(combined['proto'].astype(str))
le_state      = LabelEncoder().fit(combined['state'].astype(str))
le_attack_cat = LabelEncoder().fit(combined[TARGET_COL].astype(str))

# 6. Transform features & target
combined['proto']      = le_proto.transform(combined['proto'].astype(str))
combined['state']      = le_state.transform(combined['state'].astype(str))
combined[TARGET_COL]   = le_attack_cat.transform(combined[TARGET_COL].astype(str))

# 7. Split back into train/test
n_train = len(train_df)
train = combined.iloc[:n_train]
test  = combined.iloc[n_train:].reset_index(drop=True)

X_train = train[TOP_FEATURES].values
y_train = train[TARGET_COL].values.astype(int)
X_test  = test[TOP_FEATURES].values
y_test  = test[TARGET_COL].values.astype(int)

# 8. Scale features
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# 9. Train the RandomForest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# 10. Evaluate
y_pred = model.predict(X_test_scaled)
print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred, target_names=le_attack_cat.classes_))
print("\n=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

# 11. Save all artifacts
joblib.dump(model,           "attack_category_model.pkl")
joblib.dump(scaler,          "scaler.joblib")
joblib.dump(le_proto,        "le_proto.joblib")
joblib.dump(le_state,        "le_state.joblib")
joblib.dump(le_attack_cat,   "le_attack_cat.joblib")

print("\nSaved model and preprocessing objects to core/ml_model/")



=== Classification Report ===
                precision    recall  f1-score   support

      Analysis       0.00      0.00      0.00      2000
      Backdoor       0.98      0.03      0.07      1746
           DoS       0.34      0.54      0.42     12264
      Exploits       0.75      0.61      0.67     33393
       Fuzzers       0.64      0.26      0.37     18184
       Generic       0.93      0.98      0.96     40000
        Normal       0.78      0.97      0.86     56000
Reconnaissance       0.93      0.73      0.81     10491
     Shellcode       0.48      0.42      0.45      1133
         Worms       0.68      0.18      0.28       130

      accuracy                           0.76    175341
     macro avg       0.65      0.47      0.49    175341
  weighted avg       0.76      0.76      0.74    175341


=== Confusion Matrix ===
[[    0     0   964   306    41   138   540     1    10     0]
 [    0    60   964   452    57   109    74    13    17     0]
 [    0     0  6643  3517   38

In [None]:
import sklearn
print(sklearn.__version__)

1.6.1
