In [3]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score
from imblearn.combine import SMOTETomek
import xgboost as xgb

In [5]:
train_data = pd.read_csv('UNSW_NB15_training-set.csv')
test_data = pd.read_csv('UNSW_NB15_testing-set.csv')

In [3]:
print("Training data shape:", train_data.shape)
print("Testing data shape:", test_data.shape)

Training data shape: (175341, 45)
Testing data shape: (82332, 45)


In [4]:
print(train_data.size)
print(test_data.size)

7890345
3704940


In [5]:
print("\nColumns in dataset:")
print(train_data.columns.tolist())

# Check data types
print("\nData types:")
print(train_data.dtypes)

# Check label distribution
print("\nTraining label distribution:")
print(train_data['label'].value_counts())
print(train_data['label'].value_counts(normalize=True))

print("\nTest label distribution:")
print(test_data['label'].value_counts())
print(test_data['label'].value_counts(normalize=True))

# Check attack categories if available
if 'attack_cat' in train_data.columns:
    print("\nAttack categories in training data:")
    print(train_data['attack_cat'].value_counts())
    
    print("\nAttack categories in test data:")
    print(test_data['attack_cat'].value_counts())

# Check for missing values
print("\nMissing values in training data:")
print(train_data.isnull().sum().sum())

# Get basic statistics for numerical features
print("\nSummary statistics for numerical features (sample):")
print(train_data.describe().iloc[:, :5])  # First 5 columns only for brevity



Columns in dataset:
['id', 'dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat', 'label']

Data types:
id                     int64
dur                  float64
proto                 object
service               object
state                 object
spkts                  int64
dpkts                  int64
sbytes                 int64
dbytes                 int64
rate                 float64
sttl                   int64
dttl                   int64
sload                float64
dload                float64
sloss                  int64
dloss         

In [6]:
X_train = train_data.drop(['id', 'attack_cat', 'label'], axis=1)
y_train = train_data['label']  # For binary classification


X_test = test_data.drop(['id', 'attack_cat', 'label'], axis=1)
y_test = test_data['label']


In [7]:
categorical_cols = ['proto', 'service', 'state']
X_train_encoded = pd.get_dummies(X_train, columns=categorical_cols, drop_first=False)
X_test_encoded = pd.get_dummies(X_test, columns=categorical_cols, drop_first=False)

# Ensure train and test have the same columns
train_cols = set(X_train_encoded.columns)
test_cols = set(X_test_encoded.columns)

# Add missing columns to test set
for col in train_cols - test_cols:
    X_test_encoded[col] = 0

# Add missing columns to train set
for col in test_cols - train_cols:
    X_train_encoded[col] = 0

X_test_encoded = X_test_encoded[X_train_encoded.columns]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)


In [8]:
smote_tomek = SMOTETomek(random_state=42)
X_train_resampled, y_train_resampled = smote_tomek.fit_resample(X_train_scaled, y_train)

# Define best parameters for XGBoost (based on your original hyperparameter tuning)
best_params = {
    'n_estimators': 200,
    'learning_rate': 0.2,
    'max_depth': 5,
    'gamma': 0.2,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 1.0,
    'scale_pos_weight': (y_train == 0).sum() / (y_train == 1).sum(),
    'random_state': 42
}

# Train the SMOTE-balanced XGBoost model
print("Training SMOTE-balanced XGBoost model...")
smote_model = xgb.XGBClassifier(**best_params)
smote_model.fit(X_train_resampled, y_train_resampled)

# Make predictions and evaluate the model
print("Evaluating model on test set...")
y_pred = smote_model.predict(X_test_scaled)
y_prob = smote_model.predict_proba(X_test_scaled)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

print(f"Model Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"ROC AUC: {auc:.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save the entire pipeline for later use
print("Saving model and preprocessing components...")

# Save the scaler
joblib.dump(scaler, 'network_intrusion_scaler.pkl')
print("Scaler saved as 'network_intrusion_scaler.pkl'")

# Save the model
joblib.dump(smote_model, 'network_intrusion_smote_model.pkl')
print("SMOTE model saved as 'network_intrusion_smote_model.pkl'")

# Optional: Save column names for future reference 
with open('model_features.txt', 'w') as f:
    for feature in X_train_encoded.columns:
        f.write(f"{feature}\n")
print("Feature names saved as 'model_features.txt'")

print("Done!")


Training SMOTE-balanced XGBoost model...
Evaluating model on test set...
Model Performance:
Accuracy: 0.9119
ROC AUC: 0.9846

Confusion Matrix:
[[31501  5499]
 [ 1752 43580]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.85      0.90     37000
           1       0.89      0.96      0.92     45332

    accuracy                           0.91     82332
   macro avg       0.92      0.91      0.91     82332
weighted avg       0.91      0.91      0.91     82332

Saving model and preprocessing components...
Scaler saved as 'network_intrusion_scaler.pkl'
SMOTE model saved as 'network_intrusion_smote_model.pkl'
Feature names saved as 'model_features.txt'
Done!
