<a href="https://colab.research.google.com/github/allenjose24/BankApp/blob/main/AUTO_XGBOOST(NSL_KDD).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import load_model
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

print("Starting script...")

# === 1. LOAD DATA & ADD COLUMN HEADERS ===

# These are the 41 feature names + 2 label columns for NSL-KDD
col_names = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
    'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count',
    'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
    'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label', 'difficulty'
]

# --- Load Training and Test Data ---
# !! Update these paths to where you downloaded the files !!
train_path = '/content/KDDTrain+.txt'
test_path = '/content/KDDTest+.txt'

try:
    df_train = pd.read_csv(train_path, header=None, names=col_names)
    df_test = pd.read_csv(test_path, header=None, names=col_names)
    print("Successfully loaded KDDTrain+.txt and KDDTest+.txt")
except FileNotFoundError:
    print(f"Error: Could not find dataset files.")
    print(f"Please make sure '{train_path}' and '{test_path}' are in the same folder as this script.")
    exit()

# === 2. PRE-PROCESSING ===

print("Starting pre-processing...")

# --- 2a. Handle Target Labels ---
# We will do a binary classification: 'normal' (0) vs. 'attack' (1)
df_train['label'] = (df_train['label'] != 'normal').astype(int)
df_test['label'] = (df_test['label'] != 'normal').astype(int)

# Drop the 'difficulty' column as it's not needed
df_train = df_train.drop('difficulty', axis=1)
df_test = df_test.drop('difficulty', axis=1)

# --- 2b. Handle Categorical Features (One-Hot Encoding) ---
categorical_cols = ['protocol_type', 'service', 'flag']

# Combine train and test for consistent one-hot encoding
combined_df = pd.concat([df_train.drop('label', axis=1), df_test.drop('label', axis=1)], axis=0)
combined_df = pd.get_dummies(combined_df, columns=categorical_cols, dtype=int)

# Separate back into train and test
X_train = combined_df.iloc[:len(df_train)]
X_test = combined_df.iloc[len(df_train):]

# Get the target labels
y_train = df_train['label']
y_test = df_test['label']

print(f"Data pre-processed. Final feature count: {X_train.shape[1]}")

# --- 2c. Feature Scaling ---
# This is CRITICAL for Autoencoders and good practice for XGBoost
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# === 3. MODEL 1: XGBOOST (SUPERVISED CLASSIFIER) ===

print("\n--- Training Model 1: XGBoost (Supervised) ---")
# Use 'use_label_encoder=False' and 'eval_metric='logloss'' as best practice
model_xgb = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Train the model on ALL training data (normal + attacks)
model_xgb.fit(X_train, y_train)

# Get predictions on the test set
y_pred_xgb = model_xgb.predict(X_test)

print("XGBoost training complete.")

# === 4. MODEL 2: AUTOENCODER (UNSUPERVISED ANOMALY DETECTOR) ===

print("\n--- Training Model 2: Autoencoder (Unsupervised) ---")

# --- 4a. Prepare Data for Autoencoder ---
# Autoencoders should ONLY be trained on 'normal' data
# We get the 'normal' data from the training set
X_train_normal = X_train[y_train == 0]
print(f"Autoencoder will be trained on {len(X_train_normal)} 'normal' samples.")

input_dim = X_train.shape[1]
encoding_dim = 32  # You can tune this hyperparameter

# --- 4b. Define Autoencoder Architecture ---
input_layer = Input(shape=(input_dim, ))
encoder = Dense(64, activation='relu')(input_layer)
encoder = Dense(encoding_dim, activation='relu')(encoder)  # Encoded representation
decoder = Dense(64, activation='relu')(encoder)
decoder = Dense(input_dim, activation='sigmoid')(decoder) # Output layer

autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# --- 4c. Train the Autoencoder ---
autoencoder.fit(
    X_train_normal,  # Input
    X_train_normal,  # Target (it learns to reconstruct itself)
    epochs=50,
    batch_size=256,
    shuffle=True,
    validation_split=0.1,  # Use 10% of normal data for validation
    verbose=0  # Set to 1 to see training progress
)
print("Autoencoder training complete.")

# --- 4d. Use AE for Anomaly Detection ---
# Get the model's reconstructions of the *entire* test set
reconstructions = autoencoder.predict(X_test)

# Calculate the Mean Squared Error (MSE) for each sample
# This is our "anomaly score"
mse = np.mean(np.power(X_test - reconstructions, 2), axis=1)

# --- 4e. Find Anomaly Threshold ---
# We need a threshold to decide what error score is an "anomaly"
# A common method: use the 95th percentile of the 'normal' training data's error
reconstructions_normal = autoencoder.predict(X_train_normal)
mse_normal = np.mean(np.power(X_train_normal - reconstructions_normal, 2), axis=1)
threshold = np.percentile(mse_normal, 95)
print(f"Anomaly threshold (95th percentile of normal error) set to: {threshold:.6f}")

# Classify based on the threshold
y_pred_ae = (mse > threshold).astype(int)

# === 5. COMPARE PERFORMANCES ===

print("\n" + "="*50)
print("           MODEL PERFORMANCE COMPARISON")
print("="*50 + "\n")

def print_metrics(model_name, y_true, y_pred):
    print(f"--- Metrics for {model_name} ---")
    print(f"Accuracy:    {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision:   {precision_score(y_true, y_pred):.4f}")
    print(f"Recall:      {recall_score(y_true, y_pred):.4f}")
    print(f"F1-Score:    {f1_score(y_true, y_pred):.4f}")
    print(f"AUC-ROC:     {roc_auc_score(y_true, y_pred):.4f}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("\n")

# Print metrics for both models
print_metrics("XGBoost (Supervised)", y_test, y_pred_xgb)
print_metrics("Autoencoder (Unsupervised)", y_test, y_pred_ae)

print("="*50)
print("Script finished.")

Starting script...
Successfully loaded KDDTrain+.txt and KDDTest+.txt
Starting pre-processing...
Data pre-processed. Final feature count: 122

--- Training Model 1: XGBoost (Supervised) ---
XGBoost training complete.

--- Training Model 2: Autoencoder (Unsupervised) ---
Autoencoder will be trained on 67343 'normal' samples.
Autoencoder training complete.
[1m705/705[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m2105/2105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step
Anomaly threshold (95th percentile of normal error) set to: 0.000277

           MODEL PERFORMANCE COMPARISON

--- Metrics for XGBoost (Supervised) ---
Accuracy:    0.7879
Precision:   0.9681
Recall:      0.6487
F1-Score:    0.7769
AUC-ROC:     0.8103

Confusion Matrix:
[[9437  274]
 [4508 8325]]


--- Metrics for Autoencoder (Unsupervised) ---
Accuracy:    0.8799
Precision:   0.9151
Recall:      0.8697
F1-Score:    0.8918
AUC-ROC:     0.8815

Confusion Matrix:
[[ 8675  1036]
 [ 1672 11