In [1]:
import pandas as pd
import numpy as np

In [2]:
# -----------------------
# 1. Importing the Data
# -----------------------

columns = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land",
    "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in",
    "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations",
    "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login", "is_guest_login",
    "count", "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate",
    "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count",
    "dst_host_srv_count", "dst_host_same_srv_rate", "dst_host_diff_srv_rate",
    "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", "dst_host_serror_rate",
    "dst_host_srv_serror_rate", "dst_host_rerror_rate", "dst_host_srv_rerror_rate",
    "label", "difficulty"
]

# Load the training data.
train_data = pd.read_csv("KDDTrain+.csv", names=columns)
train_data.drop(columns=['difficulty'], inplace=True)

test_data = pd.read_csv("KDDTest+.csv", names=columns)
test_data.drop(columns=['difficulty'], inplace=True)


In [3]:
train_data.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [4]:
test_data.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,private,REJ,0,0,0,0,0,0,...,10,0.04,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune
1,0,tcp,private,REJ,0,0,0,0,0,0,...,1,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune
2,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,86,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0,normal
3,0,icmp,eco_i,SF,20,0,0,0,0,0,...,57,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0,saint
4,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,86,0.31,0.17,0.03,0.02,0.0,0.0,0.83,0.71,mscan


In [5]:
# ------------------------------
# 2. Preprocessing the Data
# ------------------------------

# Identify categorical columns.
categorical_cols = ["protocol_type", "service", "flag"]

# Apply one-hot encoding to the categorical columns for both train and test sets.
train_data = pd.get_dummies(train_data, columns=categorical_cols)
test_data = pd.get_dummies(test_data, columns=categorical_cols)

# Separate features and labels.
X_train = train_data.drop(columns=['label'])
y_train = train_data['label']
X_test = test_data.drop(columns=['label'])
y_test = test_data['label']

# Align the train and test feature sets so that they have the same columns.
X_train, X_test = X_train.align(X_test, join='outer', axis=1, fill_value=0)

# Convert the label to a binary outcome:
#    - 0 for 'normal' (benign connections)
#    - 1 for any attack type (malicious connections)
y_train = y_train.apply(lambda x: 0 if x == 'normal' else 1)
y_test = y_test.apply(lambda x: 0 if x == 'normal' else 1)

In [6]:
# ------------------------------
# 3. Verification of the Processed Data
# ------------------------------

print("Training features shape:", X_train.shape)
print("Training labels shape:", y_train.shape)
print("Test features shape:", X_test.shape)
print("Test labels shape:", y_test.shape)

Training features shape: (125973, 122)
Training labels shape: (125973,)
Test features shape: (22543, 122)
Test labels shape: (22543,)


 This scaler standardizes features by removing the mean and scaling to unit variance. Note that while scaling is especially important for models like SVMs or neural networks, many tree-based models (e.g., Random Forests) are less sensitive to feature scaling

In [7]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler.
scaler = StandardScaler()

# Fit the scaler on the training data and transform it.
X_train_scaled = scaler.fit_transform(X_train)

# Use the same scaler to transform the test data.
X_test_scaled = scaler.transform(X_test)

# Verify the shapes of the scaled features.
print("Scaled training features shape:", X_train_scaled.shape)
print("Scaled test features shape:", X_test_scaled.shape)

Scaled training features shape: (125973, 122)
Scaled test features shape: (22543, 122)


This code snippet initializes the classifier, trains it on the scaled training set, makes predictions on the scaled test set, and then prints out performance metrics such as accuracy, the classification report, and the confusion matrix.

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the Random Forest classifier.
rf_clf = RandomForestClassifier(random_state=42, n_jobs=-1)

# Train the model on the scaled training data.
rf_clf.fit(X_train_scaled, y_train)

# Predict on the test data.
y_pred = rf_clf.predict(X_test_scaled)

# Evaluate the model's performance.
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Test Accuracy: 0.7755844386283991

Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.97      0.79      9710
           1       0.97      0.63      0.76     12833

    accuracy                           0.78     22543
   macro avg       0.82      0.80      0.77     22543
weighted avg       0.84      0.78      0.77     22543

Confusion Matrix:
 [[9458  252]
 [4807 8026]]


~77.6% of the test samples were correctly classified. While accuracy gives a broad view, it’s especially important to consider class-specific metrics in cybersecurity applications where the cost of false negatives (missed attacks) can be high.

For class 0 ("normal"):

Recall: 0.97 indicates that nearly all benign connections are correctly identified.
Precision: 0.66 suggests that when the model predicts "normal," there's a substantial chance it might be a misclassified attack (i.e., some false negatives are getting labeled as normal).

For class 1 ("attack"):

Precision: 0.97 means that when the model flags a connection as an attack, it’s usually correct.
Recall: 0.63 indicates that a significant number of attacks are being missed (i.e., false negatives are high).

Now perform hyperparameter tuning on the Random Forest classifier using Scikit-learn’s GridSearchCV. In this example, we focus on optimizing for the recall of class 1 (attacks) by using a custom scorer. This is especially important in cybersecurity, where missing an attack (false negative) can have severe consequences.

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, recall_score, classification_report, confusion_matrix
import numpy as np

# Define a custom scorer to optimize for recall of class 1 (attacks).
recall_scorer = make_scorer(recall_score, pos_label=1)

# Define the same parameter grid.
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'class_weight': [None, 'balanced']
}

# Initialize the base Random Forest classifier.
rf_clf = RandomForestClassifier(random_state=42, n_jobs=-1)

# Set up RandomizedSearchCV.
random_search = RandomizedSearchCV(estimator=rf_clf,
                                   param_distributions=param_grid,
                                   n_iter=100,           # Number of random combinations to try
                                   scoring=recall_scorer,  # Optimize for recall of the attack class
                                   cv=3,                 # 3-fold cross-validation
                                   n_jobs=-1,
                                   verbose=3,
                                   random_state=42)

# Run the randomized search on the scaled training data.
random_search.fit(X_train_scaled, y_train)

# Output the best hyperparameters and best score.
print("Best Hyperparameters:", random_search.best_params_)
print("Best Cross-Validation Recall Score for Attack Class:", random_search.best_score_)

# Use the best estimator to make predictions on the test data.
best_rf = random_search.best_estimator_
y_pred_best = best_rf.predict(X_test_scaled)

# Evaluate the tuned model on the test data.
accuracy = best_rf.score(X_test_scaled, y_test)
print("\nTest Accuracy after Randomized Search:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred_best))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_best))


Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best Hyperparameters: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': None, 'class_weight': None}
Best Cross-Validation Recall Score for Attack Class: 0.9985502317112344

Test Accuracy after Randomized Search: 0.8002927738100519

Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.97      0.81      9710
           1       0.97      0.67      0.79     12833

    accuracy                           0.80     22543
   macro avg       0.83      0.82      0.80     22543
weighted avg       0.85      0.80      0.80     22543

Confusion Matrix:
 [[9417  293]
 [4209 8624]]


In [18]:
import numpy as np
from sklearn.metrics import roc_curve, recall_score, classification_report

# Get predicted probabilities for the positive class (attack)
y_probs = best_rf.predict_proba(X_test_scaled)[:, 1]

# Compute ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_probs, pos_label=1)

# Compute Youden's J statistic (tpr - fpr) for each threshold
j_scores = tpr - fpr

# Find the threshold that maximizes Youden's J statistic
optimal_threshold = thresholds[np.argmax(j_scores)]
print("Optimal threshold based on ROC:", optimal_threshold)

# Apply the new threshold to generate predictions
y_pred_adjusted = (y_probs >= optimal_threshold).astype(int)

# Evaluate the adjusted predictions
cm_tree_scaled = confusion_matrix(y_test, y_pred_adjusted)
cf_tree_scaled = classification_report(y_test, y_pred_adjusted)
print("Adjusted Classification Report:\n", cf_tree_scaled)
print("Adjusted Attack Recall:", recall_score(y_test, y_pred_adjusted))
print("\nAdjusted Confusion Matrix:\n", cm_tree_scaled)



Optimal threshold based on ROC: 0.0016666666666666668
Adjusted Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.89      0.88      9710
           1       0.92      0.90      0.91     12833

    accuracy                           0.89     22543
   macro avg       0.89      0.89      0.89     22543
weighted avg       0.90      0.89      0.89     22543

Adjusted Attack Recall: 0.8980752746824593

Adjusted Confusion Matrix:
 [[ 8644  1066]
 [ 1308 11525]]


**XGBoost (Extreme Gradient Boosting)**

XGBoost is renowned for its excellent predictive performance on structured/tabular data It’s highly optimized and can handle large datasets efficiently. Built-in regularization helps to reduce overfitting.

In [19]:
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Initialize the XGBoost classifier.
xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=300,
    max_depth=10,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

# Train the model on the scaled training data.
xgb_clf.fit(X_train_scaled, y_train)

# Predict on the test data.
y_pred_xgb = xgb_clf.predict(X_test_scaled)

# Evaluate the model.
cm_xgboost_scaled = confusion_matrix(y_test, y_pred_xgb)
cf_xgboost_scaled = classification_report(y_test, y_pred_xgb)
print("XGBoost Test Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("\nXGBoost Classification Report:\n", cf_xgboost_scaled)
print("XGBoost Confusion Matrix:\n", cm_xgboost_scaled)

XGBoost Test Accuracy: 0.7919531561903917

XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.97      0.80      9710
           1       0.97      0.66      0.78     12833

    accuracy                           0.79     22543
   macro avg       0.82      0.81      0.79     22543
weighted avg       0.84      0.79      0.79     22543

XGBoost Confusion Matrix:
 [[9439  271]
 [4419 8414]]


**LightGBM**

LightGBM is designed to be faster and more memory efficient than some other gradient boosting frameworks, which can be beneficial on larger datasets.

In [21]:
import lightgbm as lgb
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Initialize the LightGBM classifier.
lgb_clf = lgb.LGBMClassifier(
    n_estimators=300,
    max_depth=10,
    learning_rate=0.1,
    random_state=42,
    n_jobs=-1
)

# Train the model.
lgb_clf.fit(X_train_scaled, y_train)

# Predict on the test data.
y_pred_lgb = lgb_clf.predict(X_test_scaled)

# Evaluate the model.
cm_lgb_scaled = confusion_matrix(y_test, y_pred_lgb)
cf_lgb_scaled = classification_report(y_test, y_pred_lgb)
print("LightGBM Test Accuracy:", accuracy_score(y_test, y_pred_lgb))
print("\nLightGBM Classification Report:\n", cf_lgb_scaled)
print("LightGBM Confusion Matrix:\n", cm_lgb_scaled)




[LightGBM] [Info] Number of positive: 58630, number of negative: 67343
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006163 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3333
[LightGBM] [Info] Number of data points in the train set: 125973, number of used features: 110
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.465417 -> initscore=-0.138552
[LightGBM] [Info] Start training from score -0.138552
LightGBM Test Accuracy: 0.783125582220645

LightGBM Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.97      0.79      9710
           1       0.97      0.64      0.77     12833

    accuracy                           0.78     22543
   macro avg       0.82      0.81      0.78     22543
weighted avg       0.84      0.78      0.78     22543

LightGBM Confusion Matrix:
 [[9439  2

