In [32]:
import pandas as pd
import numpy as np
import os
import random
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [33]:
# Setting random seed to replicate results
DEFAULT_RANDOM_SEED = 2021

def seedBasic(seed=DEFAULT_RANDOM_SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

def seedEverything(seed=DEFAULT_RANDOM_SEED):
    seedBasic(seed)

seedEverything(2021)

In [34]:
# Load dataset
df = pd.read_csv('train.csv')  # Replace with actual dataset path

# Convert labels
df['label'] = df['label'].apply(lambda x: 1 if x == 'BENIGN' else 0)

# Split features and labels
X = df.drop(columns=['label'])
y = df['label']

In [45]:
df.shape

(88472, 79)

In [35]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,destination port,flow duration,total fwd packets,total backward packets,total length of fwd packets,total length of bwd packets,fwd packet length max,fwd packet length min,fwd packet length mean,...,min_seg_size_forward,active mean,active std,active max,active min,idle mean,idle std,idle max,idle min,label
0,127724,53,23855,1,1,42,114,42,42,42.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,1
1,74669,53108,115,1,1,0,0,0,0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,1
2,52277,53,30920,1,1,54,82,54,54,54.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,1
3,33733,443,116886492,17,17,946,5030,292,0,55.647059,...,20,392253.5,369954.7324,653851,130656,58000000.0,59459.90212,58000000,57900000,1
4,66283,21,8806152,9,15,109,188,31,0,12.111111,...,32,0.0,0.0,0,0,0.0,0.0,0,0,0


In [36]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [37]:
df.head(5)

Unnamed: 0,destination port,flow duration,total fwd packets,total backward packets,total length of fwd packets,total length of bwd packets,fwd packet length max,fwd packet length min,fwd packet length mean,fwd packet length std,...,min_seg_size_forward,active mean,active std,active max,active min,idle mean,idle std,idle max,idle min,label
0,53,23855,1,1,42,114,42,42,42.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,1
1,53108,115,1,1,0,0,0,0,0.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,1
2,53,30920,1,1,54,82,54,54,54.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,1
3,443,116886492,17,17,946,5030,292,0,55.647059,84.34597,...,20,392253.5,369954.7324,653851,130656,58000000.0,59459.90212,58000000,57900000,1
4,21,8806152,9,15,109,188,31,0,12.111111,10.469533,...,32,0.0,0.0,0,0,0.0,0.0,0,0,0


In [38]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=DEFAULT_RANDOM_SEED)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [39]:
# Save scaler
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [40]:
# Define models with hyperparameter tuning
param_grids = {
    "Decision Tree": (DecisionTreeClassifier(), {'max_depth': [5, 10, None]}),
    "Random Forest": (RandomForestClassifier(), {'n_estimators': [50, 100], 'max_depth': [10, None]}),
    "MLP": (MLPClassifier(max_iter=500), {'hidden_layer_sizes': [(50,), (100,)]}),
    "Logistic Regression": (LogisticRegression(), {'C': [0.1, 1, 10]}),
    "LightGBM": (LGBMClassifier(), {'n_estimators': [50, 100]}),
    "AdaBoost": (AdaBoostClassifier(), {'n_estimators': [50, 100]}),
    "KNN": (KNeighborsClassifier(), {'n_neighbors': [3, 5, 7]}),
    "SVM": (SVC(), {'C': [0.1, 1, 10]}),
    "XGBoost": (XGBClassifier(), {'n_estimators': [50, 100]}),
    "Naive Bayes": (GaussianNB(), {})
}

In [41]:
# Train and evaluate models
results = []
best_params = {}
for name, (model, param_grid) in param_grids.items():
    if param_grid:  # Apply GridSearchCV only if parameters exist
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_params[name] = grid_search.best_params_
    else:
        best_model = model.fit(X_train, y_train)
        best_params[name] = "No hyperparameters to tune"

    # Save trained model
    joblib.dump(best_model, f'{name.replace(" ", "_")}_model.pkl')

    y_pred = best_model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    false_positive_rate = fp / (fp + tn)

    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "False Positive Rate": false_positive_rate
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 47138, number of negative: 12138
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020452 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14656
[LightGBM] [Info] Number of data points in the train set: 59276, number of used features: 67
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.795229 -> initscore=1.356738
[LightGBM] [Info] Start training from score 1.356738




In [42]:
# Save results to CSV
results_df.to_csv('model_results.csv', index=False)

In [46]:
# Save best hyperparameters to a file
best_params_df = pd.DataFrame(list(best_params.items()), columns=['Model', 'Best Parameters'])
best_params_df.to_csv('best_hyperparameters.csv')

In [47]:
# Display results
display(results_df)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,False Positive Rate
0,Decision Tree,0.996746,0.997881,0.99801,0.997946,0.008065
1,Random Forest,0.997328,0.997969,0.998659,0.998314,0.007735
2,MLP,0.988697,0.994103,0.991609,0.992854,0.022383
3,Logistic Regression,0.934203,0.951023,0.966696,0.958795,0.189434
4,LightGBM,0.998459,0.999351,0.998702,0.999027,0.002469
5,AdaBoost,0.984073,0.984392,0.995675,0.990001,0.060072
6,KNN,0.974312,0.985376,0.982137,0.983754,0.055464
7,SVM,0.961604,0.978926,0.972448,0.975676,0.079658
8,XGBoost,0.998493,0.999308,0.998789,0.999048,0.002633
9,Naive Bayes,0.603131,0.997155,0.50026,0.666263,0.005431


In [48]:
display(best_params_df)

Unnamed: 0,Model,Best Parameters
0,Decision Tree,{'max_depth': None}
1,Random Forest,"{'max_depth': None, 'n_estimators': 100}"
2,MLP,"{'hidden_layer_sizes': (50,)}"
3,Logistic Regression,{'C': 10}
4,LightGBM,{'n_estimators': 100}
5,AdaBoost,{'n_estimators': 100}
6,KNN,{'n_neighbors': 3}
7,SVM,{'C': 10}
8,XGBoost,{'n_estimators': 100}
9,Naive Bayes,No hyperparameters to tune
