In [1]:
!pip install wandb



In [1]:
import wandb
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.datasets import fetch_covtype

In [2]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mbrinashong[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
%run 'common.ipynb'

In [4]:
# Function to evaluate and print model performance
def evaluate_model(model, X_train, y_train, X_test, y_test):
    seconds = time.time()
    # Train the model
    model.fit(X_train, y_train)
    training_time = time.time() - seconds
    print("Train operation time: = ", training_time, "seconds")
    wandb.log({"Training time (seconds)": training_time})
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Print model evaluation metrics
    return evaluate(y_test, y_pred, f"\nModel: {model.__class__.__name__}")

In [16]:
# Dictionary to store models and their names
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Support Vector Machine (SVM)": SVC()
}

In [6]:
%run kdd.ipynb

Normal class:  0    smurf.
dtype: object
Feature names:  ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack']
NORMAL_TARGET 16
TARGET_DICT {0: 'apache2', 1: 'back', 2: 'buffer_overflow', 3: 'ftp_write', 4: 'guess_passwd', 5: 'httptunnel', 6: 'imap', 7: 'ipsweep', 8: 'land', 9: 'loadmodule', 10: 'mailbomb', 11: 'm

In [7]:
# Should already be one hot encoded and label encoded
all_df.head()

Unnamed: 0,duration,protocol_type_icmp,protocol_type_tcp,protocol_type_udp,service_IRC,service_X11,service_Z39_50,service_auth,service_bgp,service_courier,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack
0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,16
1,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,16
2,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,16
3,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,28
4,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,254,1.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,28


In [8]:
# Get X and y from all_df
X_df = all_df.drop(columns=[TARGET_COLUMN])
y_df = all_df[TARGET_COLUMN]

# Split the data into training and testing sets (80% train, 20% test)
X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(X_df, y_df, test_size=0.2, random_state=42)
print(X_train_df.shape, X_test_df.shape, y_train_df.shape, y_test_df.shape)

(248823, 117) (62206, 117) (248823,) (62206,)


In [9]:
# Scale the features (required for some models, especially SVM)
scaler = StandardScaler()

X_train_scaled_df = X_train_df.copy()
X_train_scaled_df[NUMERICAL_COLUMNS] = scaler.fit_transform(X_train_scaled_df[NUMERICAL_COLUMNS])

X_test_scaled_df = X_test_df.copy()
X_test_scaled_df[NUMERICAL_COLUMNS] = scaler.transform(X_test_scaled_df[NUMERICAL_COLUMNS])

In [17]:
# Loop through models and evaluate each one
for model_name, model in models.items():
    wandb.init(project="ML-model-baselines-kdd-dataset", name=model_name)
    
    # For SVM and Logistic Regression, use scaled data
    if model_name in ["Logistic Regression", "Support Vector Machine (SVM)"]:
        conf_matrix, class_report, acc_score = evaluate_model(model, X_train_scaled_df, y_train_df, X_test_scaled_df, y_test_df)
        wandb_log(conf_matrix, class_report, acc_score)
    else:
        conf_matrix, class_report, acc_score = evaluate_model(model, X_train_df, y_train_df, X_test_df, y_test_df)
        wandb_log(conf_matrix, class_report, acc_score)
        
    wandb.finish()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average,

Train operation time: =  297.9739234447479 seconds

Model: LogisticRegression
Confusion Matrix:
[[181   0   0 ...   0   0   0]
 [  0 217   0 ...   0   0   0]
 [  0   0   1 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   1]]

Classification Report:
{'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 181.0}, '1': {'precision': 0.9774774774774775, 'recall': 0.9819004524886877, 'f1-score': 0.9796839729119639, 'support': 221.0}, '2': {'precision': 0.25, 'recall': 0.3333333333333333, 'f1-score': 0.28571428571428575, 'support': 3.0}, '4': {'precision': 0.9966666666666667, 'recall': 1.0, 'f1-score': 0.9983305509181971, 'support': 897.0}, '5': {'precision': 1.0, 'recall': 0.9259259259259259, 'f1-score': 0.9615384615384615, 'support': 27.0}, '7': {'precision': 0.9649122807017544, 'recall': 0.9821428571428571, 'f1-score': 0.9734513274336283, 'support': 56.0}, '9': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'su

VBox(children=(Label(value='0.013 MB of 0.017 MB uploaded\r'), FloatProgress(value=0.7989043597352202, max=1.0…

0,1
Accuracy Score,▁
Training time (seconds),▁
f1-score_avg,▁
precision_avg,▁
recall_avg,▁

0,1
Accuracy Score,0.96721
Training time (seconds),297.97392
f1-score_avg,0.95479
precision_avg,0.94566
recall_avg,0.96721


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011119160322212135, max=1.0…

Train operation time: =  0.6720561981201172 seconds

Model: DecisionTreeClassifier
Confusion Matrix:
[[181   0   0 ...   0   0   0]
 [  0 220   0 ...   0   0   0]
 [  0   0   2 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   1]]

Classification Report:
{'0': {'precision': 0.9890710382513661, 'recall': 1.0, 'f1-score': 0.9945054945054945, 'support': 181.0}, '1': {'precision': 0.995475113122172, 'recall': 0.995475113122172, 'f1-score': 0.995475113122172, 'support': 221.0}, '2': {'precision': 0.6666666666666666, 'recall': 0.6666666666666666, 'f1-score': 0.6666666666666666, 'support': 3.0}, '4': {'precision': 0.9966592427616926, 'recall': 0.9977703455964325, 'f1-score': 0.9972144846796657, 'support': 897.0}, '5': {'precision': 1.0, 'recall': 0.8888888888888888, 'f1-score': 0.9411764705882353, 'support': 27.0}, '7': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 56.0}, '9': {'precision': 0.0, 'recall': 0.0, 'f1-s

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


VBox(children=(Label(value='0.013 MB of 0.017 MB uploaded\r'), FloatProgress(value=0.7988, max=1.0)))

0,1
Accuracy Score,▁
Training time (seconds),▁
f1-score_avg,▁
precision_avg,▁
recall_avg,▁

0,1
Accuracy Score,0.98064
Training time (seconds),0.67206
f1-score_avg,0.98073
precision_avg,0.98085
recall_avg,0.98064


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011117883066658073, max=1.0…

Train operation time: =  8.593595027923584 seconds

Model: RandomForestClassifier
Confusion Matrix:
[[  181     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0]
 [    0   221     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0]
 [    0     0     2     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     1     0     0     0     0     0     0
      0     0     0     0     0     0]
 [    0     0     0   897     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0]
 [    0     0     0     0    25     0     0     0     0     0     0     0
      1     0     0     0     0     0     1     0     0     0     0     0
    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


VBox(children=(Label(value='0.015 MB of 0.015 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Accuracy Score,▁
Training time (seconds),▁
f1-score_avg,▁
precision_avg,▁
recall_avg,▁

0,1
Accuracy Score,0.98116
Training time (seconds),8.5936
f1-score_avg,0.98113
precision_avg,0.98113
recall_avg,0.98116


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113680711100136, max=1.0…

Train operation time: =  1136.2186801433563 seconds

Model: GradientBoostingClassifier
Confusion Matrix:
[[  181     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    0   220     0     0     0     0     0     0     0     0     0     0
      0     0     0     1     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     2     0     1     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     4   865     0     0     0     0     0     0     1     0
      4     0     0     0     0     0     0     0     0     0     0     0
      0     0     0    23     0     0     0]
 [    0     0     0     0     6     0     0     0    20     0     0     0
      0     0     0     0     0     0     1     0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


VBox(children=(Label(value='0.013 MB of 0.016 MB uploaded\r'), FloatProgress(value=0.810745693289965, max=1.0)…

0,1
Accuracy Score,▁
Training time (seconds),▁
f1-score_avg,▁
precision_avg,▁
recall_avg,▁

0,1
Accuracy Score,0.96287
Training time (seconds),1136.21868
f1-score_avg,0.9654
precision_avg,0.96911
recall_avg,0.96287


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112776366644539, max=1.0…

Train operation time: =  0.06295037269592285 seconds

Model: KNeighborsClassifier
Confusion Matrix:
[[  170     0     0     0     0     0     0     0     0     0     0     5
      0     6     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    1   219     0     0     0     0     0     0     0     0     0     0
      0     1     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     3     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     0     0   893     0     0     0     0     0     0     0
      0     3     0     0     0     0     0     0     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


VBox(children=(Label(value='0.013 MB of 0.016 MB uploaded\r'), FloatProgress(value=0.8098103574033552, max=1.0…

0,1
Accuracy Score,▁
Training time (seconds),▁
f1-score_avg,▁
precision_avg,▁
recall_avg,▁

0,1
Accuracy Score,0.9756
Training time (seconds),0.06295
f1-score_avg,0.97351
precision_avg,0.97265
recall_avg,0.9756


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011120939266644807, max=1.0…

Train operation time: =  167.85579824447632 seconds

Model: SVC
Confusion Matrix:
[[  181     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0]
 [    0   218     0     0     0     0     0     0     0     0     0     0
      3     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0]
 [    0     0     1     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     2     0     0     0]
 [    1     0     0   887     0     0     0     0     0     0     0     0
      9     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0]
 [    0     0     0     0    25     0     0     0     0     0     0     0
      2     0     0     0     0     0     0     0     0     0     0     0
      0     0     0   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


VBox(children=(Label(value='0.015 MB of 0.015 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Accuracy Score,▁
Training time (seconds),▁
f1-score_avg,▁
precision_avg,▁
recall_avg,▁

0,1
Accuracy Score,0.96393
Training time (seconds),167.8558
f1-score_avg,0.94835
precision_avg,0.93661
recall_avg,0.96393


In [15]:
wandb.finish()

VBox(children=(Label(value='0.009 MB of 0.022 MB uploaded (0.005 MB deduped)\r'), FloatProgress(value=0.395207…