### Prereq: install any necessary libraries you don't already have ↓

In [1]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [15]:
#import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier #Random Forest
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score, confusion_matrix,make_scorer, fbeta_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

#from sklearn.datasets import fetch_kddcup99
#not sure if we need these
from mlxtend.plotting import plot_decision_regions
from sklearn.tree import plot_tree
from matplotlib import pyplot as plt

### Load data

In [3]:
#load in the data from the dataset
file_path = 'kddcup.data.gz'

# Define the column names based on the kddcup.names file
columns = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", 
    "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in", 
    "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations", 
    "num_shells", "num_access_files", "num_outbound_cmds", "is_hot_login", 
    "is_guest_login", "count", "srv_count", "serror_rate", "srv_serror_rate", 
    "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate", 
    "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count", 
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", 
    "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate", 
    "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label"
]

# Read the gzipped file into a pandas dataframe
df = pd.read_csv(file_path, header=None, names=columns, compression='gzip')

### Basic EDA

In [4]:
df.head(5)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,http,SF,215,45076,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,162,4528,0,0,0,0,...,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,236,1228,0,0,0,0,...,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,233,2032,0,0,0,0,...,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,239,486,0,0,0,0,...,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,normal.


In [5]:
df.tail(5)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
4898426,0,tcp,http,SF,212,2288,0,0,0,0,...,255,1.0,0.0,0.33,0.05,0.0,0.01,0.0,0.0,normal.
4898427,0,tcp,http,SF,219,236,0,0,0,0,...,255,1.0,0.0,0.25,0.05,0.0,0.01,0.0,0.0,normal.
4898428,0,tcp,http,SF,218,3610,0,0,0,0,...,255,1.0,0.0,0.2,0.05,0.0,0.01,0.0,0.0,normal.
4898429,0,tcp,http,SF,219,1234,0,0,0,0,...,255,1.0,0.0,0.17,0.05,0.0,0.01,0.0,0.0,normal.
4898430,0,tcp,http,SF,219,1098,0,0,0,0,...,255,1.0,0.0,0.14,0.05,0.0,0.01,0.0,0.0,normal.


In [6]:
df.sample(10)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
803783,0,tcp,http,SF,204,1329,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
989100,0,icmp,ecr_i,SF,1032,0,0,0,0,0,...,255,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,smurf.
1092737,0,tcp,private,S0,0,0,0,0,0,0,...,10,0.04,0.07,0.0,0.0,1.0,1.0,0.0,0.0,neptune.
1031931,0,tcp,http,SF,182,9141,0,0,0,0,...,255,1.0,0.0,0.02,0.04,0.0,0.0,0.0,0.0,normal.
3781397,0,tcp,private,S0,0,0,0,0,0,0,...,7,0.03,0.08,0.0,0.0,1.0,1.0,0.0,0.0,neptune.
4429647,0,icmp,ecr_i,SF,520,0,0,0,0,0,...,255,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,smurf.
2811434,0,icmp,ecr_i,SF,1032,0,0,0,0,0,...,255,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,smurf.
738570,0,udp,domain_u,SF,46,46,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
4755419,0,tcp,private,REJ,0,0,0,0,0,0,...,3,0.01,0.07,0.0,0.0,0.0,0.0,1.0,1.0,neptune.
73173,0,tcp,http,SF,202,13600,0,0,0,0,...,255,1.0,0.0,0.02,0.04,0.0,0.0,0.0,0.0,normal.


### Data processing

In [7]:
#data processing 1: drop nulls and NaNs
df.isnull().sum() + df.isna().sum()

duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
num_compromised                0
root_shell                     0
su_attempted                   0
num_root                       0
num_file_creations             0
num_shells                     0
num_access_files               0
num_outbound_cmds              0
is_hot_login                   0
is_guest_login                 0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_rate                  0
srv_diff_h

In [8]:
##Remove Duplicate Rows
df = df.drop_duplicates().reset_index(drop=True)

##Remove Rows with Missing Values
df = df.dropna().reset_index(drop=True)


# 3) Binary target (normal = 0, any attack = 1)
df["binary_label"] = (df["label"] != "normal.").astype(int)

In [9]:
#data processing 2: encoding
categorical_cols = ["protocol_type", "service", "flag"]
numeric_cols      = df.columns.difference(categorical_cols + ["label", "binary_label"])

# Categorical → One‑Hot
categorical_pipe = Pipeline([
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])


#data processing 3: normalization
#take all values in each column and put the value in the range (0, 1)
# Numeric → Standard‑Scale
numeric_pipe = Pipeline([
    ("scaler", StandardScaler())
])


# Combine
preprocess = ColumnTransformer([
    ("cat", categorical_pipe, categorical_cols),
    ("num", numeric_pipe, numeric_cols)
])

In [10]:
#data processing 4: deduplication

print(f"Number of rows before deduplication: {df.shape[0]}")

#remove duplicate rows
duplicate_rows = df.duplicated()
print(f"Number of duplicate records: {duplicate_rows.sum()}")

#print number of records after removing duplicates
deduplicated_df = df.drop_duplicates()
print(f"Number of rows after deduplication: {deduplicated_df.shape[0]}")

Number of rows before deduplication: 1074992
Number of duplicate records: 0
Number of rows after deduplication: 1074992


### Supervised Learners

In [11]:
#Model 1: Decision Trees

In [14]:
#Model 2: Random Forest TRAINING and SPLITTING DATA FOR RANDOM FOREST
# Split the data into features and target variable
df_tune = df.sample(frac=0.2, random_state=42)  # Use a smaller sample for tuning
X = df_tune.drop(["label", "binary_label"], axis=1)
y = df_tune["binary_label"]


# Full train/test split (used later for the final fit + evaluation)
X_full = df.drop(["label", "binary_label"], axis=1)
y_full = df["binary_label"]


# Split the data into training and testing sets
# Stratified split to maintain the distribution of the target variable
X_train, X_test, y_train, y_test = train_test_split(
    X_full, y_full, test_size=0.20, stratify=y_full, random_state=42)


rf = RandomForestClassifier(
    n_jobs       = -1,
    class_weight = "balanced",   # second hedge vs. minority class
    random_state = 42
)

pipeline = ImbPipeline([
    ("pre",   preprocess),   # encoding + scaling
    # ("smote", SMOTE(random_state=42)),
    ("rf",    rf)
])

param_grid = {
    "rf__n_estimators"     : [200, 400],
    "rf__max_depth"        : [None, 20, 40, 60],
    "rf__min_samples_leaf" : [1, 2, 4],
    "rf__max_features"     : ["sqrt", 0.3]  # 0.3 ≈ 30 % of features
}

def f2_score(y_true, y_pred):
    return f1_score(y_true, y_pred, beta=2)

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    estimator      = pipeline,
    param_distributions = param_grid,
    n_iter         = 6,
    scoring        = {"Accuracy":  make_scorer(accuracy_score),
                      "Recall":    make_scorer(recall_score),
                      "Precision": make_scorer(precision_score),
                      "F2":        make_scorer(f2_score)},
    refit          = "F2",      # keep the params with highest F2
    cv             = cv,
    verbose        = 1,
    n_jobs         = -1,
    random_state   = 42
)


search.fit(X, y)
print("Best hyper-parameters:", search.best_params_)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


Traceback (most recent call last):
  File "/Users/chico/Desktop/CS549 Final Project/intrusion-detection/.venv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 140, in __call__
    score = scorer._score(
        cached_call, estimator, *args, **routed_params.get(name).score
    )
  File "/Users/chico/Desktop/CS549 Final Project/intrusion-detection/.venv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 388, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/fy/5f4q1_x564x1gwtwl_w_rg7c0000gn/T/ipykernel_50507/3849940800.py", line 39, in f2_score
  File "/Users/chico/Desktop/CS549 Final Project/intrusion-detection/.venv/lib/python3.13/site-packages/sklearn/utils/_param_validation.py", line 194, in wrapper
    params = func_sig.bind(*args, **kwargs)
  File "/opt/homebrew/Cellar/python@3.13/3.13.1/Frameworks/Python.framework/Versions/3.

Best hyper-parameters: {'rf__n_estimators': 400, 'rf__min_samples_leaf': 2, 'rf__max_features': 'sqrt', 'rf__max_depth': 40}


In [None]:
# Building RF with best parameters
best = search.best_params_




In [None]:
##RANDOM FOREST EVALUATION and CONFUSION MATRIX
y_pred = best_model.predict(X_test)

metrics = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred),
    "Recall":    recall_score(y_test, y_pred),
    "F1":        f1_score(y_test, y_pred),
    "F2":        f2_score(y_test, y_pred),
    "FNR":       1 - recall_score(y_test, y_pred),
}

print(pd.Series(metrics).round(4), "\n")
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred), "\n")
print("Detailed classification report:\n", classification_report(y_test, y_pred), "\n")

### Unsupervised Learners

In [None]:
#Model 3: K-Means Clustering

In [None]:
#Model 4: Isolation Forest

### Evaluation

In [None]:
#Evaluation, Visualization

#mandatory
#accuracy
#recall
#precision

#optional
#F2
#False Negative Rate