In [22]:
import pandas as pd 

# List of all 41 feature column names from NSL-KDD, plus the 'label' column
column_names = [
    "duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent",
    "hot","num_failed_logins","logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds","is_host_login",
    "is_guest_login","count","srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate",
    "same_srv_rate","diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label", "difficulty"
]

In [23]:
# Load training and test datasets from the data folder
train_path = "../data/KDDTrain+.txt"
test_path = "../data/KDDTest+.txt"

df_train = pd.read_csv(train_path, names=column_names, header=None)
df_test = pd.read_csv(test_path, names=column_names, header=None)

# Print the shape of both datasets to confirm successful loading
print("Train shape:", df_train.shape)
print("Test shape:", df_test.shape)
df_train.head()

Train shape: (125973, 43)
Test shape: (22544, 43)


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label,difficulty
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [None]:
# Step 1: Convert the original 'label' column into a binary label:
#   - 0 for 'normal' (non-attack)
#   - 1 for anything else (attack)

df_train["label_binary"] = df_train["label"].apply(lambda x: 0 if x == "normal" else 1)
df_test["label_binary"] = df_test["label"].apply(lambda x: 0 if x == "normal" else 1)


In [26]:
# Check how many normal vs attack records are in each dataset
# Helps us understand class imbalance

print("Training set lavel breakdown:")
print(df_train["label_binary"].value_counts())

print("Test set label breakdown:")
print(df_test["label_binary"].value_counts())


Training set lavel breakdown:
label_binary
0    67343
1    58630
Name: count, dtype: int64
Test set label breakdown:
label_binary
1    12833
0     9711
Name: count, dtype: int64


In [29]:
# Step 2: One-hot encode categorical features

# These are the categorical columns in the dataset
categorical_cols = ["protocol_type", "service", "flag"]

# One-hot encode both train and test sets
df_train_encoded = pd.get_dummies(df_train, columns=categorical_cols)
df_test_encoded = pd.get_dummies(df_test, columns=categorical_cols)

# Ensure train and test sets have the same columns (align them)
df_train_encoded, df_test_encoded = df_train_encoded.align(
    df_test_encoded,
    join="left",
    axis=1,
    fill_value=0
)

# Sanity check
print("One-hot encoded train shape:", df_train_encoded.shape)
print("One-hot encoded test shape:", df_test_encoded.shape)
print(df_train_encoded.columns.tolist())

One-hot encoded train shape: (125973, 125)
One-hot encoded test shape: (22544, 125)
['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label', 'difficulty', 'label_binary', 'protocol_type_icmp', 'protocol_type_tcp', 'protocol_type_udp', 'service_IRC', 'service_X11', 'service_Z39_50', 'service_aol', 'service_auth', 'service_bgp', 'service_courier', 'servic