# 🧪 NSL-KDD Preprocessing Notebook
This notebook loads and preprocesses the NSL-KDD dataset for use in intrusion detection models.

In [7]:
# 📦 Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import os

# Set paths
DATA_DIR = 'data/raw/'
PROCESSED_DIR = 'data/processed/'
os.makedirs(PROCESSED_DIR, exist_ok=True)

In [8]:
# 📂 Load NSL-KDD Data
column_names = [
    'duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent',
    'hot','num_failed_logins','logged_in','num_compromised','root_shell','su_attempted','num_root',
    'num_file_creations','num_shells','num_access_files','num_outbound_cmds','is_host_login',
    'is_guest_login','count','srv_count','serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate',
    'same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count','dst_host_srv_count',
    'dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate',
    'dst_host_rerror_rate','dst_host_srv_rerror_rate','label','difficulty'
]

# Load files
train_df = pd.read_csv(os.path.join(DATA_DIR, 'KDDTrain+.txt'), names=column_names)
test_df = pd.read_csv(os.path.join(DATA_DIR, 'KDDTest+.txt'), names=column_names)

print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")

Train shape: (125973, 43), Test shape: (22544, 43)


In [9]:
# 🧹 Preprocessing
# Drop the 'difficulty' column
train_df.drop('difficulty', axis=1, inplace=True)
test_df.drop('difficulty', axis=1, inplace=True)

# Binary label encoding
def label_encoder(label):
    return 0 if label == 'normal' else 1

train_df['label'] = train_df['label'].apply(label_encoder)
test_df['label'] = test_df['label'].apply(label_encoder)

In [10]:
# Encode categorical columns
categorical_cols = ['protocol_type', 'service', 'flag']
encoder = LabelEncoder()

for col in categorical_cols:
    train_df[col] = encoder.fit_transform(train_df[col])
    test_df[col] = encoder.transform(test_df[col])

In [11]:
# Normalize numerical columns
scaler = MinMaxScaler()

X_train = train_df.drop('label', axis=1)
y_train = train_df['label']
X_test = test_df.drop('label', axis=1)
y_test = test_df['label']

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
# Save processed data
np.save(os.path.join(PROCESSED_DIR, 'X_train.npy'), X_train_scaled)
np.save(os.path.join(PROCESSED_DIR, 'y_train.npy'), y_train)
np.save(os.path.join(PROCESSED_DIR, 'X_test.npy'), X_test_scaled)
np.save(os.path.join(PROCESSED_DIR, 'y_test.npy'), y_test)

print("✅ Preprocessing complete and saved to /data/processed/")

✅ Preprocessing complete and saved to /data/processed/
