In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [3]:
# ==============================================================================
# 2. DATA LOADING AND EXPLORATION
# ==============================================================================

path = "/repos/smote_msfb/public_datasets/kdd1999/"

# Define column names for the dataset
column_headers = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes','land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 
                  'logged_in','num_compromised', 'root_shell', 'su_attempted', 'num_root','num_file_creations', 'num_shells', 'num_access_files', 
                  'num_outbound_cmds','is_host_login','is_guest_login','count','srv_count','serror_rate','srv_serror_rate','rerror_rate', 
                  'srv_rerror_rate', 'same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count','dst_host_srv_count', 'dst_host_same_srv_rate', 
                  'dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate', 'dst_host_srv_serror_rate', 
                  'dst_host_rerror_rate','dst_host_srv_rerror_rate', 'label']

In [4]:
# Load data directly into a GPU DataFrame using cuDF

file_path = path + "kddcup.data.gz"
network_data = pd.read_csv(file_path, names=column_headers)

In [5]:
network_data.shape

(4898431, 42)

In [7]:
network_data.head(3)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,http,SF,215,45076,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,162,4528,0,0,0,0,...,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,236,1228,0,0,0,0,...,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,normal.


In [8]:
network_data['label'].unique()

array(['normal.', 'buffer_overflow.', 'loadmodule.', 'perl.', 'neptune.',
       'smurf.', 'guess_passwd.', 'pod.', 'teardrop.', 'portsweep.',
       'ipsweep.', 'land.', 'ftp_write.', 'back.', 'imap.', 'satan.',
       'phf.', 'nmap.', 'multihop.', 'warezmaster.', 'warezclient.',
       'spy.', 'rootkit.'], dtype=object)

In [None]:
# Define attack types mapping

attack_map = {'normal.': 'normal', 'back.': 'dos', 'buffer_overflow.': 'u2r', 'ftp_write.': 'r2l', 'guess_passwd.': 'r2l', 'imap.': 'r2l',
              'ipsweep.': 'probe', 'land.': 'dos', 'loadmodule.': 'u2r','multihop.': 'r2l', 'neptune.': 'dos', 'nmap.': 'probe',
              'perl.': 'u2r', 'phf.': 'r2l', 'pod.': 'dos', 'portsweep.': 'probe','rootkit.': 'u2r', 'satan.': 'probe', 'smurf.': 'dos',
              'spy.': 'r2l', 'teardrop.': 'dos', 'warezclient.': 'r2l','warezmaster.': 'r2l' }

# Map attacks - convert to pandas for map, then back to cudf

attack_category_pd = network_data['label'].to_pandas().map(attack_map)

In [11]:
print("Dataset Shape:", network_data.shape)
print("\nAttack Category Distribution:\n", network_data['label'].value_counts())

Dataset Shape: (4898431, 42)

Attack Category Distribution:
 smurf.              2807886
neptune.            1072017
normal.              972781
satan.                15892
ipsweep.              12481
portsweep.            10413
nmap.                  2316
back.                  2203
warezclient.           1020
teardrop.               979
pod.                    264
guess_passwd.            53
buffer_overflow.         30
land.                    21
warezmaster.             20
imap.                    12
rootkit.                 10
loadmodule.               9
ftp_write.                8
multihop.                 7
phf.                      4
perl.                     3
spy.                      2
Name: label, dtype: int64


In [12]:
network_data.dtypes

duration                         int64
protocol_type                   object
service                         object
flag                            object
src_bytes                        int64
dst_bytes                        int64
land                             int64
wrong_fragment                   int64
urgent                           int64
hot                              int64
num_failed_logins                int64
logged_in                        int64
num_compromised                  int64
root_shell                       int64
su_attempted                     int64
num_root                         int64
num_file_creations               int64
num_shells                       int64
num_access_files                 int64
num_outbound_cmds                int64
is_host_login                    int64
is_guest_login                   int64
count                            int64
srv_count                        int64
serror_rate                    float64
srv_serror_rate          