In [7]:
import os
import pandas as pd
import numpy as np
from urllib.request import urlretrieve
import zipfile
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score

def load_kddcup_data(filepath):
    columns = [
        'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
        'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
        'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
        'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login',
        'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
        'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
        'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
        'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
        'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label'
    ]
    
    data = pd.read_csv(filepath, header=None, names=columns)
    return data

def preprocess_kddcup_data(data):
    # Separate features and labels
    X = data.drop(columns=['label'])
    
    # Define categorical and numerical columns
    categorical_features = ['protocol_type', 'service', 'flag']
    numerical_features = [col for col in X.columns if col not in categorical_features]
    
    # Preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            # Convert all numerical features to standard normal destribution
            ('num', StandardScaler(), numerical_features),
            # One hot encode categorical features
            ('cat', OneHotEncoder(), categorical_features)
        ]
    )
    
    return preprocessor.fit_transform(X)

def download_kddcup99(url, destination_file_name, destination_folder="/home/jovyan/work/data"):
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)
        
    compressed_file = os.path.join(destination_folder, destination_file_name + ".gz")
    extracted_file = os.path.join(destination_folder, destination_file_name)
    
    # Download the dataset if not already downloaded
    if not os.path.exists(compressed_file):
        print("Downloading dataset...")
        urlretrieve(url, compressed_file)
        print("Download complete.")
    
    # Extract the dataset if not already extracted
    if not os.path.exists(extracted_file):
        print("Extracting dataset...")
        os.system(f"gunzip {compressed_file}")
        print("Extraction complete.")
    
    return extracted_file

In [8]:
url = "http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz"
data_file_path = download_kddcup99(url, "kdd_10_percent")


Downloading dataset...
Download complete.
Extracting dataset...
Extraction complete.


In [9]:
datat load_kddcup_data(data_file_path)
processed_data = preprocess_kddcup_data(data)

NameError: name 'data' is not defined