In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import OneHotEncoder, RobustScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import category_encoders as ce

In [26]:
df = pd.read_csv("../datasets/dataframe_for_preprocessing.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,0,tcp,http,SF,181,5450,0,0,0,...,9,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0
1,1,0,tcp,http,SF,239,486,0,0,0,...,19,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0
2,2,0,tcp,http,SF,235,1337,0,0,0,...,29,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0
3,3,0,tcp,http,SF,219,1337,0,0,0,...,39,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0
4,4,0,tcp,http,SF,217,2032,0,0,0,...,49,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0


In [27]:
df=df.drop("Unnamed: 0",axis=1)
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0


In [28]:
print(df.duplicated(keep=False).sum())
df.drop_duplicates(inplace=True)
print(df.shape)

367816
(145583, 41)


In [29]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,145583.0,132.027902,1224.17,0.0,0.0,0.0,0.0,58329.0
src_bytes,145583.0,7995.85817,1820402.0,0.0,0.0,147.0,288.0,693375640.0
dst_bytes,145583.0,2859.827384,60810.42,0.0,0.0,105.0,1164.5,5155468.0
land,145583.0,0.000137,0.0117201,0.0,0.0,0.0,0.0,1.0
wrong_fragment,145583.0,0.020202,0.2393704,0.0,0.0,0.0,0.0,3.0
urgent,145583.0,4.8e-05,0.01015049,0.0,0.0,0.0,0.0,3.0
hot,145583.0,0.100177,1.426813,0.0,0.0,0.0,0.0,30.0
num_failed_logins,145583.0,0.000515,0.02858572,0.0,0.0,0.0,0.0,5.0
logged_in,145583.0,0.491493,0.4999293,0.0,0.0,0.0,1.0,1.0
lnum_compromised,145583.0,0.026178,3.311407,0.0,0.0,0.0,0.0,884.0


In [30]:
constant_cols = [col for col in df.columns if df[col].min()==df[col].max()]
df.drop(columns=constant_cols,inplace=True)

In [31]:
df.shape

(145583, 39)

In [32]:
df.describe(include="object").T

See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  df.describe(include="object").T


Unnamed: 0,count,unique,top,freq
protocol_type,145583,3,tcp,130912
service,145583,66,http,62053
flag,145583,11,SF,87456


In [33]:
low_cardinality = ["protocol_type"]
medium_cardinality = ['flag']
high_cardinality = ['service']

In [34]:
def get_numerical(df):
    numerical_cols = []
    for col in df.select_dtypes(include=['int64', 'float64']).columns:
        if df[col].nunique(dropna=True) > 2:
            numerical_cols.append(col)
    return numerical_cols

def get_binary(df):
    binary_cols = []
    for col in df.select_dtypes(include=['int64', 'float64']).columns:
        if df[col].nunique(dropna=True) == 2:
            binary_cols.append(col)
    return binary_cols

In [35]:
numerical_cols = get_numerical(df)
binary_cols = get_binary(df)

In [36]:
log_pipeline = Pipeline(
    steps=[
        ('to_float', FunctionTransformer(lambda X: X.astype(float), validate=False)),
        ('log', FunctionTransformer(np.log1p, validate=True)),
        ('scaler', RobustScaler())
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        
        ('num_log_scale', log_pipeline, numerical_cols),

        ('binary_pass','passthrough',binary_cols),
        ('low_ohe', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'),
         low_cardinality),

        ('medium_ohe', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'),
         medium_cardinality),

        ('high_binary', ce.BinaryEncoder(), high_cardinality)
    ],
    remainder='drop' 
)

In [37]:
df[numerical_cols] = df[numerical_cols].apply(pd.to_numeric, errors='coerce')

In [38]:
X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)

In [39]:
X_train_final = preprocessor.fit_transform(X_train)
X_test_final = preprocessor.transform(X_test)

In [40]:
iso_forest = IsolationForest(n_estimators=100, contamination=0.1, random_state=42)