In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

In [3]:
df = pd.read_csv("../datasets/dataframe_for_preprocessing.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,0,tcp,http,SF,181,5450,0,0,0,...,9,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0
1,1,0,tcp,http,SF,239,486,0,0,0,...,19,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0
2,2,0,tcp,http,SF,235,1337,0,0,0,...,29,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0
3,3,0,tcp,http,SF,219,1337,0,0,0,...,39,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0
4,4,0,tcp,http,SF,217,2032,0,0,0,...,49,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0


In [6]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 494020 entries, 0 to 494019
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Unnamed: 0                   494020 non-null  int64  
 1   duration                     494020 non-null  int64  
 2   protocol_type                494020 non-null  str    
 3   service                      494020 non-null  str    
 4   flag                         494020 non-null  str    
 5   src_bytes                    494020 non-null  int64  
 6   dst_bytes                    494020 non-null  int64  
 7   land                         494020 non-null  int64  
 8   wrong_fragment               494020 non-null  int64  
 9   urgent                       494020 non-null  int64  
 10  hot                          494020 non-null  int64  
 11  num_failed_logins            494020 non-null  int64  
 12  logged_in                    494020 non-null  int64  
 13  lnum_compr

In [9]:
df=df.drop('Unnamed: 0',axis=1)

In [12]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,494020.0,47.9794,707.747185,0.0,0.0,0.0,0.0,58329.0
src_bytes,494020.0,3025.615744,988219.101225,0.0,45.0,520.0,1032.0,693375640.0
dst_bytes,494020.0,868.530774,33040.034672,0.0,0.0,0.0,0.0,5155468.0
land,494020.0,4.5e-05,0.006673,0.0,0.0,0.0,0.0,1.0
wrong_fragment,494020.0,0.006433,0.134805,0.0,0.0,0.0,0.0,3.0
urgent,494020.0,1.4e-05,0.00551,0.0,0.0,0.0,0.0,3.0
hot,494020.0,0.034519,0.782103,0.0,0.0,0.0,0.0,30.0
num_failed_logins,494020.0,0.000152,0.01552,0.0,0.0,0.0,0.0,5.0
logged_in,494020.0,0.148245,0.355343,0.0,0.0,0.0,0.0,1.0
lnum_compromised,494020.0,0.010212,1.798328,0.0,0.0,0.0,0.0,884.0


In [15]:
df.drop_duplicates(inplace=True)
print(df.shape)

(145583, 41)


In [19]:
const_cols = [col for col in df.columns if df[col].min()==df[col].max()]
df.drop(columns = const_cols,inplace=True)

In [21]:
df.shape

(145583, 39)

In [22]:
df.describe(include="object").T

See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  df.describe(include="object").T


Unnamed: 0,count,unique,top,freq
protocol_type,145583,3,tcp,130912
service,145583,66,http,62053
flag,145583,11,SF,87456


In [30]:
low_cardinality = ["protocol_type"]
medium_cardinality = ['flag']
high_cardinality = ['service']

In [31]:
def get_numerical(df):
    numerical_cols = []
    for col in df.select_dtypes(include=['int64', 'float64']).columns:
        if df[col].nunique(dropna=True) > 2:
            numerical_cols.append(col)
    return numerical_cols

def get_binary(df):
    binary_cols = []
    for col in df.select_dtypes(include=['int64', 'float64']).columns:
        if df[col].nunique(dropna=True) == 2:
            binary_cols.append(col)
    return binary_cols

def convert_to_folat(X):
    return X.astype(float)

In [32]:
numerical_cols = get_numerical(df)
binary_cols = get_binary(df)

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, RobustScaler, FunctionTransformer
import category_encoders as ce

log_pipeline = Pipeline(
    steps=[
        ('to_float', FunctionTransformer(convert_to_folat, validate=False)),
        ('log', FunctionTransformer(np.log1p, validate=True)),
        ('scaler', RobustScaler())
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        
        ('num_log_scale', log_pipeline, numerical_cols),

        ('binary_pass','passthrough',binary_cols),
        ('low_ohe', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'),
         low_cardinality),

        ('medium_ohe', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'),
         medium_cardinality),

        ('high_binary', ce.BinaryEncoder(), high_cardinality)
    ],
    remainder='drop' 
)