## **Scaling and Normalization**

- Robust Scaler 
- Log Scaler 

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math 
from scipy import stats

**Data**

In [2]:
# Load the data
try:
    data = pd.read_csv('data_versions/03_feature_engineered_data.csv', index_col=False)
    print(f"Successfully loaded {len(data.columns)} features")
except Exception as e: 
    print(f"Error loading data: {e}")

Successfully loaded 88 features


**Analysis** 

In [3]:
# Drop the first misc column
data.drop(columns='Unnamed: 0', inplace=True)

*Functions*

In [12]:
# Define Scaling functions
from sklearn.preprocessing import RobustScaler

def robust_scale_features(df):
    """
    Scales the features of the DataFrame using the RobustScaler.
    Parameters:
        df (pd.DataFrame): The input DataFrame.
    Returns:
        pd.DataFrame: The DataFrame with scaled features.
    """
    scaler = RobustScaler()
    scaled_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    return scaled_df


def log_transform_features(df):
    """
    Applies logarithmic transformation to the specified features in the DataFrame.
    Parameters:
        df (pd.DataFrame): The input DataFrame.
        feature_list (list): List of feature names to be log-transformed.
    Returns:
        pd.DataFrame: The DataFrame with log-transformed features.  
    """
    df = df.applymap(lambda x: np.log1p(x) if x >= 0 else 0)
    return df


In [9]:
# Separate the label and features before transformation
y = data.iloc[:, -1]
x = data.drop(columns='Label')

In [20]:
# Apply Robust scale scaling
robust_scaled_data = robust_scale_features(x)

# sample data after robust scaler transformation
robust_scaled_data.sample(5)

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,...,Idle Std,Idle Max,Idle Min,Total TCP Flow Time,Year,Month,Day,Hour,Minute,Second
1529019,0.283499,0.0,-1.447599,0.0,0.0,0.0,78.93145,0.5,1.0,0.919014,...,0.0,0.0,0.0,84.191753,0.0,0.0,0.0,0.454545,-0.172414,0.535714
2344960,0.451756,0.0,-0.029185,1.0,0.0,0.0,-0.305953,-2.5,-4.0,-0.915493,...,0.0,0.0,0.0,1985.899311,0.0,-1.5,15.0,-0.818182,-0.482759,0.5
2436031,0.167254,0.0,0.481219,1.0,0.0,0.0,15.229948,0.5,1.0,0.919014,...,0.0,0.0,0.0,16.252051,0.0,-1.5,15.0,-0.727273,0.068966,-0.357143
801730,0.355041,0.0,0.614108,0.0,0.0,0.0,78.910574,0.5,1.0,0.919014,...,0.0,0.0,0.0,84.169488,0.0,0.0,0.0,0.181818,-0.310345,-0.714286
1346055,-0.635189,-0.167571,-0.165864,-5.7e-05,1010.0,0.0,-0.301849,-2.5,-4.0,-0.915493,...,0.0,0.0,0.0,-0.31311,0.0,0.0,0.0,0.363636,0.689655,0.142857


In [21]:
# merge target col 
robust_scaled_data['Label'] = y

In [14]:
# Apply log transformation
try:
    data_log_transformed = log_transform_features(x)
    print(f"Successfully log transformed!")
except Exception as e:
    print(f"Error {e}")
    
# sample data after log transformation
data_log_transformed.sample(5)

Successfully log transformed!


Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,...,Idle Std,Idle Max,Idle Min,Total TCP Flow Time,Year,Month,Day,Hour,Minute,Second
416646,0.002108,0.155101,10.573213,1.2e-05,8.997271,1.94591,10.050182,1.94591,1.791759,6.300786,...,0.0,0.0,0.0,10.050182,7.612831,2.564949,1.94591,2.890372,3.610918,4.025352
1260859,0.003123,0.155101,9.502861,1.2e-05,8.997271,1.94591,10.902869,1.94591,1.791759,6.23637,...,0.0,0.0,0.0,10.902869,7.612831,2.564949,1.94591,3.091042,2.70805,4.060443
52583,0.036101,0.155101,10.599206,0.000936,7.539559,1.94591,6.09357,1.94591,1.609438,6.063785,...,0.0,0.0,0.0,6.09357,7.612831,1.386294,3.044522,2.772589,3.637586,3.663562
3093713,0.005996,0.155101,10.486625,1.098612,7.539559,1.94591,6.50279,1.94591,1.791759,6.949856,...,0.0,0.0,0.0,6.50279,7.613325,1.791759,2.079442,2.890372,3.044522,4.043051
361356,0.007009,0.000205,10.766778,0.0,9.11504,1.94591,15.793369,1.609438,0.0,0.0,...,0.0,0.0,0.0,15.793369,7.612831,2.564949,1.94591,2.890372,3.367296,1.94591


In [18]:
# merge the target column back to the main data
data_log_scaled = pd.concat([data_log_transformed, y], axis=1)

We are saving log transformed data as of now, might use robust scaled data later if required

**Save to csv**

In [23]:
# Save to csv 
data_log_scaled.to_csv('data_versions/04_log_scaled_data.csv', index=False)