In [5]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer, QuantileTransformer
from scipy.stats import entropy

# Define the original columns to scale
ORIGINAL_COLUMNS_TO_SCALE = [
    'flow_time', 'header_size', 'packet_duration', 'overall_rate', 
    'src_rate', 'dst_rate', 'fin_packets', 'urg_packets', 'rst_packets', 
    'max_value', 'value_covariance'
]

# New features to be created and scaled
NEW_FEATURES_TO_SCALE = [
    'rate_ratio', 'syn_to_ack', 'rst_to_fin', 'avg_pkt_size',
    'mean_interpkt', 'std_interpkt', 'p90_interpkt', 'burstiness',
    'payload_entropy', 'value_range', 'flows_last_10s', 'unique_dsts_last_10s',
    'hour_sin', 'hour_cos'
]

# Combined list of all columns to scale
COLUMNS_TO_SCALE = ORIGINAL_COLUMNS_TO_SCALE + NEW_FEATURES_TO_SCALE

# Binary features that should not be scaled
BINARY_FEATURES = [
    'handshake_complete', 'abrupt_reset', 'tcp_syn_ratio', 'udp_psh'
]

# Define the scalers to use
SCALERS = {
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler(),
    'RobustScaler': RobustScaler(),
    'PowerTransformer': PowerTransformer(method='yeo-johnson'),
    'QuantileTransformer': QuantileTransformer(output_distribution='normal')
}

# Path to data directory containing deduplicated_datasets
BASE_DIR = 'Data'
DEDUP_DIR = os.path.join(BASE_DIR, 'deduplicated_datasets')

# Main folders in deduplicated_datasets
MAIN_FOLDERS = ['Direct_Removal', 'Instance_Weighting', 'Train_Test_Aware']

def create_new_features(df):
    """
    Create all the new features described in the feature summary
    Adjusted to work with the available columns in the dataset
    """
    df_copy = df.copy()
    
    # Available columns:
    # flow_time, header_size, packet_duration, overall_rate, src_rate, dst_rate,
    # fin_packets, urg_packets, rst_packets, max_value, value_covariance,
    # fin_flags, syn_flags, rst_flags, psh_flags, ack_flags,
    # protocol_http, protocol_https, protocol_tcp, protocol_udp, protocol_icmp
    
    # Small epsilon value to avoid division by zero
    epsilon = 1e-10
    
    # 1. Continuous features that need scaling
    
    # rate_ratio: (src_rate + ε) / (dst_rate + ε)
    df_copy['rate_ratio'] = (df_copy['src_rate'] + epsilon) / (df_copy['dst_rate'] + epsilon)
    
    # syn_to_ack: (syn_flags + 1) / (ack_flags + 1) - using flags instead of packet counts
    df_copy['syn_to_ack'] = (df_copy['syn_flags'] + 1) / (df_copy['ack_flags'] + 1)
    
    # rst_to_fin: (rst_packets + 1) / (fin_packets + 1)
    df_copy['rst_to_fin'] = (df_copy['rst_packets'] + 1) / (df_copy['fin_packets'] + 1)
    
    # Estimate total_packets from available metrics
    # This is an approximation - adjust the formula based on your domain knowledge
    df_copy['total_packets_est'] = (df_copy['fin_packets'] + df_copy['urg_packets'] + 
                                   df_copy['rst_packets'] + 2)  # Adding 2 as base minimum
    
    # avg_pkt_size: approximate using available metrics
    df_copy['avg_pkt_size'] = (df_copy['overall_rate'] * df_copy['flow_time']) / df_copy['total_packets_est']
    
    # Inter-packet features using the estimated total_packets
    avg_gap = df_copy['flow_time'] / df_copy['total_packets_est'].clip(lower=2)
    df_copy['mean_interpkt'] = avg_gap
    df_copy['std_interpkt'] = avg_gap * (df_copy['value_covariance'].clip(lower=0.1))
    df_copy['p90_interpkt'] = avg_gap * 1.5  # Rough estimate for 90th percentile
    
    # Burstiness: using available metrics to approximate
    df_copy['burstiness'] = 2.0 + df_copy['value_covariance']  # Rough approximation
    
    # payload_entropy (approximated using available statistics)
    df_copy['payload_entropy'] = df_copy['value_covariance'].clip(lower=0) + 1
    
    # value_range: Since no min_value, approximate from max_value
    df_copy['value_range'] = df_copy['max_value'] * 0.8
    
    # flows_last_10s and unique_dsts_last_10s - use approximations
    df_copy['flows_last_10s'] = (df_copy['src_rate'] * 10).clip(lower=1)
    df_copy['unique_dsts_last_10s'] = (df_copy['flows_last_10s'] * 0.7).clip(lower=1)
    
    # Time-based cyclical features - use constants since no time data
    # You could consider deriving time from other context if available
    df_copy['hour_sin'] = 0  # Default placeholder
    df_copy['hour_cos'] = 1  # Default placeholder
    
    # 2. Binary features (no scaling needed)
    
    # handshake_complete: using flags instead of packet counts
    df_copy['handshake_complete'] = ((df_copy['syn_flags'] > 0) & 
                                    (df_copy['ack_flags'] > 0)).astype(int)
    
    # abrupt_reset: 1 if rst_flags=1 AND fin_flags=0, else 0
    df_copy['abrupt_reset'] = ((df_copy['rst_flags'] > 0) & 
                              (df_copy['fin_flags'] == 0)).astype(int)
    
    # tcp_syn_ratio: syn_flags * protocol_tcp
    df_copy['tcp_syn_ratio'] = df_copy['syn_flags'] * df_copy['protocol_tcp']
    
    # udp_psh: psh_flags * protocol_udp
    df_copy['udp_psh'] = df_copy['psh_flags'] * df_copy['protocol_udp']
    
    return df_copy

def apply_scaling(dataframe, scaler, columns_to_scale):
    """Apply scaling to specified columns of the dataframe"""
    df_copy = dataframe.copy()
    
    # Check which columns actually exist in the dataframe
    existing_columns = [col for col in columns_to_scale if col in df_copy.columns]
    
    if existing_columns:
        # Extract the columns to scale
        data_to_scale = df_copy[existing_columns].values
        
        # Fit and transform the data
        scaled_data = scaler.fit_transform(data_to_scale)
        
        # Replace the original columns with scaled data
        df_copy[existing_columns] = scaled_data
    
    return df_copy

def process_files():
    """Process files in the specified directory structure"""
    for main_folder in MAIN_FOLDERS:
        main_path = os.path.join(DEDUP_DIR, main_folder)
        outlier_path = os.path.join(main_path, 'outlier_handled_datasets')
        
        # Skip if path doesn't exist
        if not os.path.exists(outlier_path):
            print(f"Path not found: {outlier_path}")
            continue
        
        # Loop through each outlier handling method folder
        for outlier_method in os.listdir(outlier_path):
            method_path = os.path.join(outlier_path, outlier_method)
            
            # Skip if not a directory
            if not os.path.isdir(method_path):
                continue
            
            # Create the scaled_datasets directory if it doesn't exist
            scaled_path = os.path.join(method_path, 'scaled_datasets')
            os.makedirs(scaled_path, exist_ok=True)
            
            # Create an intermediate directory for feature-added datasets
            feature_added_path = os.path.join(method_path, 'feature_added_datasets')
            os.makedirs(feature_added_path, exist_ok=True)
            
            # Find X_train files
            for file in os.listdir(method_path):
                if file.endswith('.csv') and 'X_' in file:
                    file_path = os.path.join(method_path, file)
                    
                    try:
                        # Read the CSV file
                        df = pd.read_csv(file_path)
                        
                        # Get the base filename without extension
                        base_name = os.path.splitext(file)[0]
                        
                        # First, add the new features to the dataframe
                        df_with_features = create_new_features(df)
                        
                        # Save the intermediate dataframe with added features
                        feature_added_file = f"{base_name}_with_features.csv"
                        feature_added_path_file = os.path.join(feature_added_path, feature_added_file)
                        df_with_features.to_csv(feature_added_path_file, index=False)
                        print(f"Created: {feature_added_path_file}")
                        
                        # Apply each scaler and save the result
                        for scaler_name, scaler in SCALERS.items():
                            # Apply scaling to the dataframe (only scale continuous features)
                            scaled_df = apply_scaling(df_with_features, scaler, COLUMNS_TO_SCALE)
                            
                            # Create output filename
                            output_file = f"{base_name}_{scaler_name}.csv"
                            output_path = os.path.join(scaled_path, output_file)
                            
                            # Save the scaled dataframe
                            scaled_df.to_csv(output_path, index=False)
                            
                            print(f"Created: {output_path}")
                    
                    except Exception as e:
                        print(f"Error processing {file_path}: {e}")

if __name__ == "__main__":
    print("Starting to process files for feature engineering and scaling...")
    process_files()
    print("Feature engineering and scaling process complete!")

Starting to process files for feature engineering and scaling...
Created: Data\deduplicated_datasets\Direct_Removal\outlier_handled_datasets\Direct_Removal\feature_added_datasets\phase2_Direct_Removal_X_train_DirectRemoval_with_features.csv
Created: Data\deduplicated_datasets\Direct_Removal\outlier_handled_datasets\Direct_Removal\scaled_datasets\phase2_Direct_Removal_X_train_DirectRemoval_StandardScaler.csv
Created: Data\deduplicated_datasets\Direct_Removal\outlier_handled_datasets\Direct_Removal\scaled_datasets\phase2_Direct_Removal_X_train_DirectRemoval_MinMaxScaler.csv
Created: Data\deduplicated_datasets\Direct_Removal\outlier_handled_datasets\Direct_Removal\scaled_datasets\phase2_Direct_Removal_X_train_DirectRemoval_RobustScaler.csv
Created: Data\deduplicated_datasets\Direct_Removal\outlier_handled_datasets\Direct_Removal\scaled_datasets\phase2_Direct_Removal_X_train_DirectRemoval_PowerTransformer.csv
Created: Data\deduplicated_datasets\Direct_Removal\outlier_handled_datasets\Direc

In [3]:
import os

# Path to data directory containing deduplicated_datasets
BASE_DIR = 'Data'
DEDUP_DIR = os.path.join(BASE_DIR, 'deduplicated_datasets')

# Main folders in deduplicated_datasets
MAIN_FOLDERS = ['Direct_Removal', 'Instance_Weighting', 'Train_Test_Aware']

def recreate_scaled_directories():
    """Recreate the scaled_datasets directories in the appropriate locations"""
    directories_created = 0
    
    print("Starting to recreate scaled_datasets directories...")
    
    for main_folder in MAIN_FOLDERS:
        main_path = os.path.join(DEDUP_DIR, main_folder)
        outlier_path = os.path.join(main_path, 'outlier_handled_datasets')
        
        # Skip if path doesn't exist
        if not os.path.exists(outlier_path):
            print(f"Path not found: {outlier_path}")
            continue
            
        # Loop through each outlier handling method folder
        for outlier_method in os.listdir(outlier_path):
            method_path = os.path.join(outlier_path, outlier_method)
            
            # Skip if not a directory
            if not os.path.isdir(method_path):
                continue
                
            # Create the scaled_datasets directory
            scaled_path = os.path.join(method_path, 'scaled_datasets')
            
            if not os.path.exists(scaled_path):
                try:
                    os.makedirs(scaled_path)
                    print(f"Created directory: {scaled_path}")
                    directories_created += 1
                except Exception as e:
                    print(f"Error creating directory {scaled_path}: {e}")
            else:
                print(f"Directory already exists: {scaled_path}")
    
    print(f"Recreation process complete! Total directories created: {directories_created}")

if __name__ == "__main__":
    recreate_scaled_directories()

Starting to recreate scaled_datasets directories...
Created directory: Data\deduplicated_datasets\Direct_Removal\outlier_handled_datasets\Direct_Removal\scaled_datasets
Created directory: Data\deduplicated_datasets\Direct_Removal\outlier_handled_datasets\Isolation_Forest\scaled_datasets
Created directory: Data\deduplicated_datasets\Direct_Removal\outlier_handled_datasets\Log1p_Winsorization\scaled_datasets
Created directory: Data\deduplicated_datasets\Direct_Removal\outlier_handled_datasets\Winsorization\scaled_datasets
Created directory: Data\deduplicated_datasets\Direct_Removal\outlier_handled_datasets\Z-Score_Trimming\scaled_datasets
Created directory: Data\deduplicated_datasets\Instance_Weighting\outlier_handled_datasets\Direct_Removal\scaled_datasets
Created directory: Data\deduplicated_datasets\Instance_Weighting\outlier_handled_datasets\Isolation_Forest\scaled_datasets
Created directory: Data\deduplicated_datasets\Instance_Weighting\outlier_handled_datasets\Log1p_Winsorization\s