In [16]:
# Cell 1: Import libraries and define helper functions
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.preprocessing import LabelEncoder, PowerTransformer
from sklearn.cluster import DBSCAN

In [18]:
# Cell 2: Define the transform_skewed_features and check_skewness functions
def transform_skewed_features(data, transformation_map):
    """
    Apply the best transformation to each feature based on the provided mapping
    
    Parameters:
    -----------
    data : pandas DataFrame
        The dataframe containing the skewed columns
    transformation_map : dict
        Dictionary mapping column names to their best transformation method
    
    Returns:
    --------
    transformed_data : pandas DataFrame
        A copy of the original dataframe with transformed columns
    """
    transformed_data = data.copy()
    
    for column, transform_method in transformation_map.items():
        if column in data.columns:
            series = data[column].dropna()
            
            if transform_method == 'Yeo–Johnson':
                pt = PowerTransformer(method='yeo-johnson', standardize=False)
                transformed_values = pt.fit_transform(series.values.reshape(-1, 1)).flatten()
                transformed_data.loc[series.index, column] = transformed_values
                
            elif transform_method == 'Log1p':
                transformed_data.loc[series.index, column] = np.log1p(series)
                
            elif transform_method == 'Combined':
                # Apply Yeo-Johnson first, then log1p
                pt = PowerTransformer(method='yeo-johnson', standardize=False)
                yj_transformed = pt.fit_transform(series.values.reshape(-1, 1)).flatten()
                transformed_data.loc[series.index, column] = np.log1p(yj_transformed)
                
            elif transform_method == 'Box-Cox + log1p':
                # Apply Box-Cox first, then log1p
                # Note: Box-Cox requires all values to be positive
                min_val = series.min()
                if min_val <= 0:
                    # Shift data to make all values positive
                    shifted_series = series - min_val + 1
                else:
                    shifted_series = series
                    
                pt = PowerTransformer(method='box-cox', standardize=False)
                bc_transformed = pt.fit_transform(shifted_series.values.reshape(-1, 1)).flatten()
                transformed_data.loc[series.index, column] = np.log1p(bc_transformed)
                
            # If method is 'Original', keep the original values
    
    return transformed_data


def check_skewness(original_data, transformed_data, columns):
    results = []
    for col in columns:
        if col in original_data.columns and col in transformed_data.columns:
            original_skew = original_data[col].skew()
            transformed_skew = transformed_data[col].skew()
            results.append({
                'Feature': col,
                'Original skew': original_skew,
                'Transformed skew': transformed_skew,
                'Improvement': abs(original_skew) - abs(transformed_skew)
            })
    return pd.DataFrame(results)

In [20]:
# Cell 3: Load dataset and define constants
# Load dataset
dataset_path = "Data/phase2_students_before_cleaning.csv"  # adjust as needed
df = pd.read_csv(dataset_path)

# Base directory for saving
base_dir = Path(dataset_path).parent

# Define target column
target_column = 'label'  # change to your actual target

# Columns to base deduplication on (only numerical features)
numeric_cols = [
    'flow_time', 'header_size', 'packet_duration', 'overall_rate', 'src_rate', 'dst_rate', 
    'fin_packets', 'urg_packets', 'rst_packets', 'max_value', 'value_covariance'
]

# Define the best transformation for each column
best_transforms = {
    'flow_time': 'Combined',
    'header_size': 'Yeo–Johnson',
    'packet_duration': 'Log1p',
    'overall_rate': 'Yeo–Johnson',
    'src_rate': 'Yeo–Johnson',
    'dst_rate': 'Yeo–Johnson',
    'fin_packets': 'Combined',
    'urg_packets': 'Combined',
    'rst_packets': 'Combined',
    'max_value': 'Box-Cox + log1p',
    'value_covariance': 'Combined'
}

# Separate features and target
X = df.drop(columns=[target_column])
y = df[target_column]

# Initialize LabelEncoder
label_encoder = LabelEncoder()

In [21]:
# Cell 4: Method 1 - Direct Removal
def direct_removal(X, y):
    combined = pd.concat([X, y], axis=1)
    dedup = combined.drop_duplicates(subset=numeric_cols + [target_column])
    return dedup.drop(columns=[target_column]), dedup[target_column]

# Execute Method 1
print("Processing Direct_Removal...")
X_dedup, y_dedup = direct_removal(X, y)
y_encoded = pd.Series(label_encoder.fit_transform(y_dedup), name=target_column)

# split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_dedup,
    y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded           # preserves class ratios
)

# Apply transformation only to training data
X_train_transformed = transform_skewed_features(
    X_train,
    {k: v for k, v in best_transforms.items() if k in X_train.columns}
)

# Print skewness improvement for training data
skew_comparison = check_skewness(
    X_train,
    X_train_transformed,
    [col for col in best_transforms.keys() if col in X_train.columns]
)
print("\nSkewness improvement for Direct_Removal:")
print(skew_comparison)

# Save transformed training data
X_train_transformed.to_csv(base_dir / "phase2_Direct_Removal_X_train.csv", index=False)
X_test.to_csv(base_dir / "phase2_Direct_Removal_X_test.csv", index=False)
y_train.to_csv(base_dir / "phase2_Direct_Removal_y_train.csv", index=False)
y_test.to_csv(base_dir / "phase2_Direct_Removal_y_test.csv", index=False)


Processing Direct_Removal...

Skewness improvement for Direct_Removal:
             Feature  Original skew  Transformed skew  Improvement
0          flow_time     813.536437          1.336008   812.200429
1        header_size      83.332541          0.052562    83.279979
2    packet_duration      10.362799          7.842132     2.520666
3       overall_rate      35.407439          0.271302    35.136137
4           src_rate      35.407439          0.271302    35.136137
5           dst_rate     313.605366          0.000000   313.605366
6        fin_packets       3.916024          2.509267     1.406758
7        urg_packets      23.758800          2.093620    21.665179
8        rst_packets      12.467244          1.836099    10.631145
9          max_value       9.719423          0.962519     8.756904
10  value_covariance     102.462206          1.035220   101.426986


In [22]:
# Cell 7: Method 4 - Instance Weighting
def instance_weighting(X, y):
    combined = pd.concat([X, y], axis=1)
    weights = combined.groupby(numeric_cols + [target_column]) \
                      .size().reset_index(name='weight')
    weighted_df = combined.merge(
        weights,
        left_on=numeric_cols + [target_column],
        right_on=numeric_cols + [target_column]
    )
    X_w = weighted_df.drop(columns=[target_column, 'weight'])
    y_w = weighted_df[target_column]
    sample_weights = weighted_df['weight']
    return X_w, y_w, sample_weights

# Execute Method 4
print("Processing Instance_Weighting...")
X_w, y_w, w = instance_weighting(X, y)
y_encoded = pd.Series(label_encoder.fit_transform(y_w), name=target_column)

# split with stratification
X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
    X_w,
    y_encoded,
    w,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded           # preserves class ratios here too
)

# Apply transformation only to training data
X_train_transformed = transform_skewed_features(
    X_train,
    {k: v for k, v in best_transforms.items() if k in X_train.columns}
)

# Print skewness improvement for training data
skew_comparison = check_skewness(
    X_train,
    X_train_transformed,
    [col for col in best_transforms.keys() if col in X_train.columns]
)
print("\nSkewness improvement for Instance_Weighting:")
print(skew_comparison)

# Save transformed training data and weights
X_train_transformed.to_csv(base_dir / "phase2_Instance_Weighting_X_train.csv", index=False)
X_test.to_csv(base_dir / "phase2_Instance_Weighting_X_test.csv", index=False)
y_train.to_csv(base_dir / "phase2_Instance_Weighting_y_train.csv", index=False)
y_test.to_csv(base_dir / "phase2_Instance_Weighting_y_test.csv", index=False)
pd.DataFrame({'weight': w_train}).to_csv(
    base_dir / "phase2_Instance_Weighting_weights_train.csv", index=False
)
pd.DataFrame({'weight': w_test}).to_csv(
    base_dir / "phase2_Instance_Weighting_weights_test.csv", index=False
)


Processing Instance_Weighting...

Skewness improvement for Instance_Weighting:
             Feature  Original skew  Transformed skew  Improvement
0          flow_time     190.836679          1.460719   189.375960
1        header_size      90.252863          0.053950    90.198913
2    packet_duration      10.864631          8.302737     2.561894
3       overall_rate      22.719804          0.263820    22.455984
4           src_rate      22.719804          0.263820    22.455984
5           dst_rate     809.550333          0.000000   809.550333
6        fin_packets       3.515336          2.404278     1.111058
7        urg_packets      25.101142          2.106201    22.994941
8        rst_packets      13.193635          1.858091    11.335545
9          max_value      10.173905          0.918126     9.255780
10  value_covariance     110.565900          1.181848   109.384052


In [15]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import PowerTransformer

# Define path to training data only
base_dir = Path(r"C:\Machine Learning\Phase 2\Data\deduplicated_datasets\Train_Test_Aware")
X_train_path = base_dir / "phase2_TrainTestAware_X_train.csv"

# Define the transform_skewed_features function
def transform_skewed_features(data, transformation_map):
    """
    Apply the best transformation to each feature based on the provided mapping
    """
    transformed_data = data.copy()
    
    for column, transform_method in transformation_map.items():
        if column in data.columns:
            series = data[column].dropna()
            
            if transform_method == 'Yeo–Johnson':
                pt = PowerTransformer(method='yeo-johnson', standardize=False)
                transformed_values = pt.fit_transform(series.values.reshape(-1, 1)).flatten()
                transformed_data.loc[series.index, column] = transformed_values
                
            elif transform_method == 'Log1p':
                transformed_data.loc[series.index, column] = np.log1p(series)
                
            elif transform_method == 'Combined':
                # Apply Yeo-Johnson first, then log1p
                pt = PowerTransformer(method='yeo-johnson', standardize=False)
                yj_transformed = pt.fit_transform(series.values.reshape(-1, 1)).flatten()
                transformed_data.loc[series.index, column] = np.log1p(yj_transformed)
                
            elif transform_method == 'Box-Cox + log1p':
                # Apply Box-Cox first, then log1p
                # Note: Box-Cox requires all values to be positive
                min_val = series.min()
                if min_val <= 0:
                    # Shift data to make all values positive
                    shifted_series = series - min_val + 1
                else:
                    shifted_series = series
                    
                pt = PowerTransformer(method='box-cox', standardize=False)
                bc_transformed = pt.fit_transform(shifted_series.values.reshape(-1, 1)).flatten()
                transformed_data.loc[series.index, column] = np.log1p(bc_transformed)
    
    return transformed_data

def check_skewness(original_data, transformed_data, columns):
    results = []
    for col in columns:
        if col in original_data.columns and col in transformed_data.columns:
            original_skew = original_data[col].skew()
            transformed_skew = transformed_data[col].skew()
            results.append({
                'Feature': col,
                'Original skew': original_skew,
                'Transformed skew': transformed_skew,
                'Improvement': abs(original_skew) - abs(transformed_skew)
            })
    return pd.DataFrame(results)

# Define the best transformation for each column
best_transforms = {
    'flow_time': 'Combined',
    'header_size': 'Yeo–Johnson',
    'packet_duration': 'Log1p',
    'overall_rate': 'Yeo–Johnson',
    'src_rate': 'Yeo–Johnson',
    'dst_rate': 'Yeo–Johnson',
    'fin_packets': 'Combined',
    'urg_packets': 'Combined',
    'rst_packets': 'Combined',
    'max_value': 'Box-Cox + log1p',
    'value_covariance': 'Combined'
}

# Load only the training data
print("Loading training data...")
X_train = pd.read_csv(X_train_path)
print(f"Loaded X_train with shape: {X_train.shape}")

# Apply transformation to training data
print("Applying skewness transformations...")
X_train_transformed = transform_skewed_features(X_train, 
                                              {k: v for k, v in best_transforms.items() if k in X_train.columns})

# Print skewness improvement for training data
skew_comparison = check_skewness(X_train, X_train_transformed, 
                                [col for col in best_transforms.keys() if col in X_train.columns])
print("\nSkewness improvement after transformations:")
print(skew_comparison)

# Create output directory for transformed data if it doesn't exist
output_dir = base_dir / "transformed"
output_dir.mkdir(exist_ok=True)

# Save only the transformed training data
print(f"\nSaving transformed training data to {output_dir}...")
X_train_transformed.to_csv(output_dir / "phase2_TrainTestAware_X_train_transformed.csv", index=False)

print("Done! Successfully applied skewness transformations to training data.")

Loading training data...
Loaded X_train with shape: (737580, 21)
Applying skewness transformations...

Skewness improvement after transformations:
             Feature  Original skew  Transformed skew  Improvement
0          flow_time     662.029643          1.440893   660.588751
1        header_size      87.959323          0.053391    87.905932
2    packet_duration      10.734725          8.226282     2.508443
3       overall_rate      31.262338          0.265660    30.996678
4           src_rate      31.262338          0.265660    30.996678
5           dst_rate     807.523598          0.000000   807.523598
6        fin_packets       3.576488          2.420061     1.156427
7        urg_packets      24.593968          2.098304    22.495664
8        rst_packets      13.012618          1.851864    11.160754
9          max_value      10.140544          0.924946     9.215598
10  value_covariance     107.381428          1.157036   106.224391

Saving transformed training data to C:\Machine L