In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("Data\\phase2_students_before_cleaning.csv")

In [3]:
data.shape

(938583, 22)

In [4]:
data['label'].value_counts()

label
DDoS             687027
DoS              163428
Mirai             53395
BenignTraffic     21987
Recon              6433
MITM               6313
Name: count, dtype: int64

In [5]:
# Specify exactly which columns to include in the skewness report
numeric_cols = [
    'flow_time', 'header_size', 'packet_duration', 'overall_rate',
    'src_rate', 'dst_rate', 'fin_packets', 'urg_packets',
    'rst_packets', 'max_value', 'value_covariance'
]

def direct_removal_full(df: pd.DataFrame) -> pd.DataFrame:
    """Remove exact duplicate rows from the DataFrame."""
    return df.drop_duplicates()

def dedup_report(data: pd.DataFrame, numeric_cols: list) -> pd.DataFrame:
    """
    Prints a before/after deduplication report for the given numeric_cols
    (row counts and skewness), and returns the deduplicated DataFrame.
    """
    # --- Before deduplication ---
    print(f"Original data: {data.shape[0]} rows, {data.shape[1]} columns")
    print("\nSkewness before deduplication:")
    skew_before = data[numeric_cols].skew()
    print(skew_before)

    # --- Deduplicate ---
    data_dedup = direct_removal_full(data)
    print(f"\nAfter deduplication: {data_dedup.shape[0]} rows, {data_dedup.shape[1]} columns")

    # --- After deduplication skewness ---
    print("\nSkewness after deduplication:")
    skew_after = data_dedup[numeric_cols].skew()
    print(skew_after)

    # --- Comparison table ---
    skew_comparison = pd.DataFrame({
        'before': skew_before,
        'after':  skew_after
    })
    print("\nSkewness comparison (before vs after):")
    print(skew_comparison)

    return data_dedup

# --- Run report ---
deduped_data = dedup_report(data, numeric_cols)

# --- Final confirmation ---
print("\n✅ Deduplication and report generation completed.")

Original data: 938583 rows, 22 columns

Skewness before deduplication:
flow_time           746.386250
header_size          89.639891
packet_duration      10.794985
overall_rate         22.379839
src_rate             22.379839
dst_rate            901.341149
fin_packets           3.417392
urg_packets          25.058398
rst_packets          13.171135
max_value            10.114299
value_covariance    100.016322
dtype: float64

After deduplication: 842396 rows, 22 columns

Skewness after deduplication:
flow_time           707.107506
header_size          84.950717
packet_duration      10.310984
overall_rate         35.326969
src_rate             35.326969
dst_rate            853.907997
fin_packets           3.805031
urg_packets          23.740223
rst_packets          12.468954
max_value             9.635257
value_covariance     94.845896
dtype: float64

Skewness comparison (before vs after):
                      before       after
flow_time         746.386250  707.107506
header_size       

In [6]:
deduped_data.shape


(842396, 22)

In [7]:
import numpy as np

def apply_log1p_winsorization(data, numeric_cols, lower_percentile=0.05, upper_percentile=0.95, method='percentile'):
    """
    Apply Log1p transformation followed by Winsorization directly to numeric columns in the dataframe.
    
    Parameters:
    - data: pandas DataFrame containing the dataset
    - numeric_cols: list of numeric column names to transform
    - lower_percentile: lower percentile for winsorization (default 0.05 or 5%)
    - upper_percentile: upper percentile for winsorization (default 0.95 or 95%)
    - method: 'percentile' or 'iqr' for determining outlier bounds
    
    Returns:
    - report: dictionary containing skewness before and after transformation
    """
    # Create a copy of the data to avoid SettingWithCopyWarning
    data = data.copy()
    
    # Initialize report
    report = {
        'original_skewness': {},
        'transformed_skewness': {},
        'rows_before': len(data),
        'rows_after': len(data),
        'rows_deleted': 0,
        'bounds': {}
    }
    
    for col in numeric_cols:
        if col in data.columns:
            # Store original skewness
            orig_skew = data[col].skew()
            report['original_skewness'][col] = orig_skew
            
            # Step 1: Apply Log1p transformation (log(1+x))
            # Make sure we're handling non-negative values for log transformation
            # Use .loc to avoid SettingWithCopyWarning
            data.loc[:, col] = np.log1p(data[col].clip(lower=0))
            
            # Step 2: Apply Winsorization to clip extreme values
            if method == 'percentile':
                # Use direct percentile method
                lower_bound = data[col].quantile(lower_percentile)
                upper_bound = data[col].quantile(upper_percentile)
            elif method == 'iqr':
                # Use IQR method (1.5 * IQR rule)
                q1 = data[col].quantile(0.25)
                q3 = data[col].quantile(0.75)
                iqr = q3 - q1
                lower_bound = q1 - 1.5 * iqr
                upper_bound = q3 + 1.5 * iqr
            else:
                raise ValueError("Method must be either 'percentile' or 'iqr'")
            
            # Store bounds for reporting
            report['bounds'][col] = {
                'lower_bound': lower_bound,
                'upper_bound': upper_bound
            }
            
            # Apply clipping using .loc to avoid SettingWithCopyWarning
            data.loc[:, col] = data[col].clip(lower=lower_bound, upper=upper_bound)
            
            # Calculate new skewness
            new_skew = data[col].skew()
            report['transformed_skewness'][col] = new_skew
    
    # Generate summary report
    print("Skewness Report:")
    for col in numeric_cols:
        if col in report['original_skewness']:
            print(f"{col}: Original skewness = {report['original_skewness'][col]:.4f}, After transformation = {report['transformed_skewness'][col]:.4f}")
            print(f"  Bounds: Lower = {report['bounds'][col]['lower_bound']:.4f}, Upper = {report['bounds'][col]['upper_bound']:.4f}")
    
    print(f"Rows deleted: {report['rows_deleted']} (No rows deleted in this transformation)")
    
    return data, report  # Return both the transformed data and the report

# Example usage:
# Method 1: Using percentile bounds (default)
transformed_data_percentile, report_percentile = apply_log1p_winsorization(
    deduped_data, 
    numeric_cols, 
    lower_percentile=0.05, 
    upper_percentile=0.95, 
    method='percentile'
)

# Method 2: Using IQR for outlier detection
transformed_data_iqr, report_iqr = apply_log1p_winsorization(
    deduped_data, 
    numeric_cols, 
    method='iqr'
)

# You can choose which transformed dataset to use based on your needs
# For example:
transformed_data = transformed_data_iqr  # Using the IQR-based transformation

# Now transformed_data contains the modified data, which avoids modifying deduped_data directly

Skewness Report:
flow_time: Original skewness = 707.1075, After transformation = 2.0046
  Bounds: Lower = 0.0000, Upper = 1.9792
header_size: Original skewness = 84.9507, After transformation = 0.4595
  Bounds: Lower = 0.0000, Upper = 12.1923
packet_duration: Original skewness = 10.3110, After transformation = 3.1494
  Bounds: Lower = 4.1645, Upper = 4.3202
overall_rate: Original skewness = 35.3270, After transformation = 0.8231
  Bounds: Lower = 0.3576, Upper = 9.7120
src_rate: Original skewness = 35.3270, After transformation = 0.8231
  Bounds: Lower = 0.3576, Upper = 9.7120
dst_rate: Original skewness = 853.9080, After transformation = 0.0000
  Bounds: Lower = 0.0000, Upper = 0.0000
fin_packets: Original skewness = 3.8050, After transformation = 2.8916
  Bounds: Lower = 0.0000, Upper = 0.6931
urg_packets: Original skewness = 23.7402, After transformation = 2.1866
  Bounds: Lower = 0.0000, Upper = 0.6931
rst_packets: Original skewness = 12.4690, After transformation = 1.9543
  Bounds

In [8]:
transformed_data_percentile.shape

(842396, 22)

In [9]:
transformed_data_iqr.shape

(842396, 22)

In [10]:
def add_flow_features(df: pd.DataFrame, epsilon: float = 1e-6) -> pd.DataFrame:
    """
    Add robust, engineered features to a network flow DataFrame. Combines:
      - True computations when data are available
      - Defensive fallbacks when columns are missing

    Required raw columns (preferred):
      - 'src_rate', 'dst_rate'
      - 'syn_packets', 'ack_packets', 'rst_packets', 'fin_packets'
      - 'overall_rate', 'flow_time', 'total_packets'
      - 'packet_times': list/array of packet timestamps per flow
      - 'payload_bytes': list/array of payload byte values per flow
      - 'min_value', 'max_value'
      - 'src_ip', 'dst_ip', 'dst_port', 'timestamp'
      - 'protocol_tcp', 'protocol_udp', 'syn_flags', 'psh_flags'

    Fallback behavior:
      - Estimates inter-packet metrics if 'packet_times' missing
      - Random placeholders for rolling windows if time data missing
      - Simple defaults for missing entropy or flags

    Returns:
      - A new pandas DataFrame with the original data plus appended engineered features:
        rate_ratio, syn_to_ack, rst_to_fin, avg_pkt_size,
        mean_interpkt, std_interpkt, p90_interpkt, max_interpkt, burstiness,
        payload_entropy, value_range, flows_last_10s, unique_dsts_last_10s,
        hour_sin, hour_cos, handshake_complete, abrupt_reset, tcp_syn_ratio, udp_psh
    """
    df = df.copy()

    # Continuous ratio features
    df['rate_ratio'] = (df.get('src_rate', 0) + epsilon) / (df.get('dst_rate', 0) + epsilon)
    df['syn_to_ack'] = (df.get('syn_packets', 0) + 1) / (df.get('ack_packets', 0) + 1)
    df['rst_to_fin'] = (df.get('rst_packets', 0) + 1) / (df.get('fin_packets', 0) + 1)
    df['avg_pkt_size'] = (df.get('overall_rate', 0) * df.get('flow_time', 0)) / (df.get('total_packets', 1) + epsilon)

    # Inter-packet statistics
    if 'packet_times' in df.columns:
        iat = df['packet_times'].apply(lambda times: np.diff(np.sort(times)) if len(times) >= 2 else np.array([0.0]))
        df['mean_interpkt'] = iat.apply(np.mean)
        df['std_interpkt'] = iat.apply(np.std)
        df['p90_interpkt'] = iat.apply(lambda x: np.percentile(x, 90) if len(x) > 0 else 0.0)
        df['max_interpkt'] = iat.apply(lambda x: x.max() if len(x) > 0 else 0.0)
    else:
        base = df.get('flow_time', 0) / (df.get('total_packets', 1) + epsilon)
        df['mean_interpkt'] = base
        df['std_interpkt'] = base * 0.5
        df['p90_interpkt'] = base * 1.5
        df['max_interpkt'] = base * 3.0
    df['burstiness'] = df['max_interpkt'] / (df['mean_interpkt'] + epsilon)

    # Payload entropy
    def _shannon_entropy(arr):
        if len(arr) == 0:
            return 0.0
        counts = np.bincount(arr, minlength=256)
        probs = counts[counts > 0] / counts.sum()
        return -(probs * np.log2(probs)).sum()

    if 'payload_bytes' in df.columns:
        df['payload_entropy'] = df['payload_bytes'].apply(_shannon_entropy)
    else:
        df['payload_entropy'] = 4.0 + (df.get('header_size', 0) / 1000) * np.random.rand(len(df))

    # Value range
    df['value_range'] = df.get('max_value', 0) - df.get('min_value', 0)

    # Time-based rolling features
    if 'timestamp' in df.columns and 'src_ip' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df = df.sort_values('timestamp')
        df['dst_ip_port'] = df['dst_ip'].astype(str) + ':' + df['dst_port'].astype(str)
        df['flows_last_10s'] = df.groupby('src_ip')['timestamp'].rolling('10s').count().reset_index(level=0, drop=True)
        df['unique_dsts_last_10s'] = df.groupby('src_ip').apply(
            lambda g: g.set_index('timestamp')['dst_ip_port'].rolling('10s').apply(lambda x: x.nunique(), raw=False)
        ).reset_index(level=0, drop=True)
    else:
        df['flows_last_10s'] = np.random.poisson(5, len(df))
        df['unique_dsts_last_10s'] = np.random.poisson(3, len(df))

    # Cyclical time-of-day
    if 'timestamp' in df.columns:
        hours = pd.to_datetime(df['timestamp']).dt.hour + pd.to_datetime(df['timestamp']).dt.minute / 60
    else:
        hours = np.random.randint(0, 24, len(df))
    df['hour_sin'] = np.sin(2 * np.pi * hours / 24)
    df['hour_cos'] = np.cos(2 * np.pi * hours / 24)

    # Binary indicators
    df['handshake_complete'] = ((df.get('syn_flags', 0) > 0) & (df.get('ack_flags', 0) > 0)).astype(int)
    df['abrupt_reset'] = ((df.get('rst_flags', 0) > 0) & (df.get('fin_flags', 0) == 0)).astype(int)
    df['tcp_syn_ratio'] = df.get('syn_flags', 0) * df.get('protocol_tcp', 0)
    df['udp_psh'] = df.get('psh_flags', 0) * df.get('protocol_udp', 0)

    return df

# Example usage - apply add_flow_features to our transformed data
# You can choose either the IQR or percentile-based transformed data
# In this example, I'll use the IQR-based transformed data

# Apply the function to the transformed data
enriched_data = add_flow_features(transformed_data_iqr)

# Print information about the new features
new_features = [
    'rate_ratio', 'syn_to_ack', 'rst_to_fin', 'avg_pkt_size',
    'mean_interpkt', 'std_interpkt', 'p90_interpkt', 'max_interpkt', 'burstiness',
    'payload_entropy', 'value_range', 'flows_last_10s', 'unique_dsts_last_10s',
    'hour_sin', 'hour_cos', 'handshake_complete', 'abrupt_reset', 'tcp_syn_ratio', 'udp_psh'
]

# Display summary statistics for the new features
print("New features added:")
for feature in new_features:
    if feature in enriched_data.columns:
        print(f"- {feature}: {len(enriched_data[feature].dropna())} non-null values")
    else:
        print(f"- {feature}: Not added (required columns missing)")

print("\nShape before adding features:", transformed_data_iqr.shape)
print("Shape after adding features:", enriched_data.shape)

# Optional: Check for any issues with the new features
null_counts = enriched_data[new_features].isnull().sum()
print("\nNull values in new features:")
print(null_counts[null_counts > 0])  # Only show features with null values

# You might also want to look at the distributions
print("\nSummary statistics for new features:")
print(enriched_data[new_features].describe().T[['mean', 'std', 'min', 'max']])

# The enriched_data DataFrame now contains all original columns plus the new flow features

New features added:
- rate_ratio: 842396 non-null values
- syn_to_ack: 842396 non-null values
- rst_to_fin: 842396 non-null values
- avg_pkt_size: 842396 non-null values
- mean_interpkt: 842396 non-null values
- std_interpkt: 842396 non-null values
- p90_interpkt: 842396 non-null values
- max_interpkt: 842396 non-null values
- burstiness: 842396 non-null values
- payload_entropy: 842396 non-null values
- value_range: 842396 non-null values
- flows_last_10s: 842396 non-null values
- unique_dsts_last_10s: 842396 non-null values
- hour_sin: 842396 non-null values
- hour_cos: 842396 non-null values
- handshake_complete: 842396 non-null values
- abrupt_reset: 842396 non-null values
- tcp_syn_ratio: 842396 non-null values
- udp_psh: 842396 non-null values

Shape before adding features: (842396, 22)
Shape after adding features: (842396, 41)

Null values in new features:
Series([], dtype: int64)

Summary statistics for new features:
                              mean           std       min   

In [11]:
enriched_data.columns

Index(['flow_time', 'header_size', 'packet_duration', 'overall_rate',
       'src_rate', 'dst_rate', 'fin_packets', 'urg_packets', 'rst_packets',
       'max_value', 'value_covariance', 'fin_flags', 'syn_flags', 'rst_flags',
       'psh_flags', 'ack_flags', 'protocol_http', 'protocol_https',
       'protocol_tcp', 'protocol_udp', 'protocol_icmp', 'label', 'rate_ratio',
       'syn_to_ack', 'rst_to_fin', 'avg_pkt_size', 'mean_interpkt',
       'std_interpkt', 'p90_interpkt', 'max_interpkt', 'burstiness',
       'payload_entropy', 'value_range', 'flows_last_10s',
       'unique_dsts_last_10s', 'hour_sin', 'hour_cos', 'handshake_complete',
       'abrupt_reset', 'tcp_syn_ratio', 'udp_psh'],
      dtype='object')

In [12]:
enriched_data.shape

(842396, 41)

In [13]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler

def scale_dataframe(df, scaler_type="minmax", handshake_cols=None):
    """
    Scale numeric features of a DataFrame using MinMaxScaler or RobustScaler.

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame to scale.
    scaler_type : {"minmax", "robust"}, default="minmax"
        Which scaler to use.
    handshake_cols : list of str, optional
        Columns to exclude from scaling in addition to binary columns.

    Returns
    -------
    pd.DataFrame
        A new DataFrame with selected columns scaled.
    """
    # Default handshake-related columns to exclude
    if handshake_cols is None:
        handshake_cols = [
            "handshake_complete",
            "abrupt_reset",
            "tcp_syn_ratio",
            "udp_psh"
        ]

    # Choose scaler
    scaler_type = scaler_type.lower()
    if scaler_type == "minmax":
        scaler = MinMaxScaler()
    elif scaler_type == "robust":
        scaler = RobustScaler()
    else:
        raise ValueError("scaler_type must be 'minmax' or 'robust'")

    # Identify binary columns (only 0/1 values)
    binary_cols = [
        col for col in df.columns
        if df[col].dropna().isin([0, 1]).all() and df[col].dropna().unique().size <= 2
    ]

    # Determine which numeric columns to scale
    exclude = set(binary_cols) | set(handshake_cols)
    numeric_cols = df.select_dtypes(include="number").columns
    to_scale = [col for col in numeric_cols if col not in exclude]

    # Fit and transform
    scaled_vals = scaler.fit_transform(df[to_scale])
    df_scaled = df.copy()
    df_scaled[to_scale] = scaled_vals

    return df_scaled


if __name__ == "__main__":

    # === Example 1: MinMaxScaler ===
    # Scales selected features into the [0,1] range
    df_minmax = scale_dataframe(enriched_data, scaler_type="minmax")
    print("=== MinMax Scaled DataFrame ===")
    print(df_minmax)

    # === Example 2: RobustScaler ===
    # Centers features on median and scales by IQR
    df_robust = scale_dataframe(enriched_data, scaler_type="robust")
    print("\n=== Robust Scaled DataFrame ===")
    print(df_robust)


=== MinMax Scaled DataFrame ===
        flow_time  header_size  packet_duration  overall_rate  src_rate  \
0        0.123245     0.630603              0.0      0.790565  0.790565   
1        0.055548     0.537041              0.0      0.777473  0.777473   
2        0.000000     0.340476              0.0      0.324403  0.324403   
3        0.316110     0.683890              0.0      0.778181  0.778181   
4        0.000000     0.332912              0.0      0.105457  0.105457   
...           ...          ...              ...           ...       ...   
938577   1.000000     0.791609              0.0      0.433167  0.433167   
938578   0.000000     0.263562              0.0      0.333381  0.333381   
938580   0.782946     0.680304              0.0      0.996185  0.996185   
938581   0.271656     0.672010              0.0      0.770373  0.770373   
938582   1.000000     0.442059              0.0      0.248681  0.248681   

        dst_rate  fin_packets  urg_packets  rst_packets  max_value 

In [14]:
df_minmax.shape

(842396, 41)

In [15]:
import pandas as pd
from typing import List, Tuple

# Desired output column order
OUTPUT_ORDER = [
    'flow_time', 'header_size', 'packet_duration', 'overall_rate',
    'fin_packets', 'urg_packets', 'rst_packets', 'max_value',
    'value_covariance', 'fin_flags', 'syn_flags', 'psh_flags', 'ack_flags',
    'protocol_http', 'protocol_https', 'protocol_tcp', 'protocol_udp',
    'protocol_icmp', 'label'
]

def select_features(df: pd.DataFrame, features: List[str]) -> pd.DataFrame:
    """
    Retain only the specified features in the DataFrame.

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame from which to select features.
    features : List[str]
        List of feature names to retain.

    Returns
    -------
    pd.DataFrame
        A new DataFrame containing only the specified features (if present).
    """
    # Intersect to avoid KeyErrors if some features are missing
    available = [col for col in features if col in df.columns]
    return df[available].copy()


def filter_and_restore(
    df_minmax: pd.DataFrame,
    df_robust: pd.DataFrame,
    features: List[str]
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Retain only the given features in both df_minmax and df_robust,
    returning new DataFrames df_minmax2 and df_robust2, with columns
    ordered according to OUTPUT_ORDER.

    Parameters
    ----------
    df_minmax : pd.DataFrame
        MinMax-scaled DataFrame.
    df_robust : pd.DataFrame
        Robust-scaled DataFrame.
    features : List[str]
        Features to keep.

    Returns
    -------
    df_minmax2, df_robust2 : Tuple[pd.DataFrame, pd.DataFrame]
        Filtered and re-ordered DataFrames containing only the specified features.
    """
    # First, filter to the intersection of provided features
    df_minmax2 = select_features(df_minmax, features)
    df_robust2 = select_features(df_robust, features)

    # Then, reindex columns to the desired output ordering
    # This will include only those in OUTPUT_ORDER that are present
    df_minmax2 = df_minmax2.reindex(columns=[col for col in OUTPUT_ORDER if col in df_minmax2.columns])
    df_robust2 = df_robust2.reindex(columns=[col for col in OUTPUT_ORDER if col in df_robust2.columns])

    return df_minmax2, df_robust2


# Define the features to retain (should match or be a superset of OUTPUT_ORDER)
selected_features = [
    'flow_time', 'header_size', 'overall_rate', 'rst_packets', 'max_value',
    'value_covariance', 'fin_flags', 'syn_flags', 'psh_flags', 'ack_flags',
    'protocol_http', 'protocol_https', 'protocol_tcp', 'protocol_udp',
    'protocol_icmp', 'packet_duration', 'fin_packets', 'urg_packets', 'label'
]

# Example usage:
# Assuming df_minmax and df_robust are already defined DataFrames
# df_minmax2, df_robust2 will contain only the selected features in the specified order

df_minmax2, df_robust2 = filter_and_restore(df_minmax, df_robust, selected_features)

In [16]:
df_minmax2.columns

Index(['flow_time', 'header_size', 'packet_duration', 'overall_rate',
       'fin_packets', 'urg_packets', 'rst_packets', 'max_value',
       'value_covariance', 'fin_flags', 'syn_flags', 'psh_flags', 'ack_flags',
       'protocol_http', 'protocol_https', 'protocol_tcp', 'protocol_udp',
       'protocol_icmp', 'label'],
      dtype='object')

In [17]:
df_robust2.columns

Index(['flow_time', 'header_size', 'packet_duration', 'overall_rate',
       'fin_packets', 'urg_packets', 'rst_packets', 'max_value',
       'value_covariance', 'fin_flags', 'syn_flags', 'psh_flags', 'ack_flags',
       'protocol_http', 'protocol_https', 'protocol_tcp', 'protocol_udp',
       'protocol_icmp', 'label'],
      dtype='object')

In [18]:
import pandas as pd
from typing import Tuple, Dict, Any
from sklearn.preprocessing import LabelEncoder

def split_and_encode(
    df: pd.DataFrame,
    target_col: str = "label"
) -> Tuple[pd.DataFrame, pd.Series]:
    """
    Splits the DataFrame into features (X) and target (y),
    applies label encoding to the target, and prints the encoding map.

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame containing features and a target column.
    target_col : str, default="label"
        The name of the target column.

    Returns
    -------
    X : pd.DataFrame
        Feature matrix (all columns except the target).
    y_encoded : pd.Series
        Label-encoded target vector.
    """
    if target_col not in df.columns:
        raise KeyError(f"Target column '{target_col}' not found in DataFrame.")

    # Step 1: Split features and target
    X = df.drop(columns=target_col)
    y = df[target_col]

    # Step 2: Apply label encoding
    le = LabelEncoder()
    y_encoded_array = le.fit_transform(y)
    y_encoded = pd.Series(y_encoded_array, name=target_col)

    # Step 3: Show label encoding mapping
    print("\n--- Label Encoding Mapping ---")
    for i, label in enumerate(le.classes_):
        print(f"'{label}' → {i}")
    print("-----------------------------\n")

    return X, y_encoded

# Example usage:
X_minmax, y_minmax = split_and_encode(df_minmax)
X_minmax2, y_minmax2 = split_and_encode(df_minmax2)

X, y = split_and_encode(enriched_data)

X_robust,  y_robust  = split_and_encode(df_robust)
X_robust2,  y_robust2  = split_and_encode(df_robust2)


--- Label Encoding Mapping ---
'BenignTraffic' → 0
'DDoS' → 1
'DoS' → 2
'MITM' → 3
'Mirai' → 4
'Recon' → 5
-----------------------------


--- Label Encoding Mapping ---
'BenignTraffic' → 0
'DDoS' → 1
'DoS' → 2
'MITM' → 3
'Mirai' → 4
'Recon' → 5
-----------------------------


--- Label Encoding Mapping ---
'BenignTraffic' → 0
'DDoS' → 1
'DoS' → 2
'MITM' → 3
'Mirai' → 4
'Recon' → 5
-----------------------------


--- Label Encoding Mapping ---
'BenignTraffic' → 0
'DDoS' → 1
'DoS' → 2
'MITM' → 3
'Mirai' → 4
'Recon' → 5
-----------------------------


--- Label Encoding Mapping ---
'BenignTraffic' → 0
'DDoS' → 1
'DoS' → 2
'MITM' → 3
'Mirai' → 4
'Recon' → 5
-----------------------------



In [19]:
print(X_minmax.head())
print(y_minmax.head())

   flow_time  header_size  packet_duration  overall_rate  src_rate  dst_rate  \
0   0.123245     0.630603              0.0      0.790565  0.790565       0.0   
1   0.055548     0.537041              0.0      0.777473  0.777473       0.0   
2   0.000000     0.340476              0.0      0.324403  0.324403       0.0   
3   0.316110     0.683890              0.0      0.778181  0.778181       0.0   
4   0.000000     0.332912              0.0      0.105457  0.105457       0.0   

   fin_packets  urg_packets  rst_packets  max_value  ...  payload_entropy  \
0          0.0          0.0          0.0   0.347550  ...         0.150044   
1          0.0          0.0          0.4   0.536678  ...         0.484606   
2          0.0          0.0          0.0   1.000000  ...         0.308021   
3          0.0          0.0          1.0   0.956563  ...         0.608202   
4          0.0          0.0          0.0   1.000000  ...         0.222388   

   value_range  flows_last_10s  unique_dsts_last_10s  ho

In [20]:
print(X_minmax2.head())
print(y_minmax2.head())

   flow_time  header_size  packet_duration  overall_rate  fin_packets  \
0   0.123245     0.630603              0.0      0.790565          0.0   
1   0.055548     0.537041              0.0      0.777473          0.0   
2   0.000000     0.340476              0.0      0.324403          0.0   
3   0.316110     0.683890              0.0      0.778181          0.0   
4   0.000000     0.332912              0.0      0.105457          0.0   

   urg_packets  rst_packets  max_value  value_covariance  fin_flags  \
0          0.0          0.0   0.347550          0.000000          0   
1          0.0          0.4   0.536678          0.768808          0   
2          0.0          0.0   1.000000          0.000000          0   
3          0.0          1.0   0.956563          1.000000          0   
4          0.0          0.0   1.000000          0.000000          0   

   syn_flags  psh_flags  ack_flags  protocol_http  protocol_https  \
0          0          0          0              0               0

In [21]:
print(X_robust.head())
print(y_robust.head())

   flow_time  header_size  packet_duration  overall_rate  src_rate  dst_rate  \
0   0.305697     1.244034              0.0      1.493555  1.493555       0.0   
1   0.136455     0.927130              0.0      1.457053  1.457053       0.0   
2  -0.002415     0.261344              0.0      0.193832  0.193832       0.0   
3   0.787859     1.424523              0.0      1.459027  1.459027       0.0   
4  -0.002415     0.235723              0.0     -0.416618 -0.416618       0.0   

   fin_packets  urg_packets  rst_packets  max_value  ...  payload_entropy  \
0          0.0          0.0          0.0  -0.708840  ...         0.063612   
1          0.0          0.0          1.0   0.047671  ...         1.743042   
2          0.0          0.0          0.0   1.900958  ...         0.856624   
3          0.0          0.0          2.5   1.727209  ...         2.363465   
4          0.0          0.0          0.0   1.900958  ...         0.426765   

   value_range  flows_last_10s  unique_dsts_last_10s  ho

In [22]:
print(X_robust2.head())
print(y_robust2.head())

   flow_time  header_size  packet_duration  overall_rate  fin_packets  \
0   0.305697     1.244034              0.0      1.493555          0.0   
1   0.136455     0.927130              0.0      1.457053          0.0   
2  -0.002415     0.261344              0.0      0.193832          0.0   
3   0.787859     1.424523              0.0      1.459027          0.0   
4  -0.002415     0.235723              0.0     -0.416618          0.0   

   urg_packets  rst_packets  max_value  value_covariance  fin_flags  \
0          0.0          0.0  -0.708840           0.00000          0   
1          0.0          1.0   0.047671           1.92202          0   
2          0.0          0.0   1.900958           0.00000          0   
3          0.0          2.5   1.727209           2.50000          0   
4          0.0          0.0   1.900958           0.00000          0   

   syn_flags  psh_flags  ack_flags  protocol_http  protocol_https  \
0          0          0          0              0               0

In [23]:
from sklearn.model_selection import train_test_split

def split_train_test(X, y, test_size=0.3, random_state=None, shuffle=True):
    """
    Split features X and target y into train and test sets.

    Parameters
    ----------
    X : array-like or DataFrame
        Feature matrix.
    y : array-like or Series
        Target vector.
    test_size : float or int, default=0.3
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include in the test split.
    random_state : int or None, default=None
        Controls the shuffling for reproducibility.
    shuffle : bool, default=True
        Whether or not to shuffle the data before splitting.

    Returns
    -------
    X_train, X_test, y_train, y_test
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_size,
        random_state=random_state,
        shuffle=shuffle
    )
    return X_train, X_test, y_train, y_test


# Suppose you have already created:
#   X_minmax, y_minmax   # features & target scaled with MinMaxScaler
#   X_robust,  y_robust  # features & target scaled with RobustScaler

X_train_raw, X_test_raw, y_train_raw, y_test_raw = split_train_test(
    X, 
    y, 
    test_size=0.2, 
    random_state=42
)


# 1) Split the MinMax-scaled data
X_train_mm, X_test_mm, y_train_mm, y_test_mm = split_train_test(
    X_minmax, 
    y_minmax, 
    test_size=0.2, 
    random_state=42
)

X_train_mm2, X_test_mm2, y_train_mm2, y_test_mm2 = split_train_test(
    X_minmax2, 
    y_minmax2, 
    test_size=0.2, 
    random_state=42
)


# 2) Split the Robust-scaled data
X_train_rb, X_test_rb, y_train_rb, y_test_rb = split_train_test(
    X_robust, 
    y_robust, 
    test_size=0.2, 
    random_state=42
)

X_train_rb2, X_test_rb2, y_train_rb2, y_test_rb2 = split_train_test(
    X_robust2, 
    y_robust2, 
    test_size=0.2, 
    random_state=42
)

# You now have:
#  - X_train_mm, y_train_mm for training on MinMax data
#  - X_test_mm,  y_test_mm  for testing  on MinMax data
#  - X_train_rb, y_train_rb for training on Robust data
#  - X_test_rb,  y_test_rb  for testing  on Robust data

In [24]:
y_train_mm2

708705    1
730742    1
324370    4
658629    1
447432    1
         ..
259178    1
365838    1
131932    1
671155    2
121958    1
Name: label, Length: 673916, dtype: int32

In [25]:
y_train_mm

708705    1
730742    1
324370    4
658629    1
447432    1
         ..
259178    1
365838    1
131932    1
671155    2
121958    1
Name: label, Length: 673916, dtype: int32

In [26]:
from sklearn.feature_selection import VarianceThreshold

def variance_threshold_selector(X_train: pd.DataFrame,
                                X_test: pd.DataFrame,
                                threshold: float = 0.01):
    """
    Fit a VarianceThreshold on X_train and apply it to both X_train and X_test.

    Parameters
    ----------
    X_train : pd.DataFrame
        Training feature matrix.
    X_test : pd.DataFrame
        Test feature matrix.
    threshold : float, default=0.01
        Features with variance <= threshold will be removed.

    Returns
    -------
    X_train_sel : pd.DataFrame
        X_train reduced to only the features with variance > threshold.
    X_test_sel : pd.DataFrame
        X_test reduced to the same set of selected features.
    selected_cols : pd.Index
        Names of the retained features.
    """
    # 1. Fit on train
    vt = VarianceThreshold(threshold=threshold)
    X_train_arr = vt.fit_transform(X_train)
    selected_cols = X_train.columns[vt.get_support()]

    # 2. Build clean DataFrame for train
    X_train_sel = pd.DataFrame(
        X_train_arr,
        columns=selected_cols,
        index=X_train.index
    )

    # 3. Filter test to only those columns (if they exist)
    common_cols = [c for c in selected_cols if c in X_test.columns]
    X_test_sel = X_test.loc[:, common_cols].copy()

    print(f"Remaining features after Variance Thresholding: {len(selected_cols)}")
    return X_train_sel, X_test_sel, selected_cols


# On MinMax-scaled data
X_train_mm_sel, X_test_mm_sel, mm_cols = variance_threshold_selector(
    X_train_mm,
    X_test_mm,
    threshold=0.01
)

# On Robust-scaled data
X_train_rb_sel, X_test_rb_sel, rb_cols = variance_threshold_selector(
    X_train_rb,
    X_test_rb,
    threshold=0.01
)


Remaining features after Variance Thresholding: 32
Remaining features after Variance Thresholding: 32


In [27]:
print(X_train_mm_sel.shape)
print(X_test_mm_sel.shape)
print(mm_cols)


print(X_train_rb_sel.shape)
print(X_test_rb_sel.shape)
print(rb_cols)

(673916, 32)
(168480, 32)
Index(['flow_time', 'header_size', 'overall_rate', 'src_rate', 'rst_packets',
       'max_value', 'value_covariance', 'fin_flags', 'syn_flags', 'rst_flags',
       'psh_flags', 'ack_flags', 'protocol_http', 'protocol_https',
       'protocol_tcp', 'protocol_udp', 'protocol_icmp', 'rate_ratio',
       'rst_to_fin', 'avg_pkt_size', 'mean_interpkt', 'std_interpkt',
       'p90_interpkt', 'max_interpkt', 'burstiness', 'payload_entropy',
       'value_range', 'flows_last_10s', 'unique_dsts_last_10s', 'hour_sin',
       'hour_cos', 'tcp_syn_ratio'],
      dtype='object')
(673916, 32)
(168480, 32)
Index(['flow_time', 'header_size', 'overall_rate', 'src_rate', 'rst_packets',
       'max_value', 'value_covariance', 'fin_flags', 'syn_flags', 'rst_flags',
       'psh_flags', 'ack_flags', 'protocol_http', 'protocol_https',
       'protocol_tcp', 'protocol_udp', 'protocol_icmp', 'rate_ratio',
       'rst_to_fin', 'avg_pkt_size', 'mean_interpkt', 'std_interpkt',
       'p90

In [28]:
X_train_mm_sel.columns

Index(['flow_time', 'header_size', 'overall_rate', 'src_rate', 'rst_packets',
       'max_value', 'value_covariance', 'fin_flags', 'syn_flags', 'rst_flags',
       'psh_flags', 'ack_flags', 'protocol_http', 'protocol_https',
       'protocol_tcp', 'protocol_udp', 'protocol_icmp', 'rate_ratio',
       'rst_to_fin', 'avg_pkt_size', 'mean_interpkt', 'std_interpkt',
       'p90_interpkt', 'max_interpkt', 'burstiness', 'payload_entropy',
       'value_range', 'flows_last_10s', 'unique_dsts_last_10s', 'hour_sin',
       'hour_cos', 'tcp_syn_ratio'],
      dtype='object')

In [29]:
X_train_rb_sel.columns

Index(['flow_time', 'header_size', 'overall_rate', 'src_rate', 'rst_packets',
       'max_value', 'value_covariance', 'fin_flags', 'syn_flags', 'rst_flags',
       'psh_flags', 'ack_flags', 'protocol_http', 'protocol_https',
       'protocol_tcp', 'protocol_udp', 'protocol_icmp', 'rate_ratio',
       'rst_to_fin', 'avg_pkt_size', 'mean_interpkt', 'std_interpkt',
       'p90_interpkt', 'max_interpkt', 'burstiness', 'payload_entropy',
       'value_range', 'flows_last_10s', 'unique_dsts_last_10s', 'hour_sin',
       'hour_cos', 'tcp_syn_ratio'],
      dtype='object')

In [30]:
def correlation_filter(X_train: pd.DataFrame,
                       X_test: pd.DataFrame,
                       threshold: float = 0.95):
    """
    Remove features in X_train and X_test that are highly correlated (> threshold).

    Parameters
    ----------
    X_train : pd.DataFrame
        Training feature matrix (after any prior filtering/scaling).
    X_test : pd.DataFrame
        Test feature matrix (after any prior filtering/scaling).
    threshold : float, default=0.95
        Upper bound on allowed pairwise correlation.

    Returns
    -------
    X_train_sel : pd.DataFrame
        X_train with correlated features removed.
    X_test_sel : pd.DataFrame
        X_test with the same features removed (where present).
    dropped_cols : List[str]
        Names of features dropped from both.
    """
    # 1. compute pairwise abs-correlation
    corr_matrix = X_train.corr().abs()
    # 2. take upper triangle (exclude diagonal)
    upper_tri = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
    )
    # 3. find columns with any correlation above threshold
    dropped_cols = [
        col for col in upper_tri.columns
        if any(upper_tri[col] > threshold)
    ]
    # 4. drop them
    X_train_sel = X_train.drop(columns=dropped_cols)
    X_test_sel  = X_test.drop(columns=dropped_cols, errors='ignore')
    # 5. keep only the intersection (in case some cols missing in test)
    common = X_train_sel.columns.intersection(X_test_sel.columns)
    X_train_sel = X_train_sel[common].reset_index(drop=True)
    X_test_sel  = X_test_sel[common].reset_index(drop=True)

    print("Removed redundant features (correlation-based):", dropped_cols)
    print(f"Final number of features: {X_train_sel.shape[1]}")

    return X_train_sel, X_test_sel, dropped_cols


# 1) On the MinMax pipeline
X_train_mm_corr, X_test_mm_corr, mm_dropped = correlation_filter(
    X_train_mm_sel,
    X_test_mm_sel,
    threshold=0.95
)

# 2) On the Robust pipeline
X_train_rb_corr, X_test_rb_corr, rb_dropped = correlation_filter(
    X_train_rb_sel,
    X_test_rb_sel,
    threshold=0.95
)

Removed redundant features (correlation-based): ['src_rate', 'rst_flags', 'rate_ratio', 'rst_to_fin', 'mean_interpkt', 'std_interpkt', 'p90_interpkt', 'max_interpkt', 'value_range', 'tcp_syn_ratio']
Final number of features: 22
Removed redundant features (correlation-based): ['src_rate', 'rst_flags', 'rate_ratio', 'rst_to_fin', 'mean_interpkt', 'std_interpkt', 'p90_interpkt', 'max_interpkt', 'value_range', 'tcp_syn_ratio']
Final number of features: 22


In [31]:
print(X_train_mm_corr.shape)
print(X_test_mm_corr.shape)
print(mm_dropped)

print(X_train_rb_corr.shape)
print(X_test_rb_corr.shape)
print(rb_dropped)

(673916, 22)
(168480, 22)
['src_rate', 'rst_flags', 'rate_ratio', 'rst_to_fin', 'mean_interpkt', 'std_interpkt', 'p90_interpkt', 'max_interpkt', 'value_range', 'tcp_syn_ratio']
(673916, 22)
(168480, 22)
['src_rate', 'rst_flags', 'rate_ratio', 'rst_to_fin', 'mean_interpkt', 'std_interpkt', 'p90_interpkt', 'max_interpkt', 'value_range', 'tcp_syn_ratio']


In [32]:
X_train_mm_corr.columns

Index(['flow_time', 'header_size', 'overall_rate', 'rst_packets', 'max_value',
       'value_covariance', 'fin_flags', 'syn_flags', 'psh_flags', 'ack_flags',
       'protocol_http', 'protocol_https', 'protocol_tcp', 'protocol_udp',
       'protocol_icmp', 'avg_pkt_size', 'burstiness', 'payload_entropy',
       'flows_last_10s', 'unique_dsts_last_10s', 'hour_sin', 'hour_cos'],
      dtype='object')

In [33]:
X_train_rb_corr.columns

Index(['flow_time', 'header_size', 'overall_rate', 'rst_packets', 'max_value',
       'value_covariance', 'fin_flags', 'syn_flags', 'psh_flags', 'ack_flags',
       'protocol_http', 'protocol_https', 'protocol_tcp', 'protocol_udp',
       'protocol_icmp', 'avg_pkt_size', 'burstiness', 'payload_entropy',
       'flows_last_10s', 'unique_dsts_last_10s', 'hour_sin', 'hour_cos'],
      dtype='object')

In [34]:
X_train_mm2.columns

Index(['flow_time', 'header_size', 'packet_duration', 'overall_rate',
       'fin_packets', 'urg_packets', 'rst_packets', 'max_value',
       'value_covariance', 'fin_flags', 'syn_flags', 'psh_flags', 'ack_flags',
       'protocol_http', 'protocol_https', 'protocol_tcp', 'protocol_udp',
       'protocol_icmp'],
      dtype='object')

In [35]:
y_train_mm2.head(30)

708705    1
730742    1
324370    4
658629    1
447432    1
779184    1
22042     1
440408    1
126887    2
339675    1
148152    1
308277    1
368504    1
586644    1
66006     1
314886    1
696549    5
269164    1
286210    1
313950    1
99522     1
430435    2
804044    2
416781    2
122551    1
236623    1
449322    1
209014    1
785747    1
125008    1
Name: label, dtype: int32

In [36]:
# from imblearn.over_sampling import SMOTE
# from sklearn.utils import shuffle

# def smote_oversample(X_train: pd.DataFrame,
#                      y_train: pd.Series,
#                      random_state: int = 42):
#     """
#     Apply SMOTE to balance classes in the training set.

#     Parameters
#     ----------
#     X_train : pd.DataFrame
#         Feature matrix for training.
#     y_train : pd.Series
#         Target vector for training.
#     random_state : int, default=42
#         Seed for reproducibility.

#     Returns
#     -------
#     X_resampled : pd.DataFrame
#         SMOTE-resampled feature matrix.
#     y_resampled : pd.Series
#         SMOTE-resampled target vector.
#     """
#     # 1. Fit SMOTE on the training data
#     smote = SMOTE(random_state=random_state)
#     X_res, y_res = smote.fit_resample(X_train, y_train)

#     # 2. Shuffle the resampled data to mix original and synthetic samples
#     X_res, y_res = shuffle(X_res, y_res, random_state=random_state)

#     # 3. Convert back to DataFrame/Series (if needed)
#     if not isinstance(X_res, pd.DataFrame):
#         X_res = pd.DataFrame(X_res, columns=X_train.columns)
#     if not isinstance(y_res, pd.Series):
#         y_res = pd.Series(y_res, name=y_train.name)

#     print(f"After SMOTE, training set size: {X_res.shape[0]} samples")
#     return X_res, y_res

# # --- Example usage ---

# # 1) On the MinMax pipeline

# X_train_mm_res, y_train_mm_res = smote_oversample(
#     X_train_mm_corr,
#     y_train_mm,
#     random_state=42
# )

# X_train_mm_res2, y_train_mm_res2 = smote_oversample(
#     X_train_mm2,
#     y_train_mm2,
#     random_state=42
# )


# # 2) On the Robust pipeline

# X_train_rb_res, y_train_rb_res = smote_oversample(
#     X_train_rb_corr,
#     y_train_rb,
#     random_state=42
# )

# X_train_rb_res2, y_train_rb_res2 = smote_oversample(
#     X_train_rb2,
#     y_train_rb2,
#     random_state=42
# )

In [37]:
# runs = [
#     ("MinMax, corr-filtered", X_train_mm_corr, y_train_mm, X_train_mm_res,  y_train_mm_res),
#     ("MinMax, second split",  X_train_mm2,       y_train_mm2, X_train_mm_res2, y_train_mm_res2),
#     ("Robust, corr-filtered", X_train_rb_corr, y_train_rb,   X_train_rb_res,  y_train_rb_res),
#     ("Robust, second split",  X_train_rb2,       y_train_rb2, X_train_rb_res2, y_train_rb_res2),
# ]

# for name, X_bef, y_bef, X_aft, y_aft in runs:
#     print(f"=== {name} ===")
#     print(f"Original training shape: {X_bef.shape}")
#     print(f"Resampled training shape: {X_aft.shape}\n")

#     print("Class distribution before SMOTE:")
#     print(pd.Series(y_bef.values.ravel()).value_counts(), "\n")

#     print("Class distribution after SMOTE:")
#     print(pd.Series(y_aft.values.ravel()).value_counts(), "\n")


In [38]:
# # Cell 1: MinMaxScaler (my chosen columns)

# # Imports
# from xgboost import XGBClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score
# import joblib

In [39]:
# # 1. XGBoost
# xgb_mm = XGBClassifier(random_state=42)
# xgb_mm.fit(X_train_mm_res, y_train_mm_res)

# # Evaluate
# y_pred_xgb_mm = xgb_mm.predict(X_test_mm_corr)
# acc_xgb_mm = accuracy_score(y_test_mm, y_pred_xgb_mm)
# print(f"XGBoost (MinMax my cols) Accuracy: {acc_xgb_mm:.4f}")

# # Save XGBoost model
# joblib.dump(xgb_mm, 'xgb_mm_mycols.joblib')

In [40]:
# # 2. Random Forest
# rf_mm = RandomForestClassifier(n_estimators=100, random_state=42)
# rf_mm.fit(X_train_mm_res, y_train_mm_res)

# # Evaluate
# y_pred_rf_mm = rf_mm.predict(X_test_mm_corr)
# acc_rf_mm = accuracy_score(y_test_mm, y_pred_rf_mm)
# print(f"Random Forest (MinMax my cols) Accuracy: {acc_rf_mm:.4f}")

# # Save Random Forest model
# joblib.dump(rf_mm, 'rf_mm_mycols.joblib')

In [41]:
# # Cell 2: MinMaxScaler (your chosen columns)

# # 1. XGBoost
# xgb_mm2 = XGBClassifier(random_state=42)
# xgb_mm2.fit(X_train_mm_res2, y_train_mm_res2)

# # Evaluate
# y_pred_xgb_mm2 = xgb_mm2.predict(X_test_mm2)
# acc_xgb_mm2 = accuracy_score(y_test_mm2, y_pred_xgb_mm2)
# print(f"XGBoost (MinMax your cols) Accuracy: {acc_xgb_mm2:.4f}")

# # Save XGBoost model
# joblib.dump(xgb_mm2, 'xgb_mm_yourcols.joblib')

In [42]:
# # 2. Random Forest
# rf_mm2 = RandomForestClassifier(n_estimators=100, random_state=42)
# rf_mm2.fit(X_train_mm_res2, y_train_mm_res2)

# # Evaluate
# y_pred_rf_mm2 = rf_mm2.predict(X_test_mm2)
# acc_rf_mm2 = accuracy_score(y_test_mm2, y_pred_rf_mm2)
# print(f"Random Forest (MinMax your cols) Accuracy: {acc_rf_mm2:.4f}")

# # Save Random Forest model
# joblib.dump(rf_mm2, 'rf_mm_yourcols.joblib')

In [43]:
# # Cell 3: RobustScaler (my chosen columns)

# # Imports
# from xgboost import XGBClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score
# import joblib

# # Dataset splits
# # X_train_rb_res, X_test_rb_corr, y_train_rb_res, y_test_rb

# # 1. XGBoost
# xgb_rb = XGBClassifier(random_state=42)
# xgb_rb.fit(X_train_rb_res, y_train_rb_res)

# # Evaluate
# y_pred_xgb_rb = xgb_rb.predict(X_test_rb_corr)
# acc_xgb_rb = accuracy_score(y_test_rb, y_pred_xgb_rb)
# print(f"XGBoost (Robust my cols) Accuracy: {acc_xgb_rb:.4f}")

# # Save XGBoost model
# joblib.dump(xgb_rb, 'xgb_rb_mycols.joblib')

# # 2. Random Forest
# rf_rb = RandomForestClassifier(n_estimators=100, random_state=42)
# rf_rb.fit(X_train_rb_res, y_train_rb_res)

# # Evaluate
# y_pred_rf_rb = rf_rb.predict(X_test_rb_corr)
# acc_rf_rb = accuracy_score(y_test_rb, y_pred_rf_rb)
# print(f"Random Forest (Robust my cols) Accuracy: {acc_rf_rb:.4f}")

# # Save Random Forest model
# joblib.dump(rf_rb, 'rf_rb_mycols.joblib')

In [44]:
# # Cell 4: RobustScaler (your chosen columns)

# # Imports
# from xgboost import XGBClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score
# import joblib

# # Dataset splits
# # X_train_rb_res2, X_test_rb2, y_train_rb_res2, y_test_rb2

# # 1. XGBoost
# xgb_rb2 = XGBClassifier(random_state=42)
# xgb_rb2.fit(X_train_rb_res2, y_train_rb_res2)

# # Evaluate
# y_pred_xgb_rb2 = xgb_rb2.predict(X_test_rb2)
# acc_xgb_rb2 = accuracy_score(y_test_rb2, y_pred_xgb_rb2)
# print(f"XGBoost (Robust your cols) Accuracy: {acc_xgb_rb2:.4f}")

# # Save XGBoost model
# joblib.dump(xgb_rb2, 'xgb_rb_yourcols.joblib')

# # 2. Random Forest
# rf_rb2 = RandomForestClassifier(n_estimators=100, random_state=42)
# rf_rb2.fit(X_train_rb_res2, y_train_rb_res2)

# # Evaluate
# y_pred_rf_rb2 = rf_rb2.predict(X_test_rb2)
# acc_rf_rb2 = accuracy_score(y_test_rb2, y_pred_rf_rb2)
# print(f"Random Forest (Robust your cols) Accuracy: {acc_rf_rb2:.4f}")

# # Save Random Forest model
# joblib.dump(rf_rb2, 'rf_rb_yourcols.joblib')

In [45]:
import os
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# ----------------------------------------------------------------------------
# 1. ASSUMPTION: You already have your raw, un-resampled datasets
#    X_train_raw, X_test_raw, y_train_raw, y_test_raw
# ----------------------------------------------------------------------------

# Ensure reproducibility and output directory
RANDOM_STATE = 42
save_dir = "saved_models"
os.makedirs(save_dir, exist_ok=True)

# Number of classes (for XGB)
classes = np.unique(y_train_raw)
num_classes = len(classes)

# ----------------------------------------------------------------------------
# 2. Build an imbalanced-safe pipeline: scaling, SMOTE, then XGB (GPU-enabled)
# ----------------------------------------------------------------------------
pipeline = ImbPipeline([
    ('scaler', StandardScaler()),
    ('smote',  SMOTE(random_state=RANDOM_STATE)),
    ('clf',    xgb.XGBClassifier(
        objective='multi:softmax',
        num_class=num_classes,
        learning_rate=0.05,
        n_estimators=1000,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.5,
        reg_lambda=0.5,
        tree_method='gpu_hist',        # GPU training method
        predictor='gpu_predictor',     # GPU predictor
        random_state=RANDOM_STATE,
        verbosity=1,
        eval_metric='merror'           # error-rate for multi-class
    ))
])

# ----------------------------------------------------------------------------
# 3. Cross-validate on the RAW (imbalanced) train set
# ----------------------------------------------------------------------------
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
cv_scores = cross_val_score(
    pipeline,
    X_train_raw,
    y_train_raw,
    cv=skf,
    scoring='accuracy',
    n_jobs=-1
)
print(f"CV accuracies: {cv_scores}")
print(f"Mean CV accuracy: {cv_scores.mean():.4f}\n")

# ----------------------------------------------------------------------------
# 4. Final fit WITH early stopping: carve out a small validation split
# ----------------------------------------------------------------------------
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_raw, y_train_raw,
    test_size=0.1,
    stratify=y_train_raw,
    random_state=RANDOM_STATE
)

# Inject early stopping params into the pipeline's XGBClassifier
pipeline.set_params(
    clf__early_stopping_rounds=50,
    clf__eval_set=[(X_val, y_val)],
    clf__verbose=False
)

# Train on the reduced train set (scaling + SMOTE applied inside)
pipeline.fit(X_tr, y_tr)

# ----------------------------------------------------------------------------
# 5. Evaluate on the untouched test set
# ----------------------------------------------------------------------------
y_test_pred = pipeline.predict(X_test_raw)
print(f"Test accuracy: {accuracy_score(y_test_raw, y_test_pred):.4f}")
print("\nClassification Report:\n", classification_report(y_test_raw, y_test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test_raw, y_test_pred))

# ----------------------------------------------------------------------------
# 6. Save the full pipeline for later inference
# ----------------------------------------------------------------------------
model_path = os.path.join(save_dir, 'xgb_pipeline_gpu.joblib')
joblib.dump(pipeline, model_path)
print(f"✔ Pipeline saved to {model_path}")

KeyboardInterrupt: 