In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("Data\\phase2_students_before_cleaning.csv")

In [4]:
print(data.describe())

          flow_time   header_size  packet_duration  overall_rate  \
count  9.385830e+05  9.385830e+05    938583.000000  9.385830e+05   
mean   1.342515e+01  1.018134e+05        82.664589  9.163496e+03   
std    5.898099e+03  1.801377e+06       166.986325  1.001806e+05   
min    0.000000e+00  0.000000e+00         0.000000  0.000000e+00   
25%    0.000000e+00  5.400000e+01        64.000000  2.077046e+00   
50%    0.000000e+00  5.400000e+01        64.000000  1.570377e+01   
75%    1.017542e-01  3.640000e+02        64.000000  1.177706e+02   
max    4.930147e+06  3.311174e+08      6525.740000  7.340032e+06   

           src_rate       dst_rate    fin_packets    urg_packets  \
count  9.385830e+05  938583.000000  938583.000000  938583.000000   
mean   9.163496e+03       0.000002       0.099474       5.850813   
std    1.001806e+05       0.000898       0.299712      70.715367   
min    0.000000e+00       0.000000       0.000000       0.000000   
25%    2.077046e+00       0.000000       0.0000

In [3]:
import pandas as pd

# 2. Get raw counts
counts = data["label"].value_counts()

# 3. Get relative frequencies
freqs = data["label"].value_counts(normalize=True) * 100

# 4. Combine into one table
imbalance_data = pd.concat(
    [counts.rename("count"), freqs.rename("percent")],
    axis=1
).sort_values(by="percent", ascending=False)

print(imbalance_data)


                count    percent
label                           
DDoS           687027  73.198321
DoS            163428  17.412205
Mirai           53395   5.688895
BenignTraffic   21987   2.342574
Recon            6433   0.685395
MITM             6313   0.672610


In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PowerTransformer


def fix_skewness(data, numeric_cols, method='log1p'):
    """
    Apply a skewness-fixing transformation to specified numeric columns in a DataFrame.

    Parameters:
    - data: pandas DataFrame
    - numeric_cols: list of column names to transform
    - method: one of 'log1p', 'box-cox', 'yeo-johnson'

    Returns:
    - data_transformed: DataFrame with transformed columns
    """
    data_transformed = data.copy()

    if method == 'log1p':
        data_transformed[numeric_cols] = np.log1p(data_transformed[numeric_cols])

    elif method in ['box-cox', 'yeo-johnson']:
        # Initialize PowerTransformer
        pt = PowerTransformer(method='box-cox' if method == 'box-cox' else 'yeo-johnson', standardize=False)

        # For box-cox, ensure all values are positive by shifting if necessary
        if method == 'box-cox':
            for col in numeric_cols:
                min_val = data_transformed[col].min()
                if min_val <= 0:
                    data_transformed[col] = data_transformed[col] + (1 - min_val)

        # Fit & transform
        data_transformed[numeric_cols] = pt.fit_transform(data_transformed[numeric_cols])

    else:
        raise ValueError(f"Unsupported method: {method}")

    return data_transformed


def compute_skewness_by_class(data, numeric_cols, label_col='label'):
    """
    Compute skewness of numeric columns for each class in label_col.

    Returns a DataFrame indexed by class labels, columns are numeric_cols.
    """
    skew_dict = {}
    for cls, group in data.groupby(label_col):
        skew_dict[cls] = group[numeric_cols].skew()
    return pd.DataFrame(skew_dict).T


if __name__ == '__main__':
    # Load your dataset (modify the path as needed)
    # data = pd.read_csv('your_dataset.csv')

    # ---- Improved detection of true numeric columns ----
    # All numeric dtype columns
    all_numeric = data.select_dtypes(include=[np.number]).columns.tolist()
    # Exclude binary columns
    binary_cols = [col for col in all_numeric if data[col].nunique() == 2]
    # Exclude object/category dtype columns
    categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()
    # Exclude low-cardinality numeric columns (likely categorical encoded as numbers)
    low_cardinality = [col for col in all_numeric if data[col].nunique() < 10]

    # Combine excludes (and always exclude label)
    exclude = set(binary_cols + categorical_cols + low_cardinality + ['label'])
    numeric_cols = [col for col in all_numeric if col not in exclude]

    print("Numeric columns to transform:", numeric_cols)

    # Compute skewness before transformation
    skew_before = compute_skewness_by_class(data, numeric_cols, label_col='label')
    print("Skewness before transformations:\n", skew_before)

    # Apply each method and compute skewness after
    for method in ['log1p', 'box-cox', 'yeo-johnson']:
        data_trans = fix_skewness(data, numeric_cols, method=method)
        skew_after = compute_skewness_by_class(data_trans, numeric_cols, label_col='label')
        print(f"\nSkewness after {method}:\n", skew_after)


Numeric columns to transform: ['flow_time', 'header_size', 'packet_duration', 'overall_rate', 'src_rate', 'dst_rate', 'fin_packets', 'urg_packets', 'rst_packets', 'max_value', 'value_covariance']
Skewness before transformations:
                 flow_time  header_size  packet_duration  overall_rate  \
BenignTraffic   21.553083    22.626066        12.334105     18.514660   
DDoS           828.622012   471.864703         9.871309     19.948004   
DoS            109.027356    26.496179        10.514410     31.088226   
MITM            41.064835    19.889861        15.997384     19.045760   
Mirai          120.868408    30.754763        10.545856     17.715730   
Recon           79.703750    33.956999        12.913901     27.402579   

                src_rate    dst_rate  fin_packets  urg_packets  rst_packets  \
BenignTraffic  18.514660    0.000000   103.556296     6.831121     2.067410   
DDoS           19.948004  771.152026     2.473504   163.261103   101.573801   
DoS            31.088

In [None]:
def separate_columns(data: pd.DataFrame):
    """
    Separates the columns of data into binary, numeric (non-binary), and categorical lists.

    Returns:
        binary_cols: Columns containing exactly two values {0,1}.
        numeric_cols: All other numeric columns.
        categorical_cols: Columns of object or category dtype.
    """
    # 1. Identify binary columns: exactly two unique values (0 and 1)
    binary_cols = [
        col for col in data.columns
        if data[col].nunique(dropna=True) == 2
           and set(data[col].dropna().unique()) <= {0, 1}
    ]

    # 2. Identify numeric columns, then exclude binary ones
    numeric_cols = [
        col for col in data.select_dtypes(include=['number']).columns
        if col not in binary_cols
    ]

    # 3. Identify categorical columns (object or category dtype)
    categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()

    return binary_cols, numeric_cols, categorical_cols

if __name__ == "__main__":
    # 1. Load your full dataset here:

    # 2. Print all columns
    print("All columns in dataset:", list(data.columns))

    # 3. Separate them
    binary_cols, numeric_cols, categorical_cols = separate_columns(data)

    # 4. Inspect your lists
    print("\nBinary columns:     ", binary_cols)
    print("Numeric columns:    ", numeric_cols)
    print("Categorical columns:", categorical_cols)

    # 5. Define the expected lists
    expected_all = [
        'flow_time', 'header_size', 'packet_duration', 'overall_rate', 
        'src_rate', 'dst_rate', 'fin_packets', 'urg_packets', 'rst_packets', 
        'max_value', 'value_covariance', 'fin_flags', 'syn_flags', 'rst_flags', 
        'psh_flags', 'ack_flags', 'protocol_http', 'protocol_https', 'protocol_tcp', 
        'protocol_udp', 'protocol_icmp', 'label'
    ]
    expected_binary = [
        'fin_flags', 'syn_flags', 'rst_flags', 'psh_flags', 'ack_flags',
        'protocol_http', 'protocol_https', 'protocol_tcp', 'protocol_udp', 'protocol_icmp'
    ]
    expected_numeric = [
        'flow_time', 'header_size', 'packet_duration', 'overall_rate', 
        'src_rate', 'dst_rate', 'fin_packets', 'urg_packets', 'rst_packets', 
        'max_value', 'value_covariance'
    ]
    expected_categorical = ['label']

    # 6. Check for matches and report any differences
    def compare_lists(name, actual, expected):
        actual_set, expected_set = set(actual), set(expected)
        missing = expected_set - actual_set
        extra   = actual_set - expected_set
        if not missing and not extra:
            print(f"[✓] {name} match exactly.")
        else:
            if missing:
                print(f"[✗] {name} missing: {sorted(missing)}")
            if extra:
                print(f"[✗] {name} unexpected: {sorted(extra)}")

    print("\nVerification:")
    compare_lists("All columns", list(data.columns), expected_all)
    compare_lists("Binary columns", binary_cols, expected_binary)
    compare_lists("Numeric columns", numeric_cols, expected_numeric)
    compare_lists("Categorical columns", categorical_cols, expected_categorical)


All columns in dataset: ['flow_time', 'header_size', 'packet_duration', 'overall_rate', 'src_rate', 'dst_rate', 'fin_packets', 'urg_packets', 'rst_packets', 'max_value', 'value_covariance', 'fin_flags', 'syn_flags', 'rst_flags', 'psh_flags', 'ack_flags', 'protocol_http', 'protocol_https', 'protocol_tcp', 'protocol_udp', 'protocol_icmp', 'label']

Binary columns:      ['fin_flags', 'syn_flags', 'rst_flags', 'psh_flags', 'ack_flags', 'protocol_http', 'protocol_https', 'protocol_tcp', 'protocol_udp', 'protocol_icmp']
Numeric columns:     ['flow_time', 'header_size', 'packet_duration', 'overall_rate', 'src_rate', 'dst_rate', 'fin_packets', 'urg_packets', 'rst_packets', 'max_value', 'value_covariance']
Categorical columns: ['label']

Verification:
[✓] All columns match exactly.
[✓] Binary columns match exactly.
[✓] Numeric columns match exactly.
[✓] Categorical columns match exactly.


In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Define the paths to your datasets based on your directory structure
base_paths = [
    'Data/deduplicated_datasets/Direct_Removal/phase2_Direct_Removal',
    'Data/deduplicated_datasets/Instance_Weighting/phase2_Instance_Weighting',
    'Data/deduplicated_datasets/Train_Test_Aware/phase2_TrainTestAware'
]

binary_cols = ['fin_flags', 'syn_flags', 'rst_flags', 'psh_flags', 'ack_flags', 
               'protocol_http', 'protocol_https', 'protocol_tcp', 'protocol_udp', 'protocol_icmp']

# Function to calculate imbalance metrics
def calculate_imbalance(data, cols):
    results = {}
    for col in cols:
        if col in data.columns:
            # Count of 1s and 0s
            value_counts = data[col].value_counts()
            count_0 = value_counts.get(0, 0)
            count_1 = value_counts.get(1, 0)
            total = count_0 + count_1
            
            # Percentage of 1s and 0s
            pct_0 = count_0 / total * 100 if total > 0 else 0
            pct_1 = count_1 / total * 100 if total > 0 else 0
            
            # Imbalance ratio (higher value = more imbalanced)
            imbalance_ratio = max(pct_0, pct_1) / min(pct_0, pct_1) if min(pct_0, pct_1) > 0 else float('inf')
            
            results[col] = {
                'count_0': count_0,
                'count_1': count_1,
                'pct_0': pct_0,
                'pct_1': pct_1,
                'imbalance_ratio': imbalance_ratio,
                'majority': '0' if pct_0 > pct_1 else '1',
                'majority_pct': max(pct_0, pct_1)
            }
    return results

# Store results for each dataset
all_results = {}

for base_path in base_paths:
    experiment_name = base_path.split('/')[2]
    all_results[experiment_name] = {}
    
    # Load training and test data
    try:
        X_train = pd.read_csv(f'{base_path}_X_train.csv')
        X_test = pd.read_csv(f'{base_path}_X_test.csv')
        y_train = pd.read_csv(f'{base_path}_y_train.csv')
        y_test = pd.read_csv(f'{base_path}_y_test.csv')
        
        # Calculate imbalance for features
        train_imbalance = calculate_imbalance(X_train, binary_cols)
        test_imbalance = calculate_imbalance(X_test, binary_cols)
        
        # Calculate imbalance for target variable
        y_train_imbalance = calculate_imbalance(y_train, y_train.columns)
        y_test_imbalance = calculate_imbalance(y_test, y_test.columns)
        
        all_results[experiment_name] = {
            'X_train': train_imbalance,
            'X_test': test_imbalance,
            'y_train': y_train_imbalance,
            'y_test': y_test_imbalance
        }
        
    except FileNotFoundError as e:
        print(f"Could not find one or more files for {experiment_name}: {e}")
        continue

# Function to print results in a readable format
def print_imbalance_summary(results):
    print("\n--- IMBALANCE ANALYSIS SUMMARY ---\n")
    
    for experiment, datasets in results.items():
        print(f"\n=== {experiment} ===")
        
        # First, check target variable imbalance
        if 'y_train' in datasets and 'y_test' in datasets:
            for target_col, target_stats in datasets['y_train'].items():
                train_majority = target_stats['majority']
                train_majority_pct = target_stats['majority_pct']
                
                test_stats = datasets['y_test'].get(target_col, {})
                test_majority = test_stats.get('majority', 'N/A')
                test_majority_pct = test_stats.get('majority_pct', 0)
                
                print(f"\nTarget Variable: {target_col}")
                print(f"  Train: {train_majority_pct:.2f}% class {train_majority} (Imbalance Ratio: {target_stats['imbalance_ratio']:.2f})")
                print(f"  Test:  {test_majority_pct:.2f}% class {test_majority} (Imbalance Ratio: {test_stats.get('imbalance_ratio', 'N/A'):.2f})")
        
        # Then, feature imbalance
        print("\nBinary Features:")
        for feature in binary_cols:
            if feature in datasets.get('X_train', {}) and feature in datasets.get('X_test', {}):
                train_stats = datasets['X_train'][feature]
                test_stats = datasets['X_test'][feature]
                
                print(f"\n  {feature}:")
                print(f"    Train: {train_stats['majority_pct']:.2f}% class {train_stats['majority']} (Imbalance Ratio: {train_stats['imbalance_ratio']:.2f})")
                print(f"    Test:  {test_stats['majority_pct']:.2f}% class {test_stats['majority']} (Imbalance Ratio: {test_stats['imbalance_ratio']:.2f})")
                
                # Calculate distribution shift between train and test
                train_pct_1 = train_stats['pct_1']
                test_pct_1 = test_stats['pct_1']
                dist_shift = abs(train_pct_1 - test_pct_1)
                
                if dist_shift > 5:  # More than 5% difference
                    print(f"    ⚠️ Distribution shift: {dist_shift:.2f}% difference between train and test")

# Print the results
print_imbalance_summary(all_results)

# Create output directory for visualizations if it doesn't exist
os.makedirs('imbalance_analysis', exist_ok=True)

# Create visualizations
def create_imbalance_visualizations(results):
    for experiment, datasets in results.items():
        # Set up the figure
        plt.figure(figsize=(15, 10))
        
        # Create data for plotting
        features = [col for col in binary_cols if col in datasets.get('X_train', {})]
        train_imbalance = [datasets['X_train'][col]['imbalance_ratio'] for col in features]
        test_imbalance = [datasets['X_test'][col]['imbalance_ratio'] for col in features]
        
        # For features with very high imbalance, cap at 100 for visualization
        train_imbalance = [min(x, 100) for x in train_imbalance]
        test_imbalance = [min(x, 100) for x in test_imbalance]
        
        # Create bar chart
        x = np.arange(len(features))
        width = 0.35
        
        plt.bar(x - width/2, train_imbalance, width, label='Train')
        plt.bar(x + width/2, test_imbalance, width, label='Test')
        
        plt.xlabel('Binary Features')
        plt.ylabel('Imbalance Ratio (capped at 100)')
        plt.title(f'Feature Imbalance - {experiment}')
        plt.xticks(x, features, rotation=45)
        plt.legend()
        plt.tight_layout()
        
        plt.savefig(f'imbalance_analysis/{experiment}_imbalance.png')
        plt.close()

# Create visualizations
create_imbalance_visualizations(all_results)


--- IMBALANCE ANALYSIS SUMMARY ---


=== Direct_Removal ===

Target Variable: label
  Train: 96.44% class 1 (Imbalance Ratio: 27.08)
  Test:  96.44% class 1 (Imbalance Ratio: 27.08)

Binary Features:

  fin_flags:
    Train: 92.35% class 0 (Imbalance Ratio: 12.07)
    Test:  92.37% class 0 (Imbalance Ratio: 12.11)

  syn_flags:
    Train: 78.19% class 0 (Imbalance Ratio: 3.58)
    Test:  78.37% class 0 (Imbalance Ratio: 3.62)

  rst_flags:
    Train: 91.90% class 0 (Imbalance Ratio: 11.34)
    Test:  91.94% class 0 (Imbalance Ratio: 11.40)

  psh_flags:
    Train: 91.57% class 0 (Imbalance Ratio: 10.86)
    Test:  91.57% class 0 (Imbalance Ratio: 10.86)

  ack_flags:
    Train: 87.87% class 0 (Imbalance Ratio: 7.25)
    Test:  87.86% class 0 (Imbalance Ratio: 7.24)

  protocol_http:
    Train: 95.21% class 0 (Imbalance Ratio: 19.86)
    Test:  95.19% class 0 (Imbalance Ratio: 19.81)

  protocol_https:
    Train: 94.28% class 0 (Imbalance Ratio: 16.49)
    Test:  94.25% class 0 (Imbala

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Define the paths to your target datasets
target_paths = [
    'Data/deduplicated_datasets/Direct_Removal/phase2_Direct_Removal_y_train.csv',
    'Data/deduplicated_datasets/Instance_Weighting/phase2_Instance_Weighting_y_train.csv', 
    'Data/deduplicated_datasets/Train_Test_Aware/phase2_TrainTestAware_y_train.csv'
]

# Names of the techniques for reporting
technique_names = ['Direct_Removal', 'Instance_Weighting', 'Train_Test_Aware']

# Function to analyze multi-class distribution
def analyze_multiclass_distribution(file_path):
    # Read the target data
    y_data = pd.read_csv(file_path)
    
    # Assuming the column is named 'label'
    target_col = 'label' if 'label' in y_data.columns else y_data.columns[0]
    
    # Count each unique value
    value_counts = y_data[target_col].value_counts().sort_index()
    total = len(y_data)
    
    # Calculate percentages for each class
    percentages = (value_counts / total * 100).to_dict()
    
    # Get the counts as a dictionary
    counts = value_counts.to_dict()
    
    # Find the most common and least common classes
    most_common_class = value_counts.idxmax()
    least_common_class = value_counts.idxmin()
    
    # Calculate imbalance ratio (most common to least common)
    imbalance_ratio = value_counts.max() / value_counts.min()
    
    return {
        'counts': counts,
        'percentages': percentages,
        'most_common_class': most_common_class,
        'least_common_class': least_common_class,
        'imbalance_ratio': imbalance_ratio,
        'unique_classes': sorted(value_counts.index.tolist())
    }

# Print header
print("\n=== MULTI-CLASS TARGET LABEL DISTRIBUTION ===\n")

# Analyze each technique
for i, path in enumerate(target_paths):
    try:
        print(f"\n{technique_names[i]}:")
        metrics = analyze_multiclass_distribution(path)
        
        print(f"  Unique classes: {metrics['unique_classes']}")
        print(f"  Class distribution:")
        
        for cls, pct in metrics['percentages'].items():
            count = metrics['counts'][cls]
            print(f"    - Class {cls}: {count:,} samples ({pct:.2f}%)")
        
        print(f"  Most common: Class {metrics['most_common_class']} ({metrics['percentages'][metrics['most_common_class']]:.2f}%)")
        print(f"  Least common: Class {metrics['least_common_class']} ({metrics['percentages'][metrics['least_common_class']]:.2f}%)")
        print(f"  Imbalance ratio: {metrics['imbalance_ratio']:.2f}")
        
    except Exception as e:
        print(f"{technique_names[i]}: Error - {e}")

# Create output directory for visualizations if it doesn't exist
import os
os.makedirs('imbalance_analysis', exist_ok=True)

# Visualize the multi-class distribution for each technique
def create_multiclass_visualizations(target_paths, technique_names):
    plt.figure(figsize=(15, 10))
    
    for i, (path, name) in enumerate(zip(target_paths, technique_names)):
        plt.subplot(len(target_paths), 1, i+1)
        
        # Read data
        y_data = pd.read_csv(path)
        target_col = 'label' if 'label' in y_data.columns else y_data.columns[0]
        
        # Count values
        value_counts = y_data[target_col].value_counts().sort_index()
        
        # Create bar plot
        bars = plt.bar(value_counts.index.astype(str), value_counts.values, color='steelblue')
        
        # Add count labels on top of the bars
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height + 0.1,
                     f'{height:,}', ha='center', va='bottom')
        
        plt.title(f'Class Distribution - {name}')
        plt.xlabel('Class')
        plt.ylabel('Count')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
    
    plt.tight_layout()
    plt.savefig('imbalance_analysis/multiclass_distribution.png')
    plt.close()

# Create visualizations
create_multiclass_visualizations(target_paths, technique_names)
print("\nVisualization saved to 'imbalance_analysis/multiclass_distribution.png'")


=== MULTI-CLASS TARGET LABEL DISTRIBUTION ===


Direct_Removal:
  Unique classes: [0, 1, 2, 3, 4, 5]
  Class distribution:
    - Class 0: 17,588 samples (2.62%)
    - Class 1: 476,204 samples (70.90%)
    - Class 2: 126,690 samples (18.86%)
    - Class 3: 5,050 samples (0.75%)
    - Class 4: 41,018 samples (6.11%)
    - Class 5: 5,146 samples (0.77%)
  Most common: Class 1 (70.90%)
  Least common: Class 3 (0.75%)
  Imbalance ratio: 94.30

Instance_Weighting:
  Unique classes: [0, 1, 2, 3, 4, 5]
  Class distribution:
    - Class 0: 17,590 samples (2.34%)
    - Class 1: 549,621 samples (73.20%)
    - Class 2: 130,742 samples (17.41%)
    - Class 3: 5,051 samples (0.67%)
    - Class 4: 42,716 samples (5.69%)
    - Class 5: 5,146 samples (0.69%)
  Most common: Class 1 (73.20%)
  Least common: Class 3 (0.67%)
  Imbalance ratio: 108.81

Train_Test_Aware:
  Unique classes: [0, 1, 2, 3, 4, 5]
  Class distribution:
    - Class 0: 17,558 samples (2.38%)
    - Class 1: 537,805 samples (72.91%)
 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder

# Path to your original dataset
file_path = r'C:\Machine Learning\Phase 2\Data\phase2_students_before_cleaning.csv'

# Try a list of encodings until one works
encodings_to_try = ['utf-8', 'latin1', 'ISO-8859-1', 'cp1252']
for enc in encodings_to_try:
    try:
        data = pd.read_csv(file_path, encoding=enc)
        print(f"✅ Successfully read '{file_path}' with encoding: {enc}")
        break
    except UnicodeDecodeError:
        print(f"⚠️  Failed to decode with {enc}, trying next…")
else:
    raise UnicodeDecodeError(f"Could not read '{file_path}' with any of {encodings_to_try}")

# Ensure we have the 'label' column
if 'label' not in data.columns:
    raise KeyError(f"'label' column not found in {file_path}; found columns: {data.columns.tolist()}")

# Label-encode the 'label' column
le = LabelEncoder()
data['label_encoded'] = le.fit_transform(data['label'])

# Print the mapping
print("\nLabel encoding mapping:")
for orig, code in zip(le.classes_, le.transform(le.classes_)):
    print(f"  {orig!r} → {code}")

# Compute class counts & percentages
counts = data['label_encoded'].value_counts().sort_index()
total = counts.sum()
percentages = (counts / total * 100).round(2)

most_common = counts.idxmax()
least_common = counts.idxmin()
imbalance_ratio = counts.max() / counts.min()

# Display imbalance metrics
print("\n=== IMBALANCE ANALYSIS ON ENCODED LABELS ===")
print(f"Unique encoded classes: {list(counts.index)}")
print("Class distribution:")
for cls in counts.index:
    print(f"  - Class {cls}: {counts[cls]:,} samples ({percentages[cls]}%)")
print(f"\nMost common: Class {most_common} ({percentages[most_common]}%)")
print(f"Least common: Class {least_common} ({percentages[least_common]}%)")
print(f"Imbalance ratio: {imbalance_ratio:.2f}")

# Plotting
os.makedirs('imbalance_analysis', exist_ok=True)
plt.figure(figsize=(8, 6))
bars = plt.bar(counts.index.astype(str), counts.values, color='steelblue')
for bar in bars:
    h = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, h + total*0.005,
             f'{int(h):,}', ha='center', va='bottom')
plt.title('Encoded Class Distribution – sample_submission.csv')
plt.xlabel('Encoded Class')
plt.ylabel('Count')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

out_path = 'imbalance_analysis/sample_submission_distribution.png'
plt.savefig(out_path)
plt.close()
print(f"\nVisualization saved to '{out_path}'")


✅ Successfully read 'C:\Machine Learning\Phase 2\Data\phase2_students_before_cleaning.csv' with encoding: utf-8

Label encoding mapping:
  'BenignTraffic' → 0
  'DDoS' → 1
  'DoS' → 2
  'MITM' → 3
  'Mirai' → 4
  'Recon' → 5

=== IMBALANCE ANALYSIS ON ENCODED LABELS ===
Unique encoded classes: [0, 1, 2, 3, 4, 5]
Class distribution:
  - Class 0: 21,987 samples (2.34%)
  - Class 1: 687,027 samples (73.2%)
  - Class 2: 163,428 samples (17.41%)
  - Class 3: 6,313 samples (0.67%)
  - Class 4: 53,395 samples (5.69%)
  - Class 5: 6,433 samples (0.69%)

Most common: Class 1 (73.2%)
Least common: Class 3 (0.67%)
Imbalance ratio: 108.83

Visualization saved to 'imbalance_analysis/sample_submission_distribution.png'
