In [1]:
import os
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.ensemble import IsolationForest

# Define the columns to process for outlier handling
OUTLIER_COLUMNS = [
    'flow_time', 'header_size', 'packet_duration', 'overall_rate', 
    'src_rate', 'dst_rate', 'fin_packets', 'urg_packets', 'rst_packets', 
    'max_value', 'value_covariance'
]

# Define the base directory where data is stored
BASE_DIR = os.path.join(os.getcwd(), 'Data', 'deduplicated_datasets')

# Define the techniques to be applied
TECHNIQUES = [
    'Winsorization',
    'DirectRemoval',
    'ZScoreTrimming',
    'Log1pWinsorization',
    'IsolationForest'
]

def winsorization(df, columns, lower_percentile=0.05, upper_percentile=0.95):
    """Apply winsorization (percentile-based capping) to specified columns"""
    df_copy = df.copy()
    for col in columns:
        if col in df_copy.columns:
            lower_bound = df_copy[col].quantile(lower_percentile)
            upper_bound = df_copy[col].quantile(upper_percentile)
            df_copy[col] = df_copy[col].clip(lower=lower_bound, upper=upper_bound)
    return df_copy

def direct_removal(df, columns, lower_percentile=0.01, upper_percentile=0.99):
    """Remove outliers directly based on percentiles"""
    df_copy = df.copy()
    mask = pd.Series(True, index=df_copy.index)
    
    for col in columns:
        if col in df_copy.columns:
            lower_bound = df_copy[col].quantile(lower_percentile)
            upper_bound = df_copy[col].quantile(upper_percentile)
            col_mask = (df_copy[col] >= lower_bound) & (df_copy[col] <= upper_bound)
            mask = mask & col_mask
    
    return df_copy[mask]

def zscore_trimming(df, columns, threshold=3):
    """Apply Z-score trimming to remove outliers"""
    df_copy = df.copy()
    mask = pd.Series(True, index=df_copy.index)
    
    for col in columns:
        if col in df_copy.columns:
            z_scores = np.abs(stats.zscore(df_copy[col], nan_policy='omit'))
            col_mask = z_scores < threshold
            mask = mask & col_mask
    
    return df_copy[mask]

def log1p_winsorization(df, columns, lower_percentile=0.05, upper_percentile=0.95):
    """Apply log1p transformation followed by winsorization"""
    df_copy = df.copy()
    for col in columns:
        if col in df_copy.columns:
            # Apply log1p transformation
            df_copy[col] = np.log1p(df_copy[col].clip(lower=0))  # Ensuring non-negative values
            # Apply winsorization
            lower_bound = df_copy[col].quantile(lower_percentile)
            upper_bound = df_copy[col].quantile(upper_percentile)
            df_copy[col] = df_copy[col].clip(lower=lower_bound, upper=upper_bound)
    return df_copy

def isolation_forest_filtering(df, columns, contamination=0.05):
    """Use Isolation Forest to identify and remove outliers"""
    df_copy = df.copy()
    
    # Only use the specified columns for outlier detection
    cols_to_use = [col for col in columns if col in df_copy.columns]
    if not cols_to_use:
        return df_copy
    
    # Handle NaN values
    subset_df = df_copy[cols_to_use].fillna(df_copy[cols_to_use].mean())
    
    # Apply Isolation Forest
    model = IsolationForest(contamination=contamination, random_state=42)
    outlier_predictions = model.fit_predict(subset_df)
    
    # Keep only inliers (1 is inlier, -1 is outlier)
    return df_copy[outlier_predictions == 1]

def process_x_train_files():
    """Process each X_train file in the deduplicated datasets folders"""
    # Find all the deduplication technique folders
    technique_folders = [folder for folder in os.listdir(BASE_DIR) if os.path.isdir(os.path.join(BASE_DIR, folder))]
    
    for folder in technique_folders:
        folder_path = os.path.join(BASE_DIR, folder)
        
        # Find the X_train file in this folder
        for file in os.listdir(folder_path):
            if "X_train" in file and file.endswith(".csv"):
                input_file_path = os.path.join(folder_path, file)
                
                # Load the dataset
                df = pd.read_csv(input_file_path)
                
                # Base name for output files
                file_prefix = file.replace('.csv', '')
                
                # Apply each outlier handling technique
                print(f"Processing {file} in {folder}...")
                
                # 1. Winsorization
                winsorized_df = winsorization(df, OUTLIER_COLUMNS)
                winsorized_output_path = os.path.join(folder_path, f"{file_prefix}_Winsorization.csv")
                winsorized_df.to_csv(winsorized_output_path, index=False)
                print(f"  - Saved Winsorization result to {winsorized_output_path}")
                
                # 2. Direct Removal
                direct_removal_df = direct_removal(df, OUTLIER_COLUMNS)
                direct_removal_output_path = os.path.join(folder_path, f"{file_prefix}_DirectRemoval.csv")
                direct_removal_df.to_csv(direct_removal_output_path, index=False)
                print(f"  - Saved Direct Removal result to {direct_removal_output_path}")
                
                # 3. Z-Score Trimming
                zscore_df = zscore_trimming(df, OUTLIER_COLUMNS)
                zscore_output_path = os.path.join(folder_path, f"{file_prefix}_ZScoreTrimming.csv")
                zscore_df.to_csv(zscore_output_path, index=False)
                print(f"  - Saved Z-Score Trimming result to {zscore_output_path}")
                
                # 4. Log1p + Winsorization
                log_win_df = log1p_winsorization(df, OUTLIER_COLUMNS)
                log_win_output_path = os.path.join(folder_path, f"{file_prefix}_Log1pWinsorization.csv")
                log_win_df.to_csv(log_win_output_path, index=False)
                print(f"  - Saved Log1p+Winsorization result to {log_win_output_path}")
                
                # 5. Isolation Forest
                iso_forest_df = isolation_forest_filtering(df, OUTLIER_COLUMNS)
                iso_forest_output_path = os.path.join(folder_path, f"{file_prefix}_IsolationForest.csv")
                iso_forest_df.to_csv(iso_forest_output_path, index=False)
                print(f"  - Saved Isolation Forest result to {iso_forest_output_path}")

if __name__ == "__main__":
    process_x_train_files()
    print("Outlier handling completed for all X_train files!")

Processing phase2_Direct_Removal_X_train.csv in Direct_Removal...
  - Saved Winsorization result to c:\Machine Learning\Phase 2\Data\deduplicated_datasets\Direct_Removal\phase2_Direct_Removal_X_train_Winsorization.csv
  - Saved Direct Removal result to c:\Machine Learning\Phase 2\Data\deduplicated_datasets\Direct_Removal\phase2_Direct_Removal_X_train_DirectRemoval.csv
  - Saved Z-Score Trimming result to c:\Machine Learning\Phase 2\Data\deduplicated_datasets\Direct_Removal\phase2_Direct_Removal_X_train_ZScoreTrimming.csv
  - Saved Log1p+Winsorization result to c:\Machine Learning\Phase 2\Data\deduplicated_datasets\Direct_Removal\phase2_Direct_Removal_X_train_Log1pWinsorization.csv
  - Saved Isolation Forest result to c:\Machine Learning\Phase 2\Data\deduplicated_datasets\Direct_Removal\phase2_Direct_Removal_X_train_IsolationForest.csv
Processing phase2_Instance_Weighting_X_train.csv in Instance_Weighting...
  - Saved Winsorization result to c:\Machine Learning\Phase 2\Data\deduplicated

In [None]:
import os
import sys
import argparse
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, StackingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from xgboost import XGBClassifier
from joblib import dump, load

# Numeric features
num_cols = [
    "flow_time", "header_size", "packet_duration", "overall_rate",
    "src_rate", "dst_rate", "fin_packets", "urg_packets",
    "rst_packets", "max_value", "value_covariance"
]

def load_and_clean(path):
    """Load, dedupe, shuffle, winsorize, and log-transform numeric columns."""
    df = pd.read_csv(path)
    df = df.drop_duplicates().sample(frac=1, random_state=42).reset_index(drop=True)
    for col in num_cols:
        lo, hi = np.percentile(df[col], [1, 99])
        df[col] = df[col].clip(lo, hi)
        df[col] = np.log1p(df[col])
    return df


def fit_transformers(X, y):
    """Fit label encoder, scaler, and selector; dump them for later reuse."""
    le = LabelEncoder().fit(y)
    y_enc = le.transform(y)
    scaler = StandardScaler().fit(X[num_cols])
    X[num_cols] = scaler.transform(X[num_cols])
    selector = ExtraTreesClassifier(n_estimators=50, max_depth=10, random_state=42, n_jobs=-1)
    selector.fit(X, y_enc)
    sfm = SelectFromModel(selector, prefit=True, threshold="median")
    X_sel = sfm.transform(X)
    dump(le, 'le.joblib')
    dump(scaler, 'scaler.joblib')
    dump(sfm, 'selector.joblib')
    return X_sel, y_enc, le.classes_


def apply_transformers(df):
    """Load and apply saved transformers to new data."""
    le = load('le.joblib')
    scaler = load('scaler.joblib')
    sfm = load('selector.joblib')
    X = df.drop(columns=['label'])
    y = le.transform(df['label'].values)
    X[num_cols] = scaler.transform(X[num_cols])
    X_sel = sfm.transform(X)
    return X_sel, y, le.classes_


def train(args):
    df = load_and_clean(args.train_data)
    X = df.drop(columns=['label'])
    y = df['label'].values
    X_sel, y_enc, class_names = fit_transformers(X.copy(), y)
    X_train, X_test, y_train, y_test = train_test_split(
        X_sel, y_enc, test_size=0.2, stratify=y_enc, random_state=42
    )
    # Train XGBoost
    xgb = XGBClassifier(n_estimators=100, max_depth=6,
                        use_label_encoder=False, eval_metric='mlogloss',
                        random_state=42, n_jobs=-1)
    print("Training XGBoost...")
    xgb.fit(X_train, y_train)
    dump(xgb, 'xgb_model.joblib')
    preds = xgb.predict(X_test)
    print(f"XGBoost accuracy: {accuracy_score(y_test, preds):.4f}")
    print(classification_report(y_test, preds, target_names=class_names))
    # Train Stacking
    base_estimators = [
        ('dt', DecisionTreeClassifier(max_depth=10, class_weight='balanced', random_state=42)),
        ('et', ExtraTreesClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)),
        ('rf', RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42, n_jobs=-1)),
        ('brf', BalancedRandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)),
        ('xgb', XGBClassifier(n_estimators=100, max_depth=6, use_label_encoder=False, eval_metric='mlogloss', random_state=42, n_jobs=-1))
    ]
    stack = StackingClassifier(estimators=base_estimators,
                               final_estimator=DecisionTreeClassifier(max_depth=10, random_state=42),
                               stack_method='predict_proba', n_jobs=-1)
    print("Training Stacking Ensemble...")
    stack.fit(X_train, y_train)
    dump(stack, 'stacking_model.joblib')
    preds = stack.predict(X_test)
    print(f"Stacking accuracy: {accuracy_score(y_test, preds):.4f}")
    print(classification_report(y_test, preds, target_names=class_names))


def test(args):
    # Ensure models and transformers exist
    required = ['xgb_model.joblib', 'stacking_model.joblib', 'le.joblib', 'scaler.joblib', 'selector.joblib']
    if not all(os.path.exists(f) for f in required):
        raise FileNotFoundError("Missing files: run in train mode first.")
    df_test = load_and_clean(args.test_data)
    X_test, y_test, class_names = apply_transformers(df_test)
    for name, path in [('XGBoost', 'xgb_model.joblib'), ('Stacking', 'stacking_model.joblib')]:
        model = load(path)
        preds = model.predict(X_test)
        print(f"{name} accuracy on {os.path.basename(args.test_data)}: {accuracy_score(y_test, preds):.4f}")
        print(classification_report(y_test, preds, target_names=class_names))


def main():
    parser = argparse.ArgumentParser()
    sub = parser.add_subparsers(dest='mode')  # allow unknown args
    tr = sub.add_parser('train')
    tr.add_argument('--train-data', required=True)
    te = sub.add_parser('test')
    te.add_argument('--test-data', required=True)
    # ignore notebook args
    args, _ = parser.parse_known_args()
    if args.mode == 'train':
        train(args)
    elif args.mode == 'test':
        test(args)
    else:
        parser.print_help()

if __name__ == "__main__":
    main()


In [3]:
import pandas as pd
from joblib import load

# 1. Load your test set (must contain an 'Id' column)
test_df = pd.read_csv('C:\\Users\\HP\\Downloads\\test.csv')

# 2. Define the numeric→string mapping (invert of your label encoder)
num2label = {
    0: 'BenignTraffic',
    1: 'DDoS',
    2: 'DoS',
    3: 'MITM',
    4: 'Mirai',
    5: 'Recon'
}

# 3. List all your .joblib prediction files
prediction_files = {
    'xgb':      'C:\\Users\\HP\\Downloads\\xgb_preds.joblib',
    'stacking': 'C:\\Users\\HP\\Downloads\\stacking_preds.joblib'
}

# Immediately before building the DataFrame, add:
print("test rows:",   test_df.shape[0])
print("predictions:", len(preds_num))

# That will pin down whether it's your CSV or your preds array that’s “wrong”.


# 4. For each, load the numeric preds, map → strings, and save a submission CSV
for model_name, joblib_path in prediction_files.items():
    # load the array of numeric codes
    preds_num = load(joblib_path)
    
    # map to strings
    preds_str = [ num2label[i] for i in preds_num ]
    
    # build submission DataFrame
    sub = pd.DataFrame({
        'Id':     test_df['Id'],
        'Target': preds_str
    })
    
    # save
    filename = f'{model_name}_submission.csv'
    sub.to_csv(filename, index=False)
    print(f'Wrote {filename} ({len(sub)} rows)')


test rows: 104287
predictions: 168480


ValueError: array length 168480 does not match index length 104287

In [2]:
# Cyber Attack Classification Pipeline with Threading Backend Fix
# Complete solution from data preparation to model evaluation

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from joblib import parallel_backend
import warnings
warnings.filterwarnings('ignore')

# 1. Load the dataset
def load_data(file_path):
    print("Loading dataset...")
    df = pd.read_csv(file_path)
    print(f"Dataset loaded with shape: {df.shape}")
    return df

# 2. Data Exploration and Preprocessing
def explore_data(df):
    print("\n--- Data Exploration ---")
    print("Sample of data:")
    print(df.head())
    
    print("\nData information:")
    print(df.info())
    
    print("\nBasic statistics:")
    print(df.describe())
    
    print("\nClass distribution:")
    print(df['label'].value_counts())
    
    # Visualize class distribution
    plt.figure(figsize=(10, 6))
    sns.countplot(x='label', data=df)
    plt.title('Class Distribution')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('class_distribution.png')
    plt.close()
    
    return df

def preprocess_data(df):
    print("\n--- Data Preprocessing ---")
    
    # Check for missing values
    missing_values = df.isnull().sum()
    print(f"Missing values:\n{missing_values}")
    
    # Handle missing values if any
    if missing_values.sum() > 0:
        df = df.fillna(df.mean())
        print("Missing values have been filled with mean values.")
        
    # Check for duplicates
    duplicates = df.duplicated().sum()
    print(f"Number of duplicates: {duplicates}")
    
    if duplicates > 0:
        df = df.drop_duplicates()
        print("Duplicates have been removed.")
        
    # Identify numerical and binary features as per the document
    numerical_features = [
        'flow_time', 'header_size', 'packet_duration', 'overall_rate',
        'src_rate', 'dst_rate', 'fin_packets', 'urg_packets', 'rst_packets',
        'max_value', 'value_covariance'
    ]
    
    binary_features = [
        'fin_flags', 'syn_flags', 'rst_flags', 'psh_flags', 'ack_flags',
        'protocol_http', 'protocol_https', 'protocol_tcp', 'protocol_udp', 'protocol_icmp'
    ]
    
    # Outlier capping for numerical features
    print("\nChecking for outliers in numerical features...")
    for feature in numerical_features:
        if feature in df.columns:
            q1 = df[feature].quantile(0.25)
            q3 = df[feature].quantile(0.75)
            iqr = q3 - q1
            upper_bound = q3 + 1.5 * iqr
            lower_bound = q1 - 1.5 * iqr
            df[feature] = np.where(df[feature] > upper_bound, upper_bound, df[feature])
            df[feature] = np.where(df[feature] < lower_bound, lower_bound, df[feature])
    
    # Correlation matrix visualization
    plt.figure(figsize=(12, 10))
    correlation = df[numerical_features].corr()
    sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.savefig('correlation_matrix.png')
    plt.close()
    
    return df, numerical_features, binary_features

# 3. Feature Engineering and Selection
def engineer_features(df, numerical_features, binary_features):
    print("\n--- Feature Engineering ---")
    
    # New feature: rate_ratio
    if all(col in df.columns for col in ['src_rate', 'dst_rate']):
        df['rate_ratio'] = df['src_rate'] / (df['dst_rate'] + 1)
        numerical_features.append('rate_ratio')
        print("Added new feature: rate_ratio (src_rate / dst_rate)")
    
    # New feature: control_packet_ratio
    if all(col in df.columns for col in ['fin_packets', 'rst_packets', 'urg_packets', 'flow_time']):
        df['control_packet_ratio'] = (
            df['fin_packets'] + df['rst_packets'] + df['urg_packets']
        ) / (df['flow_time'] + 0.1)
        numerical_features.append('control_packet_ratio')
        print("Added new feature: control_packet_ratio (control packets per second)")
    
    # Quick Random Forest for feature importance
    X = df[numerical_features + binary_features]
    y = df['label']
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X, y)
    
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': rf.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importance)
    plt.title('Feature Importance')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()
    
    print("\nTop 10 important features:")
    print(feature_importance.head(10))
    
    return df, numerical_features, binary_features

# 4. Data Splitting and Preparation
def prepare_data_for_modeling(df, numerical_features, binary_features):
    print("\n--- Data Splitting and Preparation ---")
    X = df[numerical_features + binary_features]
    y = df['label']
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    print(f"Training set shape: {X_train.shape}, Testing set shape: {X_test.shape}")
    
    # Handle class imbalance with SMOTE
    class_counts = y_train.value_counts()
    majority_class_count = class_counts.max()
    minority_classes = class_counts[class_counts < 0.1 * majority_class_count].index.tolist()
    if minority_classes:
        print(f"\nApplying SMOTE to handle imbalance for classes: {minority_classes}")
        smote = SMOTE(random_state=42)
        X_train, y_train = smote.fit_resample(X_train, y_train)
        print(f"Training set shape after SMOTE: {X_train.shape}")
        print("Class distribution after SMOTE:")
        print(y_train.value_counts())
    
    return X_train, X_test, y_train, y_test

# 5. Model Building and Evaluation
def build_and_evaluate_models(
    X_train, X_test, y_train, y_test,
    numerical_features, binary_features
):
    print("\n--- Model Building and Evaluation ---")
    
    # Preprocessing
    numerical_transformer = StandardScaler()
    preprocessor = ColumnTransformer(
        transformers=[('num', numerical_transformer, numerical_features)],
        remainder='passthrough'
    )

    models = {
        'RandomForest': RandomForestClassifier(random_state=42),
        'GradientBoosting': GradientBoostingClassifier(random_state=42),
        'SVM': SVC(probability=True, random_state=42)
    }

    best_model = None
    best_accuracy = 0
    best_model_name = None

    for model_name, model in models.items():
        print(f"\nTraining {model_name}...")
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])

        # Hyperparameter grids
        if model_name == 'RandomForest':
            param_grid = {
                'classifier__n_estimators': [100, 200],
                'classifier__max_depth': [None, 10, 20],
                'classifier__min_samples_split': [2, 5]
            }
        elif model_name == 'GradientBoosting':
            param_grid = {
                'classifier__n_estimators': [100, 200],
                'classifier__learning_rate': [0.01, 0.1],
                'classifier__max_depth': [3, 5]
            }
        else:  # SVM
            param_grid = {
                'classifier__C': [1, 10],
                'classifier__gamma': ['scale', 'auto'],
                'classifier__kernel': ['rbf', 'linear']
            }

        print("Performing grid search with 3-fold cross-validation (threading backend)...")
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,
            scoring='accuracy',
            n_jobs=-1
        )
        # Use threading to avoid pickling large arrays
        with parallel_backend('threading'):
            grid_search.fit(X_train, y_train)

        best_pipeline = grid_search.best_estimator_
        y_pred = best_pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        print(f"\n{model_name} Results:")
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Test Accuracy: {accuracy:.4f}")

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = best_pipeline
            best_model_name = model_name

        report = classification_report(y_test, y_pred)
        print("\nClassification Report:")
        print(report)

        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(10, 8))
        sns.heatmap(
            cm,
            annot=True,
            fmt='d',
            cmap='Blues',
            xticklabels=np.unique(y_test),
            yticklabels=np.unique(y_test)
        )
        plt.title(f'Confusion Matrix - {model_name}')
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.tight_layout()
        plt.savefig(f'confusion_matrix_{model_name}.png')
        plt.close()

    print(f"\n--- Best Model: {best_model_name} with accuracy {best_accuracy:.4f} ---")
    return best_model

# 6. Main Function
def main():
    file_path = r"C:\Machine Learning\Phase 2\Data\phase2_students_before_cleaning.csv"
    df = load_data(file_path)
    df = explore_data(df)
    df, numerical_features, binary_features = preprocess_data(df)
    df, numerical_features, binary_features = engineer_features(df, numerical_features, binary_features)
    X_train, X_test, y_train, y_test = prepare_data_for_modeling(df, numerical_features, binary_features)
    best_model = build_and_evaluate_models(
        X_train, X_test, y_train, y_test,
        numerical_features, binary_features
    )
    # Save the best model
    import joblib
    joblib.dump(best_model, 'cyber_attack_classifier.pkl')
    print("\nBest model saved as 'cyber_attack_classifier.pkl'")

if __name__ == "__main__":
    main()


Loading dataset...
Dataset loaded with shape: (938583, 22)

--- Data Exploration ---
Sample of data:
   flow_time  header_size  packet_duration  overall_rate     src_rate  \
0   0.041268     15499.00            64.00   7805.845961  7805.845961   
1   0.018393      3702.54            64.00   6728.994198  6728.994198   
2   0.000000       182.00            64.00     38.559448    38.559448   
3   0.109292     35027.55            62.72   6783.234241  6783.234241   
4   0.000000       162.00            64.00      2.305494     2.305494   

   dst_rate  fin_packets  urg_packets  rst_packets  max_value  ...  syn_flags  \
0       0.0          0.0         0.00         0.00      50.00  ...          0   
1       0.0          0.0         0.00         0.01      54.28  ...          0   
2       0.0          0.0         0.00         0.00     182.00  ...          0   
3       0.0          0.0         0.03         0.11      65.11  ...          0   
4       0.0          0.0         0.00         0.00     

MemoryError: Unable to allocate 7.30 MiB for an array with shape (956634, 1) and data type float64

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

# Columns to transform
num_cols = [
    "flow_time", "header_size", "packet_duration", "overall_rate",
    "src_rate", "dst_rate", "fin_packets", "urg_packets",
    "rst_packets", "max_value", "value_covariance"
]

# 1. Load & clean data (dedupe, shuffle, winsorize, log1p)
def load_and_clean(path):
    df = pd.read_csv(path)
    df = df.drop_duplicates()
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    for col in num_cols:
        lo, hi = np.percentile(df[col], [1, 99])
        df[col] = df[col].clip(lo, hi)
        df[col] = np.log1p(df[col])
    return df

# 2. Preprocess: encode and scale
def preprocess(df):
    X = df.drop(columns=["label"])
    y = df["label"].values
    le = LabelEncoder()
    y = le.fit_transform(y)
    class_names = le.classes_

    scaler = StandardScaler()
    X[num_cols] = scaler.fit_transform(X[num_cols])
    return X, y, class_names

# 3. Base models factory with runtime constraints
def get_base_models():
    return [
        ("dt", DecisionTreeClassifier(max_depth=10, class_weight="balanced", random_state=42)),
        ("et", ExtraTreesClassifier(n_estimators=100, max_depth=10, n_jobs=-1, random_state=42)),
        ("rf", RandomForestClassifier(n_estimators=100, max_depth=10, class_weight="balanced", n_jobs=-1, random_state=42)),
        ("brf", BalancedRandomForestClassifier(n_estimators=100, max_depth=10, n_jobs=-1, random_state=42))
    ]

# 4. Train ensembles
def train_ensembles(X_train, y_train):
    models = get_base_models()
    vote_clf = VotingClassifier(estimators=models, voting="soft", n_jobs=-1)
    vote_clf.fit(X_train, y_train)

    stack_clf = StackingClassifier(
        estimators=models,
        final_estimator=DecisionTreeClassifier(max_depth=5, random_state=42),
        stack_method="predict_proba",
        n_jobs=-1
    )
    stack_clf.fit(X_train, y_train)

    return vote_clf, stack_clf

# 5. Evaluation
def evaluate_model(model, X, y, class_names):
    preds = model.predict(X)
    print(f"== {model.__class__.__name__} ==")
    print("Accuracy:", accuracy_score(y, preds))
    print(classification_report(y, preds, target_names=class_names))

# 6. Main pipeline
def main():
    path = "C:\Machine Learning\Phase 2\Data\phase2_students_before_cleaning.csv"
    df = load_and_clean(path)
    X, y, class_names = preprocess(df)

    # split & shuffle
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    # 7. Feature selection using ExtraTrees
    selector = ExtraTreesClassifier(n_estimators=50, max_depth=10, random_state=42, n_jobs=-1)
    selector.fit(X_train, y_train)
    sfm = SelectFromModel(selector, prefit=True, threshold="median")
    X_train = sfm.transform(X_train)
    X_test = sfm.transform(X_test)
    selected_features = X.columns[sfm.get_support()]
    print(f"Selected {len(selected_features)} features out of {X.shape[1]}")

    # 8. Train and evaluate individual models
    for name, model in get_base_models():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        evaluate_model(model, X_test, y_test, class_names)

    # 9. Ensembling
    vote_clf, stack_clf = train_ensembles(X_train, y_train)
    evaluate_model(vote_clf, X_test, y_test, class_names)
    evaluate_model(stack_clf, X_test, y_test, class_names)

if __name__ == "__main__":
    main()


  path = "C:\Machine Learning\Phase 2\Data\phase2_students_before_cleaning.csv"


Selected 11 features out of 21
Training dt...
== DecisionTreeClassifier ==
Accuracy: 0.8509852801519469
               precision    recall  f1-score   support

BenignTraffic       0.85      0.82      0.83      4397
         DDoS       0.92      0.89      0.90    119580
          DoS       0.63      0.69      0.66     31699
         MITM       0.46      0.73      0.57      1263
        Mirai       0.99      0.99      0.99     10255
        Recon       0.39      0.73      0.51      1286

     accuracy                           0.85    168480
    macro avg       0.71      0.81      0.74    168480
 weighted avg       0.86      0.85      0.85    168480

Training et...
== ExtraTreesClassifier ==
Accuracy: 0.8627967711301044
               precision    recall  f1-score   support

BenignTraffic       0.76      0.94      0.84      4397
         DDoS       0.86      0.98      0.92    119580
          DoS       0.86      0.40      0.55     31699
         MITM       0.86      0.55      0.67      1

  path = "C:\Machine Learning\Phase 2\Data\phase2_students_before_cleaning.csv"


KeyboardInterrupt: 