In [27]:
# Preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt

# 1. Load dataset
print("Loading the dataset")
data = pd.read_csv("Kernel_Events_1.csv")
print("Dataset Loaded")

# 2. Display the number of columns
print("Number of columns in the dataset:", len(data.columns))

# 3. Drop columns with a single unique value
single_unique_cols = data.columns[data.nunique() == 1]
data = data.drop(columns=single_unique_cols)
print(f"Dropped {len(single_unique_cols)} constant value columns.")

# 4. Display the number of columns
print("Number of columns in the dataset:", len(data.columns))

# 5. Drop irrelevant columns for 'Scenario' prediction
columns_to_drop = ['time', 'Attack', 'Label', 'interface']
data = data.drop(columns=columns_to_drop)
print(f"Dropped columns irrelevant for scenario prediction: {columns_to_drop}")
print("Number of columns in the dataset:", len(data.columns))

# 6. Encode categorical features 'State' and 'Scenario'
encoded_data = data.copy()
encoders = {}
columns_to_encode = ['State', 'Scenario']

for col in columns_to_encode:
    le = LabelEncoder()
    encoded_data[col] = le.fit_transform(encoded_data[col])
    encoders[col] = le
    value_counts = data[col].value_counts()
    print(f"\nEncoding for column '{col}':")
    for original, encoded in zip(le.classes_, le.transform(le.classes_)):
        count = value_counts[original]
        print(f"'{original}' -> {encoded} (count: {count})")

# --- Strictly Balance the dataset for 'Scenario' based on the minimum 'Idle' count ---

print("\n--- Balancing the dataset for 'Scenario' based on the minimum 'Idle' count ---")

idle_encoded = encoders['State'].transform(['idle'])[0]
charging_encoded = encoders['State'].transform(['Charging'])[0]

scenario_groups = encoded_data.groupby('Scenario')
min_idle_samples = scenario_groups.apply(lambda x: x[x['State'] == idle_encoded].shape[0]).min()
balanced_data = []
target_samples_per_scenario = 2 * min_idle_samples

for scenario, group in scenario_groups:
    original_scenario = encoders['Scenario'].inverse_transform([scenario])[0]
    print(f"\nProcessing Scenario: {original_scenario}")

    idle_group = group[group['State'] == idle_encoded]
    charging_group = group[group['State'] == charging_encoded]

    idle_sampled = idle_group.sample(n=min(len(idle_group), min_idle_samples), random_state=42)
    charging_sampled = charging_group.sample(n=min(len(charging_group), min_idle_samples), random_state=42)

    sampled_group = pd.concat([idle_sampled, charging_sampled])

    if len(sampled_group) < target_samples_per_scenario:
        diff = target_samples_per_scenario - len(sampled_group)
        additional_charging = charging_group.drop(charging_sampled.index, errors='ignore').sample(n=min(diff // 2 + diff % 2, len(charging_group) - len(charging_sampled)), random_state=42)
        sampled_group = pd.concat([sampled_group, additional_charging])
        additional_idle = idle_group.drop(idle_sampled.index, errors='ignore').sample(n=min(diff // 2, len(idle_group) - len(idle_sampled)), random_state=42)
        sampled_group = pd.concat([sampled_group, additional_idle])
    elif len(sampled_group) > target_samples_per_scenario:
        sampled_group = sampled_group.sample(n=target_samples_per_scenario, random_state=42)

    balanced_data.append(sampled_group)
    print(f"  Balanced counts - Idle: {len(sampled_group[sampled_group['State'] == idle_encoded])}, Charging: {len(sampled_group[sampled_group['State'] == charging_encoded])}, Total: {len(sampled_group)}")

balanced_df = pd.concat(balanced_data).sample(frac=1, random_state=42).reset_index(drop=True)

# --- Split data into training and testing sets with constraints ---

print("\n--- Splitting data into Training and Testing Sets with Constraints ---")

train_data = []
test_data = []
test_size = 0.2
random_state = 42

for scenario_code in balanced_df['Scenario'].unique():
    scenario_df = balanced_df[balanced_df['Scenario'] == scenario_code].copy()
    n_samples = len(scenario_df)
    train_n = int(n_samples * (1 - test_size))

    # Ensure equal number of Idle and Charging in training
    idle_df = scenario_df[scenario_df['State'] == idle_encoded]
    charging_df = scenario_df[scenario_df['State'] == charging_encoded]

    train_idle_n = int(train_n * 0.5)
    train_charging_n = train_n - train_idle_n

    train_idle_sampled = idle_df.sample(n=min(len(idle_df), train_idle_n), random_state=random_state)
    train_charging_sampled = charging_df.sample(n=min(len(charging_df), train_charging_n), random_state=random_state)
    train_scenario = pd.concat([train_idle_sampled, train_charging_sampled])
    train_data.append(train_scenario)

    test_scenario = scenario_df.drop(train_scenario.index)
    test_data.append(test_scenario)

train_df = pd.concat(train_data).sample(frac=1, random_state=random_state).reset_index(drop=True)
test_df = pd.concat(test_data).sample(frac=1, random_state=random_state).reset_index(drop=True)

# --- Display Training Set Information ---

print("\n--- Training Set Information ---")
print("Number of rows in Training Set:", len(train_df))
print("\nTraining Set 'Scenario' Value Counts:")
train_scenario_counts = train_df['Scenario'].value_counts().sort_index().rename(index=lambda x: encoders['Scenario'].inverse_transform([x])[0])
print(train_scenario_counts)

print("\nTraining Set 'State' Counts per 'Scenario':")
train_grouped = train_df.groupby('Scenario')
for scenario, group in train_grouped:
    original_scenario = encoders['Scenario'].inverse_transform([scenario])[0]
    idle_count = group[group['State'] == idle_encoded].shape[0]
    charging_count = group[group['State'] == charging_encoded].shape[0]
    print(f"Scenario: {original_scenario} - Idle: {idle_count}, Charging: {charging_count}")

# --- Display Testing Set Information ---

print("\n--- Testing Set Information ---")
print("Number of rows in Testing Set:", len(test_df))
print("\nTesting Set 'Scenario' Value Counts:")
test_scenario_counts = test_df['Scenario'].value_counts().sort_index().rename(index=lambda x: encoders['Scenario'].inverse_transform([x])[0])
print(test_scenario_counts)

print("\nTesting Set 'State' Counts per 'Scenario':")
test_grouped = test_df.groupby('Scenario')
for scenario, group in test_grouped:
    original_scenario = encoders['Scenario'].inverse_transform([scenario])[0]
    idle_count = group[group['State'] == idle_encoded].shape[0]
    charging_count = group[group['State'] == charging_encoded].shape[0]
    print(f"Scenario: {original_scenario} - Idle: {idle_count}, Charging: {charging_count}")

# --- Define Features (X) and Target (y) for both sets ---

print("\n--- Defining Features (X) and Target (y) for Training Set ---")
X_train = train_df.drop(columns=['Scenario'])
y_train = train_df['Scenario']
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)

print("\n--- Defining Features (X) and Target (y) for Testing Set ---")
X_test = test_df.drop(columns=['Scenario'])
y_test = test_df['Scenario']
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

# --- Identify and scale numerical features for both sets ---

print("\n--- Identifying and Scaling Numerical Features for Training Set ---")
numerical_cols_train = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
scaler = StandardScaler()
if numerical_cols_train:
    X_train[numerical_cols_train] = scaler.fit_transform(X_train[numerical_cols_train])
    print("Numerical features scaled for training set.")
else:
    print("No numerical features to scale in training set.")

print("\n--- Identifying and Scaling Numerical Features for Testing Set ---")
numerical_cols_test = X_test.select_dtypes(include=['int64', 'float64']).columns.tolist()
if numerical_cols_test:
    X_test[numerical_cols_test] = scaler.transform(X_test[numerical_cols_test]) 
    print("Numerical features scaled for testing set.")
else:
    print("No numerical features to scale in testing set.")

Loading the dataset
Dataset Loaded
Number of columns in the dataset: 911
Dropped 685 constant value columns.
Number of columns in the dataset: 226
Dropped columns irrelevant for scenario prediction: ['time', 'Attack', 'Label', 'interface']
Number of columns in the dataset: 222

Encoding for column 'State':
'Charging' -> 0 (count: 3584)
'idle' -> 1 (count: 2582)

Encoding for column 'Scenario':
'Benign' -> 0 (count: 2302)
'Cryptojacking' -> 1 (count: 1793)
'DoS' -> 2 (count: 865)
'Recon' -> 3 (count: 1206)

--- Balancing the dataset for 'Scenario' based on the minimum 'Idle' count ---

Processing Scenario: Benign
  Balanced counts - Idle: 365, Charging: 365, Total: 730

Processing Scenario: Cryptojacking
  Balanced counts - Idle: 365, Charging: 365, Total: 730

Processing Scenario: DoS
  Balanced counts - Idle: 365, Charging: 365, Total: 730

Processing Scenario: Recon
  Balanced counts - Idle: 365, Charging: 365, Total: 730

--- Splitting data into Training and Testing Sets with Constr

  min_idle_samples = scenario_groups.apply(lambda x: x[x['State'] == idle_encoded].shape[0]).min()


In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
import warnings

warnings.filterwarnings("ignore")

# Store models and their names
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel='rbf', probability=True, random_state=42),
    "Logistic Regression": LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)
}

# Train and evaluate each model
for name, model in models.items():
    print(f"\n--- {name} ---")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {acc:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=encoders['Scenario'].classes_))



--- Random Forest ---
Accuracy: 0.9675
Classification Report:
               precision    recall  f1-score   support

       Benign       0.97      1.00      0.99       146
Cryptojacking       1.00      1.00      1.00       146
          DoS       0.93      0.95      0.94       146
        Recon       0.96      0.92      0.94       146

     accuracy                           0.97       584
    macro avg       0.97      0.97      0.97       584
 weighted avg       0.97      0.97      0.97       584


--- SVM ---
Accuracy: 0.9623
Classification Report:
               precision    recall  f1-score   support

       Benign       0.95      1.00      0.98       146
Cryptojacking       1.00      1.00      1.00       146
          DoS       0.93      0.95      0.94       146
        Recon       0.96      0.90      0.93       146

     accuracy                           0.96       584
    macro avg       0.96      0.96      0.96       584
 weighted avg       0.96      0.96      0.96       584

In [32]:
from scipy.stats import ttest_ind
import pandas as pd

def compare_scenarios_vs_benign(df, feature_columns, scenario_column='Scenario', scenario_encoder=None, alpha=0.05):
    """
    Compares each attack scenario to 'Benign' using t-tests across all features.

    Args:
        df (pd.DataFrame): Encoded and preprocessed dataset.
        feature_columns (list): List of numerical feature names.
        scenario_column (str): Name of the column containing scenario labels.
        scenario_encoder (LabelEncoder): Encoder to map scenario names.
        alpha (float): Significance level for the t-test.

    Returns:
        dict: Mapping from scenario name to its t-test results vs. Benign.
    """
    results = {}
    benign_code = scenario_encoder.transform(['Benign'])[0]
    benign_df = df[df[scenario_column] == benign_code]

    for scenario_name in scenario_encoder.classes_:
        if scenario_name == 'Benign':
            continue
        scenario_code = scenario_encoder.transform([scenario_name])[0]
        attack_df = df[df[scenario_column] == scenario_code]

        print(f"\nComparing Benign vs {scenario_name}:")

        scenario_results = []
        for feature in feature_columns:
            try:
                t_stat, p_val = ttest_ind(benign_df[feature], attack_df[feature], equal_var=False)
                scenario_results.append({
                    'feature': feature,
                    't_statistic': t_stat,
                    'p_value': p_val,
                    'significant_difference': p_val < alpha
                })
            except Exception as e:
                scenario_results.append({
                    'feature': feature,
                    't_statistic': None,
                    'p_value': None,
                    'significant_difference': False,
                    'error': str(e)
                })
        df_results = pd.DataFrame(scenario_results)
        df_results.sort_values(by='p_value', inplace=True)
        significant_count = df_results['significant_difference'].sum()
        print(f"  Significant features (p < {alpha}): {significant_count} / {len(feature_columns)}")
        results[scenario_name] = df_results

    return results


print("\n--- Running t-test comparison: Benign vs Each Attack Scenario ---")
feature_columns = X_train.columns.tolist() 
scenario_comparisons = compare_scenarios_vs_benign(train_df, feature_columns, scenario_column='Scenario', scenario_encoder=encoders['Scenario'])



--- Running t-test comparison: Benign vs Each Attack Scenario ---

Comparing Benign vs Cryptojacking:
  Significant features (p < 0.05): 136 / 221

Comparing Benign vs DoS:
  Significant features (p < 0.05): 148 / 221

Comparing Benign vs Recon:
  Significant features (p < 0.05): 174 / 221
