## Model Selection (Baseline Model)

In [1]:
import os
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

RANDOM_STATE = 42

In [2]:
# load processed data
train_path= "../data/processed/train/train_processed.csv"
test_path= "../data/processed/test/test_processed.csv"

if os.path.exists(train_path) and os.path.exists(test_path):
    train_df= pd.read_csv(train_path)
    test_df= pd.read_csv(test_path)
else:
    proc_path= "../data/processed/processed_all/processed_all.csv"
    proc_df= pd.read_csv(proc_path)
    train_df, test_df= train_test_split(
        proc_df,
        test_size=0.2,
        random_state=RANDOM_STATE
    )

# target column (binary classification: 0=normal, 1=attack)
TARGET= 'label'

X_train = train_df.drop(columns=[TARGET])
y_train = train_df[TARGET]

X_test  = test_df.drop(columns=[TARGET])
y_test  = test_df[TARGET]

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
print("Target distribution (train):\n", y_train.value_counts(normalize=True))


Train shape: (175341, 54) Test shape: (82332, 54)
Target distribution (train):
 label
1    0.680622
0    0.319378
Name: proportion, dtype: float64


In [4]:
# cv & evaluation helper functions
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

cv= StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

def cv_scores(model, X, y, cv=cv):
    """Returns cross-validation scores for a model."""
    scoring= ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
    res= cross_validate(model, X, y, cv=cv, scoring=scoring, n_jobs=1)
    summary= {k.replace('test_', ''): np.mean(v) for k, v in res.items() if k.startswith('test_')}
    return summary
