In [7]:
import mlflow 
import pandas as pd 
import numpy as np 
import os 
import json 
import pickle 

**Data Loading and Process**

In [8]:
# Load data 
data = pd.read_csv('05_data_pca.csv', index_col=False) # this csv file will be removed later, it is only used for experimenting
data = data.drop(columns='Unnamed: 0')

In [9]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from collections import Counter


class UnderSampling:
    def __init__(self, sampling_strategy: dict):
        self.sampling_strategy = sampling_strategy

    def select(self, data: pd.DataFrame) -> pd.DataFrame:
        """UnderSampling of the majority class."""
        df_sampled = data.copy()

        X = df_sampled.drop(columns=['target'])
        y = df_sampled['target']

        # Apply Random UnderSampling to handle class imbalance
        rus = RandomUnderSampler(sampling_strategy=self.sampling_strategy, random_state=42)
        X_res, y_res = rus.fit_resample(X, y)

        # Convert the resampled arrays back to a DataFrame
        data_undersampled = pd.DataFrame(X_res, columns=X.columns)
        data_undersampled['target'] = y_res

        return data_undersampled


# Define a class for Over Sampling
class OverSampling:
    def __init__(self, sampling_strategy: dict): 
        self.sampling_strategy = sampling_strategy

    def select(self, data: pd.DataFrame) -> pd.DataFrame:
        """OverSampling of the minority class."""
        df_sampled = data.copy()
        
        X = df_sampled.drop(columns=['target'])
        y = df_sampled['target']

        # Process each column
        # Apply SMOTE to handle class imbalance
        smote = SMOTE(sampling_strategy=self.sampling_strategy, random_state=42)
        X_res, y_res = smote.fit_resample(X, y)

        # Convert the resampled arrays back to a DataFrame
        df_resampled = pd.DataFrame(X_res, columns=X.columns)
        df_resampled['target'] = y_res

        return df_resampled

In [10]:
# sampling 
oversampling_strategy = {8: 4000, 6: 3000, 3: 2000, 4: 2000, 7: 2000, 10: 1800, 5: 1500, 9: 1500}
undersampling_strategy = {0: 10000, 1: 8000, 2: 5000}

try:
    under_sampler = UnderSampling(sampling_strategy=undersampling_strategy)
    data_undersampled = under_sampler.select(data)
    print("Undersampling completed!") 
except Exception as e: 
    print(f'Error in undersampling: {e}')

try: 
    over_sampler = OverSampling(sampling_strategy=oversampling_strategy)
    data_resampled = over_sampler.select(data_undersampled)
    print("Oversampling completed!")
except Exception as e:
    print(f'Error in oversampling: {e}')

Undersampling completed!
Oversampling completed!


In [11]:
# save 
data_resampled.to_csv('06_data_resampled.csv', index=False) 

**Experiments**

In [12]:
from sklearn.model_selection import train_test_split 

In [13]:
# features and labels
features = data_resampled.drop(columns='target')
label = data_resampled['target']

# split
x_train, x_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42)

In [14]:
# Shapes 
print(f"Train features shape: {x_train.shape}")
print(f"Test features shape: {x_test.shape}")
print(f"Train label shape: {y_train.shape}")
print(f"Test label shape: {y_test.shape}")

Train features shape: (39617, 9)
Test features shape: (9905, 9)
Train label shape: (39617,)
Test label shape: (9905,)


Models

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
import xgboost as xgb
import time 

In [16]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score

**Create Experiment**

In [17]:
import mlflow 

In [22]:
if mlflow.get_experiment_by_name("Misuse_Detection_in_Containers") == None:
    exp_id = mlflow.create_experiment(
        "Misuse_Detection_in_Containers", 
        artifact_location="/artifacts",
        tags={"trial": "1"}
    )
else: 
    exp_id = mlflow.get_experiment_by_name("Misuse_Detection_in_Containers")

In [25]:
print(f"Experiment: {exp_id}")

Experiment: <Experiment: artifact_location='mlflow-artifacts:/602439105628882616', creation_time=1733309188848, experiment_id='602439105628882616', last_update_time=1733309188848, lifecycle_stage='active', name='Misuse_Detection_in_Containers', tags={}>


In [30]:
exp_id.experiment_id 

'602439105628882616'

helper methods

In [39]:
def mlflow_logger(model, model_name, params, train_time, x_test, y_test):
    # predictions 
    pred = model.predict(x_test)
    print("--logging metrics--")
    # metrics 
    acc = accuracy_score(y_test, pred)
    cm = confusion_matrix(y_test, pred)
    # log metric 
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_score", 2*(0.5*acc)/(0.5+acc))
    mlflow.log_metric("precision", cm[1,1]/(cm[0,1]+cm[1,1]))
    mlflow.log_metric("recall", cm[1,1]/(cm[1,0]+cm[1,1]))
    print("--logging parameters--")
    # parameters
    for param_name, param_value in params.items():
        mlflow.log_param(param_name, param_value)
    print("--logging model--")
    # save models 
    mlflow.sklearn.log_model(model, f"artifacts/{model_name}")
    # locally 
    with open(f"artifacts/{model_name}.pkl", "wb") as f:
        pickle.dump(model, f)

KNN

In [40]:
# KNN 
with mlflow.start_run(experiment_id=exp_id.experiment_id, run_name="knn_v1"):
    start = time.time()
    model = KNeighborsClassifier(
        n_neighbors=5,
        weights='uniform', 
        algorithm='auto',
        metric='minkowski',
        p=1
    )

    print("--training model--")
    history = model.fit(x_train, y_train)
    end = time.time()
    train_time = end-start

    # log
    mlflow_logger(model, "knn_v1", model.get_params(), train_time, x_test, y_test)

    mlflow.end_run()

--training model--
--logging metrics--
--logging parameters--
--logging model--


In [41]:
# KNN v2
with mlflow.start_run(experiment_id=exp_id.experiment_id, run_name="knn_v2"):
    start = time.time()

    model = KNeighborsClassifier(
        n_neighbors=3, # fewer
        weights='distance', # inverse distance
        algorithm='auto',
        metric='minkowski',
        p=1
    )

    print("--training model--")
    history = model.fit(x_train, y_train)
    end = time.time()
    train_time = end-start

    # log
    mlflow_logger(model, "knn_v2", model.get_params(), train_time, x_test, y_test)

    mlflow.end_run()

--training model--
--logging metrics--
--logging parameters--
--logging model--


In [43]:
# KNN 

with mlflow.start_run(experiment_id=exp_id.experiment_id, run_name="knn_v3"):
    start = time.time()

    model = KNeighborsClassifier(
        n_neighbors=7, # smoothing predictions
        weights='uniform', 
        algorithm='ball_tree', # special algo for distance computation
        metric='minkowski',
        p=2
    )

    print("--training model--")
    history = model.fit(x_train, y_train)
    end = time.time()
    train_time = end-start

    # log
    mlflow_logger(model, "knn_v3", model.get_params(), train_time, x_test, y_test)

    mlflow.end_run()

--training model--
--logging metrics--
--logging parameters--
--logging model--


In [45]:
# KNN 

with mlflow.start_run(experiment_id=exp_id.experiment_id, run_name="knn_v4"):
    start = time.time()

    model = KNeighborsClassifier(
        n_neighbors=4,
        weights='distance', 
        algorithm='brute',
        metric='cosine',
        p=3
    )

    print("--training model--")
    history = model.fit(x_train, y_train)
    end = time.time()
    train_time = end-start

    # log
    mlflow_logger(model, "knn_v4", model.get_params(), train_time, x_test, y_test)

    mlflow.end_run()

--training model--
--logging metrics--
--logging parameters--
--logging model--


Gradient Boosting Classifier

In [None]:
# GB

with mlflow.start_run(experiment_id=exp_id.experiment_id, run_name="gb_v1"):
    start = time.time()

    model = GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=3,
        random_state=42
    )

    print("--training model--")
    history = model.fit(x_train, y_train)
    end = time.time()
    train_time = end-start

    # log
    mlflow_logger(model, "gb_v1", model.get_params(), train_time, x_test, y_test)

    mlflow.end_run()

--training model--
--logging metrics--
--logging parameters--
--logging model--


In [47]:
# GB

with mlflow.start_run(experiment_id=exp_id.experiment_id, run_name="gb_v2"):
    start = time.time()

    model = GradientBoostingClassifier(
        n_estimators=50,
        learning_rate=0.01,
        max_depth=4,
        random_state=42
    )

    print("--training model--")
    history = model.fit(x_train, y_train)
    end = time.time()
    train_time = end-start

    # log
    mlflow_logger(model, "gb_v2", model.get_params(), train_time, x_test, y_test)

    mlflow.end_run()

--training model--
--logging metrics--
--logging parameters--
--logging model--


LightGBM 

In [50]:
# LightGBM

with mlflow.start_run(experiment_id=exp_id.experiment_id, run_name="lgbm_v1"):
    start = time.time()

    model = LGBMClassifier(
        n_estimators=100,  # Number of boosting rounds
        learning_rate=0.1,  # Boosting learning rate
        max_depth=1,  # Maximum tree depth for base learners
        random_state=42
    )

    print("--training model--")
    history = model.fit(x_train, y_train)
    end = time.time()
    train_time = end-start

    # log
    mlflow_logger(model, "lgbm_v1", model.get_params(), train_time, x_test, y_test)

    mlflow.end_run()

--training model--
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011350 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 39617, number of used features: 9
[LightGBM] [Info] Start training from score -1.605081
[LightGBM] [Info] Start training from score -1.814713
[LightGBM] [Info] Start training from score -2.295467
[LightGBM] [Info] Start training from score -3.194366
[LightGBM] [Info] Start training from score -3.222467
[LightGBM] [Info] Start training from score -3.499440
[LightGBM] [Info] Start training from score -2.788491
[LightGBM] [Info] Start training from score -3.210505
[LightGBM] [Info] Start training from score -2.509877
[LightGBM] [Info] Start training from score -3.466569
[LightGBM] [Info] Start training from score -3.320186
[LightGBM] [Info] Start training from score -1.748897
--logging metrics--
--logging parameters--
--

In [51]:
# LightGBM

with mlflow.start_run(experiment_id=exp_id.experiment_id, run_name="lgbm_v2"):
    start = time.time()

    model = LGBMClassifier(
        n_estimators= 200,
        learning_rate=0.05,
        max_depth=-1,  # No depth limit
        num_leaves=31,  # Default; adjusts automatically with max_depth
        random_state=42
    )

    print("--training model--")
    history = model.fit(x_train, y_train)
    end = time.time()
    train_time = end-start

    # log
    mlflow_logger(model, "lgbm_v2", model.get_params(), train_time, x_test, y_test)

    mlflow.end_run()

--training model--
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002607 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 39617, number of used features: 9
[LightGBM] [Info] Start training from score -1.605081
[LightGBM] [Info] Start training from score -1.814713
[LightGBM] [Info] Start training from score -2.295467
[LightGBM] [Info] Start training from score -3.194366
[LightGBM] [Info] Start training from score -3.222467
[LightGBM] [Info] Start training from score -3.499440
[LightGBM] [Info] Start training from score -2.788491
[LightGBM] [Info] Start training from score -3.210505
[LightGBM] [Info] Start training from score -2.509877
[LightGBM] [Info] Start training from score -3.466569
[LightGBM] [Info] Start training from score -3.320186
[LightGBM] [Info] Start training fr

In [52]:
# LightGBM

with mlflow.start_run(experiment_id=exp_id.experiment_id, run_name="lgbm_v3"):
    start = time.time()

    model = LGBMClassifier(
        n_estimators= 500,
        learning_rate=0.01,
        max_depth=5,  # No depth limit
        num_leaves=15,  # Default; adjusts automatically with max_depth
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )

    print("--training model--")
    history = model.fit(x_train, y_train)
    end = time.time()
    train_time = end-start

    # log
    mlflow_logger(model, "lgbm_v3", model.get_params(), train_time, x_test, y_test)

    mlflow.end_run()

--training model--
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006748 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 39617, number of used features: 9
[LightGBM] [Info] Start training from score -1.605081
[LightGBM] [Info] Start training from score -1.814713
[LightGBM] [Info] Start training from score -2.295467
[LightGBM] [Info] Start training from score -3.194366
[LightGBM] [Info] Start training from score -3.222467
[LightGBM] [Info] Start training from score -3.499440
[LightGBM] [Info] Start training from score -2.788491
[LightGBM] [Info] Start training from score -3.210505
[LightGBM] [Info] Start training from score -2.509877
[LightGBM] [Info] Start training from score -3.466569
[LightGBM] [Info] Start training from score -3.320186
[LightGBM] [Info] Start training from score -1.748897
--logging metrics--
--logging parameters--
--

SVM

In [None]:
# svc

with mlflow.start_run(experiment_id=exp_id.experiment_id, run_name="svm_v1"):
    start = time.time()

    model = SVC(
        C=1.0,
        kernel='poly',
        gamma='auto'
    )

    print("--training model--")
    history = model.fit(x_train, y_train)
    end = time.time()
    train_time = end-start

    # log
    mlflow_logger(model, "svm_v1", model.get_params(), train_time, x_test, y_test)

    mlflow.end_run()

--training model--
--logging metrics--
--logging parameters--
--logging model--


xgb 

In [None]:
# xgb

with mlflow.start_run(experiment_id=exp_id.experiment_id, run_name="xgb_v1"):
    start = time.time()

    model = xgb.XGBClassifier(
        n_estimators=800, 
        learning_rate=0.01, 
        max_depth=2,
        random_state=42,
        use_label_encoder=False,
        eval_metrics='logloss'
    )

    print("--training model--")
    history = model.fit(x_train, y_train)
    end = time.time()
    train_time = end-start

    # log
    mlflow_logger(model, "xgb_v1", model.get_params(), train_time, x_test, y_test)

    mlflow.end_run()

--training model--


Parameters: { "eval_metrics", "use_label_encoder" } are not used.



--logging metrics--
--logging parameters--
--logging model--


In [None]:
# Xgb

with mlflow.start_run(experiment_id=exp_id.experiment_id, run_name="xgb_v2"):
    start = time.time()

    model = xgb.XGBClassifier(
        n_estimators=500, 
        learning_rate=0.1, 
        max_depth=3,
        random_state=42,
        use_label_encoder=False,
    )

    print("--training model--")
    history = model.fit(x_train, y_train)
    end = time.time()
    train_time = end-start

    # log
    mlflow_logger(model, "xgb_v2", model.get_params(), train_time, x_test, y_test)

    mlflow.end_run()

--training model--


Parameters: { "use_label_encoder" } are not used.



--logging metrics--
--logging parameters--
--logging model--


In [58]:
# XGb

with mlflow.start_run(experiment_id=exp_id.experiment_id, run_name="xgb_v3"):
    start = time.time()

    model = xgb.XGBClassifier(
        n_estimators=500, 
        learning_rate=0.08, 
        max_depth=4,
        random_state=42,
        use_label_encoder=False,
    )

    print("--training model--")
    history = model.fit(x_train, y_train)
    end = time.time()
    train_time = end-start

    # log
    mlflow_logger(model, "xgb_v3", model.get_params(), train_time, x_test, y_test)

    mlflow.end_run()

--training model--


Parameters: { "use_label_encoder" } are not used.



--logging metrics--
--logging parameters--
--logging model--


knn_v4 is best model so far

with params: 

- n_neighbors = 4,
- weights = 'distance' 
- algorithm = 'brute'
- metric = 'cosine'
- p = 3

**process the scores.csv**

In [65]:
score_df = pd.read_csv("scores.csv")
# Save to parquet 
score_df.to_parquet("scores.parquet")