# First models

In [None]:
import numpy as np
import optuna
import pandas as pd
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

In [None]:
df = pd.read_pickle("../data_tcab/whole_feature_dataset.pickle")

In [None]:
tp_features = df.columns[df.columns.str.startswith("tp_")].tolist()
tp_bert_features = df.columns[df.columns.str.startswith("tp_bert_")].tolist()
lm_features = df.columns[df.columns.str.startswith("lm_")].tolist()
tm_features = df.columns[df.columns.str.startswith("tm_")].tolist()

In [None]:
id_vars = [
    "unique_id",
    "attack_name",
    "attack_toolchain",
    "attack_id",
    "scenario",
    "target_model",
    "target_model_dataset",
    "attack_id_bis",
]
index_df = df.loc[:, id_vars]
index_df["label"] = np.where(index_df["attack_name"] == "clean", 0, 1)
var_df = df.drop(id_vars, axis=1)

In [None]:
train_idx, test_idx, y_train, y_test = train_test_split(
    var_df.index,
    index_df["label"],
    test_size=0.2,
    random_state=42,
    stratify=index_df["label"],
)
X_train, X_test = var_df.loc[train_idx], var_df.loc[test_idx]

In [None]:
pipe = make_pipeline(
    StandardScaler(), PCA(100), LogisticRegression(max_iter=1000, random_state=42)
)

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
# 1. Define an objective function to be maximized.
def objective(trial):
    feature_subset = trial.suggest_categorical(
        "feature_subset",
        ["tp_features", "tp_bert_features", "lm_features", "tm_features"],
    )
    feature_dict = {
        "tp_bert_features": tp_bert_features,
        "tp_features": tp_features,
        "lm_features": tp_features + lm_features,
        "tm_features": tp_features + lm_features + tm_features,
    }
    feature_selector = ColumnTransformer(
        transformers=[("selector", "passthrough", feature_dict[feature_subset])],
        remainder="drop",
    )

    classifier_name = trial.suggest_categorical(
        "classifier", ["LogisticRegression", "XGBoost"]
    )
    if classifier_name == "LogisticRegression":
        lr_c = trial.suggest_float("lr_c", 1e-10, 1e10, log=True)
        lr_penalty = trial.suggest_categorical("lr_penalty", [None, "l1", "l2"])
        classifier_obj = sklearn.linear_model.LogisticRegression(
            C=lr_c, penalty=lr_penalty, solver="saga", max_iter=1000
        )
    elif classifier_name == "XGBoost":
        xgb_max_depth = trial.suggest_int("xgb_max_depth", 2, 32, log=True)
        xgb_n_estimators = trial.suggest_int("xgb_n_estimators", 10, 500, log=True)
        classifier_obj = XGBClassifier(
            max_depth=xgb_max_depth, n_estimators=xgb_n_estimators
        )

    scaler_name = trial.suggest_categorical(
        "scaler", ["None", "StandardScaler", "StandardScaler+PCA"]
    )
    if scaler_name == "None":
        preprocessor = Pipeline([("preprocessor", "passthrough")])
    elif scaler_name == "StandardScaler":
        preprocessor = Pipeline([("scaler", StandardScaler())])
    elif scaler_name == "StandardScaler+PCA":
        pca_n_components = trial.suggest_int("pca_n_components", 25, 250, log=True)
        preprocessor = Pipeline(
            [("scaler", StandardScaler()), ("pca", PCA(n_components=pca_n_components))]
        )

    pipe = Pipeline(
        [
            ("selector", feature_selector),
            ("preprocessor", preprocessor),
            ("classifier", classifier_obj),
        ]
    )

    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

In [None]:
# 3. Create a study object and optimize the objective function.
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, show_progress_bar=True, n_jobs=-1)