# First models

In [2]:
import numpy as np
import optuna
import pandas as pd
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from xgboost import XGBClassifier

from nlp_adversarial_attacks.reactdetect.utils.magic_vars import PRIMARY_KEY_FIELDS

In [3]:
path_to_pickle = "../data_tcab/whole_feature_dataset_with_canine.pickle"
objective = "binary"
model_type = "xgboost"
feature_set = "tlc"
scaler = "StandardScaler"
pca = False
n_trials = 10
n_jobs = -1

In [4]:
# Load the data
df = pd.read_pickle(path_to_pickle)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Separate the different features
tp_features = df.columns[df.columns.str.startswith("tp_")].tolist()
tp_bert_features = df.columns[df.columns.str.startswith("tp_bert_")].tolist()
lm_features = df.columns[df.columns.str.startswith("lm_")].tolist()
tm_features = df.columns[df.columns.str.startswith("tm_")].tolist()
canine_features = df.columns[df.columns.str.startswith("canine_tp_bert_")].tolist()
feature_sets = {
    "bert": tp_bert_features,
    "t": tp_features,
    "tl": tp_features + lm_features,
    "tlc": tp_features + lm_features + tm_features,
    "canine": canine_features,
    "tlc_canine": tp_features + lm_features + tm_features + canine_features,
}
features = feature_sets[feature_set]

# Split the data into X and y
var_df = df.loc[:, features]
id_vars = ["unique_id"] + PRIMARY_KEY_FIELDS
index_df = df.loc[:, id_vars]
index_df["label"] = np.where(index_df["attack_name"] == "clean", "clean", "attack")
del df

# Split train and test sets
label_var = "label" if objective == "binary" else "attack_name"
train_idx, test_idx = train_test_split(
    var_df.index,
    test_size=0.2,
    random_state=42,
)
X_train, X_test = var_df.loc[train_idx], var_df.loc[test_idx]
y_train, y_test = index_df.loc[train_idx, label_var], index_df.loc[test_idx, label_var]
del var_df

# Encode the labels
le = LabelEncoder().fit(y_train)
y_train_enc, y_test_enc = le.transform(y_train), le.transform(y_test)
int_to_label = {i: label for i, label in enumerate(le.classes_)}

In [5]:
# Define the objective function
def objective_function(trial):
    if model_type == "lr":
        lr_c = trial.suggest_float("lr_c", 1e-5, 1e5, log=True)
        classifier_obj = sklearn.linear_model.LogisticRegression(C=lr_c, max_iter=100)
    elif model_type == "xgboost":
        xgb_max_depth = trial.suggest_int("xgb_max_depth", 2, 16, log=True)
        xgb_n_estimators = trial.suggest_int("xgb_n_estimators", 10, 100, log=True)
        classifier_obj = XGBClassifier(
            max_depth=xgb_max_depth, n_estimators=xgb_n_estimators
        )

    # Preprocessing steps (scaler and PCA)
    print("-- Building pipeline")
    steps = []
    if scaler == "None":
        steps.append(("scaler", "passthrough"))
    elif scaler == "StandardScaler":
        steps.append(("scaler", StandardScaler()))
    elif scaler == "MinMaxScaler":
        steps.append(("scaler", MinMaxScaler()))

    if pca:
        pca_n_components = trial.suggest_int("pca_n_components", 25, 250, log=True)
        steps.append(("pca", PCA(n_components=pca_n_components)))

    steps.append(("classifier", classifier_obj))
    pipe = Pipeline(steps)
    print(pipe)

    print("-- Training")
    pipe.fit(X_train, y_train_enc)

    print("-- Evaluating")
    y_pred = pipe.predict(X_test)
    accuracy = accuracy_score(y_test_enc, y_pred)

    return accuracy

In [6]:
study_name = f"{objective}_{model_type}_{feature_set}_{scaler.lower()}_{'pca' if pca else 'nopca'}"
storage_name = f"sqlite:///{study_name}.db"
study = optuna.create_study(
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,
    direction="maximize",
)
# study = optuna.create_study(study_name=study_name, direction="maximize")
study.optimize(
    objective_function, n_trials=n_trials, show_progress_bar=True, n_jobs=n_jobs
)

[32m[I 2023-03-15 20:09:55,359][0m A new study created in RDB with name: binary_xgboost_tlc_standardscaler_nopca[0m
  self._init_valid()


  0%|          | 0/10 [00:00<?, ?it/s]

-- Building pipeline
Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=None,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               feature_types=None, gamma=None, gpu_id=None,
                               grow_policy=None, importance_type=None,
                               interaction_constraints=None, learning_rate=None,
                               max_bin=None, max_cat_threshold=None,
                               max_cat_to_onehot=None, max_delta_step=None,
                               max_depth=8, max_leaves=None,
                               min_child_weight=None, missing=nan,
                               monotone_constraints=None,

In [None]:
df = pd.read_pickle("../data_tcab/whole_feature_dataset.pickle")

In [None]:
tp_features = df.columns[df.columns.str.startswith("tp_")].tolist()
tp_bert_features = df.columns[df.columns.str.startswith("tp_bert_")].tolist()
lm_features = df.columns[df.columns.str.startswith("lm_")].tolist()
tm_features = df.columns[df.columns.str.startswith("tm_")].tolist()

In [None]:
id_vars = [
    "unique_id",
    "attack_name",
    "attack_toolchain",
    "attack_id",
    "scenario",
    "target_model",
    "target_model_dataset",
    "attack_id_bis",
]
index_df = df.loc[:, id_vars]
index_df["label"] = np.where(index_df["attack_name"] == "clean", 0, 1)
var_df = df.drop(id_vars, axis=1)

In [None]:
train_idx, test_idx, y_train, y_test = train_test_split(
    var_df.index,
    index_df["label"],
    test_size=0.2,
    random_state=42,
    stratify=index_df["label"],
)
X_train, X_test = var_df.loc[train_idx], var_df.loc[test_idx]

In [None]:
pipe = make_pipeline(
    StandardScaler(), PCA(100), LogisticRegression(max_iter=1000, random_state=42)
)

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
# 1. Define an objective function to be maximized.
def objective(trial):
    feature_subset = trial.suggest_categorical(
        "feature_subset",
        ["tp_features", "tp_bert_features", "lm_features", "tm_features"],
    )
    feature_dict = {
        "tp_bert_features": tp_bert_features,
        "tp_features": tp_features,
        "lm_features": tp_features + lm_features,
        "tm_features": tp_features + lm_features + tm_features,
    }
    feature_selector = ColumnTransformer(
        transformers=[("selector", "passthrough", feature_dict[feature_subset])],
        remainder="drop",
    )

    classifier_name = trial.suggest_categorical(
        "classifier", ["LogisticRegression", "XGBoost"]
    )
    if classifier_name == "LogisticRegression":
        lr_c = trial.suggest_float("lr_c", 1e-10, 1e10, log=True)
        lr_penalty = trial.suggest_categorical("lr_penalty", [None, "l1", "l2"])
        classifier_obj = sklearn.linear_model.LogisticRegression(
            C=lr_c, penalty=lr_penalty, solver="saga", max_iter=1000
        )
    elif classifier_name == "XGBoost":
        xgb_max_depth = trial.suggest_int("xgb_max_depth", 2, 32, log=True)
        xgb_n_estimators = trial.suggest_int("xgb_n_estimators", 10, 500, log=True)
        classifier_obj = XGBClassifier(
            max_depth=xgb_max_depth, n_estimators=xgb_n_estimators
        )

    scaler_name = trial.suggest_categorical(
        "scaler", ["None", "StandardScaler", "StandardScaler+PCA"]
    )
    if scaler_name == "None":
        preprocessor = Pipeline([("preprocessor", "passthrough")])
    elif scaler_name == "StandardScaler":
        preprocessor = Pipeline([("scaler", StandardScaler())])
    elif scaler_name == "StandardScaler+PCA":
        pca_n_components = trial.suggest_int("pca_n_components", 25, 250, log=True)
        preprocessor = Pipeline(
            [("scaler", StandardScaler()), ("pca", PCA(n_components=pca_n_components))]
        )

    pipe = Pipeline(
        [
            ("selector", feature_selector),
            ("preprocessor", preprocessor),
            ("classifier", classifier_obj),
        ]
    )

    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

In [None]:
# 3. Create a study object and optimize the objective function.
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, show_progress_bar=True, n_jobs=-1)