## **Optuna Tuning**

In [1]:
import optuna 
from optuna import Trial

import pandas as pd 
import numpy as np 

**Data**

In [2]:
data = pd.read_csv("data_v/04_log_scaled_data.csv") 

In [3]:
data.Label.info()

<class 'pandas.core.series.Series'>
RangeIndex: 3189766 entries, 0 to 3189765
Series name: Label
Non-Null Count    Dtype  
--------------    -----  
3189766 non-null  float64
dtypes: float64(1)
memory usage: 24.3 MB


Taking sample of data

In [4]:
debug = False
# Only using 10000 data,,, for fast computation for debugging.
train_df = data.sample(10000)

In [5]:
# Split features and label
X = train_df.drop('Label', axis=1)
y = train_df['Label']

In [6]:
# Split train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Print shapes of each 
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

Shape of X: (10000, 86)
Shape of y: (10000,)
Shape of X_train: (8000, 86)
Shape of y_train: (8000,)
Shape of X_test: (2000, 86)
Shape of y_test: (2000,)


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report

model = RandomForestClassifier(random_state=32)

model.fit(X_train,y_train)
preds = model.predict(X_test)

print(classification_report(y_test, preds))
print(f"Global F1: {f1_score(y_test, preds, average='weighted')}")

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1832
         1.0       1.00      1.00      1.00        55
         2.0       1.00      1.00      1.00       104
         7.0       0.00      0.00      0.00         1
        11.0       1.00      0.88      0.93         8

    accuracy                           1.00      2000
   macro avg       0.80      0.78      0.79      2000
weighted avg       1.00      1.00      1.00      2000

Global F1: 0.9987336061102019


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Define Optuna Tuner**

In [9]:
import optuna

class FeatureSelectionOptuna:
    """
    This class implements feature selection using Optuna optimization framework.

    Parameters:

    - model (object): The predictive model to evaluate; this should be any object that implements fit() and predict() methods.
    - loss_fn (function): The loss function to use for evaluating the model performance. This function should take the true labels and the
                          predictions as inputs and return a loss value.
    - features (list of str): A list containing the names of all possible features that can be selected for the model.
    - X (DataFrame): The complete set of feature data (pandas DataFrame) from which subsets will be selected for training the model.
    - y (Series): The target variable associated with the X data (pandas Series).
    - splits (list of tuples): A list of tuples where each tuple contains two elements, the train indices and the validation indices.
    - penalty (float, optional): A factor used to penalize the objective function based on the number of features used.
    """

    def __init__(self,
                 model,
                 loss_fn,
                 features,
                 X,
                 y,
                 splits,
                 penalty=0):

        self.model = model
        self.loss_fn = loss_fn
        self.features = features
        self.X = X
        self.y = y
        self.splits = splits
        self.penalty = penalty

    def __call__(self,
                 trial: optuna.trial.Trial):

        # Select True / False for each feature
        selected_features = [trial.suggest_categorical(name, [True, False]) for name in self.features]

        # List with names of selected features
        selected_feature_names = [name for name, selected in zip(self.features, selected_features) if selected]

        # Optional: adds a penalty for the amount of features used
        n_used = len(selected_feature_names)
        total_penalty = n_used * self.penalty

        loss = 0

        for split in self.splits:
          train_idx = split[0]
          valid_idx = split[1]

          X_train = self.X.iloc[train_idx].copy()
          y_train = self.y.iloc[train_idx].copy()
          X_valid = self.X.iloc[valid_idx].copy()
          y_valid = self.y.iloc[valid_idx].copy()

          X_train_selected = X_train[selected_feature_names].copy()
          X_valid_selected = X_valid[selected_feature_names].copy()

          # Train model, get predictions and accumulate loss
          self.model.fit(X_train_selected, y_train)
          pred = self.model.predict(X_valid_selected)

          loss += self.loss_fn(y_valid, pred)

        # Take the average loss across all splits
        loss /= len(self.splits)

        # Add the penalty to the loss
        loss += total_penalty

        return loss

Loss Function (Objective Function)

In [10]:
from optuna.samplers import TPESampler

def loss_fn(y_true, y_pred):
  """
  Returns the negative F1 score, to be treated as a loss function.
  """
  res = -f1_score(y_true, y_pred, average='weighted')
  return res


In [13]:
from sklearn.model_selection import StratifiedKFold

# Splits
# Stratified kfold over the train set for cross validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
splits = list(skf.split(X_train, y_train))



In [14]:
splits

[(array([   0,    1,    3, ..., 7997, 7998, 7999]),
  array([   2,   12,   23, ..., 7983, 7990, 7996])),
 (array([   0,    1,    2, ..., 7994, 7996, 7999]),
  array([   6,   18,   21, ..., 7995, 7997, 7998])),
 (array([   0,    2,    4, ..., 7996, 7997, 7998]),
  array([   1,    3,    7, ..., 7989, 7992, 7999])),
 (array([   1,    2,    3, ..., 7997, 7998, 7999]),
  array([   0,    4,   11, ..., 7988, 7993, 7994])),
 (array([   0,    1,    2, ..., 7997, 7998, 7999]),
  array([   5,   10,   15, ..., 7980, 7986, 7987]))]

Run Trial 

In [15]:
# Implementation For Feature Selection!
features = list(X_train.columns)
SEED=32
model = RandomForestClassifier(random_state=SEED)

sampler = TPESampler(seed = SEED)
study = optuna.create_study(direction="minimize",sampler=sampler)

# We first try the model using all features
default_features = {ft: True for ft in features}
study.enqueue_trial(default_features)

study.optimize(FeatureSelectionOptuna(
                         model=model,
                         loss_fn=loss_fn,
                         features=features,
                         X=X_train,
                         y=y_train,
                         splits=splits,
                         penalty = 1e-4,
                         ), n_trials=100)

[I 2024-12-03 22:00:14,332] A new study created in memory with name: no-name-9d2a3df2-53f8-438e-b91a-51c3c8aee48a


[I 2024-12-03 22:00:23,433] Trial 0 finished with value: -0.9897079868388685 and parameters: {'Flow ID': True, 'Src IP': True, 'Src Port': True, 'Dst IP': True, 'Dst Port': True, 'Protocol': True, 'Flow Duration': True, 'Total Fwd Packet': True, 'Total Bwd packets': True, 'Total Length of Fwd Packet': True, 'Total Length of Bwd Packet': True, 'Fwd Packet Length Max': True, 'Fwd Packet Length Min': True, 'Fwd Packet Length Mean': True, 'Fwd Packet Length Std': True, 'Bwd Packet Length Max': True, 'Bwd Packet Length Min': True, 'Bwd Packet Length Mean': True, 'Bwd Packet Length Std': True, 'Flow Bytes/s': True, 'Flow Packets/s': True, 'Flow IAT Mean': True, 'Flow IAT Std': True, 'Flow IAT Max': True, 'Flow IAT Min': True, 'Fwd IAT Total': True, 'Fwd IAT Mean': True, 'Fwd IAT Std': True, 'Fwd IAT Max': True, 'Fwd IAT Min': True, 'Bwd IAT Total': True, 'Bwd IAT Mean': True, 'Bwd IAT Std': True, 'Bwd IAT Max': True, 'Bwd IAT Min': True, 'Fwd PSH Flags': True, 'Bwd PSH Flags': True, 'Fwd RST

In [None]:
# get features with their scores 
results = [name for name, selected in study.best_params.items() if selected]

In [25]:
print(
    f"Best Features Selected:"
)
results

Best Features Selected:


['Dst IP',
 'Total Fwd Packet',
 'Total Bwd packets',
 'Fwd Packet Length Max',
 'Bwd Packet Length Mean',
 'Flow IAT Std',
 'Flow IAT Max',
 'Fwd IAT Total',
 'Fwd IAT Min',
 'Bwd IAT Total',
 'Bwd IAT Min',
 'Bwd PSH Flags',
 'Bwd RST Flags',
 'Fwd Header Length',
 'Fwd Packets/s',
 'Bwd Segment Size Avg',
 'Subflow Bwd Packets',
 'Subflow Bwd Bytes',
 'Bwd Init Win Bytes',
 'Active Max',
 'Idle Max',
 'Idle Min',
 'Total TCP Flow Time',
 'Month',
 'Day']