In [2]:
import pandas as pd
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('..')

In [3]:
from src.config import mem

In [4]:
RANDOM_SEED = 42
N_JOBS = 24

In [5]:
import optuna
import numpy as np
import pandas as pd
from sklearn.metrics import root_mean_squared_error, roc_auc_score
from sklearn.model_selection import KFold

In [6]:
class FeatureSelectionOptuna:
    """
    This class implements feature selection using Optuna optimization framework.

    Parameters:

    - model (object): The predictive model to evaluate; this should be any object that implements fit() and predict() methods.
    - loss_fn (function): The loss function to use for evaluating the model performance. This function should take the true labels and the
                          predictions as inputs and return a loss value.
    - features (list of str): A list containing the names of all possible features that can be selected for the model.
    - X (DataFrame): The complete set of feature data (pandas DataFrame) from which subsets will be selected for training the model.
    - y (Series): The target variable associated with the X data (pandas Series).
    - splits (list of tuples): A list of tuples where each tuple contains two elements, the train indices and the validation indices.
    - feat_count_penalty (float, optional): A factor used to penalize the objective function based on the number of features used.
    """

    def __init__(self,
                 model,
                 X,
                 y,
                 features=None,
                 loss_fn=roc_auc_score,                 
                 cv=None,
                 feat_count_penalty=0,
                 ):

        self.model = model
        self.X = X
        self.y = y
        
        if features is None:
            self.features = list(X.columns)
        else:
            self.features = features
        
        self.loss_fn = loss_fn
        
        if cv is None:
            kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
            self.splits = list(kfold.split(X))
        else:            
            self.splits = list(cv.split(X))
        
        self.feat_count_penalty = feat_count_penalty

    def __call__(self,
                 trial: optuna.trial.Trial):

        # Select True / False for each feature
        selected_features = [trial.suggest_categorical(name, [True, False]) for name in self.features]

        # List with names of selected features
        selected_feature_names = [name for name, selected in zip(self.features, selected_features) if selected]

        # Optional: adds a feat_count_penalty for the amount of features used
        n_used = len(selected_feature_names)
        total_penalty = n_used * self.feat_count_penalty
        

        X_selected = self.X[selected_feature_names].copy()
        
        from sklearn.model_selection import cross_val_score
        
        kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
        cv_res = cross_val_score(self.model, X_selected, self.y, cv=kfold, scoring='roc_auc')
        score = (cv_res.mean() - cv_res.std()) - total_penalty
        return score

In [7]:
X_train = pd.read_pickle('../data/processed/X_train_2.pkl.zip')
y_train = pd.read_pickle('../data/processed/y_train_2.pkl')


feature_list = [s for s in X_train.columns if '_' in s]
np.array(feature_list)

array(['rd_BalabanJ', 'rd_EState_VSA10', 'rd_EState_VSA11', ...,
       'gin_297', 'gin_298', 'gin_299'], dtype='<U27')

In [8]:
from sklearn.model_selection import KFold
from optuna.samplers import TPESampler
from sklearn.ensemble import RandomForestClassifier

params = {
    "random_state": RANDOM_SEED,
    "n_jobs": N_JOBS,
    "verbose": False
}

model = RandomForestClassifier(**params)
sampler = TPESampler(seed=RANDOM_SEED)

study = optuna.create_study(
    storage=f"sqlite:///../data/tuning/optuna.db",
    direction="maximize", 
    sampler=sampler
)

# We first try the model using all features
default_features = {ft: True for ft in feature_list}
study.enqueue_trial(default_features)

[I 2024-09-06 15:07:00,027] A new study created in RDB with name: no-name-96b71b0a-60ee-4dc1-b8d3-381e48e91c0b


In [None]:
optuna.logging.set_verbosity(optuna.logging.WARN)

study.optimize(FeatureSelectionOptuna(
                         model=model,
                         features=feature_list,
                         X=X_train,
                         y=y_train,
                         # feat_count_penalty = 1e-4,  
                         ), n_trials=4096, show_progress_bar=True)

  0%|          | 0/4096 [00:00<?, ?it/s]

In [None]:
best_features = [k for k,v in study.best_trial.params.items() if v]
len(best_features)

In [None]:
# pd.Series(best_features).to_csv('best_features.csv', index=False)