In [13]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('..')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
RANDOM_SEED = 42
N_JOBS = 24

In [15]:
import optuna
import numpy as np
import pandas as pd
from sklearn.metrics import root_mean_squared_error, roc_auc_score
from sklearn.model_selection import KFold

In [7]:
class FeatureSelectionOptuna:
    """
    This class implements feature selection using Optuna optimization framework.

    Parameters:

    - model (object): The predictive model to evaluate; this should be any object that implements fit() and predict() methods.
    - loss_fn (function): The loss function to use for evaluating the model performance. This function should take the true labels and the
                          predictions as inputs and return a loss value.
    - features (list of str): A list containing the names of all possible features that can be selected for the model.
    - X (DataFrame): The complete set of feature data (pandas DataFrame) from which subsets will be selected for training the model.
    - y (Series): The target variable associated with the X data (pandas Series).
    - splits (list of tuples): A list of tuples where each tuple contains two elements, the train indices and the validation indices.
    - feat_count_penalty (float, optional): A factor used to penalize the objective function based on the number of features used.
    """

    def __init__(self,
                 model,
                 X,
                 y,
                 features=None,
                 loss_fn=roc_auc_score,                 
                 cv=None,
                 feat_count_penalty=0,
                 ):

        self.model = model
        self.X = X
        self.y = y
        
        if features is None:
            self.features = list(X.columns)
        else:
            self.features = features
        
        self.loss_fn = loss_fn
        
        if cv is None:
            kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
            self.splits = list(kfold.split(X))
        else:            
            self.splits = list(cv.split(X))
        
        self.feat_count_penalty = feat_count_penalty

    def __call__(self,
                 trial: optuna.trial.Trial):

        # Select True / False for each feature
        selected_features = [trial.suggest_categorical(name, [True, False]) for name in self.features]

        # List with names of selected features
        selected_feature_names = [name for name, selected in zip(self.features, selected_features) if selected]

        # Optional: adds a feat_count_penalty for the amount of features used
        n_used = len(selected_feature_names)
        total_penalty = n_used * self.feat_count_penalty
        

        X_selected = self.X[selected_feature_names].copy()
        
        from sklearn.model_selection import cross_val_score
        
        kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
        cv_res = cross_val_score(self.model, X_selected, self.y, cv=kfold, scoring='roc_auc')
        score = (cv_res.mean() - cv_res.std()) - total_penalty
        return score

In [30]:
X_train = pd.read_pickle('../data/processed/X_train.pkl.zip')
y_train = pd.read_pickle('../data/processed/y_train.pkl')

fixed_feats = ['prop_1', 'prop_2', 'prop_3']
feature_list = [s for s in X_train.columns if '_' in s and s not in fixed_feats]
len(feature_list)

1036

In [32]:
from sklearn.model_selection import KFold
from optuna.samplers import TPESampler
from sklearn.ensemble import RandomForestClassifier

params = {
    "random_state": RANDOM_SEED,
    "n_jobs": N_JOBS,
    "verbose": False
}

model = RandomForestClassifier(**params)
sampler = TPESampler(seed=RANDOM_SEED)

study = optuna.create_study(
    storage=f"sqlite:///../data/tuning/optuna.db",
    direction="maximize", 
    sampler=sampler
)

# We first try the model using all features
default_features = {ft: True for ft in feature_list}
study.enqueue_trial(default_features)

In [33]:
optuna.logging.set_verbosity(optuna.logging.WARN)

study.optimize(FeatureSelectionOptuna(
                         model=model,
                         features=feature_list,
                         X=X_train,
                         y=y_train,
                         # feat_count_penalty = 1e-4,  
                         ), n_trials=4096, show_progress_bar=True)

  0%|          | 0/4096 [00:00<?, ?it/s]

[W 2024-09-06 17:05:37,256] Trial 0 failed with parameters: {'rd_BalabanJ': True, 'rd_BertzCT': True, 'rd_Chi0': True, 'rd_Chi0n': True, 'rd_Chi0v': True, 'rd_Chi1': True, 'rd_Chi1n': True, 'rd_Chi1v': True, 'rd_Chi2n': True, 'rd_Chi2v': True, 'rd_Chi3n': True, 'rd_Chi3v': True, 'rd_Chi4n': True, 'rd_Chi4v': True, 'rd_EState_VSA1': True, 'rd_EState_VSA10': True, 'rd_EState_VSA11': True, 'rd_EState_VSA2': True, 'rd_EState_VSA3': True, 'rd_EState_VSA4': True, 'rd_EState_VSA5': True, 'rd_EState_VSA6': True, 'rd_EState_VSA7': True, 'rd_EState_VSA8': True, 'rd_EState_VSA9': True, 'rd_ExactMolWt': True, 'rd_FpDensityMorgan1': True, 'rd_FpDensityMorgan2': True, 'rd_FpDensityMorgan3': True, 'rd_FractionCSP3': True, 'rd_HallKierAlpha': True, 'rd_HeavyAtomCount': True, 'rd_HeavyAtomMolWt': True, 'rd_Ipc': True, 'rd_Kappa1': True, 'rd_Kappa2': True, 'rd_Kappa3': True, 'rd_LabuteASA': True, 'rd_MaxAbsEStateIndex': True, 'rd_MaxEStateIndex': True, 'rd_MinAbsEStateIndex': True, 'rd_MinEStateIndex': 

KeyboardInterrupt: 

In [None]:
best_features = [k for k,v in study.best_trial.params.items() if v]
len(best_features)

In [None]:
# pd.Series(best_features).to_csv('best_features.csv', index=False)