In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('..')

In [2]:
import pandas as pd
import numpy as np
from src.utils import get_fps_offset, get_fps_cols, OffsetScaler
from sklearn.model_selection import train_test_split

RANDOM_SEED = 42
N_JOBS = 12

def preprocess(df):
    df.dropna(axis=1, inplace=True)
    df = df.loc[:, df.nunique() > 1].copy()
    return df

X_train = preprocess(pd.read_csv('../data/processed/X_train.csv'))
cols_to_keep = X_train.columns.tolist()
X_test = pd.read_csv('../data/processed/X_test.csv')[cols_to_keep]
y_train = pd.read_csv('../data/processed/y_train.csv').target


fps_cols = get_fps_cols(X_train)
features = list(set(X_train.columns) - set(fps_cols))
scaler = OffsetScaler(len(fps_cols))

X_train = pd.DataFrame(
    scaler.fit_transform(X_train.values),
    index=X_train.index,
    columns=X_train.columns
)

X_test = pd.DataFrame(
    scaler.transform(X_test.values),
    index=X_test.index,
    columns=X_test.columns
)

In [3]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, random_state=RANDOM_SEED, test_size=0.2)
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

((808, 3448), (808,), (203, 3448), (203,))

In [4]:
X_train.isna().sum().sum(), X_valid.isna().sum().sum(), y_valid.isna().sum().sum()

(0, 0, 0)

In [5]:
import optuna
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import KFold

class FeatureSelectionOptuna:
    """
    This class implements feature selection using Optuna optimization framework.

    Parameters:

    - model (object): The predictive model to evaluate; this should be any object that implements fit() and predict() methods.
    - loss_fn (function): The loss function to use for evaluating the model performance. This function should take the true labels and the
                          predictions as inputs and return a loss value.
    - features (list of str): A list containing the names of all possible features that can be selected for the model.
    - X (DataFrame): The complete set of feature data (pandas DataFrame) from which subsets will be selected for training the model.
    - y (Series): The target variable associated with the X data (pandas Series).
    - splits (list of tuples): A list of tuples where each tuple contains two elements, the train indices and the validation indices.
    - penalty (float, optional): A factor used to penalize the objective function based on the number of features used.
    """

    def __init__(self,
                 model,
                 X,
                 y,
                 features=None,
                 loss_fn=root_mean_squared_error,                 
                 splits=None,                
                 penalty=0,
                 ):

        self.model = model
        self.X = X
        self.y = y
        
        if features is None:
            self.features = list(X.columns)
        else:
            self.features = features
        
        self.loss_fn = loss_fn
        
        if splits is None:
            kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
            self.splits = list(kfold.split(X))
        else:
            self.splits = splits
        
        self.penalty = penalty

    def __call__(self,
                 trial: optuna.trial.Trial):

        # Select True / False for each feature
        selected_features = [trial.suggest_categorical(name, [True, False]) for name in self.features]

        # List with names of selected features
        selected_feature_names = [name for name, selected in zip(self.features, selected_features) if selected]

        # Optional: adds a penalty for the amount of features used
        n_used = len(selected_feature_names)
        total_penalty = n_used * self.penalty

        cv_res = []

        for split in self.splits:
          train_idx = split[0]
          valid_idx = split[1]

          X_train = self.X.iloc[train_idx].copy()
          y_train = self.y.iloc[train_idx].copy()
          X_valid = self.X.iloc[valid_idx].copy()
          y_valid = self.y.iloc[valid_idx].copy()

          X_train_selected = X_train[selected_feature_names].copy()
          X_valid_selected = X_valid[selected_feature_names].copy()

          # Train model, get predictions and accumulate loss
          self.model.fit(X_train_selected, y_train)
          pred = self.model.predict(X_valid_selected)

          cv_res.append(self.loss_fn(y_valid, pred))
        
        # Take the average loss across all splits plus standard deviation to have more stable models
        loss = np.mean(cv_res) + np.std(cv_res)

        # Add the penalty to the loss
        loss += total_penalty
        
        return loss

In [6]:
from sklearn.model_selection import KFold
from optuna.samplers import TPESampler
from catboost import CatBoostRegressor

params = {
    "iterations": 100,
    "colsample_bylevel": 0.1,
    "random_seed": RANDOM_SEED,
    "thread_count": N_JOBS,
    "verbose": False
}

model = CatBoostRegressor(**params)
sampler = TPESampler(seed=RANDOM_SEED)
study = optuna.create_study(direction="minimize",sampler=sampler)

# We first try the model using all features
default_features = {ft: True for ft in features}
study.enqueue_trial(default_features)

[I 2024-08-30 11:25:12,902] A new study created in memory with name: no-name-5103d0d0-a8d7-49cc-b2bf-c8c0293b5989


In [7]:
optuna.logging.set_verbosity(optuna.logging.WARN)

study.optimize(FeatureSelectionOptuna(
                         model=model,
                         features=features,
                         X=X_train,
                         y=y_train,
                         # penalty = 1e-4,  
                         ), n_trials=4096, show_progress_bar=True)

  0%|          | 0/4096 [00:00<?, ?it/s]

[W 2024-08-30 11:27:23,675] Trial 24 failed with parameters: {'SeaN2N3aa': True, 'rd_MolMR': True, 'rd_fr_urea': False, 'Se1O1Cl3d': True, 'SsssB': False, 'md_SlogP_VSA10': True, 'rd_fr_nitrile': True, 'md_SMR_VSA9': True, 'md_ATS3i': True, 'md_nSpiro': False, 'SeaC2N3aa': False, 'md_SsCl': True, 'rd_BertzCT': True, 'rd_FpDensityMorgan3': False, 'md_nG12HRing': False, 'md_ATS8dv': True, 'SaaCH': False, 'Se1C2N3ss': False, 'md_nFaHRing': True, 'Se1C3O1a': False, 'rd_fr_Nhpyrrole': True, 'md_ATSC0d': True, 'Se1C3Hg2as': False, 'rd_PEOE_VSA5': False, 'rd_fr_sulfone': True, 'SssNH(ar)': True, 'Se1N2O2ds': False, 'md_ATS0Z': False, 'Se2C3S2ss': False, 'rd_fr_hdrzine': True, 'md_EState_VSA7': True, 'SsssP': False, 'md_ATS7pe': True, 'SdsssP': False, 'rd_Chi1v': False, 'md_nI': True, 'rd_fr_Ar_NH': False, 'md_PEOE_VSA2': False, 'SssNH': False, 'md_SsOH': False, 'Se1C2C2dd': True, 'Se1C3N1d': True, 'Se2N3O1a': True, 'md_EState_VSA10': True, 'md_StsC': False, 'md_ATSC7v': False, 'rd_SMR_VSA5': 

KeyboardInterrupt: 

In [None]:
best_features = [k for k,v in study.best_trial.params.items() if v]
len(best_features)

In [None]:
# pd.Series(best_features).to_csv('best_features.csv', index=False)