In [60]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')

In [55]:
import pandas as pd
import numpy as np

import lightgbm as lgb
from sklearn.preprocessing import OneHotEncoder

from src.corr import non_corr_features
from src.maplight_gnn import get_representation
from src.utils import drop_nans_non_unique, OffsetScaler, eval_model

N_JOBS = 24
RANDOM_SEED = 42

clf = lgb.LGBMClassifier(random_state=RANDOM_SEED, n_jobs=N_JOBS, verbose=-1)


In [3]:
train = pd.read_csv('../data/processed/train.csv', index_col=0)
test = pd.read_csv('../data/processed/test.csv', index_col=0)

In [29]:
def prepare_data(**params):
    X_train = get_representation(train.smi, **params)
    X_test = get_representation(test.smi, **params)
    
    X_train = drop_nans_non_unique(X_train)
    X_test = X_test[X_train.columns]
    
    fps_offset = 1024 * params['morgan_fps'] + \
                 1024 * params['avalon_fps'] + \
                 315 * params['erg_fps']
    
    scaler = OffsetScaler(fps_offset)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)    
    
    ohe = OneHotEncoder(sparse_output=False)
    prop = ohe.fit_transform(train[['prop']])
    X_train = pd.concat([X_train, pd.DataFrame(prop, columns=ohe.get_feature_names_out())], axis=1)
    
    prop = ohe.transform(test[['prop']])
    X_test = pd.concat([X_test, pd.DataFrame(prop, columns=ohe.get_feature_names_out())], axis=1)
    return X_train, train.target, X_test

params = {
    'morgan_fps':True,
    'avalon_fps':True,
    'erg_fps':True,
    'rdkit_feats':True,
    'mord_feats': True,
    'gin_gnn':True,
}

X_train, y_train, X_test = prepare_data(**params)

X_train = non_corr_features(X_train, y_train, threshold=0.99)
X_test = X_test[X_train.columns]

X_train.shape, X_test.shape

((7939, 3346), (1221, 3346))

In [64]:
X_train.to_pickle('../data/processed/X_train_full.pkl.zip')

In [28]:
X_train.columns.str.contains('morgan_').sum(), X_train.columns.str.contains('erg_').sum(), X_train.columns.str.contains('avalon_').sum(), X_train.columns.str.contains('gin_').sum()

(1024, 295, 1024, 300)

In [26]:
X_train.columns.str.contains('morgan_').sum(), X_train.columns.str.contains('erg_').sum(), X_train.columns.str.contains('avalon_').sum(), X_train.columns.str.contains('gin_').sum()

(1024, 295, 1018, 300)

In [56]:
eval_model('LGB', clf, X_train, y_train);

    LGB: 0.8985    (0.904 ± 0.005)    24.6s


In [69]:
import optuna
from optuna.samplers import TPESampler

import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, RepeatedKFold

RANDOM_SEED = 42

class FeatureSelectionOptuna:
    """
    This class implements feature selection using Optuna optimization framework.

    Parameters:

    - model (object): The predictive model to evaluate; this should be any object that implements fit() and predict() methods.
    - loss_fn (function): The loss function to use for evaluating the model performance. This function should take the true labels and the
                          predictions as inputs and return a loss value.
    - features (list of str): A list containing the names of all possible features that can be selected for the model.
    - X (DataFrame): The complete set of feature data (pandas DataFrame) from which subsets will be selected for training the model.
    - y (Series): The target variable associated with the X data (pandas Series).
    - splits (list of tuples): A list of tuples where each tuple contains two elements, the train indices and the validation indices.
    - feat_count_penalty (float, optional): A factor used to penalize the objective function based on the number of features used.
    """

    def __init__(self,
                 model,
                 X,
                 y,
                 features=None,
                 group_feats=None,
                 loss_fn=roc_auc_score,
                 ):

        self.model = model
        self.X = X
        self.y = y
        self.features = features            
        self.group_feats = group_feats
        self.loss_fn = loss_fn

    def __call__(self,
                 trial: optuna.trial.Trial):
        
        """
        self.group_feats = {
            'group_1': ['feat_1', 'feat_2', ... ],
            ...            
        }
        
        """
        
        groups = [k for k in self.group_feats if trial.suggest_categorical(k, [True, False])]
        group_feats_selected = []
        [group_feats_selected.extend(self.group_feats[k]) for k in groups]
                                        
        feature = {name: trial.suggest_categorical(name, [True, False]) for name in self.features}
        feats_selected = [k for k, v in feature.items() if v]
        
        #print(group_feats_selected)
        
        X_selected = self.X[feats_selected + group_feats_selected].copy()

        
        
        kfold = RepeatedKFold(n_splits=5, n_repeats=3, random_state=RANDOM_SEED)

        cv_res = cross_val_score(
            self.model, 
            X_selected, self.y, 
            cv=kfold, 
            scoring='roc_auc')
        score = (cv_res.mean() - cv_res.std())
        return score





def main():
    X_train = pd.read_pickle('../data/processed/X_train_full.pkl.zip')
    y_train = pd.read_pickle('../data/processed/y_train.pkl')
    
    groups = {
        'morgan': [],
        'avalon': [],
        'erg': [],
        'gin': [],
    }
    
    features = []
    
    for c in X_train.columns:
        if 'morgan_' in c:
            groups['morgan'].append(c)
            continue
        if 'avalon_' in c:
            groups['avalon'].append(c)
            continue
        if 'erg_' in c:
            groups['erg'].append(c)
            continue
        if 'gin_' in c:
            groups['gin'].append(c)
            continue
        features.append(c)                
        
    clf = lgb.LGBMClassifier(random_state=RANDOM_SEED, n_jobs=-1, verbose=-1)
    clf = xgb.XGBClassifier(random_state=RANDOM_SEED, n_jobs=N_JOBS, verbosity=0, 
                                  tree_method='gpu_hist', predictor='gpu_predictor', gpu_id=1)

    
    sampler = TPESampler(seed=RANDOM_SEED)
    study = optuna.create_study(
        study_name='feat_sel_study_rf_ds02',
        # storage=f"sqlite:///../data/tuning/optuna.db",
        direction="maximize",
        sampler=sampler,
        load_if_exists=True,
    )
    
    # Set all groups and features to True
    opt_params = [k for k in groups] + features
    default_params = {k: True for k in opt_params}
    study.enqueue_trial(default_params)    
    optuna.logging.set_verbosity(optuna.logging.INFO)    
    
    study.optimize(
        FeatureSelectionOptuna(
            model=clf,
            features=features,
            group_feats=groups,
            X=X_train,
            y=y_train,
        ),
        n_trials=10,
        n_jobs=1,
        show_progress_bar=True)

if __name__ == '__main__':
    main()

[I 2024-09-07 18:52:54,013] A new study created in memory with name: feat_sel_study_rf_ds02


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2024-09-07 18:54:11,992] Trial 0 finished with value: 0.8987944667036418 and parameters: {'morgan': True, 'avalon': True, 'erg': True, 'gin': True, 'rd_BalabanJ': True, 'rd_Chi2n': True, 'rd_Chi2v': True, 'rd_Chi3n': True, 'rd_Chi4n': True, 'rd_Chi4v': True, 'rd_EState_VSA1': True, 'rd_EState_VSA10': True, 'rd_EState_VSA11': True, 'rd_EState_VSA2': True, 'rd_EState_VSA4': True, 'rd_EState_VSA6': True, 'rd_FpDensityMorgan1': True, 'rd_FpDensityMorgan2': True, 'rd_FpDensityMorgan3': True, 'rd_HallKierAlpha': True, 'rd_Ipc': True, 'rd_Kappa1': True, 'rd_Kappa2': True, 'rd_Kappa3': True, 'rd_MaxEStateIndex': True, 'rd_MinAbsEStateIndex': True, 'rd_MinEStateIndex': True, 'rd_MolMR': True, 'rd_NHOHCount': True, 'rd_NOCount': True, 'rd_NumAliphaticCarbocycles': True, 'rd_NumAromaticHeterocycles': True, 'rd_NumRadicalElectrons': True, 'rd_NumSaturatedCarbocycles': True, 'rd_NumSaturatedHeterocycles': True, 'rd_NumSaturatedRings': True, 'rd_PEOE_VSA1': True, 'rd_PEOE_VSA10': True, 'rd_PEOE_V

KeyboardInterrupt: 