In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')

In [2]:
import time
from tqdm import tqdm

import numpy as np
import pandas as pd

from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import  KFold
from src.utils import scale, eval_model
import xgboost as xgb
import catboost as cb
import lightgbm as lgb
import seaborn as sns
from src.corr import non_corr_features
from src.maplight_gnn import get_representation
from src.utils import get_fps_cols, scale, OffsetScaler, to_submit, drop_nans_non_unique, BlendingClassifier

def get_models_list(names: list):
    return [(k,v) for k,v in models.items() if k in names]

mae = 'neg_mean_absolute_error'
mse = 'neg_mean_squared_error'
rmse = 'neg_root_mean_squared_error'
roc_auc = 'neg_roc_auc_score'

N_JOBS = 24
RANDOM_SEED = 42

# prepare models
models = {}


models['LR'] = LogisticRegression()
models['Ridge'] = RidgeClassifier()
models['SVC'] = SVC()
models['KNN'] = KNeighborsClassifier(n_jobs=N_JOBS)

models['RF'] = RandomForestClassifier(random_state=RANDOM_SEED, n_jobs=N_JOBS)

models['HistGB'] = HistGradientBoostingClassifier(random_state=RANDOM_SEED)

models['XGB'] = xgb.XGBClassifier(random_state=RANDOM_SEED, n_jobs=N_JOBS, verbosity=0,)

models['XGB_GPU'] = xgb.XGBClassifier(random_state=RANDOM_SEED, n_jobs=N_JOBS, verbosity=0, 
                                  tree_method='gpu_hist', predictor='gpu_predictor', gpu_id=1)

models['CB'] = cb.CatBoostClassifier(iterations=100, random_seed=RANDOM_SEED, thread_count=N_JOBS, verbose=False)

models['CB_GPU'] = cb.CatBoostClassifier(iterations=100, random_seed=RANDOM_SEED, thread_count=N_JOBS, verbose=False, task_type="GPU")

models['LGB'] = lgb.LGBMClassifier(random_state=RANDOM_SEED, n_jobs=N_JOBS, verbose=-1)

In [3]:
train = pd.read_csv('../data/processed/train.csv', index_col=0)
test = pd.read_csv('../data/processed/test.csv', index_col=0)

# Maplight

In [70]:
from sklearn.preprocessing import OneHotEncoder

def prepare_data(**params):
    X_train = get_representation(train.smi, **params)
    X_test = get_representation(test.smi, **params)
    
    X_train = drop_nans_non_unique(X_train)
    X_test = X_test[X_train.columns]
    
    fps_offset = 1024 * params['morgan_fps'] + \
                 1024 * params['avalon_fps'] + \
                 315 * params['erg_fps']
    
    scaler = OffsetScaler(fps_offset)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)    
    
    ohe = OneHotEncoder(sparse_output=False)
    prop = ohe.fit_transform(train[['prop']])
    X_train = pd.concat([X_train, pd.DataFrame(prop, columns=ohe.get_feature_names_out())], axis=1)
    
    prop = ohe.transform(test[['prop']])
    X_test = pd.concat([X_test, pd.DataFrame(prop, columns=ohe.get_feature_names_out())], axis=1)
    return X_train, train.target, X_test

params = {
    'morgan_fps':True,
    'avalon_fps':True,
    'erg_fps':True,
    'rdkit_feats':True,
    'mord_feats': False,
    'gin_gnn': False,
}

X_train, y_train, X_test = prepare_data(**params)
X_train.shape[1]

2538

In [71]:
X_train.columns.str.contains('rd_|md_').sum()

192

In [41]:
print('=========== Maplight ===========')
for name, model in models.items():
    eval_model(name, model, X_train, y_train)

     RF: 0.8858    (0.892 ± 0.006)    3.7s
 HistGB: 0.9001    (0.907 ± 0.007)    31.7s
    XGB: 0.8968    (0.903 ± 0.006)    16.7s
XGB_GPU: 0.8922    (0.900 ± 0.008)    8.7s
     CB: 0.8836    (0.892 ± 0.008)    35.5s
 CB_GPU: 0.8809    (0.888 ± 0.007)    39.8s
    LGB: 0.8976    (0.906 ± 0.008)    12.3s


In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
print('=========== Maplight ===========')
for name, model in models.items():
    eval_model(name, model, X_train, y_train, cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=RANDOM_SEED))

In [143]:
clf = BlendingClassifier(
    get_models_list(
        ['HistGB', 'XGB', 'XGB_GPU', 'LGB'],
    ),
    weights=None
)

eval_model('blend', clf, X_train, y_train);

  blend: 0.9025    (0.910 ± 0.007)    54.6s


In [49]:
from src.corr import non_corr_features

X_train_uncor = non_corr_features(X_train, y_train)
X_test_uncor = X_test[X_train_uncor.columns]

In [181]:
X_train_uncor.shape, X_test_uncor.shape

((7939, 2477), (1221, 2477))

In [170]:
X_train_uncor.shape[1]

2477

In [176]:
print('=========== Maplight uncorrelated ===========')
for name, model in models.items():
    eval_model(name, model, X_train_uncor, y_train)

     LR: 0.7899    (0.802 ± 0.012)    12.1s
  Ridge: 0.8125    (0.821 ± 0.009)    3.5s
    SVC: 0.7193    (0.723 ± 0.004)    231.3s
    KNN: 0.7463    (0.754 ± 0.008)    1.7s
     RF: 0.8886    (0.894 ± 0.006)    4.2s
 HistGB: 0.8974    (0.904 ± 0.007)    30.7s
    XGB: 0.8976    (0.904 ± 0.007)    20.0s
XGB_GPU: 0.8939    (0.901 ± 0.007)    6.8s
     CB: 0.8812    (0.889 ± 0.008)    29.7s
 CB_GPU: 0.8807    (0.888 ± 0.007)    35.1s
    LGB: 0.8982    (0.906 ± 0.008)    10.6s


In [180]:
clf = BlendingClassifier(get_models_list(
    ['HistGB', 'XGB', 'XGB_GPU', 'LGB']),
    weights=None)
eval_model('blend', clf, X_train_uncor, y_train);

  blend: 0.9026    (0.910 ± 0.007)    66.8s


In [185]:
from sklearn.model_selection import RepeatedKFold

clf = BlendingClassifier(get_models_list(
    ['HistGB', 'XGB', 'XGB_GPU', 'LGB']),
    weights=None)
eval_model('blend', clf, X_train_uncor, y_train, cv=RepeatedKFold(n_splits=5, n_repeats=3, random_state=RANDOM_SEED));

  blend: 0.9019    (0.909 ± 0.008)    186.2s


In [51]:
clf = BlendingClassifier(get_models_list(
    ['HistGB', 'XGB', 'XGB_GPU', 'LGB']),
    weights=None)

probs = clf.fit(X_train_uncor, y_train).predict_proba(X_test_uncor)
# to_submit(probs).to_csv('../data/preds/maplight_uncorr_blend_histgb_xgb_lgb.csv')

In [179]:
clf = BlendingClassifier(get_models_list(
    ['HistGB', 'XGB', 'XGB_GPU', 'LGB']),
    weights=[1, 0.5, 0.5, 1],
)

eval_model('blend', clf, X_train_uncor, y_train);
# probs = clf.fit(X_train_uncor, y_train).predict_proba(X_test_uncor)
# to_submit(probs).to_csv('../data/preds/maplight_uncorr_blend_histgb_xgb_lgb.csv')

  blend: 0.9022    (0.910 ± 0.007)    74.9s


In [178]:
eval_model('blend', clf, X_train_uncor, y_train, random_seed=0);
eval_model('blend', clf, X_train_uncor, y_train, random_seed=1);
eval_model('blend', clf, X_train_uncor, y_train, random_seed=2);
eval_model('blend', clf, X_train_uncor, y_train, random_seed=3);

  blend: 0.8992    (0.907 ± 0.008)    75.6s
  blend: 0.9034    (0.912 ± 0.008)    55.6s
  blend: 0.9040    (0.910 ± 0.006)    54.8s
  blend: 0.9046    (0.911 ± 0.006)    54.6s


In [174]:
clf = BlendingClassifier(get_models_list(
    ['HistGB', 'XGB', 'XGB_GPU', 'LGB']),
    weights=[1, 0.2, 0.2, 1],
)

eval_model('blend', clf, X_train, y_train);

  blend: 0.9021    (0.909 ± 0.007)    74.7s


# Maplight Mordred

In [72]:
params = {
    'morgan_fps':True,
    'avalon_fps':True,
    'erg_fps':True,
    'rdkit_feats':True,
    'mord_feats': True,
    'gin_gnn':False,
}

X_train, y_train, X_test = prepare_data(**params)
X_train.shape

(7939, 3465)

In [152]:
print('=========== Maplight Mord ===========')
for name, model in models.items():
    eval_model(name, model, X_train, y_train)

     RF: 0.8749    (0.881 ± 0.006)    7.1s
 HistGB: 0.8952    (0.903 ± 0.008)    56.9s
    XGB: 0.8940    (0.901 ± 0.007)    34.6s
XGB_GPU: 0.8944    (0.901 ± 0.007)    14.2s
     CB: 0.8779    (0.889 ± 0.011)    56.4s
 CB_GPU: 0.8782    (0.886 ± 0.008)    51.2s
    LGB: 0.9001    (0.906 ± 0.006)    31.2s


In [155]:
clf = BlendingClassifier(get_models_list(
    ['HistGB', 'XGB', 'XGB_GPU', 'LGB']),
    weights=None)
eval_model('blend', clf, X_train, y_train);

  blend: 0.9025    (0.909 ± 0.006)    120.0s


In [75]:
X_train_uncor = non_corr_features(X_train, y_train, threshold=0.90)
X_test_uncor = X_test[X_train_uncor.columns]
X_train_uncor.shape[0], X_train_uncor.columns.str.contains('rd_|md_').sum()

(7939, 449)

In [76]:
eval_model('hgb', models['HistGB'], X_train_uncor, y_train);

    hgb: 0.8985    (0.904 ± 0.006)    36.2s


array([0.89581326, 0.90613727, 0.90330033, 0.91351211, 0.9023322 ])

In [154]:
print('=========== Maplight Mord uncorrelated ===========')
for name, model in models.items():
    eval_model(name, model, X_train_uncor, y_train)

     RF: 0.8814    (0.886 ± 0.004)    4.6s
 HistGB: 0.8990    (0.903 ± 0.004)    38.3s
    XGB: 0.8950    (0.901 ± 0.006)    15.0s
XGB_GPU: 0.8951    (0.900 ± 0.005)    9.6s
     CB: 0.8831    (0.891 ± 0.008)    39.3s
 CB_GPU: 0.8769    (0.885 ± 0.008)    40.6s
    LGB: 0.8990    (0.905 ± 0.006)    15.0s


In [156]:
clf = BlendingClassifier(get_models_list(
    ['HistGB', 'XGB', 'XGB_GPU', 'LGB']),
    weights=None)
eval_model('blend', clf, X_train_uncor, y_train);

  blend: 0.9037    (0.909 ± 0.005)    78.0s


In [66]:
from sklearn.model_selection import ShuffleSplit, StratifiedKFold, StratifiedShuffleSplit
clf = BlendingClassifier(get_models_list(
    ['HistGB', 'XGB', 'XGB_GPU', 'LGB']),
    weights=None)

eval_model('hGB', clf, X_train_uncor, y_train, cv=StratifiedKFold(random_state=RANDOM_SEED, shuffle=True));
eval_model('hGB', clf, X_train_uncor, y_train, cv=ShuffleSplit(random_state=RANDOM_SEED));
eval_model('hGB', clf, X_train_uncor, y_train, cv=StratifiedShuffleSplit(random_state=RANDOM_SEED));

    hGB: 0.9062    (0.911 ± 0.005)    88.5s
    hGB: 0.9075    (0.912 ± 0.004)    185.2s


KeyboardInterrupt: 

In [160]:
probs = clf.fit(X_train_uncor, y_train).predict_proba(X_test_uncor)

In [166]:
to_submit(probs).to_csv('../data/preds/maplight_mord_uncorr_blend_hgb_xgb_xgb_lgb.csv')

# Maplight GNN

In [None]:
params = {
    'morgan_fps':True,
    'avalon_fps':True,
    'erg_fps':True,
    'rdkit_feats':True,
    'mord_feats': False,
    'gin_gnn':True,
}

X_train, y_train, X_test = prepare_data(**params)

In [51]:
print('=========== Maplight GNN ===========')
for name, model in models.items():
    eval_model(name, model, X_train, y_train)

     RF: 0.8735    (0.879 ± 0.005)    4.5s
 HistGB: 0.8977    (0.903 ± 0.006)    42.6s
    XGB: 0.8926    (0.902 ± 0.009)    22.8s
XGB_GPU: 0.8926    (0.900 ± 0.007)    13.1s
     CB: 0.8804    (0.889 ± 0.009)    59.9s
 CB_GPU: 0.8734    (0.883 ± 0.009)    42.3s
    LGB: 0.8951    (0.903 ± 0.008)    24.8s


In [29]:
# clf = models['HistGB']
# clf.fit(X_train, y_train)
# pred = clf.predict_proba(X_test)

In [30]:
# arr_to_submit(pred[:, 1]).to_csv('../submits/maplight_gnn_histgb.csv')

In [None]:
from sklearn.model_selection import ShuffleSplit, StratifiedKFold, StratifiedShuffleSplit

eval_model('hGB', models['HistGB'], X_train, y_train, cv=StratifiedKFold(random_state=RANDOM_SEED, shuffle=True));
eval_model('hGB', models['HistGB'], X_train, y_train, cv=ShuffleSplit(random_state=RANDOM_SEED));
eval_model('hGB', models['HistGB'], X_train, y_train, cv=StratifiedShuffleSplit(random_state=RANDOM_SEED));

In [52]:
from src.corr import non_corr_features

X_train_uncor = non_corr_features(X_train, y_train)
X_test_uncor = X_test[X_train_uncor.columns]

________________________________________________________________________________
[Memory] Calling src.corr.get_corr...
get_corr(      morgan_0  morgan_1  morgan_2  morgan_3  morgan_4  morgan_5  morgan_6  \
0          0.0       0.0       0.0       0.0       1.0       0.0       0.0   
1          0.0       0.0       0.0       0.0       0.0       0.0       0.0   
2          0.0       0.0       0.0       0.0       0.0       0.0       0.0   
3          0.0       0.0       0.0       0.0       0.0       0.0       0.0   
4          0.0       1.0       0.0       0.0       0.0       0.0       0.0   
...        ...       ...       ...       ...       ...       ...       ...   
7934       0.0       0.0       0.0       0.0       0.0       0.0       0.0   
7935       0.0       1.0       0.0       0.0       0.0       0.0       0.0  ...)
________________________________________________________get_corr - 88.0s, 1.5min


In [53]:
print('=========== MaplightGNN uncorrelated ===========')

for name, model in models.items():
    eval_model(name, model, X_train_uncor, y_train)

     RF: 0.8757    (0.881 ± 0.006)    4.5s
 HistGB: 0.8959    (0.903 ± 0.007)    41.8s
    XGB: 0.8927    (0.899 ± 0.006)    20.9s
XGB_GPU: 0.8920    (0.901 ± 0.009)    12.7s
     CB: 0.8781    (0.887 ± 0.009)    60.5s
 CB_GPU: 0.8781    (0.884 ± 0.006)    42.2s
    LGB: 0.8954    (0.903 ± 0.008)    24.4s


In [None]:
# X_train = pd.read_pickle('../data/processed/X_train.pkl.zip')
# X_test = pd.read_pickle('../data/processed/X_test.pkl.zip')
# 
# y_train = pd.read_pickle('../data/processed/y_train.pkl')

# Maplight GNN + Mordred

In [60]:
from sklearn.preprocessing import OneHotEncoder

params = {
    'morgan_fps':True,
    'avalon_fps':True,
    'erg_fps':True,
    'rdkit_feats':True,
    'mord_feats': True,
    'gin_gnn':True,
}

X_train, y_train, X_test = prepare_data(**params)

In [55]:
print('============= Maplight GNN + Mordred ============')
for name, model in models.items():
    eval_model(name, model, X_train, y_train)

     RF: 0.8650    (0.871 ± 0.006)    9.4s
 HistGB: 0.8995    (0.905 ± 0.006)    89.7s
    XGB: 0.8943    (0.900 ± 0.006)    73.5s
XGB_GPU: 0.8927    (0.900 ± 0.007)    25.1s
     CB: 0.8799    (0.887 ± 0.007)    87.0s
 CB_GPU: 0.8754    (0.884 ± 0.009)    69.9s
    LGB: 0.8967    (0.904 ± 0.007)    48.1s


In [137]:
from src.utils import BlendingClassifier

clf = BlendingClassifier(
    get_models_list(
        ['HistGB', 'XGB', 'XGB_GPU', 'LGB'],
    ),
    weights=None
)

eval_model('blend', clf, X_train, y_train);
# arr_to_submit(y_pred).to_csv('../submits/maplight_gnn_mord_hgb.csv')

  blend: 0.9018    (0.908 ± 0.007)    182.9s


array([0.90241797, 0.91132905, 0.90049903, 0.91926353, 0.90894378])

In [140]:
clf = BlendingClassifier(
    get_models_list(
        ['HistGB', 'XGB', 'XGB_GPU', 'LGB'],
    ),
    weights=[1, 0.5, 0.2, 1],
)

eval_model('blend', clf, X_train, y_train);

  blend: 0.9014    (0.908 ± 0.007)    158.0s


In [61]:
X_train_uncor = non_corr_features(X_train, y_train, threshold=0.95)
X_test_uncor = X_test[X_train.columns]
X_train_uncor.shape

(7939, 3140)

In [57]:
print('============= Maplight GNN + Mordred uncorrelated ============')

for name, model in models.items():
    eval_model(name, model, X_train_uncor, y_train)

     RF: 0.8730    (0.877 ± 0.004)    6.9s
 HistGB: 0.8952    (0.902 ± 0.007)    67.1s
    XGB: 0.8946    (0.898 ± 0.004)    45.4s
XGB_GPU: 0.8935    (0.902 ± 0.008)    18.7s
     CB: 0.8781    (0.885 ± 0.007)    67.5s
 CB_GPU: 0.8778    (0.887 ± 0.009)    56.8s
    LGB: 0.8979    (0.905 ± 0.007)    27.5s


In [139]:
clf = BlendingClassifier(
    get_models_list(
        ['HistGB', 'XGB', 'XGB_GPU', 'LGB'],
    ),
    weights=None
)

eval_model('blend', clf, X_train_uncor, y_train)

  blend: 0.9014    (0.909 ± 0.007)    123.9s


array([0.90263829, 0.90781323, 0.90090324, 0.92194256, 0.91115503])

In [141]:
clf = BlendingClassifier(
    get_models_list(
        ['HistGB', 'XGB', 'XGB_GPU', 'LGB'],
    ),
    weights=[1, 0.5, 0.2, 1],
)

eval_model('blend', clf, X_train_uncor, y_train)

  blend: 0.9015    (0.908 ± 0.007)    119.4s


array([0.90147517, 0.90769868, 0.90195553, 0.92058225, 0.91056388])

# Selected Features

In [58]:
top_10_trial_feats = [
    'gin_101',
 'gin_116',
 'gin_124',
 'gin_125',
 'gin_127',
 'gin_128',
 'gin_130',
 'gin_132',
 'gin_134',
 'gin_135',
 'gin_142',
 'gin_147',
 'gin_149',
 'gin_152',
 'gin_156',
 'gin_159',
 'gin_162',
 'gin_164',
 'gin_167',
 'gin_172',
 'gin_177',
 'gin_179',
 'gin_183',
 'gin_189',
 'gin_201',
 'gin_206',
 'gin_208',
 'gin_217',
 'gin_222',
 'gin_223',
 'gin_226',
 'gin_227',
 'gin_23',
 'gin_231',
 'gin_232',
 'gin_237',
 'gin_243',
 'gin_256',
 'gin_262',
 'gin_263',
 'gin_264',
 'gin_266',
 'gin_268',
 'gin_276',
 'gin_278',
 'gin_28',
 'gin_282',
 'gin_285',
 'gin_288',
 'gin_290',
 'gin_293',
 'gin_297',
 'gin_35',
 'gin_36',
 'gin_46',
 'gin_50',
 'gin_51',
 'gin_52',
 'gin_53',
 'gin_60',
 'gin_67',
 'gin_72',
 'gin_90',
 'gin_93',
 'gin_94',
 'gin_95',
 'gin_96',
 'gin_99',
 'md_AATS0p',
 'md_AATS1s',
 'md_AATS2d',
 'md_AATS3i',
 'md_AATS3m',
 'md_AATS3s',
 'md_AATS4i',
 'md_AATS5Z',
 'md_AATS5are',
 'md_AATS5dv',
 'md_AATS8are',
 'md_AATS8d',
 'md_AATS8i',
 'md_AATS8v',
 'md_AATSC0dv',
 'md_AATSC0s',
 'md_AATSC1are',
 'md_AATSC1dv',
 'md_AATSC3i',
 'md_AATSC3pe',
 'md_AATSC3s',
 'md_AATSC4m',
 'md_AATSC4s',
 'md_AATSC5are',
 'md_AATSC5c',
 'md_AATSC5d',
 'md_AATSC5i',
 'md_AATSC5m',
 'md_AATSC6d',
 'md_AATSC8i',
 'md_AATSC8m',
 'md_AETA_beta',
 'md_AETA_beta_ns_d',
 'md_AETA_eta',
 'md_AETA_eta_F',
 'md_AMID_O',
 'md_ATS0Z',
 'md_ATS5s',
 'md_ATSC0are',
 'md_ATSC0dv',
 'md_ATSC0m',
 'md_ATSC1i',
 'md_ATSC3Z',
 'md_ATSC3c',
 'md_ATSC3d',
 'md_ATSC3dv',
 'md_ATSC3i',
 'md_ATSC3p',
 'md_ATSC4d',
 'md_ATSC5m',
 'md_ATSC6are',
 'md_ATSC6c',
 'md_ATSC6m',
 'md_ATSC6p',
 'md_ATSC6s',
 'md_ATSC7c',
 'md_ATSC7d',
 'md_ATSC7i',
 'md_ATSC8c',
 'md_ATSC8v',
 'md_BCUTc-1h',
 'md_BCUTd-1h',
 'md_ETA_beta_ns_d',
 'md_ETA_dEpsilon_D',
 'md_ETA_dPsi_A',
 'md_ETA_epsilon_4',
 'md_ETA_epsilon_5',
 'md_ETA_eta_L',
 'md_ETA_shape_x',
 'md_GATS1m',
 'md_GATS2c',
 'md_GATS2i',
 'md_GATS2m',
 'md_GATS3v',
 'md_GATS4are',
 'md_GATS4dv',
 'md_GATS4i',
 'md_GATS4m',
 'md_GATS5p',
 'md_GATS5s',
 'md_GATS6p',
 'md_GATS8c',
 'md_GGI3',
 'md_IC0',
 'md_IC1',
 'md_IC2',
 'md_JGI1',
 'md_JGI5',
 'md_JGI6',
 'md_JGI7',
 'md_MATS1c',
 'md_MATS1s',
 'md_MATS1v',
 'md_MATS2s',
 'md_MATS4are',
 'md_MATS4i',
 'md_MATS5c',
 'md_MATS5p',
 'md_MATS5s',
 'md_MATS6s',
 'md_MATS8c',
 'md_MATS8s',
 'md_MW',
 'md_Mi',
 'md_NaaNH',
 'md_NaasC',
 'md_NdssC',
 'md_NsCH3',
 'md_NssCH2',
 'md_PEOE_VSA7',
 'md_Radius',
 'md_SaaaC',
 'md_SlogP_VSA3',
 'md_SlogP_VSA5',
 'md_SlogP_VSA8',
 'md_SpDiam_A',
 'md_SpMAD_A',
 'md_SpMAD_Dzp',
 'md_SsF',
 'md_SssssC',
 'md_TSRW10',
 'md_VE1_A',
 'md_VE2_A',
 'md_VR2_A',
 'md_VSA_EState5',
 'md_WPath',
 'md_Xc-5d',
 'md_Xp-2dv',
 'md_ZMIC2',
 'md_fMF',
 'md_mZagreb2',
 'md_n10FHRing',
 'md_n10FRing',
 'md_n10FaRing',
 'md_n6Ring',
 'md_n7AHRing',
 'md_n7aRing',
 'md_n8ARing',
 'md_n8FHRing',
 'md_n9ARing',
 'md_nAHRing',
 'md_nFRing',
 'md_nFaRing',
 'md_nG12FaHRing',
 'md_nHBDon',
 'rd_FpDensityMorgan2',
 'rd_Kappa3',
 'rd_MaxPartialCharge',
 'rd_NOCount',
 'rd_NumSaturatedHeterocycles',
 'rd_PEOE_VSA1',
 'rd_PEOE_VSA10',
 'rd_SMR_VSA4',
 'rd_SMR_VSA6',
 'rd_VSA_EState10',
 'rd_VSA_EState2',
 'rd_VSA_EState8',
 'rd_fr_Ar_COO',
 'rd_fr_Ar_N',
 'rd_fr_C_S',
 'rd_fr_HOCCN',
 'rd_fr_Imine',
 'rd_fr_NH1',
 'rd_fr_furan',
 'rd_fr_hdrzine',
 'rd_fr_imidazole',
 'rd_fr_ketone',
 'rd_fr_ketone_Topliss',
 'rd_fr_methoxy',
 'rd_fr_piperdine',
 'rd_fr_pyridine',
 'rd_fr_sulfone',
 'rd_fr_unbrch_alkane',
 'rd_qed'
]

In [59]:
params = {
    'morgan_fps':True,
    'avalon_fps':True,
    'erg_fps':True,
    'rdkit_feats':True,
    'mord_feats': True,
    'gin_gnn':True,
}

X_train, y_train, X_test = prepare_data(**params)
fps_offset = 1024 + 1024 + 315
fps_cols = X_train.columns[:fps_offset].tolist()

In [60]:
X_train = X_train[fps_cols + top_10_trial_feats]
X_test = X_test[fps_cols + top_10_trial_feats]

print('============= Top feats from 10 trials ============')

for name, model in models.items():
    eval_model(name, model, X_train, y_train)

     RF: 0.8554    (0.863 ± 0.007)    3.9s
 HistGB: 0.8560    (0.864 ± 0.008)    36.2s
    XGB: 0.8473    (0.856 ± 0.008)    24.4s
XGB_GPU: 0.8492    (0.858 ± 0.008)    10.5s
     CB: 0.8427    (0.851 ± 0.008)    37.5s
 CB_GPU: 0.8363    (0.846 ± 0.009)    39.1s
    LGB: 0.8572    (0.865 ± 0.008)    26.1s


In [61]:
top_feats = ['rd_FpDensityMorgan2', 'rd_Kappa3', 'rd_MaxPartialCharge',
       'rd_NOCount', 'rd_NumSaturatedHeterocycles', 'rd_PEOE_VSA1',
       'rd_PEOE_VSA10', 'rd_SMR_VSA4', 'rd_SMR_VSA6', 'rd_VSA_EState10',
       'rd_VSA_EState2', 'rd_VSA_EState8', 'rd_fr_Ar_COO', 'rd_fr_Ar_N',
       'rd_fr_C_S', 'rd_fr_HOCCN', 'rd_fr_hdrzine', 'rd_fr_imidazole',
       'rd_fr_ketone', 'rd_fr_ketone_Topliss', 'rd_fr_methoxy',
       'rd_fr_piperdine', 'rd_fr_pyridine', 'rd_fr_sulfone',
       'rd_fr_unbrch_alkane', 'md_SpDiam_A', 'md_SpMAD_A', 'md_VE1_A',
       'md_VE2_A', 'md_ATS5s', 'md_ATS0Z', 'md_AATS5dv', 'md_AATS2d',
       'md_AATS8d', 'md_AATS1s', 'md_AATS3s', 'md_AATS5Z', 'md_AATS8v',
       'md_AATS5are', 'md_AATS8are', 'md_AATS0p', 'md_AATS3i',
       'md_AATS4i', 'md_ATSC3c', 'md_ATSC6c', 'md_ATSC8c', 'md_ATSC0dv',
       'md_ATSC3dv', 'md_ATSC3d', 'md_ATSC4d', 'md_ATSC7d', 'md_ATSC6s',
       'md_ATSC3Z', 'md_ATSC0m', 'md_ATSC5m', 'md_ATSC6m', 'md_ATSC8v',
       'md_ATSC0are', 'md_ATSC6are', 'md_ATSC6p', 'md_ATSC1i',
       'md_ATSC3i', 'md_AATSC5c', 'md_AATSC0dv', 'md_AATSC1dv',
       'md_AATSC6d', 'md_AATSC0s', 'md_AATSC3s', 'md_AATSC4m',
       'md_AATSC8m', 'md_AATSC3i', 'md_AATSC5i', 'md_AATSC8i',
       'md_MATS1c', 'md_MATS5c', 'md_MATS8c', 'md_MATS1s', 'md_MATS5s',
       'md_MATS6s', 'md_MATS8s', 'md_MATS1v', 'md_MATS4are', 'md_MATS5p',
       'md_MATS4i', 'md_GATS2c', 'md_GATS8c', 'md_GATS5s', 'md_GATS1m',
       'md_GATS2m', 'md_GATS4are', 'md_GATS5p', 'md_GATS6p', 'md_GATS2i',
       'md_GATS4i', 'md_BCUTc-1h', 'md_BCUTd-1h', 'md_SpMAD_Dzp',
       'md_Xc-5d', 'md_Xp-2dv', 'md_Mi', 'md_NsCH3', 'md_NssCH2',
       'md_NdssC', 'md_NaasC', 'md_NaaNH', 'md_SaaaC', 'md_ETA_shape_x',
       'md_AETA_beta', 'md_ETA_beta_ns_d', 'md_AETA_beta_ns_d',
       'md_AETA_eta', 'md_ETA_eta_L', 'md_AETA_eta_F', 'md_ETA_epsilon_4',
       'md_ETA_epsilon_5', 'md_ETA_dEpsilon_D', 'md_ETA_dPsi_A', 'md_fMF',
       'md_IC1', 'md_IC2', 'md_ZMIC2', 'md_PEOE_VSA7', 'md_SlogP_VSA3',
       'md_SlogP_VSA8', 'md_VSA_EState5', 'md_AMID_O', 'md_n6Ring',
       'md_n7aRing', 'md_n8ARing', 'md_n9ARing', 'md_nAHRing',
       'md_n7AHRing', 'md_nFRing', 'md_n10FRing', 'md_n8FHRing',
       'md_n10FHRing', 'md_nFaRing', 'md_nG12FaHRing', 'md_GGI3',
       'md_JGI1', 'md_JGI5', 'md_JGI7', 'md_Radius', 'md_TSRW10', 'md_MW',
       'md_WPath', 'gin_23', 'gin_28', 'gin_35', 'gin_36', 'gin_46',
       'gin_50', 'gin_53', 'gin_60', 'gin_67', 'gin_72', 'gin_90',
       'gin_94', 'gin_95', 'gin_96', 'gin_99', 'gin_101', 'gin_116',
       'gin_124', 'gin_125', 'gin_127', 'gin_128', 'gin_130', 'gin_132',
       'gin_134', 'gin_135', 'gin_142', 'gin_149', 'gin_152', 'gin_159',
       'gin_162', 'gin_164', 'gin_172', 'gin_183', 'gin_189', 'gin_201',
       'gin_206', 'gin_222', 'gin_223', 'gin_232', 'gin_237', 'gin_256',
       'gin_263', 'gin_264', 'gin_268', 'gin_276', 'gin_282', 'gin_288',
       'gin_290', 'gin_293', 'gin_297']

In [62]:
X_train, y_train, X_test = prepare_data(**params)

X_train = X_train[fps_cols + top_feats]
X_test = X_test[fps_cols + top_feats]

print('============= Top 1 feats  ============')

for name, model in models.items():
    eval_model(name, model, X_train, y_train)

     RF: 0.8587    (0.864 ± 0.006)    4.0s
 HistGB: 0.8565    (0.865 ± 0.008)    35.6s
    XGB: 0.8502    (0.859 ± 0.008)    19.6s
XGB_GPU: 0.8505    (0.860 ± 0.009)    9.7s
     CB: 0.8426    (0.852 ± 0.009)    35.6s
 CB_GPU: 0.8357    (0.844 ± 0.009)    38.3s
    LGB: 0.8566    (0.864 ± 0.007)    23.8s


## Blending

In [120]:



models_list = [
    ('RF', RandomForestClassifier()),
    ('LGB', lgb.LGBMClassifier()),
]

blend_clf = BlendingClassifier(models_list)
    
check_estimator(blend_clf)    

AssertionError: Estimator BlendingClassifier doesn't check for NaN and inf in fit.

In [115]:
params = {
    'morgan_fps':False,
    'avalon_fps':False,
    'erg_fps':False,
    'rdkit_feats':True,
    'mord_feats': False,
    'gin_gnn':False,
}

X_train, y_train, X_test = prepare_data(**params)

for name, model in models.items():
    eval_model(name, model, X_train, y_train)

     RF: 0.8908    (0.895 ± 0.004)    2.7s
 HistGB: 0.8913    (0.897 ± 0.005)    3.5s
    XGB: 0.8901    (0.896 ± 0.006)    9.5s
XGB_GPU: 0.8859    (0.893 ± 0.007)    3.2s
     CB: 0.8793    (0.885 ± 0.006)    4.1s
 CB_GPU: 0.8809    (0.883 ± 0.002)    8.8s
    LGB: 0.8902    (0.896 ± 0.006)    3.0s


In [121]:
blend_clf = BlendingClassifier([(k,v) for k,v in models.items()])

eval_model('Blend', blend_clf, X_train, y_train);

  Blend: 0.8986    (0.903 ± 0.005)    52.0s


In [130]:
blend_clf = BlendingClassifier(get_models_list(['RF', 'HistGB', 'XGB', 'LGB']))
eval_model('Blend', blend_clf, X_train, y_train);

  Blend: 0.8986    (0.904 ± 0.005)    22.3s


In [131]:
blend_clf = BlendingClassifier(get_models_list(['RF', 'HistGB', 'XGB', 'LGB']), weights=[0.5, 1, 0.2, 0.2])
eval_model('Blend', blend_clf, X_train, y_train);

  Blend: 0.8978    (0.903 ± 0.005)    18.2s


In [134]:
blend_clf = BlendingClassifier(get_models_list(['RF', 'HistGB', 'XGB', 'LGB']), weights=[0.5, 1, 0.2, 0.2])
eval_model('Blend', blend_clf, X_train, y_train);

  Blend: 0.8978    (0.903 ± 0.005)    26.8s
