In [173]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [219]:
import time
from tqdm import tqdm

import numpy as np
import pandas as pd

from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import  KFold
from src.utils import scale, eval_model
import xgboost as xgb
import catboost as cb
import lightgbm as lgb
import seaborn as sns
from src.corr import non_corr_features
from src.maplight_gnn import get_fingerprints
from src.utils import get_fps_cols, scale, OffsetScaler, arr_to_submit, drop_nans_non_unique


mae = 'neg_mean_absolute_error'
mse = 'neg_mean_squared_error'
rmse = 'neg_root_mean_squared_error'
roc_auc = 'neg_roc_auc_score'

N_JOBS = 24
RANDOM_SEED = 42

# prepare models
models = {}

models['RF'] = RandomForestClassifier(random_state=RANDOM_SEED, n_jobs=N_JOBS)

models['HistGB'] = HistGradientBoostingClassifier(random_state=RANDOM_SEED)

models['XGB'] = xgb.XGBClassifier(random_state=RANDOM_SEED, n_jobs=N_JOBS, verbosity=0,)

models['XGB_GPU'] = xgb.XGBClassifier(random_state=RANDOM_SEED, n_jobs=N_JOBS, verbosity=0, 
                                  tree_method='gpu_hist', predictor='gpu_predictor', gpu_id=1)

models['CB'] = cb.CatBoostClassifier(iterations=100, random_seed=RANDOM_SEED, thread_count=N_JOBS, verbose=False)

models['CB_GPU'] = cb.CatBoostClassifier(iterations=100, random_seed=RANDOM_SEED, thread_count=N_JOBS, verbose=False, task_type="GPU")

models['LGB'] = lgb.LGBMClassifier(random_state=RANDOM_SEED, n_jobs=N_JOBS, verbose=-1)

In [220]:
train = pd.read_csv('../data/processed/train.csv', index_col=0)
test = pd.read_csv('../data/processed/test.csv', index_col=0)

# Maplight

In [239]:
from sklearn.preprocessing import OneHotEncoder

params = {
    'morgan_fps':True,
    'avalon_fps':True,
    'erg_fps':True,
    'rdkit_feats':True,
    'mord_feats': False,
    'gin_gnn':False,
}
X_train = get_fingerprints(train.smi, **params)
X_test = get_fingerprints(test.smi, **params)
y_train = pd.read_pickle('../data/processed/y_train.pkl')

fps_offset = 1024 + 1024 + 315

scaler = OffsetScaler(fps_offset)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)    

ohe = OneHotEncoder(sparse_output=False)
prop = ohe.fit_transform(train[['prop']])
X_train = np.concatenate([X_train, prop], axis=1)    

prop = ohe.transform(test[['prop']])
X_test = np.concatenate([X_test, prop], axis=1) 

print(X_train.shape)

# drop nans and duplicated
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
X_train = drop_nans_non_unique(X_train)
X_test = X_test[X_train.columns]

print(X_train.shape)

(7939, 2566)
(7939, 2455)


In [240]:
print('=========== Maplight GNN ===========')
for name, model in models.items():
    eval_model(name, model, X_train, y_train, to_scale=False)

     RF: 0.8874    (0.893 ± 0.006)    3.1s
 HistGB: 0.8956    (0.904 ± 0.008)    49.6s
    XGB: 0.8951    (0.901 ± 0.006)    20.1s
XGB_GPU: 0.8974    (0.902 ± 0.005)    16.9s
     CB: 0.8802    (0.890 ± 0.010)    14.8s
 CB_GPU: 0.8848    (0.893 ± 0.008)    11.0s
    LGB: 0.8989    (0.905 ± 0.007)    9.3s


In [245]:
print(X_train_uncor.shape)

(7939, 2400)


In [244]:
from src.corr import non_corr_features

X_train_uncor = non_corr_features(X_train, y_train)
X_test_uncor = X_test[X_train_uncor.columns]

print('=========== Maplight GNN uncorrelated ===========')
for name, model in models.items():
    eval_model(name, model, X_train_uncor, y_train, to_scale=False)

     RF: 0.8873    (0.892 ± 0.005)    3.2s
 HistGB: 0.8972    (0.904 ± 0.007)    46.6s
    XGB: 0.8935    (0.899 ± 0.005)    19.5s
XGB_GPU: 0.8964    (0.904 ± 0.008)    16.1s
     CB: 0.8797    (0.888 ± 0.008)    13.8s
 CB_GPU: 0.8848    (0.891 ± 0.006)    9.4s
    LGB: 0.8980    (0.906 ± 0.008)    8.4s


# Maplight GNN

In [221]:
params = {
    'morgan_fps':True,
    'avalon_fps':True,
    'erg_fps':True,
    'rdkit_feats':True,
    'mord_feats': False,
    'gin_gnn':True,
}
X_train = get_fingerprints(train.smi, **params)
X_test = get_fingerprints(test.smi, **params)
y_train = pd.read_pickle('../data/processed/y_train.pkl')

X_train.shape

(7939, 2863)

In [222]:
fps_offset = 1024 + 1024 + 315

scaler = OffsetScaler(fps_offset)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [223]:
from sklearn.preprocessing import OneHotEncoder    

ohe = OneHotEncoder(sparse_output=False)
prop = ohe.fit_transform(train[['prop']])
X_train = np.concatenate([X_train, prop], axis=1)    

prop = ohe.transform(test[['prop']])
X_test = np.concatenate([X_test, prop], axis=1) 

X_train.shape, X_test.shape

((7939, 2866), (1221, 2866))

In [224]:
print('=========== Maplight GNN ===========')
for name, model in models.items():
    eval_model(name, model, X_train, y_train, to_scale=False)

     RF: 0.8733    (0.880 ± 0.007)    4.6s
 HistGB: 0.8975    (0.902 ± 0.005)    66.7s
    XGB: 0.8901    (0.897 ± 0.007)    24.6s
XGB_GPU: 0.8903    (0.900 ± 0.010)    14.5s
     CB: 0.8758    (0.884 ± 0.008)    47.5s
 CB_GPU: 0.8767    (0.886 ± 0.009)    39.4s
    LGB: 0.8916    (0.901 ± 0.010)    13.6s


In [29]:
clf = models['HistGB']
clf.fit(X_train, y_train)
pred = clf.predict_proba(X_test)

In [30]:
arr_to_submit(pred[:, 1]).to_csv('../submits/maplight_gnn_histgb.csv')

### unique feats only

In [225]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
X_train = drop_nans_non_unique(X_train)
X_test = X_test[X_train.columns]

In [226]:
print('=========== Maplight GNN unique feats only ===========')
for name, model in models.items():
    eval_model(name, model, X_train, y_train, to_scale=False)

     RF: 0.8718    (0.877 ± 0.005)    4.0s
 HistGB: 0.8921    (0.900 ± 0.008)    60.1s
    XGB: 0.8922    (0.899 ± 0.007)    38.2s
XGB_GPU: 0.8898    (0.897 ± 0.008)    23.0s
     CB: 0.8762    (0.886 ± 0.010)    20.6s
 CB_GPU: 0.8747    (0.886 ± 0.011)    11.4s
    LGB: 0.8932    (0.902 ± 0.009)    14.0s


In [218]:
clf = models['HistGB']
clf.fit(X_train, y_train)
pred = clf.predict_proba(X_test)

arr_to_submit(pred[:, 1]).to_csv('../submits/maplight_gnn_histgb_unique_feats.csv')

### non correlated feats only

In [227]:
from src.corr import non_corr_features

X_train_uncor = non_corr_features(X_train, y_train)
X_test_uncor = X_test[X_train_uncor.columns]

________________________________________________________________________________
[Memory] Calling src.corr.get_corr...
get_corr(        0    1    2    3    4    5    6    7    8    9  ...      2857  \
0     0.0  0.0  0.0  0.0  2.0  0.0  0.0  0.0  0.0  0.0  ... -0.761662   
1     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ... -0.824082   
2     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.722339   
3     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  ...  1.415517   
4     0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  ...  1.042417   
...   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...       ...   
7934  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.557920   
7935  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  ...  0.856924   
7936  0.0  1.0  1.0  0.0  0.0  0.0  0.0  0....)
________________________________________________________get_corr - 82.8s, 1.4min


In [228]:
X_train_uncor.shape, X_test_uncor.shape

((7939, 2700), (1221, 2700))

In [229]:
print('=========== MaplightGNN uncorrelated ===========')

for name, model in models.items():
    eval_model(name, model, X_train_uncor, y_train, to_scale=False)

     RF: 0.8733    (0.880 ± 0.007)    4.1s
 HistGB: 0.8959    (0.903 ± 0.007)    64.2s
    XGB: 0.8916    (0.899 ± 0.007)    36.6s
XGB_GPU: 0.8905    (0.898 ± 0.007)    22.3s
     CB: 0.8762    (0.886 ± 0.009)    21.0s
 CB_GPU: 0.8789    (0.888 ± 0.009)    12.3s
    LGB: 0.8912    (0.899 ± 0.008)    16.2s


In [None]:
# X_train = pd.read_pickle('../data/processed/X_train.pkl.zip')
# X_test = pd.read_pickle('../data/processed/X_test.pkl.zip')
# 
# y_train = pd.read_pickle('../data/processed/y_train.pkl')

# Maplight GNN + Mordred

In [232]:
from sklearn.preprocessing import OneHotEncoder

params = {
    'morgan_fps':True,
    'avalon_fps':True,
    'erg_fps':True,
    'rdkit_feats':True,
    'mord_feats': True,
    'gin_gnn':True,
}
X_train = get_fingerprints(train.smi, **params)
X_test = get_fingerprints(test.smi, **params)
y_train = pd.read_pickle('../data/processed/y_train.pkl')

fps_offset = 1024 + 1024 + 315

scaler = OffsetScaler(fps_offset)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)    

ohe = OneHotEncoder(sparse_output=False)
prop = ohe.fit_transform(train[['prop']])
X_train = np.concatenate([X_train, prop], axis=1)    

prop = ohe.transform(test[['prop']])
X_test = np.concatenate([X_test, prop], axis=1) 

X_train.shape

(7939, 4692)

In [233]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
X_train = drop_nans_non_unique(X_train)
X_test = X_test[X_train.columns]

X_train.shape

(7939, 3682)

In [234]:
print('============= Maplight GNN + Mordred ============')
for name, model in models.items():
    eval_model(name, model, X_train, y_train, to_scale=False)

     RF: 0.8720    (0.878 ± 0.006)    6.9s
 HistGB: 0.8959    (0.903 ± 0.007)    92.1s
    XGB: 0.8944    (0.900 ± 0.006)    65.2s
XGB_GPU: 0.8913    (0.898 ± 0.006)    34.2s
     CB: 0.8797    (0.888 ± 0.008)    38.9s
 CB_GPU: 0.8819    (0.890 ± 0.008)    16.4s
    LGB: 0.8967    (0.904 ± 0.007)    24.7s


In [237]:
X_train = non_corr_features(X_train, y_train, threshold=0.95)
X_test = X_test[X_train.columns]
X_train.shape

________________________________________________________________________________
[Memory] Calling src.corr.get_corr...
get_corr(        0    1    2    3    4    5    6    7    8    9  ...      4683  \
0     0.0  0.0  0.0  0.0  2.0  0.0  0.0  0.0  0.0  0.0  ... -0.761662   
1     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ... -0.824082   
2     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.722339   
3     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  ...  1.415517   
4     0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  ...  1.042417   
...   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...       ...   
7934  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.557920   
7935  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  ...  0.856924   
7936  0.0  1.0  1.0  0.0  0.0  0.0  0.0  0....)
________________________________________________________get_corr - 99.1s, 1.7min


(7939, 3063)

In [238]:
print('============= Maplight GNN + Mordred uncorrelated ============')

for name, model in models.items():
    eval_model(name, model, X_train, y_train, to_scale=False)

     RF: 0.8748    (0.879 ± 0.004)    4.7s
 HistGB: 0.8969    (0.903 ± 0.006)    71.2s
    XGB: 0.8920    (0.899 ± 0.007)    46.7s
XGB_GPU: 0.8912    (0.898 ± 0.007)    26.5s
     CB: 0.8821    (0.889 ± 0.007)    27.4s
 CB_GPU: 0.8774    (0.886 ± 0.009)    12.7s
    LGB: 0.8939    (0.903 ± 0.009)    17.9s
