In [1]:
import sys

from xgboost import XGBRegressor

sys.path.append('../')

In [28]:
import pandas as pd
import numpy as np
import seaborn as sns

from src.representation import get_representation, get_representation_from_series
from src.config import mem
from src.utils import get_fps_offset, OffsetScaler

N_JOBS = 12
RANDOM_SEED = 42

# from rdkit import RDLogger
# RDLogger.DisableLog('rdApp.*')

X_train = pd.read_csv('../data/processed/X_train.csv', index_col=0)
y_train = pd.read_csv('../data/processed/y_train.csv', index_col=0)

X_test = pd.read_csv('../data/processed/X_test.csv', index_col=0)
y_test = pd.read_csv('../data/processed/y_test.csv', index_col=0)

In [24]:
FPS_OFFSET = get_fps_offset(X_train.columns)

scaler = OffsetScaler(offset=FPS_OFFSET)
X_train_scale = pd.DataFrame(scaler.fit_transform(X_train.values), 
                             columns=X_train.columns, 
                             index=X_train.index)
X_test_scale = pd.DataFrame(scaler.transform(X_test.values),
                            columns=X_test.columns, index=X_test.index)

In [83]:
best_features = pd.read_csv('../data/tuning/best_features.csv')['0'].tolist()

['rd_BalabanJ',
 'rd_Chi2v',
 'rd_EState_VSA4',
 'rd_EState_VSA5',
 'rd_EState_VSA6',
 'rd_EState_VSA8',
 'rd_FractionCSP3',
 'rd_Kappa2',
 'rd_Kappa3',
 'rd_MaxEStateIndex',
 'rd_MinAbsEStateIndex',
 'rd_MinEStateIndex',
 'rd_NumRadicalElectrons',
 'rd_PEOE_VSA10',
 'rd_PEOE_VSA14',
 'rd_PEOE_VSA2',
 'rd_PEOE_VSA4',
 'rd_RingCount',
 'rd_SMR_VSA1',
 'rd_SMR_VSA9',
 'rd_SlogP_VSA1',
 'rd_SlogP_VSA5',
 'rd_VSA_EState1',
 'rd_VSA_EState5',
 'rd_fr_Al_OH_noTert',
 'rd_fr_ArN',
 'rd_fr_Ar_COO',
 'rd_fr_COO2',
 'rd_fr_C_O',
 'rd_fr_C_O_noCOO',
 'rd_fr_C_S',
 'rd_fr_Imine',
 'rd_fr_NH0',
 'rd_fr_NH1',
 'rd_fr_Ndealkylation2',
 'rd_fr_aldehyde',
 'rd_fr_allylic_oxid',
 'rd_fr_azide',
 'rd_fr_barbitur',
 'rd_fr_benzodiazepine',
 'rd_fr_ester',
 'rd_fr_guanido',
 'rd_fr_halogen',
 'rd_fr_hdrzine',
 'rd_fr_imide',
 'rd_fr_ketone_Topliss',
 'rd_fr_lactone',
 'rd_fr_nitro_arom_nonortho',
 'rd_fr_nitroso',
 'rd_fr_para_hydroxylation',
 'rd_fr_phenol_noOrthoHbond',
 'rd_fr_phos_ester',
 'rd_fr_piper

In [85]:
fps_cols = [str(i) for i in range(FPS_OFFSET)]

X_train_best = X_train_scale[fps_cols + best_features]
X_test_best = X_test_scale[fps_cols + best_features]

In [11]:
import time

import numpy as np
import pandas as pd

from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.cross_decomposition import PLSRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score, KFold
from src.utils import OffsetScaler, get_fps_offset
import xgboost as xgb
import seaborn as sns


mae = 'neg_mean_absolute_error'
mse = 'neg_mean_squared_error'
rmse = 'neg_root_mean_squared_error'
N_JOBS = 12
RANDOM_SEED = 42

# prepare models
models = {}
models['Ridge'] = Ridge()
models['PLS'] = PLSRegression()
models['DT'] = DecisionTreeRegressor(random_state=RANDOM_SEED)
models['Lasso'] = Lasso()
models['KNN'] = KNeighborsRegressor(n_jobs=N_JOBS)
models['SVR'] = SVR()
models['RF'] = RandomForestRegressor(random_state=RANDOM_SEED, n_jobs=N_JOBS)
models['XGB'] = xgb.XGBRegressor(random_state=RANDOM_SEED, n_jobs=N_JOBS, verbosity=0)

In [86]:
results = {}

for name in models.keys():
    tic = time.time()
    
    model = models[name]
    kfold = KFold(n_splits=10, shuffle=True, random_state=RANDOM_SEED)
    results[name] = cross_val_score(model, X_train_best, y_train, cv=kfold, scoring=rmse)
    
    toc = time.time()
    print("%5s: %3.3f ± %3.3f    %.1fs" % (name, results[name].mean(), results[name].std(), toc - tic))
    
results = pd.DataFrame(results)
sns.boxplot(results);

Ridge: -42.928 ± 17.981    2.3s
  PLS: -28.467 ± 2.461    1.0s
   DT: -33.241 ± 2.626    1.9s
Lasso: -27.944 ± 3.453    1.8s
  KNN: -26.900 ± 1.753    0.4s
  SVR: -34.578 ± 2.445    5.6s
   RF: -22.998 ± 1.374    17.4s
  XGB: -23.917 ± 1.264    15.6s


In [91]:
def evaluate_model(model, X):
    tic = time.time()
        
    kfold = KFold(n_splits=10, shuffle=True, random_state=RANDOM_SEED)
    results = cross_val_score(model, X, y_train, cv=kfold, scoring=rmse)
    
    toc = time.time()
    print("%3.3f ± %3.3f    %.1fs" % (results.mean(), results.std(), toc - tic))

In [92]:
rf = RandomForestRegressor(n_jobs=N_JOBS, random_state=RANDOM_SEED)
xgr = xgb.XGBRegressor(random_state=RANDOM_SEED, n_jobs=N_JOBS, verbosity=0)

evaluate_model(rf, X_train)
evaluate_model(rf, X_train_scale)
evaluate_model(rf, X_train[best_features])
evaluate_model(rf, X_train_scale[best_features])
evaluate_model(rf, X_train[fps_cols + best_features])
evaluate_model(rf, X_train_scale[fps_cols + best_features])

-22.965 ± 1.311    29.6s
-22.909 ± 1.345    29.5s
-23.396 ± 1.086    8.8s
-23.363 ± 1.049    9.0s
-22.981 ± 1.370    21.4s
-22.998 ± 1.374    21.4s


In [93]:
evaluate_model(xgr, X_train)
evaluate_model(xgr, X_train_scale)
evaluate_model(xgr, X_train[best_features])
evaluate_model(xgr, X_train_scale[best_features])
evaluate_model(xgr, X_train[fps_cols + best_features])
evaluate_model(xgr, X_train_scale[fps_cols + best_features])

-23.757 ± 1.502    17.1s
-23.654 ± 1.312    16.9s
-23.228 ± 1.262    5.2s
-22.508 ± 1.238    5.6s
-24.020 ± 1.383    15.1s
-23.917 ± 1.264    13.5s


In [96]:
rf.fit(X_train_scale, y_train)

In [97]:
y_pred = rf.predict(X_test_scale)

In [100]:
pd.Series(y_pred, name='Results').to_csv('../predictions/fps_rd_md_descs_rf.csv', index=False)