In [1]:
import os
import sys
import numpy as np
import pandas as pd
import pickle
import sys
sys.path.append('./utils/qsar')
from Data_integration import Data_Integration
from Data_preprocess import Data_preprocess
from Rescale import rescale
from Feature_selection import feature_selection_pipeline
from Feature_engineering import feature_engineering
from Model_selection import model_selection
from Meta_analysis import statistic_data, statistic_test
from Posthoc import statical_test

from utils.vaeutils import *
seed_everything(42)

In [2]:
RUN = 'VEGFR2'
fps = 'RDK7'
data_path = './raw_data_features/'+RUN+'/'+fps+'.csv'
SAVE_FITTED_PIPELINE = './raw_data_features/'+RUN+'/pipeline/'
activity_col = 'pIC50'
smiles_col='rdkit_SMILES'
task_type = 'R'
var_thresh = 0.05
scoring = 'r2'

if not os.path.exists(SAVE_FITTED_PIPELINE):
    os.makedirs(SAVE_FITTED_PIPELINE)

data = pd.read_csv(data_path)
# data.drop(['Canomicalsmiles'], axis =1, inplace = True)
data.head()

Unnamed: 0,rdkit_SMILES,pIC50,0,1,2,3,4,5,6,7,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,CNC(=O)c1c(C)oc2cc(Oc3ccnc4cc(C(=O)N5CCC(OC)C5...,9.853872,0,1,0,0,0,1,1,0,...,0,1,0,1,1,0,0,0,0,1
1,CNC(=O)Nc1ccc(Oc2ncnc3cc(OCCCN4CCCCC4)c(OC)cc2...,9.69897,0,0,0,0,0,1,0,0,...,1,1,0,1,0,0,0,0,0,0
2,CNC(=O)c1c(C)sc2cc(Oc3ccnc4cc(-c5nccn5C)sc34)c...,9.677781,1,1,0,0,0,1,0,1,...,0,1,0,0,1,0,1,0,0,1
3,Cc1c(C(=O)NC2CC2)c2ccc(Oc3ccnc4cc(-c5nccn5C)sc...,9.66354,0,1,0,0,0,1,0,1,...,0,1,0,0,1,0,1,0,0,1
4,O=C1Nc2ccccc2C1=CNc1ccc(OCCCCN2CCOCC2)cc1,9.619789,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
X_train = np.load(SAVE_FITTED_PIPELINE + '/X_train.npy')
X_test  = np.load(SAVE_FITTED_PIPELINE + '/X_test.npy')
y_train = pd.read_csv(SAVE_FITTED_PIPELINE + '/y_train.csv').iloc[:,0]
y_test  = pd.read_csv(SAVE_FITTED_PIPELINE + '/y_test.csv').iloc[:,0]
smiles_train = np.load(SAVE_FITTED_PIPELINE + '/smiles_train.npy')
smiles_test = np.load(SAVE_FITTED_PIPELINE + '/smiles_test.npy')
with open(SAVE_FITTED_PIPELINE + '/rescale_y.pkl', 'rb') as f:
    scale_y = pickle.load(f)

In [4]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import cross_validate, RepeatedKFold
from catboost import CatBoostRegressor
from sklearn.compose import TransformedTargetRegressor
from Scaffold_split import RepeatedStratifiedScaffoldKFold, get_scaffold_groups

model = CatBoostRegressor(verbose = 0, random_state = 42)
# model.fit(X_train, y_train)
trans_model = TransformedTargetRegressor(regressor = model, transformer = scale_y)
trans_model.fit(X_train, y_train)

scoring = ['r2', 'neg_root_mean_squared_error', 'neg_mean_absolute_error']
groups = get_scaffold_groups(smiles_train)
cv= RepeatedStratifiedScaffoldKFold(n_splits=10,n_repeats=3,random_state=42,scaff_based='median')
scores = cross_validate(trans_model, X_train, y_train, groups=groups, cv=cv, scoring=scoring, n_jobs = -1)

print("R2 cross validation %.3f ± %.3f" % (scores['test_r2'].mean(),scores['test_r2'].std()))
print("RMSE cross validation: %.3f ± %.3f" % (-scores['test_neg_root_mean_squared_error'].mean(),scores['test_neg_root_mean_squared_error'].std()))
print("MAE cross validation: %.3f ± %.3f" % (-scores['test_neg_mean_absolute_error'].mean(),scores['test_neg_mean_absolute_error'].std()))

y_pred_test = trans_model.predict(X_test)

print('*** External Validation ***')
print("R2 = ",r2_score(y_test,y_pred_test))
print("RMSE = ",mean_squared_error(y_test,y_pred_test,squared=False))
print("MAE = ",mean_absolute_error(y_test,y_pred_test))

R2 cross validation 0.792 ± 0.075
RMSE cross validation: 0.569 ± 0.099
MAE cross validation: 0.307 ± 0.068
*** External Validation ***
R2 =  0.8592851873534305
RMSE =  0.5046151139908822
MAE =  0.26998141971502143


In [5]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import cross_validate, RepeatedKFold
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.compose import TransformedTargetRegressor
from Scaffold_split import RepeatedStratifiedScaffoldKFold, get_scaffold_groups

model = HistGradientBoostingRegressor(random_state = 42, verbose = 0)
# model.fit(X_train, y_train)
trans_model = TransformedTargetRegressor(regressor = model, transformer = scale_y)
trans_model.fit(X_train, y_train)

scoring = ['r2', 'neg_root_mean_squared_error', 'neg_mean_absolute_error']
groups = get_scaffold_groups(smiles_train)
cv= RepeatedStratifiedScaffoldKFold(n_splits=10,n_repeats=3,random_state=42,scaff_based='median')
scores = cross_validate(trans_model, X_train, y_train, groups=groups, cv=cv, scoring=scoring, n_jobs = -1)

print("R2 cross validation %.3f ± %.3f" % (scores['test_r2'].mean(),scores['test_r2'].std()))
print("RMSE cross validation: %.3f ± %.3f" % (-scores['test_neg_root_mean_squared_error'].mean(),scores['test_neg_root_mean_squared_error'].std()))
print("MAE cross validation: %.3f ± %.3f" % (-scores['test_neg_mean_absolute_error'].mean(),scores['test_neg_mean_absolute_error'].std()))

y_pred_test = trans_model.predict(X_test)

print('*** External Validation ***')
print("R2 = ",r2_score(y_test,y_pred_test))
print("RMSE = ",mean_squared_error(y_test,y_pred_test,squared=False))
print("MAE = ",mean_absolute_error(y_test,y_pred_test))

R2 cross validation 0.776 ± 0.080
RMSE cross validation: 0.591 ± 0.104
MAE cross validation: 0.322 ± 0.075
*** External Validation ***
R2 =  0.858678651710933
RMSE =  0.5057014884048164
MAE =  0.27776427532462783


In [6]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import cross_validate, RepeatedKFold
from xgboost import XGBRegressor
from sklearn.compose import TransformedTargetRegressor
from Scaffold_split import RepeatedStratifiedScaffoldKFold, get_scaffold_groups

model = XGBRegressor(random_state = 42, verbosity=0,  eval_metrics ='logloss')
# model.fit(X_train, y_train)
trans_model = TransformedTargetRegressor(regressor = model, transformer = scale_y)
trans_model.fit(X_train, y_train)

scoring = ['r2', 'neg_root_mean_squared_error', 'neg_mean_absolute_error']
groups = get_scaffold_groups(smiles_train)
cv= RepeatedStratifiedScaffoldKFold(n_splits=10,n_repeats=3,random_state=42,scaff_based='median')
scores = cross_validate(trans_model, X_train, y_train, groups=groups, cv=cv, scoring=scoring, n_jobs = -1)

print("R2 cross validation %.3f ± %.3f" % (scores['test_r2'].mean(),scores['test_r2'].std()))
print("RMSE cross validation: %.3f ± %.3f" % (-scores['test_neg_root_mean_squared_error'].mean(),scores['test_neg_root_mean_squared_error'].std()))
print("MAE cross validation: %.3f ± %.3f" % (-scores['test_neg_mean_absolute_error'].mean(),scores['test_neg_mean_absolute_error'].std()))

y_pred_test = trans_model.predict(X_test)

print('*** External Validation ***')
print("R2 = ",r2_score(y_test,y_pred_test))
print("RMSE = ",mean_squared_error(y_test,y_pred_test,squared=False))
print("MAE = ",mean_absolute_error(y_test,y_pred_test))

R2 cross validation 0.758 ± 0.072
RMSE cross validation: 0.616 ± 0.098
MAE cross validation: 0.332 ± 0.070
*** External Validation ***
R2 =  0.832561899030857
RMSE =  0.550449511484071
MAE =  0.3010627529430302


In [7]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import cross_validate, RepeatedKFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor
from Scaffold_split import RepeatedStratifiedScaffoldKFold, get_scaffold_groups

model = RandomForestRegressor(random_state=42)
# model.fit(X_train, y_train)
trans_model = TransformedTargetRegressor(regressor = model, transformer = scale_y)
trans_model.fit(X_train, y_train)

scoring = ['r2', 'neg_root_mean_squared_error', 'neg_mean_absolute_error']
groups = get_scaffold_groups(smiles_train)
cv= RepeatedStratifiedScaffoldKFold(n_splits=10,n_repeats=3,random_state=42,scaff_based='median')
scores = cross_validate(trans_model, X_train, y_train, groups=groups, cv=cv, scoring=scoring, n_jobs = -1)

print("R2 cross validation %.3f ± %.3f" % (scores['test_r2'].mean(),scores['test_r2'].std()))
print("RMSE cross validation: %.3f ± %.3f" % (-scores['test_neg_root_mean_squared_error'].mean(),scores['test_neg_root_mean_squared_error'].std()))
print("MAE cross validation: %.3f ± %.3f" % (-scores['test_neg_mean_absolute_error'].mean(),scores['test_neg_mean_absolute_error'].std()))

y_pred_test = trans_model.predict(X_test)

print('*** External Validation ***')
print("R2 = ",r2_score(y_test,y_pred_test))
print("RMSE = ",mean_squared_error(y_test,y_pred_test,squared=False))
print("MAE = ",mean_absolute_error(y_test,y_pred_test))

R2 cross validation 0.767 ± 0.082
RMSE cross validation: 0.603 ± 0.112
MAE cross validation: 0.312 ± 0.082
*** External Validation ***
R2 =  0.852206207258598
RMSE =  0.517152279044744
MAE =  0.264572711657122


In [8]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import cross_validate, RepeatedKFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.compose import TransformedTargetRegressor
from Scaffold_split import RepeatedStratifiedScaffoldKFold, get_scaffold_groups

model = KNeighborsRegressor()
# model.fit(X_train, y_train)
trans_model = TransformedTargetRegressor(regressor = model, transformer = scale_y)
trans_model.fit(X_train, y_train)

scoring = ['r2', 'neg_root_mean_squared_error', 'neg_mean_absolute_error']
groups = get_scaffold_groups(smiles_train)
cv= RepeatedStratifiedScaffoldKFold(n_splits=10,n_repeats=3,random_state=42,scaff_based='median')
scores = cross_validate(trans_model, X_train, y_train, groups=groups, cv=cv, scoring=scoring, n_jobs = -1)

print("R2 cross validation %.3f ± %.3f" % (scores['test_r2'].mean(),scores['test_r2'].std()))
print("RMSE cross validation: %.3f ± %.3f" % (-scores['test_neg_root_mean_squared_error'].mean(),scores['test_neg_root_mean_squared_error'].std()))
print("MAE cross validation: %.3f ± %.3f" % (-scores['test_neg_mean_absolute_error'].mean(),scores['test_neg_mean_absolute_error'].std()))

y_pred_test = trans_model.predict(X_test)

print('*** External Validation ***')
print("R2 = ",r2_score(y_test,y_pred_test))
print("RMSE = ",mean_squared_error(y_test,y_pred_test,squared=False))
print("MAE = ",mean_absolute_error(y_test,y_pred_test))

R2 cross validation 0.738 ± 0.107
RMSE cross validation: 0.638 ± 0.149
MAE cross validation: 0.287 ± 0.108
*** External Validation ***
R2 =  0.8488891688399447
RMSE =  0.5229234802892113
MAE =  0.23214547094494767
