# Mo_Ma

In [37]:
from sklearn.metrics import roc_auc_score, precision_recall_curve
from sklearn.metrics import auc as calculate_auc
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from sklearn.utils import shuffle 
from joblib import load, dump
import numpy as np
import pandas as pd
import os,sys,json
from argparse import ArgumentParser
from scipy.stats.stats import pearsonr
import time
import xgboost

In [38]:
def rmse(y_true, y_pred):
	mse = mean_squared_error(y_true, y_pred)
	rmse = np.sqrt(mse)  
	return rmse

In [39]:
df=pd.read_csv('CB2_Chembl_R6227.csv')

In [40]:
Y0 = df['PActivity'].astype('float').values
len(Y0)

6227

In [41]:
X0=pd.read_csv('Mo_ma_r.csv')
len(X0)

6227

In [42]:
def random_split(df, random_state, split_size = [0.8, 0.1, 0.1]):
	base_indices = np.arange(len(df)) 
	base_indices = shuffle(base_indices, random_state = random_state) 
	nb_test = int(len(base_indices) * split_size[2]) 
	nb_val = int(len(base_indices) * split_size[1]) 
	test_idx = base_indices[0:nb_test] 
	valid_idx = base_indices[(nb_test):(nb_test+nb_val)] 
	train_idx = base_indices[(nb_test+nb_val):len(base_indices)] 
	print(len(train_idx), len(valid_idx), len(test_idx)) 
	return train_idx, valid_idx, test_idx 

In [7]:
seed = 1
X0=X0.values
train_idx, valid_idx, test_idx = random_split(df,random_state=seed)
train_idx = [i for i in train_idx if i < len(df)]
valid_idx = [i for i in valid_idx if i < len(df)]	
test_idx = [i for i in test_idx if i < len(df)]	
print(len(train_idx), len(valid_idx), len(test_idx)) 
X = X0[train_idx]; y = Y0[train_idx]
X_valid = X0[valid_idx];y_valid = Y0[valid_idx]
X_test = X0[test_idx]; y_test = Y0[test_idx] 

4983 622 622
4983 622 622


In [43]:
import json
with open('params_regress','r') as f:
    best_param=json.load(f)

In [44]:
use_param=best_param['params']
for item in ['gpu_id','tree_method','missing']:
    use_param.pop(item)
use_param.update({'n_jobs': -1})

In [14]:
use_param

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 0.3,
 'gamma': 0.02,
 'importance_type': 'gain',
 'learning_rate': 0.05,
 'max_delta_step': 0,
 'max_depth': 6,
 'min_child_weight': 4,
 'n_estimators': 10000,
 'n_jobs': -1,
 'nthread': None,
 'objective': 'reg:squarederror',
 'random_state': 0,
 'reg_alpha': 4.0,
 'reg_lambda': 1.2,
 'scale_pos_weight': 1,
 'seed': 123,
 'silent': None,
 'subsample': 0.6,
 'verbosity': 1}

In [15]:
clf=xgboost.XGBRegressor(**use_param)
#clf=xgboost.XGBRegressor(**best_param['params'])

In [16]:
time1=time.time()
model=clf.fit(X, y)
time2=time.time()
time_fit=time2-time1
print(f"fit time is: {time_fit}")

fit time is: 284.61198806762695


In [17]:
valid_r2 = pearsonr(y_valid, clf.predict(X_valid))[0]**2
valid_rmse = rmse(y_valid, clf.predict(X_valid))
test_r2 = pearsonr(y_test, clf.predict(X_test))[0]**2
test_rmse = rmse(y_test, clf.predict(X_test))

In [18]:
results = {"seed":seed, 'valid_rmse':valid_rmse,
'valid_r2':valid_r2,"test_rmse":test_rmse, "test_r2": test_r2,"time":time_fit}
print('results = ',results)

results =  {'seed': 1, 'valid_rmse': 0.6798157868645288, 'valid_r2': 0.6447870215443375, 'test_rmse': 0.6911373059397857, 'test_r2': 0.6676731547491025, 'time': 284.61198806762695}


In [45]:
seeds = [1,2,4,8,16,32,64,128,256,512]
X0=X0.values
for seed in seeds:
    train_idx, valid_idx, test_idx = random_split(df,random_state=seed)
    train_idx = [i for i in train_idx if i < len(df)]
    valid_idx = [i for i in valid_idx if i < len(df)]	
    test_idx = [i for i in test_idx if i < len(df)]	 
    X = X0[train_idx]; y = Y0[train_idx]
    X_valid = X0[valid_idx];y_valid = Y0[valid_idx]
    X_test = X0[test_idx]; y_test = Y0[test_idx]
    clf=''
    clf=xgboost.XGBRegressor(**use_param)
    time1=time.time()
    model=clf.fit(X, y)
    time2=time.time()
    time_fit=time2-time1
    print(f"fit time is: {time_fit}")
    valid_r2 = pearsonr(y_valid, clf.predict(X_valid))[0]**2
    valid_rmse = rmse(y_valid, clf.predict(X_valid))
    test_r2 = pearsonr(y_test, clf.predict(X_test))[0]**2
    test_rmse = rmse(y_test, clf.predict(X_test))
    results = {"seed":seed, 'valid_rmse':valid_rmse,
    'valid_r2':valid_r2,"test_rmse":test_rmse, "test_r2": test_r2,"time":time_fit}
    print('results = ',results)

4983 622 622
fit time is: 320.9758150577545
results =  {'seed': 1, 'valid_rmse': 0.6798157868645288, 'valid_r2': 0.6447870215443375, 'test_rmse': 0.6911373059397857, 'test_r2': 0.6676731547491025, 'time': 320.9758150577545}
4983 622 622
fit time is: 311.7244403362274
results =  {'seed': 2, 'valid_rmse': 0.6543562161268149, 'valid_r2': 0.6737183322613969, 'test_rmse': 0.6689389862852707, 'test_r2': 0.6629938726767428, 'time': 311.7244403362274}
4983 622 622
fit time is: 309.7731773853302
results =  {'seed': 4, 'valid_rmse': 0.6923714735972474, 'valid_r2': 0.6558362656082992, 'test_rmse': 0.6836250802623381, 'test_r2': 0.6558790924498649, 'time': 309.7731773853302}
4983 622 622
fit time is: 313.3846046924591
results =  {'seed': 8, 'valid_rmse': 0.6529832097959148, 'valid_r2': 0.6575121072768183, 'test_rmse': 0.6787772439350613, 'test_r2': 0.67443593126759, 'time': 313.3846046924591}
4983 622 622
fit time is: 319.0431785583496
results =  {'seed': 16, 'valid_rmse': 0.6734491423637432, 'val

# Mo_Pu

In [19]:
X0=pd.read_csv('Mo_Pu_r.csv')
len(X0)

6227

In [20]:
def random_split(df, random_state, split_size = [0.8, 0.1, 0.1]):
	base_indices = np.arange(len(df)) 
	base_indices = shuffle(base_indices, random_state = random_state) 
	nb_test = int(len(base_indices) * split_size[2]) 
	nb_val = int(len(base_indices) * split_size[1]) 
	test_idx = base_indices[0:nb_test] 
	valid_idx = base_indices[(nb_test):(nb_test+nb_val)] 
	train_idx = base_indices[(nb_test+nb_val):len(base_indices)] 
	print(len(train_idx), len(valid_idx), len(test_idx)) 
	return train_idx, valid_idx, test_idx 

In [21]:
seed = 1
X0=X0.values
train_idx, valid_idx, test_idx = random_split(df,random_state=seed)
train_idx = [i for i in train_idx if i < len(df)]
valid_idx = [i for i in valid_idx if i < len(df)]	
test_idx = [i for i in test_idx if i < len(df)]	
print(len(train_idx), len(valid_idx), len(test_idx)) 
X = X0[train_idx]; y = Y0[train_idx]
X_valid = X0[valid_idx];y_valid = Y0[valid_idx]
X_test = X0[test_idx]; y_test = Y0[test_idx] 

4983 622 622
4983 622 622


In [22]:
clf=xgboost.XGBRegressor(**use_param)
#clf=xgboost.XGBRegressor(**best_param['params'])
time1=time.time()
model=clf.fit(X, y)
time2=time.time()
time_fit=time2-time1
print(f"fit time is: {time_fit}")

fit time is: 472.180992603302


In [23]:
valid_r2 = pearsonr(y_valid, clf.predict(X_valid))[0]**2
valid_rmse = rmse(y_valid, clf.predict(X_valid))
test_r2 = pearsonr(y_test, clf.predict(X_test))[0]**2
test_rmse = rmse(y_test, clf.predict(X_test))

In [24]:
results = {"seed":seed, 'valid_rmse':valid_rmse,
'valid_r2':valid_r2,"test_rmse":test_rmse, "test_r2": test_r2,"time":time_fit}
print('results = ',results)

results =  {'seed': 1, 'valid_rmse': 0.6711811063276028, 'valid_r2': 0.6532283220067221, 'test_rmse': 0.69679825687605, 'test_r2': 0.6623770979676231, 'time': 472.180992603302}


In [46]:
X0=pd.read_csv('Mo_Pu_r.csv')
len(X0)

6227

In [47]:
seeds = [1,2,4,8,16,32,64,128,256,512]
X0=X0.values
for seed in seeds:
    train_idx, valid_idx, test_idx = random_split(df,random_state=seed)
    train_idx = [i for i in train_idx if i < len(df)]
    valid_idx = [i for i in valid_idx if i < len(df)]	
    test_idx = [i for i in test_idx if i < len(df)]	 
    X = X0[train_idx]; y = Y0[train_idx]
    X_valid = X0[valid_idx];y_valid = Y0[valid_idx]
    X_test = X0[test_idx]; y_test = Y0[test_idx]
    clf=''
    clf=xgboost.XGBRegressor(**use_param)
    time1=time.time()
    model=clf.fit(X, y)
    time2=time.time()
    time_fit=time2-time1
    print(f"fit time is: {time_fit}")
    valid_r2 = pearsonr(y_valid, clf.predict(X_valid))[0]**2
    valid_rmse = rmse(y_valid, clf.predict(X_valid))
    test_r2 = pearsonr(y_test, clf.predict(X_test))[0]**2
    test_rmse = rmse(y_test, clf.predict(X_test))
    results = {"seed":seed, 'valid_rmse':valid_rmse,
    'valid_r2':valid_r2,"test_rmse":test_rmse, "test_r2": test_r2,"time":time_fit}
    print('results = ',results)

4983 622 622
fit time is: 431.97073793411255
results =  {'seed': 1, 'valid_rmse': 0.6711811063276028, 'valid_r2': 0.6532283220067221, 'test_rmse': 0.69679825687605, 'test_r2': 0.6623770979676231, 'time': 431.97073793411255}
4983 622 622
fit time is: 440.61969661712646
results =  {'seed': 2, 'valid_rmse': 0.6729565395997779, 'valid_r2': 0.6551882860146439, 'test_rmse': 0.6564554628405899, 'test_r2': 0.6744588581924806, 'time': 440.61969661712646}
4983 622 622
fit time is: 465.20979046821594
results =  {'seed': 4, 'valid_rmse': 0.6959260099384523, 'valid_r2': 0.6522567046342864, 'test_rmse': 0.6851162303804512, 'test_r2': 0.6562262350360132, 'time': 465.20979046821594}
4983 622 622
fit time is: 460.72197580337524
results =  {'seed': 8, 'valid_rmse': 0.6704020677973551, 'valid_r2': 0.6396740097750736, 'test_rmse': 0.658991617840344, 'test_r2': 0.6929851676396932, 'time': 460.72197580337524}
4983 622 622
fit time is: 464.2941038608551
results =  {'seed': 16, 'valid_rmse': 0.675423203639491

# Mo_Pu_Ma

In [25]:
X0=pd.read_csv('Mo_Pu_ma_r.csv')
len(X0)

6227

In [26]:
def random_split(df, random_state, split_size = [0.8, 0.1, 0.1]):
	base_indices = np.arange(len(df)) 
	base_indices = shuffle(base_indices, random_state = random_state) 
	nb_test = int(len(base_indices) * split_size[2]) 
	nb_val = int(len(base_indices) * split_size[1]) 
	test_idx = base_indices[0:nb_test] 
	valid_idx = base_indices[(nb_test):(nb_test+nb_val)] 
	train_idx = base_indices[(nb_test+nb_val):len(base_indices)] 
	print(len(train_idx), len(valid_idx), len(test_idx)) 
	return train_idx, valid_idx, test_idx 

In [27]:
seed = 1
X0=X0.values
train_idx, valid_idx, test_idx = random_split(df,random_state=seed)
train_idx = [i for i in train_idx if i < len(df)]
valid_idx = [i for i in valid_idx if i < len(df)]	
test_idx = [i for i in test_idx if i < len(df)]	
print(len(train_idx), len(valid_idx), len(test_idx)) 
X = X0[train_idx]; y = Y0[train_idx]
X_valid = X0[valid_idx];y_valid = Y0[valid_idx]
X_test = X0[test_idx]; y_test = Y0[test_idx] 

4983 622 622
4983 622 622


In [28]:
clf=xgboost.XGBRegressor(**use_param)
#clf=xgboost.XGBRegressor(**best_param['params'])
time1=time.time()
model=clf.fit(X, y)
time2=time.time()
time_fit=time2-time1
print(f"fit time is: {time_fit}")

fit time is: 491.01512122154236


In [29]:
valid_r2 = pearsonr(y_valid, clf.predict(X_valid))[0]**2
valid_rmse = rmse(y_valid, clf.predict(X_valid))
test_r2 = pearsonr(y_test, clf.predict(X_test))[0]**2
test_rmse = rmse(y_test, clf.predict(X_test))

In [48]:
X0=pd.read_csv('Mo_Pu_ma_r.csv')
len(X0)

6227

In [49]:
seeds = [1,2,4,8,16,32,64,128,256,512]
X0=X0.values
for seed in seeds:
    train_idx, valid_idx, test_idx = random_split(df,random_state=seed)
    train_idx = [i for i in train_idx if i < len(df)]
    valid_idx = [i for i in valid_idx if i < len(df)]	
    test_idx = [i for i in test_idx if i < len(df)]	 
    X = X0[train_idx]; y = Y0[train_idx]
    X_valid = X0[valid_idx];y_valid = Y0[valid_idx]
    X_test = X0[test_idx]; y_test = Y0[test_idx]
    clf=''
    clf=xgboost.XGBRegressor(**use_param)
    time1=time.time()
    model=clf.fit(X, y)
    time2=time.time()
    time_fit=time2-time1
    print(f"fit time is: {time_fit}")
    valid_r2 = pearsonr(y_valid, clf.predict(X_valid))[0]**2
    valid_rmse = rmse(y_valid, clf.predict(X_valid))
    test_r2 = pearsonr(y_test, clf.predict(X_test))[0]**2
    test_rmse = rmse(y_test, clf.predict(X_test))
    results = {"seed":seed, 'valid_rmse':valid_rmse,
    'valid_r2':valid_r2,"test_rmse":test_rmse, "test_r2": test_r2,"time":time_fit}
    print('results = ',results)

4983 622 622
fit time is: 497.4952654838562
results =  {'seed': 1, 'valid_rmse': 0.6667246818171246, 'valid_r2': 0.6573453183120247, 'test_rmse': 0.692634045285413, 'test_r2': 0.6664387248685751, 'time': 497.4952654838562}
4983 622 622
fit time is: 493.70739579200745
results =  {'seed': 2, 'valid_rmse': 0.6678567563963242, 'valid_r2': 0.6607150285266088, 'test_rmse': 0.6590632158659078, 'test_r2': 0.6726734582366762, 'time': 493.70739579200745}
4983 622 622
fit time is: 496.8907034397125
results =  {'seed': 4, 'valid_rmse': 0.6891249862268267, 'valid_r2': 0.6592194729684671, 'test_rmse': 0.68693854387341, 'test_r2': 0.6533429724588469, 'time': 496.8907034397125}
4983 622 622
fit time is: 501.3080139160156
results =  {'seed': 8, 'valid_rmse': 0.6708232507302224, 'valid_r2': 0.6396508358132494, 'test_rmse': 0.6608768849247725, 'test_r2': 0.6916200237761172, 'time': 501.3080139160156}
4983 622 622
fit time is: 495.37905645370483
results =  {'seed': 16, 'valid_rmse': 0.6837498577491549, 'v

# Pu_Ma

In [31]:
X0=pd.read_csv('Pu_ma_r.csv')
len(X0)

6227

In [32]:
def random_split(df, random_state, split_size = [0.8, 0.1, 0.1]):
	base_indices = np.arange(len(df)) 
	base_indices = shuffle(base_indices, random_state = random_state) 
	nb_test = int(len(base_indices) * split_size[2]) 
	nb_val = int(len(base_indices) * split_size[1]) 
	test_idx = base_indices[0:nb_test] 
	valid_idx = base_indices[(nb_test):(nb_test+nb_val)] 
	train_idx = base_indices[(nb_test+nb_val):len(base_indices)] 
	print(len(train_idx), len(valid_idx), len(test_idx)) 
	return train_idx, valid_idx, test_idx 

In [33]:
seed = 1
X0=X0.values
train_idx, valid_idx, test_idx = random_split(df,random_state=seed)
train_idx = [i for i in train_idx if i < len(df)]
valid_idx = [i for i in valid_idx if i < len(df)]	
test_idx = [i for i in test_idx if i < len(df)]	
print(len(train_idx), len(valid_idx), len(test_idx)) 
X = X0[train_idx]; y = Y0[train_idx]
X_valid = X0[valid_idx];y_valid = Y0[valid_idx]
X_test = X0[test_idx]; y_test = Y0[test_idx] 

4983 622 622
4983 622 622


In [34]:
clf=xgboost.XGBRegressor(**use_param)
#clf=xgboost.XGBRegressor(**best_param['params'])
time1=time.time()
model=clf.fit(X, y)
time2=time.time()
time_fit=time2-time1
print(f"fit time is: {time_fit}")

fit time is: 150.3189480304718


In [35]:
valid_r2 = pearsonr(y_valid, clf.predict(X_valid))[0]**2
valid_rmse = rmse(y_valid, clf.predict(X_valid))
test_r2 = pearsonr(y_test, clf.predict(X_test))[0]**2
test_rmse = rmse(y_test, clf.predict(X_test))

In [36]:
results = {"seed":seed, 'valid_rmse':valid_rmse,
'valid_r2':valid_r2,"test_rmse":test_rmse, "test_r2": test_r2,"time":time_fit}
print('results = ',results)

results =  {'seed': 1, 'valid_rmse': 0.6985504138985598, 'valid_r2': 0.626711782068792, 'test_rmse': 0.7485138593968138, 'test_r2': 0.6117082047411144, 'time': 150.3189480304718}


In [50]:
X0=pd.read_csv('Pu_ma_r.csv')
len(X0)

6227

In [51]:
seeds = [1,2,4,8,16,32,64,128,256,512]
X0=X0.values
for seed in seeds:
    train_idx, valid_idx, test_idx = random_split(df,random_state=seed)
    train_idx = [i for i in train_idx if i < len(df)]
    valid_idx = [i for i in valid_idx if i < len(df)]	
    test_idx = [i for i in test_idx if i < len(df)]	 
    X = X0[train_idx]; y = Y0[train_idx]
    X_valid = X0[valid_idx];y_valid = Y0[valid_idx]
    X_test = X0[test_idx]; y_test = Y0[test_idx]
    clf=''
    clf=xgboost.XGBRegressor(**use_param)
    time1=time.time()
    model=clf.fit(X, y)
    time2=time.time()
    time_fit=time2-time1
    print(f"fit time is: {time_fit}")
    valid_r2 = pearsonr(y_valid, clf.predict(X_valid))[0]**2
    valid_rmse = rmse(y_valid, clf.predict(X_valid))
    test_r2 = pearsonr(y_test, clf.predict(X_test))[0]**2
    test_rmse = rmse(y_test, clf.predict(X_test))
    results = {"seed":seed, 'valid_rmse':valid_rmse,
    'valid_r2':valid_r2,"test_rmse":test_rmse, "test_r2": test_r2,"time":time_fit}
    print('results = ',results)

4983 622 622
fit time is: 151.6936957836151
results =  {'seed': 1, 'valid_rmse': 0.6985504138985598, 'valid_r2': 0.626711782068792, 'test_rmse': 0.7485138593968138, 'test_r2': 0.6117082047411144, 'time': 151.6936957836151}
4983 622 622
fit time is: 174.0218939781189
results =  {'seed': 2, 'valid_rmse': 0.7020107346011382, 'valid_r2': 0.6259067753111449, 'test_rmse': 0.7007953515109926, 'test_r2': 0.6326316300747151, 'time': 174.0218939781189}
4983 622 622
fit time is: 162.60093021392822
results =  {'seed': 4, 'valid_rmse': 0.7174314869231403, 'valid_r2': 0.6300475882171156, 'test_rmse': 0.7293910183876001, 'test_r2': 0.6099713981820866, 'time': 162.60093021392822}
4983 622 622
fit time is: 158.28595852851868
results =  {'seed': 8, 'valid_rmse': 0.7162957922034536, 'valid_r2': 0.591351035366535, 'test_rmse': 0.7093725494992837, 'test_r2': 0.6475090409073956, 'time': 158.28595852851868}
4983 622 622
fit time is: 158.29596209526062
results =  {'seed': 16, 'valid_rmse': 0.7253224630570546,