In [22]:
from IPython.display import SVG
import numpy as np
import pandas as pd
import os

from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
from rdkit.Chem.Draw import DrawMorganBit, DrawMorganBits,DrawMorganEnv, IPythonConsole

from sklearn.ensemble import RandomForestRegressor as rfr
from sklearn.neighbors import KNeighborsRegressor as knn
from sklearn.svm import SVR as svr
from sklearn.linear_model import LinearRegression as LR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error as MSE
from sklearn.linear_model import Lasso as lasso
from sklearn.linear_model import BayesianRidge
import xgboost as xgb

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D 
import matplotlib.cm as cm

In [5]:
seed = 1
data = "IGC50"

mols_train = []
mols_test = []
suppl_train = Chem.SDMolSupplier("./data/{}/{}_training.sdf".format(data))
suppl_pre = Chem.SDMolSupplier("./data/{}/{}_prediction.sdf".format(data))
for mol in suppl_train:
    mols_train.append(mol)
for mol in suppl_pre:
    mols_test.append(mol)
print(len(mols_train),len(mols_test))
if len(suppl_train)+len(suppl_pre) == len(mols_train)+len(mols_test):
    print("mols ready")

my_smiles_train=np.array([Chem.MolToSmiles(submol) for submol in mols_train])
my_smiles_test=np.array([Chem.MolToSmiles(submol) for submol in mols_test])
chembl_ids_train=np.array([m.GetProp("CAS") for m in mols_train])
chembl_ids_test=np.array([m.GetProp("CAS") for m in mols_test])
activities_train =np.array([float(m.GetProp("Tox")) for m in mols_train])
activities_test =np.array([float(m.GetProp("Tox")) for m in mols_test])

base_indices = np.arange(0,len(activities_train))
np.random.seed(seed)
np.random.shuffle(base_indices)
np.random.seed(seed)
np.random.shuffle(base_indices)

os.makedirs("./models/",exist_ok=True)
os.makedirs("./models/ML",exist_ok=True)
os.makedirs("./results/{}/ML/train/csv",exist_ok=True)
os.makedirs("./results/{}/ML/train/figure",exist_ok=True)
os.makedirs("./results/{}/ML/test/csv",exist_ok=True)
os.makedirs("./results/{}/ML/test/figure",exist_ok=True)

In [24]:
def mols_to_FP(mols, radius=3, nBits=1024, useFeatures=False):
    l = len(mols)
    mfp_pd = np.zeros((l, nBits), dtype='int')
    for i in range(l):
        mfp = AllChem.GetMorganFingerprintAsBitVect(mols[i], radius, nBits, useFeatures=useFeatures)
        mfp_pd[i,:]=np.array(list(mfp.ToBitString()))

    return pd.DataFrame(mfp_pd)

ECFP_train = mols_to_FP(mols_train, useFeatures=False)
ECFP_test = mols_to_FP(mols_test, useFeatures=False)
FCFP_train = mols_to_FP(mols_train, useFeatures=True)
FCFP_test = mols_to_FP(mols_test, useFeatures=True)

def mols_to_MACCS(mols):
    maccs_true = np.zeros((len(mols),167),int)
    maccs = [AllChem.GetMACCSKeysFingerprint(mol) for mol in mols]
    for i in range(len(mols)):
        for j in range(167):
            maccs_true[i][j] = maccs[i][j]
#     print(maccs_true)
    return pd.DataFrame(maccs_true)

MACCS_train = mols_to_MACCS(mols_train)
MACCS_test = mols_to_MACCS(mols_test)

# des
des_train = pd.read_csv("./data/{}/{}_training.csv".format(data)).iloc[:,5:]
des_test = pd.read_csv("./data/{}/{}_prediction.csv".format(data)).iloc[:,5:]
label = des_train.columns
des = pd.concat([des_train,des_test])
scaler = StandardScaler()
des = scaler.fit_transform(des)
des_train = pd.DataFrame(des[:des_train.shape[0]])
des_test = pd.DataFrame(des[des_train.shape[0]:])

In [26]:
def show(name, model, y_test, pred, r2 ,mse):

    plt.figure(figsize=[20,20])
    plt.grid()
    plt.title('Model: {} {} {}'.format(data, name, model),fontsize=40
              ,verticalalignment= 'baseline'
              ,horizontalalignment='center')
    xmax = int(max(pred))+2
    ax = plt.gca()
    plt.text(0.72,0.05, "$r^2$: {}\nMSE: {}".format(r2,mse)
             ,fontsize=20
             ,bbox=dict(facecolor='g', alpha=0.2)
             ,transform = ax.transAxes)
    plt.scatter(y_test, pred,c='orange', s=100 ,alpha=0.6)
    plt.plot(range(0,xmax+1),range(0,xmax+1),c='g',linewidth=5, linestyle="--")
    plt.xlabel('True',fontsize=30)
    plt.ylabel('Prediction',fontsize=30)
    plt.xticks(fontsize=20)
    labels = [""]+[str(i) for i in range(1,xmax)]
    plt.yticks(ticks=range(0,xmax), labels=labels, fontsize=20)
    plt.xticks(ticks=range(0,xmax), labels=range(0,xmax), fontsize=20)
    plt.xlim(0,xmax)
    plt.ylim(0,xmax)
    plt.savefig(f"./results/{data}/ML/train/figure/{data}_{name}_{model}_contrast.png")
    plt.show()
    
    datalist = pd.concat([pd.DataFrame(pred,columns=["prediction"]),pd.DataFrame(y_test,columns=['true']),pd.DataFrame([r2],columns=["r^2"]),pd.DataFrame([mse],columns=['MSE'])],axis =1)
    datalist.to_csv(f"./results/{data}/ML/train/csv/{data}_{name}_{model}_pred_and_score.csv", index=None)
    return datalist

class cross_val():
    def __init__(self, model, X, y, index, cv):
        super(cross_val, self).__init__()
        step = int(len(y)/cv)
        self.r2 = []
        self.mse = []
        self.pred_all = np.zeros((len(y)), dtype=float)
        for i in range(cv):
            if i < cv-1:
                index_train = np.concatenate([index[:i*step],index[(i+1)*step:]], axis=0)
                index_val = index[i*step:(i+1)*step]
            else: 
                index_train = index[0:i*step]
                index_val = index[i*step:]
        
            X_train = X.iloc[index_train]
            y_train = y[index_train]
            X_val = X.iloc[index_val]
            y_val = y[index_val]
            
            pred = model.fit(X_train,y_train).predict(X_val)
            self.pred_all[index_val] = pred
            self.r2.append(r2_score(y_val, pred))
            self.mse.append(MSE(y_val, pred))
            
        self.r2_mean = np.array(self.r2).mean()
        self.mse_mean = np.array(self.mse).mean()


In [27]:
# rfr
def rfr_auto(name, X, y, index, cv):

    print("*"*50,'\n',"*"*50)
    print(name, "Model:RF")
    
    model = rfr(n_jobs=-1)
    before_r2 = cross_val(model, X, y, index, cv).r2_mean
    before_mse = cross_val(model, X, y, index, cv).mse_mean
    
    print("before r^2", before_r2)
    print("before mse", before_mse)

    
    # 随机种子
    scores = []
    for i in range(0,200):
        model = rfr(n_jobs=-1
                    ,random_state=i)
        score = cross_val(model, X, y, index, cv)
        scores.append(score.r2_mean)
        pass
    
    plt.figure()
    plt.plot(range(0,200),scores)
    plt.show()

    random_state = range(0,200)[scores.index(max(scores))]
    print("random_state:", max(scores),random_state)
    
    # 随机树数目
    scores = []
    for i in range(1,200):
        model = rfr(n_estimators=i
                   ,random_state=random_state
                   ,n_jobs=-1)
        score = cross_val(model, X, y, index, cv)
        scores.append(score.r2_mean)
        pass    
    

    plt.figure()
    plt.plot(range(1,200),scores)
    plt.show()

    n_estimators = range(1,200)[scores.index(max(scores))]
    print("n_estimators:", max(scores),n_estimators)
    
    # 最大深度
    scores = []
    for i in range(1,200):
        model = rfr(n_estimators=n_estimators
                   ,random_state=random_state
                   ,n_jobs=-1
                   ,max_depth=i)
        score = cross_val(model, X, y, index, cv)
        scores.append(score.r2_mean)
        pass            

    plt.figure()
    plt.plot(range(1,200),scores)
    plt.show()

    max_depth = range(1,200)[scores.index(max(scores))]
    print("max_depth", max(scores),max_depth)
    
    # 最大特征选择
    scores = []
    for i in range(int(X.shape[1]**0.5),X.shape[1]):
        model = rfr(n_jobs=-1
                   ,n_estimators=n_estimators
                   ,random_state=random_state
                   ,max_depth=max_depth
                   ,max_features=i)
        score = cross_val(model, X, y, index, cv)
        scores.append(score.r2_mean)
        pass

    plt.figure()
    plt.plot(range(int(X.shape[1]**0.5),X.shape[1]),scores)
    plt.show()

    max_features = range(int(X.shape[1]**0.5),X.shape[1])[scores.index(max(scores))]
    print("max_features", max(scores),max_features)
    
    # 最小纯度递减
    scores = []
    for i in np.linspace(0,0.5,20):
        model = rfr(n_jobs=-1
                   ,n_estimators=n_estimators
                   ,random_state=random_state
                   ,max_depth=max_depth
                   ,max_features=max_features
                   ,min_impurity_decrease=i)
        score = cross_val(model, X, y, index, cv)
        scores.append(score.r2_mean)
        pass

    plt.figure()
    plt.plot(np.linspace(0,0.5,20),scores)
    plt.show()

    min_impurity_decrease = np.linspace(0,0.5,20)[scores.index(max(scores))]
    print("min_impurity_decrease:", max(scores),min_impurity_decrease)
    
    # 最大样本数量
    scores = []
    len_train = int(len(y)/cv)*(cv-1)+1
    for i in range(1,len_train):
        model = rfr(n_jobs=-1
                   ,n_estimators=n_estimators
                   ,random_state=random_state
                   ,max_depth=max_depth
                   ,max_features=max_features
                   ,min_impurity_decrease=min_impurity_decrease
                   ,max_samples=i)
        score = cross_val(model, X, y, index, cv)
        scores.append(score.r2_mean)
        pass

    plt.figure()
    plt.plot(range(1,len_train),scores)
    plt.show()

    max_samples = range(1,len_train)[scores.index(max(scores))]
    print("max samples", max(scores),max_samples)
    
    ####################################
    model = rfr(n_jobs=-1
               ,n_estimators=n_estimators
               ,random_state=random_state
               ,max_depth=max_depth
               ,max_features=max_features
               ,min_impurity_decrease=min_impurity_decrease
               ,max_samples=max_samples)
    score = cross_val(model, X, y, index, cv)
    after_r2 = score.r2_mean
    after_mse = score.mse_mean
    
    print("r^2:", before_r2, "->", after_r2)
    print("MSE:", before_mse, "->", after_mse)
    
    print("n_estimators=",n_estimators,","
          , "random_state=",random_state,","
          , "max_depth=",max_depth,","
          , "max_features=",max_features,","
          , "min_impurity_decrease=",min_impurity_decrease,","
          , "max_samples=",max_samples)
    
    pd.DataFrame([n_estimators, random_state, max_depth, max_features, min_impurity_decrease, max_samples]).to_csv("./models/ML/{}_{}_RF.csv".format(data, name),index=0)
    
    show(name, "RF", y, score.pred_all, after_r2, after_mse)
    
    return rfr(n_jobs=-1
               ,n_estimators=n_estimators
               ,random_state=random_state
               ,max_depth=max_depth
               ,max_features=max_features
               ,min_impurity_decrease=min_impurity_decrease
               ,max_samples=max_samples)

In [28]:
# svr
def svr_auto(name, X, y, index, cv):

    print("*"*50,'\n',"*"*50)
    print(name, "Model:SVR")
    
    model = svr()
    before_r2 = cross_val(model, X, y, index, cv).r2_mean
    before_mse = cross_val(model, X, y, index, cv).mse_mean
    
    print("before r^2", before_r2)
    print("before mse", before_mse)


    # kernel
    scores = []
    for i in ["rbf", "poly", "sigmoid"]:
        model = svr(kernel=i)
        score = cross_val(model, X, y, index, cv)
        scores.append(score.r2_mean)
        pass

    plt.figure()
    plt.bar(["rbf", "poly", "sigmoid"],scores)
    plt.xticks(ticks=["rbf", "poly", "sigmoid"])
    plt.show()


    kernel = ["rbf", "poly", "sigmoid"][scores.index(max(scores))]
    print(max(scores),kernel)

    # C
    scores = []
    range_list = np.arange(1,50,0.05)
    for i in range_list:
        model = svr(kernel="rbf", C=i)
        score = cross_val(model, X, y, index, cv)
        scores.append(score.r2_mean)

    plt.figure()
    plt.plot(range_list,scores)
    plt.show()

    C = range_list[scores.index(max(scores))]
    print(max(scores),C)

    # gamma
    scores = []
    range_list = np.linspace(0.0001,100/X.shape[1],200)
    for i in range_list:
        model = svr(kernel=kernel,gamma=i, C=C)
        score = cross_val(model, X, y, index, cv)
        scores.append(score.r2_mean)

    plt.figure()
    plt.plot(range_list,scores)
    plt.show()

    gamma = range_list[scores.index(max(scores))]
    if score.r2_mean > cross_val(svr(kernel=kernel,gamma="scale", C=C), X, y, index, cv).r2_mean:
        gamma = gamma
    else:
        gamma = 'scale'
    print(max(scores),gamma)

    ####################################
    model = svr(kernel=kernel,gamma=gamma, C=C)
    score = cross_val(model, X, y, index, cv)
    after_r2 = score.r2_mean
    after_mse = score.mse_mean
    
    print("r^2:", before_r2, "->", after_r2)
    print("MSE:", before_mse, "->", after_mse)
    
    print("kernel=", kernel, ","
          , "C=", C, ","
          , "gamma=", gamma, ",")
    
    print(y.shape, score.pred_all.shape)
    pd.DataFrame([kernel, C, gamma]).to_csv("./models/ML/{}_{}_SVR.csv".format(data, name),index=0)
    show(name, "SVR", y, score.pred_all, after_r2, after_mse)

    return svr(kernel=kernel,gamma=gamma, C=C)


In [29]:
# knn
def knn_auto(name, X, y, index, cv):

    print("*"*50,'\n',"*"*50)
    print(name, "Model:KNN")
    
    model = knn(n_jobs=-1)
    before_r2 = cross_val(model, X, y, index, cv).r2_mean
    before_mse = cross_val(model, X, y, index, cv).mse_mean
    
    print("before r^2", before_r2)
    print("before mse", before_mse)

    # algorithm
    scores = []
    for i in ['ball_tree', 'kd_tree', 'brute']:
        model = knn(n_jobs=-1, algorithm=i)
        score = cross_val(model, X, y, index, cv)
        scores.append(score.r2_mean)
        
    plt.figure()
    plt.bar(['ball_tree', 'kd_tree', 'brute'],scores)
    plt.show()

    algorithm = ['ball_tree', 'kd_tree', 'brute'][scores.index(max(scores))]
    print(max(scores),algorithm)

    # weights 
    scores = []
    for i in ['uniform', 'distance']:
        model = knn(n_jobs=-1, algorithm=algorithm, weights=i)
        score = cross_val(model, X, y, index, cv)
        scores.append(score.r2_mean)
        
    plt.figure()
    plt.bar(['uniform', 'distance'],scores)
    plt.show()

    weights = ['uniform', 'distance'][scores.index(max(scores))]
    print(max(scores),weights)

    # n_neighbors
    scores = []
    for i in range(1,50):
        model = knn(n_jobs=-1, algorithm=algorithm, weights=weights, n_neighbors=i)
        score = cross_val(model, X, y, index, cv)
        scores.append(score.r2_mean)
        
    plt.figure()
    plt.plot(range(1,50),scores)
    plt.show()

    n_neighbors = range(1,50)[scores.index(max(scores))]
    print(max(scores),n_neighbors)

    # p
    scores = []
    for i in [1, 2]:
        model = knn(n_jobs=-1, algorithm=algorithm, weights=weights, n_neighbors=n_neighbors, p=i)
        score = cross_val(model, X, y, index, cv)
        scores.append(score.r2_mean)
        
    plt.figure()
    plt.bar([1, 2],scores)
    plt.show()

    p = [1, 2][scores.index(max(scores))]
    print(max(scores),p)

    # leaf_size
    scores = []
    for i in range(1,100):
        mpdel = knn(n_jobs=-1, algorithm=algorithm, weights=weights, n_neighbors=n_neighbors, p=p, leaf_size=i)
        score = cross_val(model, X, y, index, cv)
        scores.append(score.r2_mean)
        
    plt.figure()
    plt.plot(range(1,100),scores)
    plt.show()

    leaf_size = range(1,100)[scores.index(max(scores))]
    print(max(scores),leaf_size)
    

    ####################################
    model = knn(n_jobs=-1, algorithm=algorithm, weights=weights, n_neighbors=n_neighbors, p=p, leaf_size=leaf_size)
    after_r2 = cross_val(model, X, y, index, cv).r2_mean
    after_mse = cross_val(model, X, y, index, cv).mse_mean
    
    print("r^2:", before_r2, "->", after_r2)
    print("MSE:", before_mse, "->", after_mse)
    
    print( "algorithm=", algorithm, ","
          , "weights=", weights, ","
          , "n_neighbors=", n_neighbors, ","
          , "p=", p, ","
          , "leaf_size=", leaf_size, ",")
    
    pd.DataFrame([algorithm, weights, n_neighbors, p, leaf_size]).to_csv("./models/{}/ML/{}_{}_KNN.csv".format(data, data, name),index=0)
    show(name, "KNN", y, score.pred_all, after_r2, after_mse)
    
    return knn(n_jobs=-1, algorithm=algorithm, weights=weights, n_neighbors=n_neighbors, p=p, leaf_size=leaf_size)

In [None]:
# BayesianRidge
from sklearn.linear_model import BayesianRidge
def br_auto(name, X, y, index, cv):

    print("*"*50,'\n',"*"*50)
    print(name, "Model:BR")
    
    model = BayesianRidge(compute_score=True)
    before_r2 = cross_val(model, X, y, index, cv).r2_mean
    before_mse = cross_val(model, X, y, index, cv).mse_mean
    
    print("before r^2", before_r2)
    print("before mse", before_mse)

    
    # n_iter
    scores = []
    for i in range(0,301):
        model = BayesianRidge(compute_score=True,
                        n_iter=i)
        score = cross_val(model, X, y, index, cv)
        scores.append(score.r2_mean)
        pass
    
    plt.figure()
    plt.plot(range(0,301),scores)
    plt.show()

    n_iter = range(0,301)[scores.index(max(scores))]
    print("n_iter:", max(scores),n_iter)
    
    # alpha_1
    scores = []
    for i in [1e-04, 1e-05, 1e-06, 1e-07, 1e-08]:
        model = BayesianRidge(compute_score=True,
                        n_iter=n_iter,
                        alpha_1=i)
        score = cross_val(model, X, y, index, cv)
        scores.append(score.r2_mean)
        pass
    
    plt.figure()
    plt.plot(range(0,5),scores)
    plt.show()

    alpha_1 = [1e-04, 1e-05, 1e-06, 1e-07, 1e-08][scores.index(max(scores))]
    print("alpha_1:", max(scores),alpha_1)
    
    # alpha_2
    scores = []
    for i in [1e-04, 1e-05, 1e-06, 1e-07, 1e-08]:
        model = BayesianRidge(compute_score=True,
                        n_iter=n_iter,
                        alpha_1=alpha_1,
                        alpha_2=i,
                        )
        score = cross_val(model, X, y, index, cv)
        scores.append(score.r2_mean)
        pass
    
    plt.figure()
    plt.plot(range(0,5),scores)
    plt.show()

    alpha_2 = [1e-04, 1e-05, 1e-06, 1e-07, 1e-08][scores.index(max(scores))]
    print("alpha_2:", max(scores),alpha_2)
    
    lambda_1 = 1e-06
    lambda_2 = 1e-06
    ####################################
    model = BayesianRidge(compute_score=True,
                        n_iter=n_iter,
                        alpha_1=alpha_1,
                        alpha_2=alpha_2,
                        lambda_1=lambda_1,
                        lambda_2=lambda_2)
    score = cross_val(model, X, y, index, cv)
    after_r2 = score.r2_mean
    after_mse = score.mse_mean
    
    print("r^2:", before_r2, "->", after_r2)
    print("MSE:", before_mse, "->", after_mse)
    
    show(name, "BR", y, score.pred_all, after_r2, after_mse)
    pd.DataFrame([n_iter, alpha_1, alpha_2, lambda_1, lambda_1, lambda_2]).to_csv("./models/ML/{}_{}_BR.csv".format(data, name),index=0)
    
    return BayesianRidge(compute_score=True,
                        n_iter=n_iter,
                        alpha_1=alpha_1,
                        alpha_2=alpha_2,
                        lambda_1=lambda_1,
                        lambda_2=lambda_2)

In [30]:
# xgb

def xgb_auto(name, X, y, index, cv):

    print("*"*50,'\n',"*"*50)
    print(name, "Model:XGB")
    
    score_add = 0
    mse_add = 0
    step = int(len(y)/cv)
    for c in range(cv):
        if c < cv-1:
            index_train = np.concatenate([index[:c*step],index[(c+1)*step:]], axis=0)
            index_val = index[c*step:(c+1)*step]
        else: 
            index_train = index[0:c*step]
            index_val = index[c*step:]

        X_train = X.iloc[index_train]
        y_train = y[index_train]
        X_val = X.iloc[index_val]
        y_val = y[index_val]
        dtrain = xgb.DMatrix(X_train, y_train)
        dval = xgb.DMatrix(X_val, y_val)

        param = {'objective':'reg:squarederror'}
        score_add += r2_score(y_val,xgb.train(param,dtrain).predict(dval))
        mse_add += MSE(y_val,xgb.train(param,dtrain).predict(dval))
        
    before_r2 = score_add/5
    before_mse = mse_add/5
    print("before r^2", before_r2)
    print("before mse", before_mse)
    
#     num_boost and eta
    scores = np.zeros((len(range(1,101)),len(np.arange(0.01,0.5,0.01))),dtype =np.float64)
    score = 0
    imax = 0
    jmax = 0
    for i,num_boost_round in enumerate(range(1,101)):
        for j,eta in enumerate(np.arange(0.01,0.5,0.01)):
            score_add = 0
            for c in range(cv):
                if c < cv-1:
                    index_train = np.concatenate([index[:c*step],index[(c+1)*step:]], axis=0)
                    index_val = index[c*step:(c+1)*step]
                else: 
                    index_train = index[0:c*step]
                    index_val = index[c*step:]
        
                X_train = X.iloc[index_train]
                y_train = y[index_train]
                X_val = X.iloc[index_val]
                y_val = y[index_val]
                dtrain = xgb.DMatrix(X_train, y_train)
                dval = xgb.DMatrix(X_val, y_val)
                
                param = {'objective':'reg:squarederror'
                        ,'eta':eta}
                score_add += r2_score(y_val,xgb.train(param,dtrain,num_boost_round=num_boost_round).predict(dval))
                
            scores[i][j] = score_add/5
            
            if scores[i][j] > score:
                score = scores[i][j]
                jmax = j
                imax = i


    fig=plt.figure(figsize=[10,10])
    ax3d=Axes3D(fig)
    X_ax,y_ax=np.meshgrid(range(1,101),np.arange(0.01,0.5,0.01))
    matrix = np.array(scores.T)
    ax3d.plot_surface(X_ax, y_ax, matrix, linewidth=0, antialiased=False, shade = True, alpha = 0.5, cmap='rainbow')#facecolors=cm.viridis(matrix),cmap=plt.cm.spring)#cmap=plt.cm.spring)#cmap='rainbow')
    plt.show()

    num_boost_round = range(1,101)[imax]
    eta = np.arange(0.01,0.5,0.01)[jmax]
    print(score,num_boost_round, eta)

    # max_depth
    scores = []
    for i in range(1,50):
        score_add = 0
        for c in range(cv):
            if c < cv-1:
                index_train = np.concatenate([index[:c*step],index[(c+1)*step:]], axis=0)
                index_val = index[c*step:(c+1)*step]
            else: 
                index_train = index[0:c*step]
                index_val = index[c*step:]
            
            X_train = X.iloc[index_train]
            y_train = y[index_train]
            X_val = X.iloc[index_val]
            y_val = y[index_val]
            dtrain = xgb.DMatrix(X_train, y_train)
            dval = xgb.DMatrix(X_val, y_val)
            
            param = {'objective':'reg:squarederror'
                    ,'eta':eta
                    ,'max_depth':i}
            score_add += r2_score(y_val,xgb.train(param,dtrain,num_boost_round=num_boost_round).predict(dval))

        scores.append(score_add/5)
                      
    plt.figure()
    plt.plot(range(1,50),scores)
    plt.show()

    max_depth = range(1,50)[scores.index(max(scores))]
    print(max(scores),max_depth)

    # gamma
    scores = []
    for i in np.arange(0,5,0.5):
        score_add = 0
        for c in range(cv):
            if c < cv-1:
                index_train = np.concatenate([index[:c*step],index[(c+1)*step:]], axis=0)
                index_val = index[c*step:(c+1)*step]
            else: 
                index_train = index[0:c*step]
                index_val = index[c*step:]

            X_train = X.iloc[index_train]
            y_train = y[index_train]
            X_val = X.iloc[index_val]
            y_val = y[index_val]
            dtrain = xgb.DMatrix(X_train, y_train)
            dval = xgb.DMatrix(X_val, y_val)
                      
            param = {'objective':'reg:squarederror'
                    ,'eta':eta
                    ,'max_depth':max_depth
                    ,'gamma':i}
            score_add += r2_score(y_val,xgb.train(param,dtrain,num_boost_round=num_boost_round).predict(dval))
                      
        scores.append(score_add/5)
                      
    plt.figure()
    plt.plot(np.arange(0,5,0.5),scores)
    plt.show()

    gamma = np.arange(0,5,0.05)[scores.index(max(scores))]
    print(max(scores),gamma)

    # alpha
    scores = []
    for i in np.arange(0,5,0.05):
        score_add = 0
        for c in range(cv):
            if c < cv-1:
                index_train = np.concatenate([index[:c*step],index[(c+1)*step:]], axis=0)
                index_val = index[c*step:(c+1)*step]
            else: 
                index_train = index[0:c*step]
                index_val = index[c*step:]

            X_train = X.iloc[index_train]
            y_train = y[index_train]
            X_val = X.iloc[index_val]
            y_val = y[index_val]
            dtrain = xgb.DMatrix(X_train, y_train)
            dval = xgb.DMatrix(X_val, y_val)
                      
            param = {'objective':'reg:squarederror'
                    ,'eta':eta
                    ,'max_depth':max_depth
                    ,'gamma':gamma
                    ,'alpha':i}
                      
            score_add += r2_score(y_val,xgb.train(param,dtrain,num_boost_round=num_boost_round).predict(dval))
                      
        scores.append(score_add/5)
              
    plt.figure()
    plt.plot(np.arange(0,5,0.05),scores)
    plt.show()

    alpha = np.arange(0,5,0.05)[scores.index(max(scores))]
    print(max(scores),alpha)

    # lambda_
    scores = []
    for i in np.arange(0,5,0.05):
        score_add = 0
        for c in range(cv):
            if c < cv-1:
                index_train = np.concatenate([index[:c*step],index[(c+1)*step:]], axis=0)
                index_val = index[c*step:(c+1)*step]
            else: 
                index_train = index[0:c*step]
                index_val = index[c*step:]

            X_train = X.iloc[index_train]
            y_train = y[index_train]
            X_val = X.iloc[index_val]
            y_val = y[index_val]
            dtrain = xgb.DMatrix(X_train, y_train)
            dval = xgb.DMatrix(X_val, y_val)
                      
            param = {'objective':'reg:squarederror'
                    ,'eta':eta
                    ,'max_depth':max_depth
                    ,'gamma':gamma
                    ,'alpha':alpha
                    ,'lambda':i}
                      
            score_add += r2_score(y_val,xgb.train(param,dtrain,num_boost_round=num_boost_round).predict(dval))
                      
        scores.append(score_add/5)
                      
    plt.figure()
    plt.plot(np.arange(0,5,0.05),scores)
    plt.show()

    lambda_ = np.arange(0,5,0.05)[scores.index(max(scores))]
    print(max(scores),lambda_)
    

                      
    score_add = 0
    mse_add = 0
    pred_all = np.zeros((len(y)), dtype=float)
    for c in range(cv):
        if c < cv-1:
            index_train = np.concatenate([index[:c*step],index[(c+1)*step:]], axis=0)
            index_val = index[c*step:(c+1)*step]
        else: 
            index_train = index[0:c*step]
            index_val = index[c*step:]

        X_train = X.iloc[index_train]
        y_train = y[index_train]
        X_val = X.iloc[index_val]
        y_val = y[index_val]
        dtrain = xgb.DMatrix(X_train, y_train)
        dval = xgb.DMatrix(X_val, y_val)

        param = {'objective':'reg:squarederror'
                ,'eta':eta
                ,'max_depth':max_depth
                ,'gamma':gamma
                ,'alpha':alpha
                ,'lambda':lambda_}
        score_add += r2_score(y_val,xgb.train(param,dtrain,num_boost_round=num_boost_round).predict(dval))
        mse_add += MSE(y_val,xgb.train(param,dtrain,num_boost_round=num_boost_round).predict(dval))
        pred = xgb.train(param,dtrain,num_boost_round=num_boost_round).predict(dval)

        pred_all[index_val] = pred
        print(len(pred_all))

    after_r2 = score_add/5
    after_mse = mse_add/5
    pred = pred_all
    
    print("r^2:", before_r2, "->", after_r2)
    print("MSE", before_mse, "->", after_mse)
    
    print("objective","reg:squarederror"
          , "num_boost_round=", num_boost_round, ","
          ,"eta=", eta, ","
          , "max_depth=", max_depth, ","
          , "gamma=", gamma, ","
          , "alpha=", alpha, ","
          , "lambda=", lambda_, ",")
    pd.DataFrame(["reg:squarederror", num_boost_round, eta, max_depth, gamma, alpha, lambda_]).to_csv("./models/ML/{}_{}_XGB.csv".format(data, name),index=0)
    ##############################################
    m = "XGB"
    plt.figure(figsize=[20,20])
    plt.grid()
    plt.title('Model: {} {} {}'.format(data,name,m),fontsize=40
              ,verticalalignment= 'baseline'
              ,horizontalalignment='center')
    xmax = int(max(pred))+2
    ax = plt.gca()
    plt.text(0.72,0.05, "$r^2$: {}\nMSE: {}".format(after_r2,after_mse)
             ,fontsize=20
             ,bbox=dict(facecolor='g', alpha=0.2)
             ,transform = ax.transAxes)
    plt.scatter(y, pred,c='orange', s=100 ,alpha=0.6)
    plt.plot(range(0,xmax+1),range(0,xmax+1),c='g',linewidth=5, linestyle="--")
    plt.xlabel('True',fontsize=30)
    plt.ylabel('Prediction',fontsize=30)
    plt.xticks(fontsize=20)
    labels = [""]+[str(i) for i in range(1,xmax)]
    plt.yticks(ticks=range(0,xmax), labels=labels, fontsize=20)
    plt.xticks(ticks=range(0,xmax), labels=range(0,xmax), fontsize=20)
    plt.xlim(0,xmax)
    plt.ylim(0,xmax)
    plt.savefig(f"./results/{data}/ML/train/figure/{data}_{name}_{m}_contrast.png")
    plt.show()
    
    datalist = pd.concat([pd.DataFrame(pred,columns=["prediction"]),pd.DataFrame(y,columns=['true']),pd.DataFrame([after_r2],columns=["r^2"]),pd.DataFrame([after_mse],columns=['MSE'])],axis =1)
    datalist.to_csv(f"./results/{data}/ML/train/csv/{data}_{name}_{m}_pred_and_score.csv", index=None)
    
    return xgb.train(param,xgb.DMatrix(X, y),num_boost_round=num_boost_round)

In [31]:
def sklearn_score(name, model_name, model, X_trian, y_train, X_test, y_test):
    pred = model.fit(X_trian,y_train).predict(X_test)
    r2 = r2_score(y_test, pred)
    mse = MSE(y_test, pred)
    
    plt.figure(figsize=[20,20])
    plt.grid()
    plt.title('Model: {} {} {}'.format(data, name, model_name),fontsize=40
              ,verticalalignment= 'baseline'
              ,horizontalalignment='center')
    xmax = int(max(pred))+2
    ax = plt.gca()
    plt.text(0.72,0.05, "$r^2$: {}\nMSE: {}".format(r2,mse)
             ,fontsize=20
             ,bbox=dict(facecolor='g', alpha=0.2)
             ,transform = ax.transAxes)
    plt.scatter(y_test, pred,c='orange', s=100 ,alpha=0.6)
    plt.plot(range(0,xmax+1),range(0,xmax+1),c='g',linewidth=5, linestyle="--")
    plt.xlabel('True',fontsize=30)
    plt.ylabel('Prediction',fontsize=30)
    plt.xticks(fontsize=20)
    labels = [""]+[str(i) for i in range(1,xmax)]
    plt.yticks(ticks=range(0,xmax), labels=labels, fontsize=20)
    plt.xticks(ticks=range(0,xmax), labels=range(0,xmax), fontsize=20)
    plt.xlim(0,xmax)
    plt.ylim(0,xmax)
    plt.savefig("./results/{}/ML/test/figure/{}_{}_{}_contrast.png".format(data, data, name, model_name))
    plt.show()
    
    datalist = pd.concat([pd.DataFrame(pred,columns=["prediction"]),pd.DataFrame(y_test,columns=['true']),pd.DataFrame([r2],columns=["r^2"]),pd.DataFrame([mse],columns=['MSE'])],axis =1)
    datalist.to_csv("./results/{}/ML/test/csv/{}_{}_{}_pred_and_score.csv".format(data, data, name, model_name), index=None)
    
def xgboost_score(name, model_name, model, X_test, y_test):

    dtest = xgb.DMatrix(X_test, y_test)    
    pred = model.predict(dtest)
    r2 = r2_score(y_test, pred)
    mse = MSE(y_test, pred)
    
    plt.figure(figsize=[20,20])
    plt.grid()
    plt.title('Model: {} {} {}'.format(data, name, model_name),fontsize=40
              ,verticalalignment= 'baseline'
              ,horizontalalignment='center')
    xmax = int(max(pred))+2
    ax = plt.gca()
    plt.text(0.72,0.05, "$r^2$: {}\nMSE: {}".format(r2,mse)
             ,fontsize=20
             ,bbox=dict(facecolor='g', alpha=0.2)
             ,transform = ax.transAxes)
    plt.scatter(y_test, pred,c='orange', s=100 ,alpha=0.6)
    plt.plot(range(0,xmax+1),range(0,xmax+1),c='g',linewidth=5, linestyle="--")
    plt.xlabel('True',fontsize=30)
    plt.ylabel('Prediction',fontsize=30)
    plt.xticks(fontsize=20)
    labels = [""]+[str(i) for i in range(1,xmax)]
    plt.yticks(ticks=range(0,xmax), labels=labels, fontsize=20)
    plt.xticks(ticks=range(0,xmax), labels=range(0,xmax), fontsize=20)
    plt.xlim(0,xmax)
    plt.ylim(0,xmax)
    plt.savefig("./results/{}/ML/test/figure/{}_{}_{}_contrast.png".format(data, data, name, model_name))
    plt.show()
    
    datalist = pd.concat([pd.DataFrame(pred,columns=["prediction"]),pd.DataFrame(y_test,columns=['true']),pd.DataFrame([r2],columns=["r^2"]),pd.DataFrame([mse],columns=['MSE'])],axis =1)
    datalist.to_csv("./results/{}/ML/test/csv/{}_{}_{}_pred_and_score.csv".format(data, data, name, model_name), index=None)

def importance(label, model):
    imp = list(model.feature_importances_)
    top = []
    for i in range(1,len(label)):
        top.append((label[imp.index(np.sort(imp)[-i])], np.sort(imp)[-i]))
    pd.DataFrame(top).to_csv(f"./results/{data}/ML/{data}_importance.csv", index=0)
    return top

In [1]:
rfr_ECFP = rfr_auto('ECFP', ECFP_train, activities_train, base_indices, 5)
sklearn_score('ECFP', 'RF', rfr_ECFP, ECFP_train, activities_train, ECFP_test, activities_test)

rfr_FCFP = rfr_auto('FCFP', FCFP_train, activities_train, base_indices, 5)
sklearn_score('FCFP', 'RF', rfr_FCFP, FCFP_train, activities_train, FCFP_test, activities_test)

rfr_MACCS = rfr_auto('MACCS', MACCS_train, activities_train, base_indices, 5)
sklearn_score('MACCS', 'RF', rfr_MACCS, MACCS_train, activities_train, MACCS_test, activities_test)

rfr_des = rfr_auto('Descriptors', des_train, activities_train, base_indices, 5)
sklearn_score('Descriptors', 'RF', rfr_des, des_train, activities_train, des_test, activities_test)
rfr_des_importance = importance(label, rfr_des.fit(des_train, activities_train))
print(rfr_des_importance)

In [2]:
svr_ECFP = svr_auto('ECFP', ECFP_train, activities_train, base_indices, 5)
sklearn_score('ECFP', 'SVR', svr_ECFP, ECFP_train, activities_train, ECFP_test, activities_test)

svr_FCFP = svr_auto('FCFP', FCFP_train, activities_train, base_indices, 5)
sklearn_score('FCFP', 'SVR', svr_FCFP, FCFP_train, activities_train, FCFP_test, activities_test)

svr_MACCS = svr_auto('MACCS', MACCS_train, activities_train, base_indices, 5)
sklearn_score('MACCS', 'SVR', svr_MACCS, MACCS_train, activities_train, MACCS_test, activities_test)

svr_des = svr_auto('Descriptors', des_train, activities_train, base_indices, 5)
sklearn_score('Descriptors', 'SVR', svr_des, des_train, activities_train, des_test, activities_test)

In [3]:
xgb_ECFP = xgb_auto('ECFP', ECFP_train, activities_train, base_indices, 5)
xgboost_score('ECFP', 'XGB', xgb_ECFP, ECFP_test, activities_test)

xgb_FCFP = xgb_auto('FCFP', FCFP_train, activities_train, base_indices, 5)
xgboost_score('FCFP', 'XGB', xgb_FCFP, FCFP_test, activities_test)

xgb_MACCS = xgb_auto('MACCS', MACCS_train, activities_train, base_indices, 5)
xgboost_score('MACCS', 'XGB', xgb_MACCS, MACCS_test, activities_test)

xgb_des = xgb_auto('Descriptors', des_train, activities_train, base_indices, 5)
xgboost_score('Descriptors', 'XGB', xgb_des, des_test, activities_test)

In [4]:
br_ECFP = br_auto('ECFP', ECFP_train, activities_train, base_indices, 5)
sklearn_score('ECFP', 'BR', br_ECFP, ECFP_train, activities_train, ECFP_test, activities_test)

br_FCFP = br_auto('FCFP', FCFP_train, activities_train, base_indices, 5)
sklearn_score('FCFP', 'BR', br_FCFP, FCFP_train, activities_train, FCFP_test, activities_test)

br_MACCS = br_auto('MACCS', MACCS_train, activities_train, base_indices, 5)
sklearn_score('MACCS', 'BR', br_MACCS, MACCS_train, activities_train, MACCS_test, activities_test)

br_des = br_auto('Descriptors', des_train, activities_train, base_indices, 5)
sklearn_score('Descriptors', 'BR', br_des, des_train, activities_train, des_test, activities_test)