In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from imblearn.metrics import sensitivity_score
from imblearn.metrics import specificity_score
from sklearn.metrics import precision_score

In [5]:
seed = 3645641

In [6]:
sp = RepeatedStratifiedKFold(random_state=seed,n_repeats=3,n_splits=5)

In [7]:
GSE12649 = pd.read_csv('data_GSE12649.csv')
X_GSE12649 = GSE12649.iloc[:,2:]
y_GSE12649 = GSE12649.iloc[:,1]
GSE17612 = pd.read_csv('data_GSE17612.csv')
X_GSE17612 = GSE17612.iloc[:,2:]
y_GSE17612 = GSE17612.iloc[:,1]
GSE21138 = pd.read_csv('data_GSE21138.csv')
X_GSE21138 = GSE21138.iloc[:,2:]
y_GSE21138 = GSE21138.iloc[:,1]
GSE35977 = pd.read_csv('data_GSE35977.csv')
X_GSE35977 = GSE35977.iloc[:,2:]
y_GSE35977 = GSE35977.iloc[:,1]
GSE53987 = pd.read_csv('data_GSE53987.csv')
X_GSE53987 = GSE53987.iloc[:,2:]
y_GSE53987 = GSE53987.iloc[:,1]
GSE62191 = pd.read_csv('data_GSE62191.csv')
X_GSE62191 = GSE62191.iloc[:,2:]
y_GSE62191 = GSE62191.iloc[:,1]
GSE78936 = pd.read_csv('data_GSE78936.csv')
X_GSE78936 = GSE78936.iloc[:,2:]
y_GSE78936 = GSE78936.iloc[:,1]
GSE87194 = pd.read_csv('data_GSE87194.csv')
X_GSE87194 = GSE87194.iloc[:,2:]
y_GSE87194 = GSE87194.iloc[:,1]
GSE87610 = pd.read_csv('data_GSE87610.csv')
X_GSE87610 = GSE87610.iloc[:,2:]
y_GSE87610 = GSE87610.iloc[:,1]
GSE107638 = pd.read_csv('data_GSE107638.csv')
X_GSE107638 = GSE107638.iloc[:,2:]
y_GSE107638 = GSE107638.iloc[:,1]
GSE112523 = pd.read_csv('data_GSE112523.csv')
X_GSE112523 = GSE112523.iloc[:,2:]
y_GSE112523 = GSE112523.iloc[:,1]
GSE120340 = pd.read_csv('data_GSE120340.csv')
X_GSE120340 = GSE120340.iloc[:,2:]
y_GSE120340 = GSE120340.iloc[:,1]

In [8]:
import hyperopt
from hyperopt import hp

In [9]:
def objective(params):
    aucs = []
    for train_index,test_index in sp.split(X_GSE53987,y_GSE53987):
        X_train = X_GSE53987.iloc[train_index,:]
        X_vali = X_GSE53987.iloc[test_index,:]
        y_train = y_GSE53987[train_index]
        y_vali = y_GSE53987[test_index]
        model = GradientBoostingClassifier(random_state=seed,
                                           learning_rate=0.1,
                                           validation_fraction=0.2,
                                           n_estimators=params['n_estimators'],
                                           max_depth=params['max_depth'],
                                           min_samples_split=params['min_samples_split'],
                                           min_samples_leaf=params['min_samples_leaf'])
        model.fit(X_train,y_train)
        pro_vali = model.predict_proba(X_vali)[:,1]
        auc_vali = roc_auc_score(y_vali,pro_vali)
        aucs.append(auc_vali)
    return -np.mean(aucs)

In [10]:
space = {
    'n_estimators':hp.choice('n_estimators',range(2,50)),
    'max_depth':hp.choice('max_depth',range(1,5)),
    'min_samples_split':hp.choice('min_samples_split',range(2,50)),
    'min_samples_leaf':hp.choice('min_samples_leaf',range(2,50)),
}

In [11]:
best_param = hyperopt.fmin(objective,space,hyperopt.tpe.suggest,max_evals=100)

100%|██████████| 100/100 [00:46<00:00,  2.16trial/s, best loss: -0.9416161616161616]


In [12]:
best_param

{'max_depth': np.int64(3),
 'min_samples_leaf': np.int64(9),
 'min_samples_split': np.int64(15),
 'n_estimators': np.int64(45)}

In [8]:
model = GradientBoostingClassifier(random_state=seed,
                                   learning_rate=0.1,
                                   validation_fraction=0.2,
                                   n_estimators=range(2,50)[45],
                                   max_depth=range(1,5)[3],
                                   min_samples_split=range(2,50)[15],
                                   min_samples_leaf=range(2,50)[9])
model.fit(X_GSE53987,y_GSE53987)

In [9]:
pro_GSE12649 = model.predict_proba(X_GSE12649)[:,1]
pro_GSE17612 = model.predict_proba(X_GSE17612)[:,1]
pro_GSE21138 = model.predict_proba(X_GSE21138)[:,1]
pro_GSE35977 = model.predict_proba(X_GSE35977)[:,1]
pro_GSE53987 = model.predict_proba(X_GSE53987)[:,1]
pro_GSE62191 = model.predict_proba(X_GSE62191)[:,1]
pro_GSE78936 = model.predict_proba(X_GSE78936)[:,1]
pro_GSE87194 = model.predict_proba(X_GSE87194)[:,1]
pro_GSE87610 = model.predict_proba(X_GSE87610)[:,1]
pro_GSE107638 = model.predict_proba(X_GSE107638)[:,1]
pro_GSE112523 = model.predict_proba(X_GSE112523)[:,1]
pro_GSE120340 = model.predict_proba(X_GSE120340)[:,1]
pre_GSE12649 = model.predict(X_GSE12649)
pre_GSE17612 = model.predict(X_GSE17612)
pre_GSE21138 = model.predict(X_GSE21138)
pre_GSE35977 = model.predict(X_GSE35977)
pre_GSE53987 = model.predict(X_GSE53987)
pre_GSE62191 = model.predict(X_GSE62191)
pre_GSE78936 = model.predict(X_GSE78936)
pre_GSE87194 = model.predict(X_GSE87194)
pre_GSE87610 = model.predict(X_GSE87610)
pre_GSE107638 = model.predict(X_GSE107638)
pre_GSE112523 = model.predict(X_GSE112523)
pre_GSE120340 = model.predict(X_GSE120340)

In [10]:
pd.DataFrame({
    'True':y_GSE12649,
    'Pre':pre_GSE12649,
}).to_csv('GBDT_GSE12649.csv',index=False)
pd.DataFrame({
    'True':y_GSE17612,
    'Pre':pre_GSE17612,
}).to_csv('GBDT_GSE17612.csv',index=False)
pd.DataFrame({
    'True':y_GSE21138,
    'Pre':pre_GSE21138,
}).to_csv('GBDT_GSE21138.csv',index=False)
pd.DataFrame({
    'True':y_GSE35977,
    'Pre':pre_GSE35977,
}).to_csv('GBDT_GSE35977.csv',index=False)
pd.DataFrame({
    'True':y_GSE62191,
    'Pre':pre_GSE62191,
}).to_csv('GBDT_GSE62191.csv',index=False)
pd.DataFrame({
    'True':y_GSE78936,
    'Pre':pre_GSE78936,
}).to_csv('GBDT_GSE78936.csv',index=False)
pd.DataFrame({
    'True':y_GSE87194,
    'Pre':pre_GSE87194,
}).to_csv('GBDT_GSE87194.csv',index=False)
pd.DataFrame({
    'True':y_GSE87610,
    'Pre':pre_GSE87610,
}).to_csv('GBDT_GSE87610.csv',index=False)
pd.DataFrame({
    'True':y_GSE107638,
    'Pre':pre_GSE107638,
}).to_csv('GBDT_GSE107638.csv',index=False)
pd.DataFrame({
    'True':y_GSE112523,
    'Pre':pre_GSE112523,
}).to_csv('GBDT_GSE112523.csv',index=False)
pd.DataFrame({
    'True':y_GSE120340,
    'Pre':pre_GSE120340,
}).to_csv('GBDT_GSE120340.csv',index=False)

In [15]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE53987,pro_GSE53987)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE53987,pre_GSE53987)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE53987,pre_GSE53987)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE53987,pre_GSE53987)))
print('Precision = {:.3f}'.format(precision_score(y_GSE53987,pre_GSE53987)))
print('F1 = {:.3f}'.format(f1_score(y_GSE53987,pre_GSE53987)))

AUC = 1.000
Accuracy = 1.000
Sensitivity = 1.000
Specificity = 1.000
Precision = 1.000
F1 = 1.000


In [16]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE12649,pro_GSE12649)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE12649,pre_GSE12649)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE12649,pre_GSE12649)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE12649,pre_GSE12649)))
print('Precision = {:.3f}'.format(precision_score(y_GSE12649,pre_GSE12649)))
print('F1 = {:.3f}'.format(f1_score(y_GSE12649,pre_GSE12649)))

AUC = 0.487
Accuracy = 0.464
Sensitivity = 0.514
Specificity = 0.412
Precision = 0.474
F1 = 0.493


In [17]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE17612,pro_GSE17612)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE17612,pre_GSE17612)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE17612,pre_GSE17612)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE17612,pre_GSE17612)))
print('Precision = {:.3f}'.format(precision_score(y_GSE17612,pre_GSE17612)))
print('F1 = {:.3f}'.format(f1_score(y_GSE17612,pre_GSE17612)))

AUC = 0.640
Accuracy = 0.569
Sensitivity = 0.571
Specificity = 0.565
Precision = 0.615
F1 = 0.593


In [18]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE21138,pro_GSE21138)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE21138,pre_GSE21138)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE21138,pre_GSE21138)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE21138,pre_GSE21138)))
print('Precision = {:.3f}'.format(precision_score(y_GSE21138,pre_GSE21138)))
print('F1 = {:.3f}'.format(f1_score(y_GSE21138,pre_GSE21138)))

AUC = 0.492
Accuracy = 0.475
Sensitivity = 0.500
Specificity = 0.448
Precision = 0.484
F1 = 0.492


In [19]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE35977,pro_GSE35977)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE35977,pre_GSE35977)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE35977,pre_GSE35977)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE35977,pre_GSE35977)))
print('Precision = {:.3f}'.format(precision_score(y_GSE35977,pre_GSE35977)))
print('F1 = {:.3f}'.format(f1_score(y_GSE35977,pre_GSE35977)))

AUC = 0.825
Accuracy = 0.752
Sensitivity = 0.784
Specificity = 0.720
Precision = 0.741
F1 = 0.762


In [20]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE62191,pro_GSE62191)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE62191,pre_GSE62191)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE62191,pre_GSE62191)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE62191,pre_GSE62191)))
print('Precision = {:.3f}'.format(precision_score(y_GSE62191,pre_GSE62191)))
print('F1 = {:.3f}'.format(f1_score(y_GSE62191,pre_GSE62191)))

AUC = 0.654
Accuracy = 0.585
Sensitivity = 0.607
Specificity = 0.560
Precision = 0.607
F1 = 0.607


In [21]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE78936,pro_GSE78936)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE78936,pre_GSE78936)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE78936,pre_GSE78936)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE78936,pre_GSE78936)))
print('Precision = {:.3f}'.format(precision_score(y_GSE78936,pre_GSE78936)))
print('F1 = {:.3f}'.format(f1_score(y_GSE78936,pre_GSE78936)))

AUC = 0.810
Accuracy = 0.769
Sensitivity = 0.750
Specificity = 0.792
Precision = 0.808
F1 = 0.778


In [22]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE87194,pro_GSE87194)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE87194,pre_GSE87194)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE87194,pre_GSE87194)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE87194,pre_GSE87194)))
print('Precision = {:.3f}'.format(precision_score(y_GSE87194,pre_GSE87194)))
print('F1 = {:.3f}'.format(f1_score(y_GSE87194,pre_GSE87194)))

AUC = 0.601
Accuracy = 0.553
Sensitivity = 0.526
Specificity = 0.579
Precision = 0.556
F1 = 0.541


In [23]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE87610,pro_GSE87610)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE87610,pre_GSE87610)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE87610,pre_GSE87610)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE87610,pre_GSE87610)))
print('Precision = {:.3f}'.format(precision_score(y_GSE87610,pre_GSE87610)))
print('F1 = {:.3f}'.format(f1_score(y_GSE87610,pre_GSE87610)))

AUC = 0.421
Accuracy = 0.453
Sensitivity = 0.446
Specificity = 0.458
Precision = 0.426
F1 = 0.436


In [24]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE107638,pro_GSE107638)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE107638,pre_GSE107638)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE107638,pre_GSE107638)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE107638,pre_GSE107638)))
print('Precision = {:.3f}'.format(precision_score(y_GSE107638,pre_GSE107638)))
print('F1 = {:.3f}'.format(f1_score(y_GSE107638,pre_GSE107638)))

AUC = 0.482
Accuracy = 0.486
Sensitivity = 0.472
Specificity = 0.500
Precision = 0.472
F1 = 0.472


In [25]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE112523,pro_GSE112523)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE112523,pre_GSE112523)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE112523,pre_GSE112523)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE112523,pre_GSE112523)))
print('Precision = {:.3f}'.format(precision_score(y_GSE112523,pre_GSE112523)))
print('F1 = {:.3f}'.format(f1_score(y_GSE112523,pre_GSE112523)))

AUC = 0.756
Accuracy = 0.708
Sensitivity = 0.857
Specificity = 0.647
Precision = 0.500
F1 = 0.632


In [26]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE120340,pro_GSE120340)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE120340,pre_GSE120340)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE120340,pre_GSE120340)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE120340,pre_GSE120340)))
print('Precision = {:.3f}'.format(precision_score(y_GSE120340,pre_GSE120340)))
print('F1 = {:.3f}'.format(f1_score(y_GSE120340,pre_GSE120340)))

AUC = 0.690
Accuracy = 0.500
Sensitivity = 0.500
Specificity = 0.500
Precision = 0.500
F1 = 0.500
