In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from imblearn.metrics import sensitivity_score
from imblearn.metrics import specificity_score
from sklearn.metrics import precision_score

In [2]:
seed = 74356631

In [3]:
sp = RepeatedStratifiedKFold(random_state=seed,n_repeats=3,n_splits=5)

In [4]:
GSE89408 = pd.read_csv('data_GSE89408.csv')
X_GSE89408 = GSE89408.iloc[:,2:]
y_GSE89408 = GSE89408.iloc[:,1]
GSE55235 = pd.read_csv('data_GSE55235.csv')
X_GSE55235 = GSE55235.iloc[:,2:]
y_GSE55235 = GSE55235.iloc[:,1]
GSE55457 = pd.read_csv('data_GSE55457.csv')
X_GSE55457 = GSE55457.iloc[:,2:]
y_GSE55457 = GSE55457.iloc[:,1]
GSE82107 = pd.read_csv('data_GSE82107.csv')
X_GSE82107 = GSE82107.iloc[:,2:]
y_GSE82107 = GSE82107.iloc[:,1]
GSE114007 = pd.read_csv('data_GSE114007.csv')
X_GSE114007 = GSE114007.iloc[:,2:]
y_GSE114007 = GSE114007.iloc[:,1]
GSE117999 = pd.read_csv('data_GSE117999.csv')
X_GSE117999 = GSE117999.iloc[:,2:]
y_GSE117999 = GSE117999.iloc[:,1]
GSE169077 = pd.read_csv('data_GSE169077.csv')
X_GSE169077 = GSE169077.iloc[:,2:]
y_GSE169077 = GSE169077.iloc[:,1]
GSE206848 = pd.read_csv('data_GSE206848.csv')
X_GSE206848 = GSE206848.iloc[:,2:]
y_GSE206848 = GSE206848.iloc[:,1]
GSE254682 = pd.read_csv('data_GSE254682.csv')
X_GSE254682 = GSE254682.iloc[:,2:]
y_GSE254682 = GSE254682.iloc[:,1]
GSE283079 = pd.read_csv('data_GSE283079.csv')
X_GSE283079 = GSE283079.iloc[:,2:]
y_GSE283079 = GSE283079.iloc[:,1]
GSE285234 = pd.read_csv('data_GSE285234.csv')
X_GSE285234 = GSE285234.iloc[:,2:]
y_GSE285234 = GSE285234.iloc[:,1]
GSE286154 = pd.read_csv('data_GSE286154.csv')
X_GSE286154 = GSE286154.iloc[:,2:]
y_GSE286154 = GSE286154.iloc[:,1]

In [26]:
import hyperopt
from hyperopt import hp

In [27]:
def objective(params):
    aucs = []
    for train_index,test_index in sp.split(X_GSE89408,y_GSE89408):
        X_train = X_GSE89408.iloc[train_index,:]
        X_vali = X_GSE89408.iloc[test_index,:]
        y_train = y_GSE89408[train_index]
        y_vali = y_GSE89408[test_index]
        model = GradientBoostingClassifier(random_state=seed,
                                           learning_rate=0.1,
                                           validation_fraction=0.2,
                                           n_estimators=params['n_estimators'],
                                           max_depth=params['max_depth'],
                                           min_samples_split=params['min_samples_split'],
                                           min_samples_leaf=params['min_samples_leaf'])
        model.fit(X_train,y_train)
        pro_vali = model.predict_proba(X_vali)[:,1]
        auc_vali = roc_auc_score(y_vali,pro_vali)
        aucs.append(auc_vali)
    return -np.mean(aucs)

In [28]:
space = {
    'n_estimators':hp.choice('n_estimators',range(2,50)),
    'max_depth':hp.choice('max_depth',range(1,5)),
    'min_samples_split':hp.choice('min_samples_split',range(2,50)),
    'min_samples_leaf':hp.choice('min_samples_leaf',range(2,50)),
}

In [29]:
best_param = hyperopt.fmin(objective,space,hyperopt.tpe.suggest,max_evals=100)

100%|██████████| 100/100 [00:38<00:00,  2.58trial/s, best loss: -0.9466111111111111]


In [30]:
best_param

{'max_depth': np.int64(3),
 'min_samples_leaf': np.int64(18),
 'min_samples_split': np.int64(20),
 'n_estimators': np.int64(43)}

In [5]:
model = GradientBoostingClassifier(random_state=seed,
                                   learning_rate=0.1,
                                   validation_fraction=0.2,
                                   n_estimators=range(2,50)[43],
                                   max_depth=range(1,5)[3],
                                   min_samples_leaf=range(2,50)[18],
                                   min_samples_split=range(2,50)[20])
model.fit(X_GSE89408,y_GSE89408)

In [6]:
pro_GSE89408 = model.predict_proba(X_GSE89408)[:,1]
pro_GSE55235 = model.predict_proba(X_GSE55235)[:,1]
pro_GSE55457 = model.predict_proba(X_GSE55457)[:,1]
pro_GSE82107 = model.predict_proba(X_GSE82107)[:,1]
pro_GSE114007 = model.predict_proba(X_GSE114007)[:,1]
pro_GSE117999 = model.predict_proba(X_GSE117999)[:,1]
pro_GSE169077 = model.predict_proba(X_GSE169077)[:,1]
pro_GSE206848 = model.predict_proba(X_GSE206848)[:,1]
pro_GSE254682 = model.predict_proba(X_GSE254682)[:,1]
pro_GSE283079 = model.predict_proba(X_GSE283079)[:,1]
pro_GSE285234 = model.predict_proba(X_GSE285234)[:,1]
pro_GSE286154 = model.predict_proba(X_GSE286154)[:,1]
pre_GSE89408 = model.predict(X_GSE89408)
pre_GSE55235 = model.predict(X_GSE55235)
pre_GSE55457 = model.predict(X_GSE55457)
pre_GSE82107 = model.predict(X_GSE82107)
pre_GSE114007 = model.predict(X_GSE114007)
pre_GSE117999 = model.predict(X_GSE117999)
pre_GSE169077 = model.predict(X_GSE169077)
pre_GSE206848 = model.predict(X_GSE206848)
pre_GSE254682 = model.predict(X_GSE254682)
pre_GSE283079 = model.predict(X_GSE283079)
pre_GSE285234 = model.predict(X_GSE285234)
pre_GSE286154 = model.predict(X_GSE286154)

In [7]:
pd.DataFrame({
    'True':y_GSE55235,
    'Pre':pre_GSE55235
}).to_csv('GBDT_GSE55235.csv',index=False)
pd.DataFrame({
    'True':y_GSE55457,
    'Pre':pre_GSE55457
}).to_csv('GBDT_GSE55457.csv',index=False)
pd.DataFrame({
    'True':y_GSE82107,
    'Pre':pre_GSE82107
}).to_csv('GBDT_GSE82107.csv',index=False)
pd.DataFrame({
    'True':y_GSE114007,
    'Pre':pre_GSE114007
}).to_csv('GBDT_GSE114007.csv',index=False)
pd.DataFrame({
    'True':y_GSE117999,
    'Pre':pre_GSE117999
}).to_csv('GBDT_GSE117999.csv',index=False)
pd.DataFrame({
    'True':y_GSE169077,
    'Pre':pre_GSE169077
}).to_csv('GBDT_GSE169077.csv',index=False)
pd.DataFrame({
    'True':y_GSE206848,
    'Pre':pre_GSE206848
}).to_csv('GBDT_GSE206848.csv',index=False)
pd.DataFrame({
    'True':y_GSE254682,
    'Pre':pre_GSE254682
}).to_csv('GBDT_GSE254682.csv',index=False)
pd.DataFrame({
    'True':y_GSE283079,
    'Pre':pre_GSE283079
}).to_csv('GBDT_GSE283079.csv',index=False)
pd.DataFrame({
    'True':y_GSE285234,
    'Pre':pre_GSE285234
}).to_csv('GBDT_GSE285234.csv',index=False)
pd.DataFrame({
    'True':y_GSE286154,
    'Pre':pre_GSE286154
}).to_csv('GBDT_GSE286154.csv',index=False)

In [33]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE89408,pro_GSE89408)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE89408,pre_GSE89408)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE89408,pre_GSE89408)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE89408,pre_GSE89408)))
print('Precision = {:.3f}'.format(precision_score(y_GSE89408,pre_GSE89408)))
print('F1 = {:.3f}'.format(f1_score(y_GSE89408,pre_GSE89408)))

AUC = 0.995
Accuracy = 0.980
Sensitivity = 0.955
Specificity = 1.000
Precision = 1.000
F1 = 0.977


In [34]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE55235,pro_GSE55235)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE55235,pre_GSE55235)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE55235,pre_GSE55235)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE55235,pre_GSE55235)))
print('Precision = {:.3f}'.format(precision_score(y_GSE55235,pre_GSE55235)))
print('F1 = {:.3f}'.format(f1_score(y_GSE55235,pre_GSE55235)))

AUC = 0.930
Accuracy = 0.850
Sensitivity = 0.900
Specificity = 0.800
Precision = 0.818
F1 = 0.857


In [35]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE55457,pro_GSE55457)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE55457,pre_GSE55457)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE55457,pre_GSE55457)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE55457,pre_GSE55457)))
print('Precision = {:.3f}'.format(precision_score(y_GSE55457,pre_GSE55457)))
print('F1 = {:.3f}'.format(f1_score(y_GSE55457,pre_GSE55457)))

AUC = 0.690
Accuracy = 0.600
Sensitivity = 0.800
Specificity = 0.400
Precision = 0.571
F1 = 0.667


In [36]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE82107,pro_GSE82107)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE82107,pre_GSE82107)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE82107,pre_GSE82107)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE82107,pre_GSE82107)))
print('Precision = {:.3f}'.format(precision_score(y_GSE82107,pre_GSE82107)))
print('F1 = {:.3f}'.format(f1_score(y_GSE82107,pre_GSE82107)))

AUC = 0.843
Accuracy = 0.824
Sensitivity = 0.700
Specificity = 1.000
Precision = 1.000
F1 = 0.824


In [37]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE114007,pro_GSE114007)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE114007,pre_GSE114007)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE114007,pre_GSE114007)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE114007,pre_GSE114007)))
print('Precision = {:.3f}'.format(precision_score(y_GSE114007,pre_GSE114007)))
print('F1 = {:.3f}'.format(f1_score(y_GSE114007,pre_GSE114007)))

AUC = 0.911
Accuracy = 0.816
Sensitivity = 0.850
Specificity = 0.778
Precision = 0.810
F1 = 0.829


In [38]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE117999,pro_GSE117999)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE117999,pre_GSE117999)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE117999,pre_GSE117999)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE117999,pre_GSE117999)))
print('Precision = {:.3f}'.format(precision_score(y_GSE117999,pre_GSE117999)))
print('F1 = {:.3f}'.format(f1_score(y_GSE117999,pre_GSE117999)))

AUC = 0.250
Accuracy = 0.400
Sensitivity = 0.500
Specificity = 0.300
Precision = 0.417
F1 = 0.455


In [39]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE169077,pro_GSE169077)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE169077,pre_GSE169077)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE169077,pre_GSE169077)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE169077,pre_GSE169077)))
print('Precision = {:.3f}'.format(precision_score(y_GSE169077,pre_GSE169077)))
print('F1 = {:.3f}'.format(f1_score(y_GSE169077,pre_GSE169077)))

AUC = 1.000
Accuracy = 0.909
Sensitivity = 1.000
Specificity = 0.800
Precision = 0.857
F1 = 0.923


In [40]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE206848,pro_GSE206848)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE206848,pre_GSE206848)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE206848,pre_GSE206848)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE206848,pre_GSE206848)))
print('Precision = {:.3f}'.format(precision_score(y_GSE206848,pre_GSE206848)))
print('F1 = {:.3f}'.format(f1_score(y_GSE206848,pre_GSE206848)))

AUC = 0.571
Accuracy = 0.500
Sensitivity = 0.714
Specificity = 0.286
Precision = 0.500
F1 = 0.588


In [41]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE254682,pro_GSE254682)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE254682,pre_GSE254682)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE254682,pre_GSE254682)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE254682,pre_GSE254682)))
print('Precision = {:.3f}'.format(precision_score(y_GSE254682,pre_GSE254682)))
print('F1 = {:.3f}'.format(f1_score(y_GSE254682,pre_GSE254682)))

AUC = 0.429
Accuracy = 0.533
Sensitivity = 0.625
Specificity = 0.429
Precision = 0.556
F1 = 0.588


In [42]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE283079,pro_GSE283079)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE283079,pre_GSE283079)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE283079,pre_GSE283079)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE283079,pre_GSE283079)))
print('Precision = {:.3f}'.format(precision_score(y_GSE283079,pre_GSE283079)))
print('F1 = {:.3f}'.format(f1_score(y_GSE283079,pre_GSE283079)))

AUC = 0.756
Accuracy = 0.732
Sensitivity = 0.750
Specificity = 0.600
Precision = 0.931
F1 = 0.831


In [43]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE285234,pro_GSE285234)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE285234,pre_GSE285234)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE285234,pre_GSE285234)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE285234,pre_GSE285234)))
print('Precision = {:.3f}'.format(precision_score(y_GSE285234,pre_GSE285234)))
print('F1 = {:.3f}'.format(f1_score(y_GSE285234,pre_GSE285234)))

AUC = 0.306
Accuracy = 0.333
Sensitivity = 0.167
Specificity = 0.500
Precision = 0.250
F1 = 0.200


In [44]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE286154,pro_GSE286154)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE286154,pre_GSE286154)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE286154,pre_GSE286154)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE286154,pre_GSE286154)))
print('Precision = {:.3f}'.format(precision_score(y_GSE286154,pre_GSE286154)))
print('F1 = {:.3f}'.format(f1_score(y_GSE286154,pre_GSE286154)))

AUC = 0.453
Accuracy = 0.562
Sensitivity = 0.500
Specificity = 0.625
Precision = 0.571
F1 = 0.533
