In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from imblearn.metrics import sensitivity_score
from imblearn.metrics import specificity_score
from sklearn.metrics import precision_score

In [3]:
seed = 874356351

In [4]:
GSE54456 = pd.read_csv('data_GSE54456.csv')
X_GSE54456 = GSE54456.iloc[:,2:]
y_GSE54456 = GSE54456.iloc[:,1]
GSE13355 = pd.read_csv('data_GSE13355.csv')
X_GSE13355 = GSE13355.iloc[:,2:]
y_GSE13355 = GSE13355.iloc[:,1]
GSE14905 = pd.read_csv('data_GSE14905.csv')
X_GSE14905 = GSE14905.iloc[:,2:]
y_GSE14905 = GSE14905.iloc[:,1]
GSE16161 = pd.read_csv('data_GSE16161.csv')
X_GSE16161 = GSE16161.iloc[:,2:]
y_GSE16161 = GSE16161.iloc[:,1]
GSE66511 = pd.read_csv('data_GSE66511.csv')
X_GSE66511 = GSE66511.iloc[:,2:]
y_GSE66511 = GSE66511.iloc[:,1]
GSE75890 = pd.read_csv('data_GSE75890.csv')
X_GSE75890 = GSE75890.iloc[:,2:]
y_GSE75890 = GSE75890.iloc[:,1]
GSE78097 = pd.read_csv('data_GSE78097.csv')
X_GSE78097 = GSE78097.iloc[:,2:]
y_GSE78097 = GSE78097.iloc[:,1]
GSE79704 = pd.read_csv('data_GSE79704.csv')
X_GSE79704 = GSE79704.iloc[:,2:]
y_GSE79704 = GSE79704.iloc[:,1]
GSE83582 = pd.read_csv('data_GSE83582.csv')
X_GSE83582 = GSE83582.iloc[:,2:]
y_GSE83582 = GSE83582.iloc[:,1]
GSE109248 = pd.read_csv('data_GSE109248.csv')
X_GSE109248 = GSE109248.iloc[:,2:]
y_GSE109248 = GSE109248.iloc[:,1]
GSE114286 = pd.read_csv('data_GSE114286.csv')
X_GSE114286 = GSE114286.iloc[:,2:]
y_GSE114286 = GSE114286.iloc[:,1]
GSE121212 = pd.read_csv('data_GSE121212.csv')
X_GSE121212 = GSE121212.iloc[:,2:]
y_GSE121212 = GSE121212.iloc[:,1]
GSE182740 = pd.read_csv('data_GSE182740.csv')
X_GSE182740 = GSE182740.iloc[:,2:]
y_GSE182740 = GSE182740.iloc[:,1]

In [5]:
sp = RepeatedStratifiedKFold(random_state=seed,n_repeats=3,n_splits=5)

In [55]:
import hyperopt
from hyperopt import hp

In [56]:
def objective(params):
    aucs = []
    for train_index,test_index in sp.split(X_GSE54456,y_GSE54456):
        X_train = X_GSE54456.iloc[train_index,:]
        X_vali = X_GSE54456.iloc[test_index,:]
        y_train = y_GSE54456[train_index]
        y_vali = y_GSE54456[test_index]
        model = RandomForestClassifier(random_state=seed,
                                       n_estimators=params['n_estimators'],
                                       max_depth=params['max_depth'],
                                       min_samples_split=params['min_samples_split'],
                                       min_samples_leaf=params['min_samples_leaf'])
        model.fit(X_train,y_train)
        pro_vali = model.predict_proba(X_vali)[:,1]
        auc_vali = roc_auc_score(y_vali,pro_vali)
        aucs.append(auc_vali)
    return -np.mean(aucs)

In [57]:
space = {
    'n_estimators':hp.choice('n_estimators',range(2,100)),
    'max_depth':hp.choice('max_depth',range(1,10)),
    'min_samples_split':hp.choice('min_samples_split',range(2,100)),
    'min_samples_leaf':hp.choice('min_samples_leaf',range(2,100)),
}

In [58]:
best_param = hyperopt.fmin(objective,space,hyperopt.tpe.suggest,max_evals=100)

100%|██████████| 100/100 [01:47<00:00,  1.07s/trial, best loss: -1.0]


In [59]:
best_param

{'max_depth': np.int64(3),
 'min_samples_leaf': np.int64(40),
 'min_samples_split': np.int64(85),
 'n_estimators': np.int64(27)}

In [6]:
model = RandomForestClassifier(random_state=seed,
                               n_estimators=range(2,100)[27],
                               max_depth=range(1,10)[3],
                               min_samples_split=range(2,100)[85],
                               min_samples_leaf=range(2,100)[40])
model.fit(X_GSE54456,y_GSE54456)

In [7]:
pro_GSE54456 = model.predict_proba(X_GSE54456)[:,1]
pro_GSE13355 = model.predict_proba(X_GSE13355)[:,1]
pro_GSE14905 = model.predict_proba(X_GSE14905)[:,1]
pro_GSE16161 = model.predict_proba(X_GSE16161)[:,1]
pro_GSE66511 = model.predict_proba(X_GSE66511)[:,1]
pro_GSE75890 = model.predict_proba(X_GSE75890)[:,1]
pro_GSE78097 = model.predict_proba(X_GSE78097)[:,1]
pro_GSE79704 = model.predict_proba(X_GSE79704)[:,1]
pro_GSE83582 = model.predict_proba(X_GSE83582)[:,1]
pro_GSE109248 = model.predict_proba(X_GSE109248)[:,1]
pro_GSE114286 = model.predict_proba(X_GSE114286)[:,1]
pro_GSE121212 = model.predict_proba(X_GSE121212)[:,1]
pro_GSE182740 = model.predict_proba(X_GSE182740)[:,1]
pre_GSE54456 = model.predict(X_GSE54456)
pre_GSE13355 = model.predict(X_GSE13355)
pre_GSE14905 = model.predict(X_GSE14905)
pre_GSE16161 = model.predict(X_GSE16161)
pre_GSE66511 = model.predict(X_GSE66511)
pre_GSE75890 = model.predict(X_GSE75890)
pre_GSE78097 = model.predict(X_GSE78097)
pre_GSE79704 = model.predict(X_GSE79704)
pre_GSE83582 = model.predict(X_GSE83582)
pre_GSE109248 = model.predict(X_GSE109248)
pre_GSE114286 = model.predict(X_GSE114286)
pre_GSE121212 = model.predict(X_GSE121212)
pre_GSE182740 = model.predict(X_GSE182740)

In [8]:
pd.DataFrame({
    'True':y_GSE13355,
    'Pre':pre_GSE13355
}).to_csv('RF_GSE13355.csv',index=False)
pd.DataFrame({
    'True':y_GSE14905,
    'Pre':pre_GSE14905
}).to_csv('RF_GSE14905.csv',index=False)
pd.DataFrame({
    'True':y_GSE16161,
    'Pre':pre_GSE16161
}).to_csv('RF_GSE16161.csv',index=False)
pd.DataFrame({
    'True':y_GSE66511,
    'Pre':pre_GSE66511
}).to_csv('RF_GSE66511.csv',index=False)
pd.DataFrame({
    'True':y_GSE75890,
    'Pre':pre_GSE75890
}).to_csv('RF_GSE75890.csv',index=False)
pd.DataFrame({
    'True':y_GSE78097,
    'Pre':pre_GSE78097
}).to_csv('RF_GSE78097.csv',index=False)
pd.DataFrame({
    'True':y_GSE79704,
    'Pre':pre_GSE79704
}).to_csv('RF_GSE79704.csv',index=False)
pd.DataFrame({
    'True':y_GSE83582,
    'Pre':pre_GSE83582
}).to_csv('RF_GSE83582.csv',index=False)
pd.DataFrame({
    'True':y_GSE109248,
    'Pre':pre_GSE109248
}).to_csv('RF_GSE109248.csv',index=False)
pd.DataFrame({
    'True':y_GSE114286,
    'Pre':pre_GSE114286
}).to_csv('RF_GSE114286.csv',index=False)
pd.DataFrame({
    'True':y_GSE121212,
    'Pre':pre_GSE121212
}).to_csv('RF_GSE121212.csv',index=False)
pd.DataFrame({
    'True':y_GSE182740,
    'Pre':pre_GSE182740
}).to_csv('RF_GSE182740.csv',index=False)

In [62]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE54456,pro_GSE54456)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE54456,pre_GSE54456)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE54456,pre_GSE54456)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE54456,pre_GSE54456)))
print('Precision = {:.3f}'.format(precision_score(y_GSE54456,pre_GSE54456)))
print('F1 = {:.3f}'.format(f1_score(y_GSE54456,pre_GSE54456)))

AUC = 1.000
Accuracy = 1.000
Sensitivity = 1.000
Specificity = 1.000
Precision = 1.000
F1 = 1.000


In [63]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE13355,pro_GSE13355)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE13355,pre_GSE13355)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE13355,pre_GSE13355)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE13355,pre_GSE13355)))
print('Precision = {:.3f}'.format(precision_score(y_GSE13355,pre_GSE13355)))
print('F1 = {:.3f}'.format(f1_score(y_GSE13355,pre_GSE13355)))

AUC = 0.925
Accuracy = 0.869
Sensitivity = 0.793
Specificity = 0.938
Precision = 0.920
F1 = 0.852


In [64]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE14905,pro_GSE14905)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE14905,pre_GSE14905)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE14905,pre_GSE14905)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE14905,pre_GSE14905)))
print('Precision = {:.3f}'.format(precision_score(y_GSE14905,pre_GSE14905)))
print('F1 = {:.3f}'.format(f1_score(y_GSE14905,pre_GSE14905)))

AUC = 0.968
Accuracy = 0.963
Sensitivity = 0.970
Specificity = 0.952
Precision = 0.970
F1 = 0.970


In [65]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE16161,pro_GSE16161)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE16161,pre_GSE16161)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE16161,pre_GSE16161)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE16161,pre_GSE16161)))
print('Precision = {:.3f}'.format(precision_score(y_GSE16161,pre_GSE16161)))
print('F1 = {:.3f}'.format(f1_score(y_GSE16161,pre_GSE16161)))

AUC = 1.000
Accuracy = 0.750
Sensitivity = 1.000
Specificity = 0.333
Precision = 0.714
F1 = 0.833


In [66]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE66511,pro_GSE66511)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE66511,pre_GSE66511)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE66511,pre_GSE66511)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE66511,pre_GSE66511)))
print('Precision = {:.3f}'.format(precision_score(y_GSE66511,pre_GSE66511)))
print('F1 = {:.3f}'.format(f1_score(y_GSE66511,pre_GSE66511)))

AUC = 1.000
Accuracy = 1.000
Sensitivity = 1.000
Specificity = 1.000
Precision = 1.000
F1 = 1.000


In [67]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE75890,pro_GSE75890)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE75890,pre_GSE75890)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE75890,pre_GSE75890)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE75890,pre_GSE75890)))
print('Precision = {:.3f}'.format(precision_score(y_GSE75890,pre_GSE75890)))
print('F1 = {:.3f}'.format(f1_score(y_GSE75890,pre_GSE75890)))

AUC = 1.000
Accuracy = 1.000
Sensitivity = 1.000
Specificity = 1.000
Precision = 1.000
F1 = 1.000


In [68]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE78097,pro_GSE78097)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE78097,pre_GSE78097)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE78097,pre_GSE78097)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE78097,pre_GSE78097)))
print('Precision = {:.3f}'.format(precision_score(y_GSE78097,pre_GSE78097)))
print('F1 = {:.3f}'.format(f1_score(y_GSE78097,pre_GSE78097)))

AUC = 1.000
Accuracy = 0.970
Sensitivity = 1.000
Specificity = 0.833
Precision = 0.964
F1 = 0.982


In [69]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE79704,pro_GSE79704)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE79704,pre_GSE79704)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE79704,pre_GSE79704)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE79704,pre_GSE79704)))
print('Precision = {:.3f}'.format(precision_score(y_GSE79704,pre_GSE79704)))
print('F1 = {:.3f}'.format(f1_score(y_GSE79704,pre_GSE79704)))

AUC = 1.000
Accuracy = 0.969
Sensitivity = 1.000
Specificity = 0.950
Precision = 0.923
F1 = 0.960


In [70]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE83582,pro_GSE83582)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE83582,pre_GSE83582)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE83582,pre_GSE83582)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE83582,pre_GSE83582)))
print('Precision = {:.3f}'.format(precision_score(y_GSE83582,pre_GSE83582)))
print('F1 = {:.3f}'.format(f1_score(y_GSE83582,pre_GSE83582)))

AUC = 0.996
Accuracy = 0.938
Sensitivity = 0.917
Specificity = 0.950
Precision = 0.917
F1 = 0.917


In [71]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE109248,pro_GSE109248)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE109248,pre_GSE109248)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE109248,pre_GSE109248)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE109248,pre_GSE109248)))
print('Precision = {:.3f}'.format(precision_score(y_GSE109248,pre_GSE109248)))
print('F1 = {:.3f}'.format(f1_score(y_GSE109248,pre_GSE109248)))

AUC = 0.998
Accuracy = 0.968
Sensitivity = 1.000
Specificity = 0.929
Precision = 0.944
F1 = 0.971


In [72]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE114286,pro_GSE114286)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE114286,pre_GSE114286)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE114286,pre_GSE114286)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE114286,pre_GSE114286)))
print('Precision = {:.3f}'.format(precision_score(y_GSE114286,pre_GSE114286)))
print('F1 = {:.3f}'.format(f1_score(y_GSE114286,pre_GSE114286)))

AUC = 1.000
Accuracy = 1.000
Sensitivity = 1.000
Specificity = 1.000
Precision = 1.000
F1 = 1.000


In [73]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE121212,pro_GSE121212)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE121212,pre_GSE121212)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE121212,pre_GSE121212)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE121212,pre_GSE121212)))
print('Precision = {:.3f}'.format(precision_score(y_GSE121212,pre_GSE121212)))
print('F1 = {:.3f}'.format(f1_score(y_GSE121212,pre_GSE121212)))

AUC = 0.980
Accuracy = 0.970
Sensitivity = 0.964
Specificity = 0.974
Precision = 0.964
F1 = 0.964


In [74]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE182740,pro_GSE182740)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE182740,pre_GSE182740)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE182740,pre_GSE182740)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE182740,pre_GSE182740)))
print('Precision = {:.3f}'.format(precision_score(y_GSE182740,pre_GSE182740)))
print('F1 = {:.3f}'.format(f1_score(y_GSE182740,pre_GSE182740)))

AUC = 0.986
Accuracy = 0.952
Sensitivity = 1.000
Specificity = 0.667
Precision = 0.947
F1 = 0.973
