In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from imblearn.metrics import sensitivity_score
from imblearn.metrics import specificity_score
from sklearn.metrics import precision_score

In [4]:
seed = 3653458742

In [5]:
TCGA = pd.read_csv('data_TCGA.csv')
X_TCGA = TCGA.iloc[:,2:]
y_TCGA = TCGA.iloc[:,1]
GSE12236 = pd.read_csv('data_GSE12236.csv')
X_GSE12236 = GSE12236.iloc[:,2:]
y_GSE12236 = GSE12236.iloc[:,1]
GSE12428 = pd.read_csv('data_GSE12428.csv')
X_GSE12428 = GSE12428.iloc[:,2:]
y_GSE12428 = GSE12428.iloc[:,1]
GSE18842 = pd.read_csv('data_GSE18842.csv')
X_GSE18842 = GSE18842.iloc[:,2:]
y_GSE18842 = GSE18842.iloc[:,1]
GSE29250 = pd.read_csv('data_GSE29250.csv')
X_GSE29250 = GSE29250.iloc[:,2:]
y_GSE29250 = GSE29250.iloc[:,1]
GSE31210 = pd.read_csv('data_GSE31210.csv')
X_GSE31210 = GSE31210.iloc[:,2:]
y_GSE31210 = GSE31210.iloc[:,1]
GSE31446 = pd.read_csv('data_GSE31446.csv')
X_GSE31446 = GSE31446.iloc[:,2:]
y_GSE31446 = GSE31446.iloc[:,1]
GSE32863 = pd.read_csv('data_GSE32863.csv')
X_GSE32863 = GSE32863.iloc[:,2:]
y_GSE32863 = GSE32863.iloc[:,1]
GSE33532 = pd.read_csv('data_GSE33532.csv')
X_GSE33532 = GSE33532.iloc[:,2:]
y_GSE33532 = GSE33532.iloc[:,1]
GSE63459 = pd.read_csv('data_GSE63459.csv')
X_GSE63459 = GSE63459.iloc[:,2:]
y_GSE63459 = GSE63459.iloc[:,1]
GSE75037 = pd.read_csv('data_GSE75037.csv')
X_GSE75037 = GSE75037.iloc[:,2:]
y_GSE75037 = GSE75037.iloc[:,1]
GSE85716 = pd.read_csv('data_GSE85716.csv')
X_GSE85716 = GSE85716.iloc[:,2:]
y_GSE85716 = GSE85716.iloc[:,1]
GSE85841 = pd.read_csv('data_GSE85841.csv')
X_GSE85841 = GSE85841.iloc[:,2:]
y_GSE85841 = GSE85841.iloc[:,1]
GSE101929 = pd.read_csv('data_GSE101929.csv')
X_GSE101929 = GSE101929.iloc[:,2:]
y_GSE101929 = GSE101929.iloc[:,1]
GSE115002 = pd.read_csv('data_GSE115002.csv')
X_GSE115002 = GSE115002.iloc[:,2:]
y_GSE115002 = GSE115002.iloc[:,1]
GSE134381 = pd.read_csv('data_GSE134381.csv')
X_GSE134381 = GSE134381.iloc[:,2:]
y_GSE134381 = GSE134381.iloc[:,1]

In [6]:
sp = RepeatedStratifiedKFold(random_state=seed,n_repeats=3,n_splits=5)

In [44]:
import hyperopt
from hyperopt import hp

In [45]:
def objective(params):
    aucs = []
    for train_index,test_index in sp.split(X_TCGA,y_TCGA):
        X_train = X_TCGA.iloc[train_index,:]
        X_vali = X_TCGA.iloc[test_index,:]
        y_train = y_TCGA[train_index]
        y_vali = y_TCGA[test_index]
        model = LogisticRegression(random_state=seed,
                                   C=params['C'])
        model.fit(X_train,y_train)
        pro_vali = model.predict_proba(X_vali)[:,1]
        auc_vali = roc_auc_score(y_vali,pro_vali)
        aucs.append(auc_vali)
    return -np.mean(aucs)

In [46]:
space = {
    'C':hp.uniform('C',0,10)
}

In [47]:
best_param = hyperopt.fmin(objective,space,hyperopt.tpe.suggest,max_evals=100)

100%|██████████| 100/100 [00:27<00:00,  3.58trial/s, best loss: -1.0]


In [48]:
best_param

{'C': np.float64(9.888187960446661)}

In [7]:
model = LogisticRegression(random_state=seed,
                           C=9.888187960446661)
model.fit(X_TCGA,y_TCGA)

In [8]:
pro_TCGA = model.predict_proba(X_TCGA)[:,1]
pro_GSE12236 = model.predict_proba(X_GSE12236)[:,1]
pro_GSE12428 = model.predict_proba(X_GSE12428)[:,1]
pro_GSE18842 = model.predict_proba(X_GSE18842)[:,1]
pro_GSE29250 = model.predict_proba(X_GSE29250)[:,1]
pro_GSE31210 = model.predict_proba(X_GSE31210)[:,1]
pro_GSE31446 = model.predict_proba(X_GSE31446)[:,1]
pro_GSE32863 = model.predict_proba(X_GSE32863)[:,1]
pro_GSE33532 = model.predict_proba(X_GSE33532)[:,1]
pro_GSE63459 = model.predict_proba(X_GSE63459)[:,1]
pro_GSE75037 = model.predict_proba(X_GSE75037)[:,1]
pro_GSE85716 = model.predict_proba(X_GSE85716)[:,1]
pro_GSE85841 = model.predict_proba(X_GSE85841)[:,1]
pro_GSE101929 = model.predict_proba(X_GSE101929)[:,1]
pro_GSE115002 = model.predict_proba(X_GSE115002)[:,1]
pro_GSE134381 = model.predict_proba(X_GSE134381)[:,1]
pre_TCGA = model.predict(X_TCGA)
pre_GSE12236 = model.predict(X_GSE12236)
pre_GSE12428 = model.predict(X_GSE12428)
pre_GSE18842 = model.predict(X_GSE18842)
pre_GSE29250 = model.predict(X_GSE29250)
pre_GSE31210 = model.predict(X_GSE31210)
pre_GSE31446 = model.predict(X_GSE31446)
pre_GSE32863 = model.predict(X_GSE32863)
pre_GSE33532 = model.predict(X_GSE33532)
pre_GSE63459 = model.predict(X_GSE63459)
pre_GSE75037 = model.predict(X_GSE75037)
pre_GSE85716 = model.predict(X_GSE85716)
pre_GSE85841 = model.predict(X_GSE85841)
pre_GSE101929 = model.predict(X_GSE101929)
pre_GSE115002 = model.predict(X_GSE115002)
pre_GSE134381 = model.predict(X_GSE134381)

In [9]:
pd.DataFrame({
    'True':y_GSE12236,
    'Pre':pre_GSE12236
}).to_csv('Logistic_GSE12236.csv',index=False)
pd.DataFrame({
    'True':y_GSE12428,
    'Pre':pre_GSE12428
}).to_csv('Logistic_GSE12428.csv',index=False)
pd.DataFrame({
    'True':y_GSE18842,
    'Pre':pre_GSE18842
}).to_csv('Logistic_GSE18842.csv',index=False)
pd.DataFrame({
    'True':y_GSE29250,
    'Pre':pre_GSE29250
}).to_csv('Logistic_GSE29250.csv',index=False)
pd.DataFrame({
    'True':y_GSE31210,
    'Pre':pre_GSE31210
}).to_csv('Logistic_GSE31210.csv',index=False)
pd.DataFrame({
    'True':y_GSE31446,
    'Pre':pre_GSE31446
}).to_csv('Logistic_GSE31446.csv',index=False)
pd.DataFrame({
    'True':y_GSE32863,
    'Pre':pre_GSE32863
}).to_csv('Logistic_GSE32863.csv',index=False)
pd.DataFrame({
    'True':y_GSE33532,
    'Pre':pre_GSE33532
}).to_csv('Logistic_GSE33532.csv',index=False)
pd.DataFrame({
    'True':y_GSE63459,
    'Pre':pre_GSE63459
}).to_csv('Logistic_GSE63459.csv',index=False)
pd.DataFrame({
    'True':y_GSE75037,
    'Pre':pre_GSE75037
}).to_csv('Logistic_GSE75037.csv',index=False)
pd.DataFrame({
    'True':y_GSE85716,
    'Pre':pre_GSE85716
}).to_csv('Logistic_GSE85716.csv',index=False)
pd.DataFrame({
    'True':y_GSE85841,
    'Pre':pre_GSE85841
}).to_csv('Logistic_GSE85841.csv',index=False)
pd.DataFrame({
    'True':y_GSE101929,
    'Pre':pre_GSE101929
}).to_csv('Logistic_GSE101929.csv',index=False)
pd.DataFrame({
    'True':y_GSE115002,
    'Pre':pre_GSE115002
}).to_csv('Logistic_GSE115002.csv',index=False)
pd.DataFrame({
    'True':y_GSE134381,
    'Pre':pre_GSE134381
}).to_csv('Logistic_GSE134381.csv',index=False)

In [51]:
print('AUC = {:.3f}'.format(roc_auc_score(y_TCGA,pro_TCGA)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_TCGA,pre_TCGA)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_TCGA,pre_TCGA)))
print('Specificity = {:.3f}'.format(specificity_score(y_TCGA,pre_TCGA)))
print('Precision = {:.3f}'.format(precision_score(y_TCGA,pre_TCGA)))
print('F1 = {:.3f}'.format(f1_score(y_TCGA,pre_TCGA)))

AUC = 1.000
Accuracy = 1.000
Sensitivity = 1.000
Specificity = 1.000
Precision = 1.000
F1 = 1.000


In [52]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE12236,pro_GSE12236)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE12236,pre_GSE12236)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE12236,pre_GSE12236)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE12236,pre_GSE12236)))
print('Precision = {:.3f}'.format(precision_score(y_GSE12236,pre_GSE12236)))
print('F1 = {:.3f}'.format(f1_score(y_GSE12236,pre_GSE12236)))

AUC = 1.000
Accuracy = 0.825
Sensitivity = 1.000
Specificity = 0.650
Precision = 0.741
F1 = 0.851


In [53]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE12428,pro_GSE12428)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE12428,pre_GSE12428)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE12428,pre_GSE12428)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE12428,pre_GSE12428)))
print('Precision = {:.3f}'.format(precision_score(y_GSE12428,pre_GSE12428)))
print('F1 = {:.3f}'.format(f1_score(y_GSE12428,pre_GSE12428)))

AUC = 0.807
Accuracy = 0.548
Sensitivity = 1.000
Specificity = 0.000
Precision = 0.548
F1 = 0.708


In [54]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE18842,pro_GSE18842)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE18842,pre_GSE18842)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE18842,pre_GSE18842)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE18842,pre_GSE18842)))
print('Precision = {:.3f}'.format(precision_score(y_GSE18842,pre_GSE18842)))
print('F1 = {:.3f}'.format(f1_score(y_GSE18842,pre_GSE18842)))

AUC = 1.000
Accuracy = 0.857
Sensitivity = 1.000
Specificity = 0.711
Precision = 0.780
F1 = 0.876


In [55]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE29250,pro_GSE29250)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE29250,pre_GSE29250)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE29250,pre_GSE29250)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE29250,pre_GSE29250)))
print('Precision = {:.3f}'.format(precision_score(y_GSE29250,pre_GSE29250)))
print('F1 = {:.3f}'.format(f1_score(y_GSE29250,pre_GSE29250)))

AUC = 0.944
Accuracy = 0.750
Sensitivity = 0.833
Specificity = 0.667
Precision = 0.714
F1 = 0.769


In [56]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE31210,pro_GSE31210)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE31210,pre_GSE31210)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE31210,pre_GSE31210)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE31210,pre_GSE31210)))
print('Precision = {:.3f}'.format(precision_score(y_GSE31210,pre_GSE31210)))
print('F1 = {:.3f}'.format(f1_score(y_GSE31210,pre_GSE31210)))

AUC = 0.973
Accuracy = 0.947
Sensitivity = 0.991
Specificity = 0.450
Precision = 0.953
F1 = 0.972


In [57]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE31446,pro_GSE31446)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE31446,pre_GSE31446)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE31446,pre_GSE31446)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE31446,pre_GSE31446)))
print('Precision = {:.3f}'.format(precision_score(y_GSE31446,pre_GSE31446)))
print('F1 = {:.3f}'.format(f1_score(y_GSE31446,pre_GSE31446)))

AUC = 0.971
Accuracy = 0.734
Sensitivity = 1.000
Specificity = 0.433
Precision = 0.667
F1 = 0.800


In [58]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE32863,pro_GSE32863)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE32863,pre_GSE32863)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE32863,pre_GSE32863)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE32863,pre_GSE32863)))
print('Precision = {:.3f}'.format(precision_score(y_GSE32863,pre_GSE32863)))
print('F1 = {:.3f}'.format(f1_score(y_GSE32863,pre_GSE32863)))

AUC = 0.996
Accuracy = 0.879
Sensitivity = 1.000
Specificity = 0.759
Precision = 0.806
F1 = 0.892


In [59]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE33532,pro_GSE33532)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE33532,pre_GSE33532)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE33532,pre_GSE33532)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE33532,pre_GSE33532)))
print('Precision = {:.3f}'.format(precision_score(y_GSE33532,pre_GSE33532)))
print('F1 = {:.3f}'.format(f1_score(y_GSE33532,pre_GSE33532)))

AUC = 0.986
Accuracy = 0.920
Sensitivity = 1.000
Specificity = 0.600
Precision = 0.909
F1 = 0.952


In [60]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE63459,pro_GSE63459)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE63459,pre_GSE63459)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE63459,pre_GSE63459)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE63459,pre_GSE63459)))
print('Precision = {:.3f}'.format(precision_score(y_GSE63459,pre_GSE63459)))
print('F1 = {:.3f}'.format(f1_score(y_GSE63459,pre_GSE63459)))

AUC = 0.914
Accuracy = 0.554
Sensitivity = 0.970
Specificity = 0.125
Precision = 0.533
F1 = 0.688


In [61]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE75037,pro_GSE75037)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE75037,pre_GSE75037)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE75037,pre_GSE75037)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE75037,pre_GSE75037)))
print('Precision = {:.3f}'.format(precision_score(y_GSE75037,pre_GSE75037)))
print('F1 = {:.3f}'.format(f1_score(y_GSE75037,pre_GSE75037)))

AUC = 1.000
Accuracy = 0.964
Sensitivity = 1.000
Specificity = 0.928
Precision = 0.933
F1 = 0.965


In [62]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE85716,pro_GSE85716)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE85716,pre_GSE85716)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE85716,pre_GSE85716)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE85716,pre_GSE85716)))
print('Precision = {:.3f}'.format(precision_score(y_GSE85716,pre_GSE85716)))
print('F1 = {:.3f}'.format(f1_score(y_GSE85716,pre_GSE85716)))

AUC = 1.000
Accuracy = 0.833
Sensitivity = 1.000
Specificity = 0.667
Precision = 0.750
F1 = 0.857


In [63]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE85841,pro_GSE85841)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE85841,pre_GSE85841)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE85841,pre_GSE85841)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE85841,pre_GSE85841)))
print('Precision = {:.3f}'.format(precision_score(y_GSE85841,pre_GSE85841)))
print('F1 = {:.3f}'.format(f1_score(y_GSE85841,pre_GSE85841)))

AUC = 1.000
Accuracy = 0.938
Sensitivity = 1.000
Specificity = 0.875
Precision = 0.889
F1 = 0.941


In [64]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE101929,pro_GSE101929)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE101929,pre_GSE101929)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE101929,pre_GSE101929)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE101929,pre_GSE101929)))
print('Precision = {:.3f}'.format(precision_score(y_GSE101929,pre_GSE101929)))
print('F1 = {:.3f}'.format(f1_score(y_GSE101929,pre_GSE101929)))

AUC = 0.991
Accuracy = 0.864
Sensitivity = 1.000
Specificity = 0.735
Precision = 0.780
F1 = 0.877


In [65]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE115002,pro_GSE115002)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE115002,pre_GSE115002)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE115002,pre_GSE115002)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE115002,pre_GSE115002)))
print('Precision = {:.3f}'.format(precision_score(y_GSE115002,pre_GSE115002)))
print('F1 = {:.3f}'.format(f1_score(y_GSE115002,pre_GSE115002)))

AUC = 0.991
Accuracy = 0.904
Sensitivity = 0.981
Specificity = 0.827
Precision = 0.850
F1 = 0.911


In [66]:
print('AUC = {:.3f}'.format(roc_auc_score(y_GSE134381,pro_GSE134381)))
print('Accuracy = {:.3f}'.format(accuracy_score(y_GSE134381,pre_GSE134381)))
print('Sensitivity = {:.3f}'.format(sensitivity_score(y_GSE134381,pre_GSE134381)))
print('Specificity = {:.3f}'.format(specificity_score(y_GSE134381,pre_GSE134381)))
print('Precision = {:.3f}'.format(precision_score(y_GSE134381,pre_GSE134381)))
print('F1 = {:.3f}'.format(f1_score(y_GSE134381,pre_GSE134381)))

AUC = 0.750
Accuracy = 0.676
Sensitivity = 0.919
Specificity = 0.432
Precision = 0.618
F1 = 0.739
