Compare difference Model results

In [None]:
"""
Compare the predictive performance of different models:
· Support Vector Machine (SVM)
· Random Forest (RF)
· EXtreme Gradient Boosting (XGBoost)

"""

In [1]:
import sklearn
from sklearn import feature_selection as fs
 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn import tree
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB, CategoricalNB, ComplementNB
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.neighbors import NearestNeighbors,KDTree,BallTree
from sklearn.ensemble import GradientBoostingClassifier

from catboost import CatBoostClassifier
import xgboost as xgb

In [3]:
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, auc, r2_score
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import precision_recall_curve, precision_score, average_precision_score, recall_score, f1_score

In [4]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import ClusterCentroids, EditedNearestNeighbours
from imblearn.under_sampling import RandomUnderSampler

load data

In [5]:
######### TCGA discovery features #############
data_all = pd.read_csv('../Data/radiomic_feature/radio_discovery.csv',index_col=0)

label_caf = np.array(data_all['CAF'])
label_tnbc = np.array(data_all['tnbc'])

data_feature = data_all.drop(['CAF', 'tnbc'], axis=1)
feature_name = data_feature.columns

print(data_feature.shape)
print(label_caf.shape)
print(label_tnbc.shape)

(518, 36)
(518,)
(518,)


In [6]:
##################### Radiogenomics Validation features #####################
data_tcga = pd.read_csv('../Data/radiomic_feature/radio_val.csv',index_col=0)

data_tcga1 = data_tcga[data_tcga['tnbc'] != 2]
label_tcga = np.array(data_tcga1['tnbc'])
data_feature_tcga = data_tcga1.drop(['CAF', 'tnbc'], axis=1)
data_feature_tcga.shape

(178, 36)

In [7]:
######################## Radiomics Features (ISPY) #########################
data_ispy = pd.read_csv('../Data/radiomic_feature/radio_ispy.csv',index_col=0)

label_ispy = np.array(data_ispy['tnbc'])
data_feature_ispy = data_ispy.drop(['tnbc'], axis=1)

data_feature_ispy.shape

(1252, 36)

In [8]:
##################### Radiomics Features (UCSF) #################################

data_ucsf = pd.read_csv('../Data/radiomic_feature/radio_ucsf.csv',index_col=0)
label_ucsf = np.array(data_ucsf['tnbc'])
data_feature_ucsf = data_ucsf.drop(['tnbc'], axis=1)

print(data_feature_ucsf.shape)
print(label_ucsf.shape)

(445, 36)
(445,)


In [9]:
####################   Clinical   #####################

data_sysucc = pd.read_csv('../Data/radiomic_feature/radio_sysucc.csv',index_col=0)

label_sysucc = np.array(data_sysucc['tnbc'])

data_feature_sysucc = data_sysucc.drop(['tnbc'], axis=1)
data_feature_sysucc.shape

(1571, 36)

data processing

In [10]:
# radiogenomics discovery cohort 

X_train_1, X_test_1,  y_train_1, y_test_1 = train_test_split(
        data_feature, label_tnbc, test_size=0.2, stratify=label_tnbc, random_state=42
    )

print(Counter(y_train_1),Counter(y_test_1))
data_feature.shape, X_train_1.shape, X_test_1.shape

Counter({0: 362, 1: 52}) Counter({0: 91, 1: 13})


((518, 36), (414, 36), (104, 36))

In [11]:
# radiogenomics discovery cohort 

X_train_2,X_test_2,y_train_2,y_test_2 = train_test_split(
        data_feature_tcga, label_tcga, test_size=0.2, stratify=label_tcga, random_state=42
    )

print(Counter(y_train_2),Counter(y_test_2))
data_feature_tcga.shape, X_train_2.shape, X_test_2.shape

Counter({0: 125, 1: 17}) Counter({0: 32, 1: 4})


((178, 36), (142, 36), (36, 36))

In [12]:
# radiomics : ISPY

X_train_3,X_test_3,y_train_3,y_test_3 = train_test_split(
        data_feature_ispy, label_ispy, test_size=0.2, stratify=label_ispy, random_state=0
    )

print(Counter(y_train_3), Counter(y_test_3))
data_feature_ispy.shape, X_train_3.shape, X_test_3.shape

Counter({0: 744, 1: 257}) Counter({0: 187, 1: 64})


((1252, 36), (1001, 36), (251, 36))

In [13]:
# radiomics : UCSF

X_train_4,X_test_4,y_train_4,y_test_4 = train_test_split(
        data_feature_ucsf, label_ucsf, test_size=0.2, stratify=label_ucsf, random_state=42
    )

print(Counter(y_train_4),Counter(y_test_4))
data_feature_tcga.shape, X_train_4.shape, X_test_4.shape

Counter({0: 232, 1: 124}) Counter({0: 58, 1: 31})


((178, 36), (356, 36), (89, 36))

In [14]:
####################   Clinical   #####################

X_train_5,X_test_5,y_train_5,y_test_5 = train_test_split(
        data_feature_sysucc, label_sysucc, test_size=0.2, stratify=label_sysucc, random_state=0
    )

print(Counter(y_train_5),Counter(y_test_5))
data_feature_sysucc.shape, X_train_5.shape, X_test_5.shape

Counter({0: 740, 1: 516}) Counter({0: 186, 1: 129})


((1571, 36), (1256, 36), (315, 36))

Train model

radiogenomic discovery cohort

In [15]:
############### Discovery #######################
discovery_params = {
    'n_estimators': 10, 
    'learning_rate': 0.05, 
    'max_depth': 5, 
    'min_child_weight': 1, 
    'gamma': 0, 
    'subsample': 1, 
    'colsample_bytree': 0.6, 
    'reg_alpha': 0.1, 
    'reg_lambda': 2, 
    'scale_pos_weight': 6.9, 
    'objective': 'binary:logistic', 
    'random_state': 42,
    'max_delta_step': 0
}
xgb_model = xgb.XGBClassifier(**discovery_params)
xgb_model.fit(X_train_1, y_train_1)

In [16]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train_1, y_train_1)

In [17]:
svm_model = SVC(probability=True)
svm_model.fit(X_train_1, y_train_1)

In [18]:
y_pred_train = xgb_model.predict(X_train_1)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_1)

pred_prob = xgb_model.predict_proba(X_test_1)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_1)
roc_auc = auc(fpr, tpr)
y_pred = (pred_prob[:, 1] >= 0.5).astype(int)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_1)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_1)

tn, fp, fn, tp = acc_con1.ravel()
# Specificity
spec_score = tn / (tn + fp) if (tn + fp) > 0 else 0 

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_1)
pr_auc = auc(recall, precision)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_1)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_1)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)

print('Recall / Sensitivity: ', recall_scores)
print('Specificity: ', spec_score)


Accuracy Train:  0.9347826086956522
Accuracy Test:  0.8942307692307693
Confusion Matrix: 
[[83  8]
 [ 3 10]]
ROC AUC:  0.7709213863060017
PR AUC:  0.7008547008547008
Recall / Sensitivity:  0.7692307692307693
Specificity:  0.9120879120879121


In [19]:
y_pred_train = rf_model.predict(X_train_1)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_1)

pred_prob = rf_model.predict_proba(X_test_1)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_1)
roc_auc = auc(fpr, tpr)
y_pred = (pred_prob[:, 1] > 0.5).astype(int)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_1)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_1)

tn, fp, fn, tp = acc_con1.ravel()
# Specificity
spec_score = tn / (tn + fp) if (tn + fp) > 0 else 0 

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_1)
pr_auc = auc(recall, precision)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_1)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_1)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)

print('Recall / Sensitivity: ', recall_scores)
print('Specificity: ', spec_score)


Accuracy Train:  1.0
Accuracy Test:  0.875
Confusion Matrix: 
[[91  0]
 [13  0]]
ROC AUC:  0.7121724429416737
PR AUC:  0.5
Recall / Sensitivity:  0.0
Specificity:  1.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [20]:
y_pred_train = svm_model.predict(X_train_1)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_1)

pred_prob = svm_model.predict_proba(X_test_1)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_1)
roc_auc = auc(fpr, tpr)
y_pred = (pred_prob[:, 1] > 0.5).astype(int)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_1)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_1)

tn, fp, fn, tp = acc_con1.ravel()
# Specificity
spec_score = tn / (tn + fp) if (tn + fp) > 0 else 0 

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_1)
pr_auc = auc(recall, precision)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_1)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_1)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)

print('Recall / Sensitivity: ', recall_scores)
print('Specificity: ', spec_score)


Accuracy Train:  0.8743961352657005
Accuracy Test:  0.875
Confusion Matrix: 
[[91  0]
 [13  0]]
ROC AUC:  0.661876584953508
PR AUC:  0.5
Recall / Sensitivity:  0.0
Specificity:  1.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


radiogenomic validation cohort

In [21]:
############### Val #######################

val_params = {
    'n_estimators': 55, 
    'learning_rate': 0.91, 
    'max_depth': 3, 
    'min_child_weight': 4, 
    'gamma': 0, 
    'subsample': 0.5, 
    'colsample_bytree': 0.2, 
    'reg_alpha': 0.6, 
    'reg_lambda': 0.6, 
    'scale_pos_weight': 8, 
    'objective': 'binary:logistic', 
    'eval_metric': 'aucpr', 
    'random_state': 42
}

xgb_model = xgb.XGBClassifier(**val_params)
xgb_model.fit(X_train_2, y_train_2)

In [22]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train_2, y_train_2)

In [23]:
svm_model = SVC(probability=True)
svm_model.fit(X_train_2, y_train_2)

In [24]:
y_pred_train = xgb_model.predict(X_train_2)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_2)

pred_prob = xgb_model.predict_proba(X_test_2)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_2)
roc_auc = auc(fpr, tpr)
y_pred = (pred_prob[:, 1] > 0.5).astype(int)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_2)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_2)

tn, fp, fn, tp = acc_con1.ravel()
# Specificity
spec_score = tn / (tn + fp) if (tn + fp) > 0 else 0 

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_2)
pr_auc = auc(recall, precision)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_2)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_2)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)

print('Recall / Sensitivity: ', recall_scores)
print('Specificity: ', spec_score)


Accuracy Train:  0.9436619718309859
Accuracy Test:  0.9444444444444444
Confusion Matrix: 
[[31  1]
 [ 1  3]]
ROC AUC:  0.9375
PR AUC:  0.7638888888888888
Recall / Sensitivity:  0.75
Specificity:  0.96875


In [25]:
y_pred_train = rf_model.predict(X_train_2)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_2)

pred_prob = rf_model.predict_proba(X_test_2)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_2)
roc_auc = auc(fpr, tpr)
y_pred = (pred_prob[:, 1] > 0.5).astype(int)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_2)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_2)

tn, fp, fn, tp = acc_con1.ravel()
# Specificity
spec_score = tn / (tn + fp) if (tn + fp) > 0 else 0 

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_2)
pr_auc = auc(recall, precision)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_2)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_2)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)

print('Recall / Sensitivity: ', recall_scores)
print('Specificity: ', spec_score)


Accuracy Train:  1.0
Accuracy Test:  0.8888888888888888
Confusion Matrix: 
[[32  0]
 [ 4  0]]
ROC AUC:  0.84375
PR AUC:  0.5
Recall / Sensitivity:  0.0
Specificity:  1.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [26]:
y_pred_train = svm_model.predict(X_train_2)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_2)

pred_prob = svm_model.predict_proba(X_test_2)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_2)
roc_auc = auc(fpr, tpr)
y_pred = (pred_prob[:, 1] > 0.5).astype(int)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_2)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_2)

tn, fp, fn, tp = acc_con1.ravel()
# Specificity
spec_score = tn / (tn + fp) if (tn + fp) > 0 else 0 

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_2)
pr_auc = auc(recall, precision)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_2)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_2)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)

print('Recall / Sensitivity: ', recall_scores)
print('Specificity: ', spec_score)


Accuracy Train:  0.8802816901408451
Accuracy Test:  0.8888888888888888
Confusion Matrix: 
[[32  0]
 [ 4  0]]
ROC AUC:  0.59375
PR AUC:  0.5
Recall / Sensitivity:  0.0
Specificity:  1.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


radiomic cohort: ISPY

In [27]:
############### ISPY #######################

ispy_params = {
    'n_estimators': 70, 
    'learning_rate': 0.1, 
    'max_depth': 6, 
    'min_child_weight': 1, 
    'gamma': 0.4, 
    'subsample': 0.9, 
    'colsample_bytree': 0.6, 
    'reg_alpha': 1.06, 
    'reg_lambda': 4.48, 
    'scale_pos_weight': 7.5, 
    'objective': 'binary:logistic', 
    'eval_metric': 'aucpr', 
    'random_state': 42,
    'max_delta_step': 2,
    'tree_method': 'hist'
}
xgb_model = xgb.XGBClassifier(**ispy_params)
xgb_model.fit(X_train_3, y_train_3)

In [28]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train_3, y_train_3)

In [29]:
svm_model = SVC(probability=True)
svm_model.fit(X_train_3, y_train_3)

In [30]:
y_pred_train = xgb_model.predict(X_train_3)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_3)

pred_prob = xgb_model.predict_proba(X_test_3)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_3)
roc_auc = auc(fpr, tpr)
y_pred = (pred_prob[:, 1] > 0.509).astype(int)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_3)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_3)

tn, fp, fn, tp = acc_con1.ravel()
# Specificity
spec_score = tn / (tn + fp) if (tn + fp) > 0 else 0 

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_3)
pr_auc = auc(recall, precision)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_3)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_3)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)

print('Recall / Sensitivity: ', recall_scores)
print('Specificity: ', spec_score)


Accuracy Train:  0.961038961038961
Accuracy Test:  0.7330677290836654
Confusion Matrix: 
[[137  50]
 [ 17  47]]
ROC AUC:  0.7263536096256685
PR AUC:  0.7090571348626113
Recall / Sensitivity:  0.734375
Specificity:  0.732620320855615


In [31]:
y_pred_train = rf_model.predict(X_train_3)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_3)

pred_prob = rf_model.predict_proba(X_test_3)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_3)
roc_auc = auc(fpr, tpr)
y_pred = (pred_prob[:, 1] > 0.5).astype(int)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_3)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_3)

tn, fp, fn, tp = acc_con1.ravel()
# Specificity
spec_score = tn / (tn + fp) if (tn + fp) > 0 else 0 

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_3)
pr_auc = auc(recall, precision)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_3)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_3)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)

print('Recall / Sensitivity: ', recall_scores)
print('Specificity: ', spec_score)


Accuracy Train:  1.0
Accuracy Test:  0.7410358565737052
Confusion Matrix: 
[[180   7]
 [ 58   6]]
ROC AUC:  0.6382436497326204
PR AUC:  0.2915884538768005
Recall / Sensitivity:  0.09375
Specificity:  0.9625668449197861


In [32]:
y_pred_train = svm_model.predict(X_train_3)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_3)

pred_prob = svm_model.predict_proba(X_test_3)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_3)
roc_auc = auc(fpr, tpr)
y_pred = (pred_prob[:, 1] > 0.5).astype(int)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_3)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_3)

tn, fp, fn, tp = acc_con1.ravel()
# Specificity
spec_score = tn / (tn + fp) if (tn + fp) > 0 else 0 

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_3)
pr_auc = auc(recall, precision)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_3)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_3)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)

print('Recall / Sensitivity: ', recall_scores)
print('Specificity: ', spec_score)


Accuracy Train:  0.7432567432567433
Accuracy Test:  0.7529880478087649
Confusion Matrix: 
[[187   0]
 [ 62   2]]
ROC AUC:  0.6226604278074866
PR AUC:  0.515625
Recall / Sensitivity:  0.03125
Specificity:  1.0


radiomic cohort: NACT-Pilot

In [33]:

ucsf_params = {
 'n_estimators': 2,
 'learning_rate': 1.8,
 'max_depth': 8,
 'min_child_weight': 1,
 'gamma': 0.3,
 'subsample': 1.0,
 'colsample_bytree': 1.0,
 'reg_alpha': 1,
 'reg_lambda': 0.9,
 'scale_pos_weight': 1.7,
 'objective': 'binary:logistic',
 'eval_metric': 'aucpr',
 'random_state': 42,
 'max_delta_step': 1}

xgb_model = xgb.XGBClassifier(**ucsf_params)
xgb_model.fit(X_train_4, y_train_4)

In [34]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train_4, y_train_4)

In [35]:
svm_model = SVC(probability=True)
svm_model.fit(X_train_4, y_train_4)

In [37]:
y_pred_train = xgb_model.predict(X_train_4)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_4)

pred_prob = xgb_model.predict_proba(X_test_4)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_4)
roc_auc = auc(fpr, tpr)
y_pred = (pred_prob[:, 1] > 0.5).astype(int)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_4)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_4)

tn, fp, fn, tp = acc_con1.ravel()
# Specificity
spec_score = tn / (tn + fp) if (tn + fp) > 0 else 0 

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_4)
pr_auc = auc(recall, precision)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_4)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_4)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)

print('Recall / Sensitivity: ', recall_scores)
print('Specificity: ', spec_score)


Accuracy Train:  0.9466292134831461
Accuracy Test:  0.8651685393258427
Confusion Matrix: 
[[52  6]
 [ 6 25]]
ROC AUC:  0.85706340378198
PR AUC:  0.8401594780717652
Recall / Sensitivity:  0.8064516129032258
Specificity:  0.896551724137931


In [38]:
y_pred_train = rf_model.predict(X_train_4)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_4)

pred_prob = rf_model.predict_proba(X_test_4)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_4)
roc_auc = auc(fpr, tpr)
y_pred = (pred_prob[:, 1] > 0.5).astype(int)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_4)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_4)

tn, fp, fn, tp = acc_con1.ravel()
# Specificity
spec_score = tn / (tn + fp) if (tn + fp) > 0 else 0 

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_4)
pr_auc = auc(recall, precision)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_4)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_4)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)

print('Recall / Sensitivity: ', recall_scores)
print('Specificity: ', spec_score)


Accuracy Train:  1.0
Accuracy Test:  0.7078651685393258
Confusion Matrix: 
[[52  6]
 [20 11]]
ROC AUC:  0.7766963292547274
PR AUC:  0.534656631771955
Recall / Sensitivity:  0.3548387096774194
Specificity:  0.896551724137931


In [39]:
y_pred_train = svm_model.predict(X_train_4)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_4)

pred_prob = svm_model.predict_proba(X_test_4)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_4)
roc_auc = auc(fpr, tpr)
y_pred = (pred_prob[:, 1] > 0.5).astype(int)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_4)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_4)

tn, fp, fn, tp = acc_con1.ravel()
# Specificity
spec_score = tn / (tn + fp) if (tn + fp) > 0 else 0 

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_4)
pr_auc = auc(recall, precision)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_4)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_4)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)

print('Recall / Sensitivity: ', recall_scores)
print('Specificity: ', spec_score)


Accuracy Train:  0.7106741573033708
Accuracy Test:  0.651685393258427
Confusion Matrix: 
[[52  6]
 [25  6]]
ROC AUC:  0.6629588431590656
PR AUC:  0.3804820587169264
Recall / Sensitivity:  0.1935483870967742
Specificity:  0.896551724137931


clinical cohort

In [40]:
sysucc_params = {
    'n_estimators': 109, 
    'learning_rate': 0.15, 
    'max_depth': 8, 
    'min_child_weight': 2, 
    'gamma': 0.9, 
    'subsample': 0.4, 
    'colsample_bytree': 0.8, 
    'reg_alpha': 3.10, 
    'reg_lambda': 0.28, 
    'scale_pos_weight': 2.9, 
    'objective': 'binary:logistic', 
    'eval_metric': 'aucpr', 
    'max_delta_step' : 0,
    'tree_method': 'hist'
    
}

xgb_model = xgb.XGBClassifier(**sysucc_params)
xgb_model.fit(X_train_5, y_train_5)

In [41]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train_5, y_train_5)

In [42]:
svm_model = SVC(probability=True)
svm_model.fit(X_train_5, y_train_5)

In [44]:
y_pred_train = xgb_model.predict(X_train_5)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_5)

pred_prob = xgb_model.predict_proba(X_test_5)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_5)
roc_auc = auc(fpr, tpr)
y_pred = (pred_prob[:, 1] > 0.494).astype(int)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_5)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_5)

tn, fp, fn, tp = acc_con1.ravel()
# Specificity
spec_score = tn / (tn + fp) if (tn + fp) > 0 else 0 

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_5)
pr_auc = auc(recall, precision)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_5)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_5)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)

print('Recall / Sensitivity: ', recall_scores)
print('Specificity: ', spec_score)


Accuracy Train:  0.9514331210191083
Accuracy Test:  0.7015873015873015
Confusion Matrix: 
[[117  69]
 [ 25 104]]
ROC AUC:  0.7379761607068434
PR AUC:  0.8132026193996889
Recall / Sensitivity:  0.8062015503875969
Specificity:  0.6290322580645161


In [45]:
y_pred_train = rf_model.predict(X_train_5)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_5)

pred_prob = rf_model.predict_proba(X_test_5)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_5)
roc_auc = auc(fpr, tpr)
y_pred = (pred_prob[:, 1] > 0.494).astype(int)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_5)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_5)

tn, fp, fn, tp = acc_con1.ravel()
# Specificity
spec_score = tn / (tn + fp) if (tn + fp) > 0 else 0 

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_5)
pr_auc = auc(recall, precision)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_5)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_5)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)

print('Recall / Sensitivity: ', recall_scores)
print('Specificity: ', spec_score)


Accuracy Train:  1.0
Accuracy Test:  0.6857142857142857
Confusion Matrix: 
[[153  33]
 [ 66  63]]
ROC AUC:  0.6850045844794531
PR AUC:  0.6246919988925803
Recall / Sensitivity:  0.4883720930232558
Specificity:  0.8225806451612904


In [46]:
y_pred_train = svm_model.predict(X_train_5)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_5)

pred_prob = svm_model.predict_proba(X_test_5)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_5)
roc_auc = auc(fpr, tpr)
y_pred = (pred_prob[:, 1] > 0.494).astype(int)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_5)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_5)

tn, fp, fn, tp = acc_con1.ravel()
# Specificity
spec_score = tn / (tn + fp) if (tn + fp) > 0 else 0 

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_5)
pr_auc = auc(recall, precision)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_5)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_5)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)

print('Recall / Sensitivity: ', recall_scores)
print('Specificity: ', spec_score)


Accuracy Train:  0.6281847133757962
Accuracy Test:  0.6603174603174603
Confusion Matrix: 
[[173  13]
 [ 94  35]]
ROC AUC:  0.644161040260065
PR AUC:  0.5208771686969361
Recall / Sensitivity:  0.2713178294573643
Specificity:  0.9301075268817204


important features

In [33]:
# importance_type: gain, weight, cover
gain_scores = xgb_model.get_booster().get_score(importance_type='weight')

# DataFrame of important features (weight score)

importance_df = pd.DataFrame({
    'feature': list(gain_scores.keys()),
    'weight': list(gain_scores.values())
}).sort_values('weight', ascending=False)
print(importance_df.head(10))

                           feature    weight
17         shape2D_MajorAxisLength  3.890729
16         shape2D_MaximumDiameter  3.851686
15  shape2D_SphericalDisproportion  3.658034
1     firstorder_StandardDeviation  3.507150
8    firstorder_InterquartileRange  3.498747
0                  firstorder_Mean  3.489721
35            glcm_InverseVariance  3.463172
12               shape2D_Perimeter  3.394161
26         glcm_DifferenceVariance  3.389233
3              firstorder_Kurtosis  3.360221


In [34]:
importance_df.to_csv('../Results/sysucc_weight_xgb.csv',index=True)