Compare difference Model results

In [None]:
"""
Compare the predictive performance of different models:
· Support Vector Machine (SVM)
· Random Forest (RF)
· EXtreme Gradient Boosting (XGBoost)

"""

In [2]:
import sklearn
from sklearn import feature_selection as fs
 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn import tree
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB, CategoricalNB, ComplementNB
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.neighbors import NearestNeighbors,KDTree,BallTree
from sklearn.ensemble import GradientBoostingClassifier

from catboost import CatBoostClassifier
import xgboost as xgb

In [87]:
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, auc, r2_score
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import precision_recall_curve, precision_score, average_precision_score, recall_score, f1_score

In [4]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import ClusterCentroids, EditedNearestNeighbours
from imblearn.under_sampling import RandomUnderSampler



load data

In [25]:
######### TCGA discovery features #############
data_all = pd.read_csv('./Data/radiomic_feature/radio_discovery.csv',index_col=0)

label_caf = np.array(data_all['CAF'])
label_tnbc = np.array(data_all['tnbc'])

data_feature = data_all.drop(['CAF', 'tnbc'], axis=1)
feature_name = data_feature.columns

print(data_feature.shape)
print(label_caf.shape)
print(label_tnbc.shape)

(518, 36)
(518,)
(518,)


In [26]:
##################### Radiogenomics Validation features #####################
data_tcga = pd.read_csv('./Data/radiomic_feature/radio_val.csv',index_col=0)

data_tcga1 = data_tcga[data_tcga['tnbc'] != 2]
label_tcga = np.array(data_tcga1['tnbc'])
data_feature_tcga = data_tcga1.drop(['CAF', 'tnbc'], axis=1)
data_feature_tcga.shape

(178, 36)

In [27]:
######################## Radiomics Features (ISPY) #########################
data_ispy = pd.read_csv('./Data/radiomic_feature/radio_ispy.csv',index_col=0)

label_ispy = np.array(data_ispy['tnbc'])
data_feature_ispy = data_ispy.drop(['tnbc'], axis=1)

data_feature_ispy.shape

(1252, 36)

In [28]:
##################### Radiomics Features (UCSF) #################################

data_ucsf = pd.read_csv('./Data/radiomic_feature/radio_ucsf.csv',index_col=0)
label_ucsf = np.array(data_ucsf['tnbc'])
data_feature_ucsf = data_ucsf.drop(['tnbc'], axis=1)

print(data_feature_ucsf.shape)
print(label_ucsf.shape)

(445, 36)
(445,)


In [29]:
####################   Clinical   #####################

data_sysucc = pd.read_csv('./Data/radiomic_feature/radio_sysucc.csv',index_col=0)

label_sysucc = np.array(data_sysucc['tnbc'])

data_feature_sysucc = data_sysucc.drop(['tnbc'], axis=1)
data_feature_sysucc.shape

(1571, 36)

data processing

In [30]:
# radiogenomics discovery cohort 

X_train_1,X_test_1,y_train_1,y_test_1 = train_test_split(data_feature, label_tnbc, test_size=0.2,random_state=42)
print(X_train_1.shape, X_test_1.shape, y_train_1.shape, y_test_1.shape)

# oversample
ros = RandomOverSampler(sampling_strategy=1,random_state=0)
X_resampled_1, y_resampled_1 = ros.fit_resample(X_train_1, y_train_1)
print(X_resampled_1.shape, y_resampled_1.shape)
print(Counter(y_resampled_1))

(414, 36) (104, 36) (414,) (104,)
(726, 36) (726,)
Counter({1: 363, 0: 363})


In [31]:
# radiogenomics validation
X_train_2,X_test_2,y_train_2,y_test_2 = train_test_split(data_feature_tcga, label_tcga, test_size=0.2,random_state=42)
print(X_train_2.shape, X_test_2.shape, y_train_2.shape, y_test_2.shape)

# oversample
ros = RandomOverSampler(sampling_strategy=1,random_state=0)
X_resampled_2, y_resampled_2 = ros.fit_resample(X_train_2, y_train_2)
print(X_resampled_2.shape, y_resampled_2.shape)
print(Counter(y_resampled_2))

(142, 36) (36, 36) (142,) (36,)
(248, 36) (248,)
Counter({0: 124, 1: 124})


In [32]:
# radiomics : ISPY
X_train_3,X_test_3,y_train_3,y_test_3 = train_test_split(data_feature_ispy, label_ispy, test_size=0.2,random_state=42)
print(X_train_3.shape, X_test_3.shape, y_train_3.shape, y_test_3.shape)

# oversample
ros = RandomOverSampler(sampling_strategy=1,random_state=0)
X_resampled_3, y_resampled_3 = ros.fit_resample(X_train_3, y_train_3)
print(X_resampled_3.shape, y_resampled_3.shape)
print(Counter(y_resampled_3))

(1001, 36) (251, 36) (1001,) (251,)
(1486, 36) (1486,)
Counter({1: 743, 0: 743})


In [33]:
# radiomics : UCSF
X_train_4,X_test_4,y_train_4,y_test_4 = train_test_split(data_feature_ucsf, label_ucsf, test_size=0.2,random_state=42)
print(X_train_4.shape, X_test_4.shape, y_train_4.shape, y_test_4.shape)

# oversample
ros = RandomOverSampler(sampling_strategy=1,random_state=0)
X_resampled_4, y_resampled_4 = ros.fit_resample(X_train_4, y_train_4)
print(X_resampled_4.shape, y_resampled_4.shape)
print(Counter(y_resampled_4))

(356, 36) (89, 36) (356,) (89,)
(448, 36) (448,)
Counter({1: 224, 0: 224})


In [34]:
# clinical : SYSUCC
X_train_5,X_test_5,y_train_5,y_test_5 = train_test_split(data_feature_sysucc, label_sysucc, test_size=0.2,random_state=0)
print(X_train_5.shape, X_test_5.shape, y_train_5.shape, y_test_5.shape)

# oversample
ros = RandomOverSampler(sampling_strategy=1,random_state=0)
X_resampled_5, y_resampled_5 = ros.fit_resample(X_train_5, y_train_5)
print(X_resampled_5.shape, y_resampled_5.shape)
print(Counter(y_resampled_5))

(1256, 36) (315, 36) (1256,) (315,)
(1470, 36) (1470,)
Counter({1: 735, 0: 735})


Train model

radiogenomic discovery cohort

In [274]:
# train model
xgb_model = xgb.XGBClassifier(learning_rate=0.1,scale_pos_weight=3,subsample=0.6,n_estimators=150) 
#xgb_model = xgb.XGBClassifier()

xgb_model.fit(X_resampled_1, y_resampled_1)

In [246]:
rf_model = RandomForestClassifier()
rf_model.fit(X_resampled_1, y_resampled_1)

In [139]:
svm_model = SVC(probability=True)
svm_model.fit(X_resampled_1, y_resampled_1)

In [275]:
y_pred = xgb_model.predict(X_test_1)
y_pred_train = xgb_model.predict(X_train_1)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_1)

pred_prob = xgb_model.predict_proba(X_test_1)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_1)
roc_auc = auc(fpr, tpr)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_1)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_1)

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_1)
pr_auc = auc(recall, precision)

ap_score = average_precision_score(y_pred, y_test_1)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_1)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_1)
f1_scores = f1_score(y_pred=y_pred, y_true=y_test_1)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)
print('F1 score: ', f1_scores)
print('Precision: ', preci_scores)
print('Recall: ', recall_scores)
print("AP scores: ", ap_score)

Accuracy Train:  1.0
Accuracy Test:  0.8461538461538461
Confusion Matrix: 
[[ 0 14]
 [ 2 88]]
ROC AUC:  0.7103174603174603
PR AUC:  0.9875691302161891
F1 score:  0.9166666666666666
Precision:  0.8627450980392157
Recall:  0.9777777777777777
AP scores:  0.9781883693648399


In [247]:
y_pred = rf_model.predict(X_test_1)
y_pred_train = rf_model.predict(X_train_1)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_1)

pred_prob = rf_model.predict_proba(X_test_1)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_1)
roc_auc = auc(fpr, tpr)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_1)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_1)

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_1)
pr_auc = auc(recall, precision)

ap_score = average_precision_score(y_pred, y_test_1)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_1)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_1)
f1_scores = f1_score(y_pred=y_pred, y_true=y_test_1)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)
print('F1 score: ', f1_scores)
print('Precision: ', preci_scores)
print('Recall: ', recall_scores)
print("AP scores: ", ap_score)

Accuracy Train:  1.0
Accuracy Test:  0.8557692307692307
Confusion Matrix: 
[[ 1 13]
 [ 2 88]]
ROC AUC:  0.6646825396825397
PR AUC:  0.9870324532453245
F1 score:  0.9214659685863874
Precision:  0.8712871287128713
Recall:  0.9777777777777777
AP scores:  0.9769251925192519


In [140]:
y_pred = svm_model.predict(X_test_1)
y_pred_train = svm_model.predict(X_train_1)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_1)

pred_prob = svm_model.predict_proba(X_test_1)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_1)
roc_auc = auc(fpr, tpr)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_1)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_1)

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_1)
pr_auc = auc(recall, precision)

ap_score = average_precision_score(y_pred, y_test_1)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_1)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_1)
f1_scores = f1_score(y_pred=y_pred, y_true=y_test_1)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)
print('F1 score: ', f1_scores)
print('Precision: ', preci_scores)
print('Recall: ', recall_scores)
print("AP scores: ", ap_score)

Accuracy Train:  0.6980676328502415
Accuracy Test:  0.6634615384615384
Confusion Matrix: 
[[ 7  7]
 [28 62]]
ROC AUC:  0.7063492063492064
PR AUC:  0.8273736529171312
F1 score:  0.779874213836478
Precision:  0.8985507246376812
Recall:  0.6888888888888889
AP scores:  0.6863093026136504


radiogenomic validation cohort

In [283]:
# train model
#xgb_model = xgb.XGBClassifier(learning_rate=0.1,scale_pos_weight=3,subsample=0.6,n_estimators=150) 
xgb_model = xgb.XGBClassifier()

xgb_model.fit(X_resampled_2, y_resampled_2)

In [271]:
rf_model = RandomForestClassifier()
rf_model.fit(X_resampled_2, y_resampled_2)

In [224]:
svm_model = SVC(probability=True)
svm_model.fit(X_resampled_2, y_resampled_2)

In [284]:
y_pred = xgb_model.predict(X_test_2)
y_pred_train = xgb_model.predict(X_train_2)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_2)

pred_prob = xgb_model.predict_proba(X_test_2)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_2)
roc_auc = auc(fpr, tpr)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_2)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_2)

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_2)
pr_auc = auc(recall, precision)

ap_score = average_precision_score(y_pred, y_test_2)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_2)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_2)
f1_scores = f1_score(y_pred=y_pred, y_true=y_test_2)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)
print('F1 score: ', f1_scores)
print('Precision: ', preci_scores)
print('Recall: ', recall_scores)
print("AP scores: ", ap_score)

Accuracy Train:  1.0
Accuracy Test:  0.9166666666666666
Confusion Matrix: 
[[ 1  2]
 [ 1 32]]
ROC AUC:  0.7171717171717172
PR AUC:  0.9832144979203803
F1 score:  0.9552238805970149
Precision:  0.9411764705882353
Recall:  0.9696969696969697
AP scores:  0.9682115270350565


In [273]:
y_pred = rf_model.predict(X_test_2)
y_pred_train = rf_model.predict(X_train_2)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_2)

pred_prob = rf_model.predict_proba(X_test_2)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_2)
roc_auc = auc(fpr, tpr)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_2)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_2)

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_2)
pr_auc = auc(recall, precision)

ap_score = average_precision_score(y_pred, y_test_2)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_2)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_2)
f1_scores = f1_score(y_pred=y_pred, y_true=y_test_2)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)
print('F1 score: ', f1_scores)
print('Precision: ', preci_scores)
print('Recall: ', recall_scores)
print("AP scores: ", ap_score)

Accuracy Train:  1.0
Accuracy Test:  0.9166666666666666
Confusion Matrix: 
[[ 1  2]
 [ 1 32]]
ROC AUC:  0.6818181818181819
PR AUC:  0.9832144979203803
F1 score:  0.9552238805970149
Precision:  0.9411764705882353
Recall:  0.9696969696969697
AP scores:  0.9682115270350565


In [225]:
y_pred = svm_model.predict(X_test_2)
y_pred_train = svm_model.predict(X_train_2)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_2)

pred_prob = svm_model.predict_proba(X_test_2)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_2)
roc_auc = auc(fpr, tpr)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_2)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_2)

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_2)
pr_auc = auc(recall, precision)

ap_score = average_precision_score(y_pred, y_test_2)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_2)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_2)
f1_scores = f1_score(y_pred=y_pred, y_true=y_test_2)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)
print('F1 score: ', f1_scores)
print('Precision: ', preci_scores)
print('Recall: ', recall_scores)
print("AP scores: ", ap_score)

Accuracy Train:  0.8450704225352113
Accuracy Test:  0.8888888888888888
Confusion Matrix: 
[[ 1  2]
 [ 2 31]]
ROC AUC:  0.42424242424242425
PR AUC:  0.9671717171717171
F1 score:  0.9393939393939394
Precision:  0.9393939393939394
Recall:  0.9393939393939394
AP scores:  0.9380165289256198


radiomic cohort: ISPY

In [291]:
# train model
xgb_model = xgb.XGBClassifier(learning_rate=0.1,scale_pos_weight=3,subsample=0.6,n_estimators=150) 
#xgb_model = xgb.XGBClassifier()

xgb_model.fit(X_resampled_3, y_resampled_3)

In [190]:
rf_model = RandomForestClassifier()
rf_model.fit(X_resampled_3, y_resampled_3)

In [192]:
svm_model = SVC(probability=True)
svm_model.fit(X_resampled_3, y_resampled_3)

In [292]:
y_pred = xgb_model.predict(X_test_3)
y_pred_train = xgb_model.predict(X_train_3)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_3)

pred_prob = xgb_model.predict_proba(X_test_3)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_3)
roc_auc = auc(fpr, tpr)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_3)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_3)

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_3)
pr_auc = auc(recall, precision)

ap_score = average_precision_score(y_pred, y_test_3)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_3)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_3)
f1_scores = f1_score(y_pred=y_pred, y_true=y_test_3)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)
print('F1 score: ', f1_scores)
print('Precision: ', preci_scores)
print('Recall: ', recall_scores)
print("AP scores: ", ap_score)

Accuracy Train:  0.999000999000999
Accuracy Test:  0.7649402390438247
Confusion Matrix: 
[[ 17  46]
 [ 13 175]]
ROC AUC:  0.7310874704491725
PR AUC:  0.9529865998603065
F1 score:  0.8557457212713936
Precision:  0.7918552036199095
Recall:  0.9308510638297872
AP scores:  0.9203661909596619


In [191]:
y_pred = rf_model.predict(X_test_3)
y_pred_train = rf_model.predict(X_train_3)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_3)

pred_prob = rf_model.predict_proba(X_test_3)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_3)
roc_auc = auc(fpr, tpr)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_3)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_3)

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_3)
pr_auc = auc(recall, precision)

ap_score = average_precision_score(y_pred, y_test_3)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_3)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_3)
f1_scores = f1_score(y_pred=y_pred, y_true=y_test_3)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)
print('F1 score: ', f1_scores)
print('Precision: ', preci_scores)
print('Recall: ', recall_scores)
print("AP scores: ", ap_score)

Accuracy Train:  1.0
Accuracy Test:  0.7529880478087649
Confusion Matrix: 
[[ 20  43]
 [ 19 169]]
ROC AUC:  0.7254728132387706
PR AUC:  0.9337103612846887
F1 score:  0.845
Precision:  0.7971698113207547
Recall:  0.898936170212766
AP scores:  0.8879195182337691


In [193]:
y_pred = svm_model.predict(X_test_3)
y_pred_train = svm_model.predict(X_train_3)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_3)

pred_prob = svm_model.predict_proba(X_test_3)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_3)
roc_auc = auc(fpr, tpr)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_3)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_3)

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_3)
pr_auc = auc(recall, precision)

ap_score = average_precision_score(y_pred, y_test_3)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_3)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_3)
f1_scores = f1_score(y_pred=y_pred, y_true=y_test_3)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)
print('F1 score: ', f1_scores)
print('Precision: ', preci_scores)
print('Recall: ', recall_scores)
print("AP scores: ", ap_score)

Accuracy Train:  0.5864135864135864
Accuracy Test:  0.5338645418326693
Confusion Matrix: 
[[41 22]
 [95 93]]
ROC AUC:  0.6483451536643026
PR AUC:  0.6955129528140905
F1 score:  0.6138613861386139
Precision:  0.808695652173913
Recall:  0.4946808510638298
AP scores:  0.4876956558594484


radiomic cohort: UCSF

In [297]:
# train model
xgb_model = xgb.XGBClassifier(learning_rate=0.1,scale_pos_weight=3,subsample=0.6,n_estimators=150) 
#xgb_model = xgb.XGBClassifier()

xgb_model.fit(X_resampled_4, y_resampled_4)

In [195]:
rf_model = RandomForestClassifier()
rf_model.fit(X_resampled_4, y_resampled_4)

In [203]:
svm_model = SVC(probability=True)
svm_model.fit(X_resampled_4, y_resampled_4)

In [298]:
y_pred = xgb_model.predict(X_test_4)
y_pred_train = xgb_model.predict(X_train_4)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_4)

pred_prob = xgb_model.predict_proba(X_test_4)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_4)
roc_auc = auc(fpr, tpr)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_4)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_4)

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_4)
pr_auc = auc(recall, precision)

ap_score = average_precision_score(y_pred, y_test_4)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_4)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_4)
f1_scores = f1_score(y_pred=y_pred, y_true=y_test_4)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)
print('F1 score: ', f1_scores)
print('Precision: ', preci_scores)
print('Recall: ', recall_scores)
print("AP scores: ", ap_score)

Accuracy Train:  1.0
Accuracy Test:  0.8876404494382022
Confusion Matrix: 
[[14  9]
 [ 1 65]]
ROC AUC:  0.9018445322793149
PR AUC:  0.9821752293662406
F1 score:  0.9285714285714286
Precision:  0.8783783783783784
Recall:  0.9848484848484849
AP scores:  0.9661932105752331


In [198]:
y_pred = rf_model.predict(X_test_4)
y_pred_train = rf_model.predict(X_train_4)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_4)

pred_prob = rf_model.predict_proba(X_test_4)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_4)
roc_auc = auc(fpr, tpr)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_4)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_4)

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_4)
pr_auc = auc(recall, precision)

ap_score = average_precision_score(y_pred, y_test_4)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_4)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_4)
f1_scores = f1_score(y_pred=y_pred, y_true=y_test_4)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)
print('F1 score: ', f1_scores)
print('Precision: ', preci_scores)
print('Recall: ', recall_scores)
print("AP scores: ", ap_score)

Accuracy Train:  1.0
Accuracy Test:  0.8426966292134831
Confusion Matrix: 
[[13 10]
 [ 4 62]]
ROC AUC:  0.8573781291172595
PR AUC:  0.9564323005334242
F1 score:  0.8985507246376812
Precision:  0.8611111111111112
Recall:  0.9393939393939394
AP scores:  0.9212821094843567


In [204]:
y_pred = svm_model.predict(X_test_4)
y_pred_train = svm_model.predict(X_train_4)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_4)

pred_prob = svm_model.predict_proba(X_test_4)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_4)
roc_auc = auc(fpr, tpr)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_4)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_4)

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_4)
pr_auc = auc(recall, precision)

ap_score = average_precision_score(y_pred, y_test_4)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_4)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_4)
f1_scores = f1_score(y_pred=y_pred, y_true=y_test_4)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)
print('F1 score: ', f1_scores)
print('Precision: ', preci_scores)
print('Recall: ', recall_scores)
print("AP scores: ", ap_score)

Accuracy Train:  0.6292134831460674
Accuracy Test:  0.7528089887640449
Confusion Matrix: 
[[15  8]
 [14 52]]
ROC AUC:  0.7239789196310936
PR AUC:  0.8722165474974464
F1 score:  0.8253968253968254
Precision:  0.8666666666666667
Recall:  0.7878787878787878
AP scores:  0.772715923277721


clinical cohort

In [303]:
# train model
xgb_model = xgb.XGBClassifier(learning_rate=0.1,scale_pos_weight=3,subsample=0.6,n_estimators=150) 
#xgb_model = xgb.XGBClassifier()

xgb_model.fit(X_resampled_5, y_resampled_5)

In [206]:
rf_model = RandomForestClassifier()
rf_model.fit(X_resampled_5, y_resampled_5)

In [207]:
svm_model = SVC(probability=True)
svm_model.fit(X_resampled_5, y_resampled_5)

In [304]:
y_pred = xgb_model.predict(X_test_5)
y_pred_train = xgb_model.predict(X_train_5)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_5)

pred_prob = xgb_model.predict_proba(X_test_5)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_5)
roc_auc = auc(fpr, tpr)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_5)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_5)

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_5)
pr_auc = auc(recall, precision)

ap_score = average_precision_score(y_pred, y_test_5)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_5)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_5)
f1_scores = f1_score(y_pred=y_pred, y_true=y_test_5)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)
print('F1 score: ', f1_scores)
print('Precision: ', preci_scores)
print('Recall: ', recall_scores)
print("AP scores: ", ap_score)

Accuracy Train:  0.9968152866242038
Accuracy Test:  0.7142857142857143
Confusion Matrix: 
[[ 59  65]
 [ 25 166]]
ROC AUC:  0.6900861340989698
PR AUC:  0.8970369363039521
F1 score:  0.7867298578199052
Precision:  0.7186147186147186
Recall:  0.8691099476439791
AP scores:  0.8309044068206373


In [129]:
y_pred = rf_model.predict(X_test_5)
y_pred_train = rf_model.predict(X_train_5)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_5)

pred_prob = rf_model.predict_proba(X_test_5)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_5)
roc_auc = auc(fpr, tpr)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_5)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_5)

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_5)
pr_auc = auc(recall, precision)

ap_score = average_precision_score(y_pred, y_test_5)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_5)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_5)
f1_scores = f1_score(y_pred=y_pred, y_true=y_test_5)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)
print('F1 score: ', f1_scores)
print('Precision: ', preci_scores)
print('Recall: ', recall_scores)
print("AP scores: ", ap_score)

Accuracy Train:  1.0
Accuracy Test:  0.6730158730158731
Confusion Matrix: 
[[ 63  61]
 [ 42 149]]
ROC AUC:  0.6941817260597872
PR AUC:  0.8416396576082439
F1 score:  0.743142144638404
Precision:  0.7095238095238096
Recall:  0.7801047120418848
AP scores:  0.7471536607662262


In [213]:
y_pred = svm_model.predict(X_test_5)
y_pred_train = svm_model.predict(X_train_5)
acc_train = accuracy_score(y_pred=y_pred_train, y_true=y_train_5)

pred_prob = svm_model.predict_proba(X_test_5)
fpr, tpr, thresholds = roc_curve(y_score=pred_prob[:, 1], y_true=y_test_5)
roc_auc = auc(fpr, tpr)
acc_con1 = confusion_matrix(y_pred=y_pred, y_true=y_test_5)
acc_pred1 = accuracy_score(y_pred=y_pred, y_true=y_test_5)

precision, recall, thresholds = precision_recall_curve(y_pred, y_test_5)
pr_auc = auc(recall, precision)

ap_score = average_precision_score(y_pred, y_test_5)

preci_scores = precision_score(y_pred=y_pred, y_true=y_test_5)
recall_scores = recall_score(y_pred=y_pred, y_true=y_test_5)
f1_scores = f1_score(y_pred=y_pred, y_true=y_test_5)


print("Accuracy Train: ", acc_train)
print("Accuracy Test: ", acc_pred1)
print("Confusion Matrix: ")
print(acc_con1) 
print("ROC AUC: ", roc_auc)
print("PR AUC: ", pr_auc)
print('F1 score: ', f1_scores)
print('Precision: ', preci_scores)
print('Recall: ', recall_scores)
print("AP scores: ", ap_score)

Accuracy Train:  0.6377388535031847
Accuracy Test:  0.6666666666666666
Confusion Matrix: 
[[ 63  61]
 [ 44 147]]
ROC AUC:  0.6769337949670664
PR AUC:  0.8350075353674831
F1 score:  0.7368421052631579
Precision:  0.7067307692307693
Recall:  0.7696335078534031
AP scores:  0.7375744746818045


important features

In [307]:
# importance_type: gain, weight, cover
gain_scores = xgb_model.get_booster().get_score(importance_type='gain')

# DataFrame of important features (weight score)

importance_df = pd.DataFrame({
    'feature': list(gain_scores.keys()),
    'weight': list(gain_scores.values())
}).sort_values('weight', ascending=False)
print(importance_df.head(10))

                           feature      gain
16         shape2D_MaximumDiameter  4.923330
8    firstorder_InterquartileRange  4.388739
15  shape2D_SphericalDisproportion  4.248492
2              firstorder_Skewness  3.985197
25               glcm_JointEntropy  3.941970
6               firstorder_Maximum  3.861290
14              shape2D_Sphericity  3.792946
10             shape2D_MeshSurface  3.787179
34         glcm_MaximumProbability  3.657549
11            shape2D_PixelSurface  3.652588


In [306]:
importance_df.to_csv('./Results/sysucc_weight_xgb.csv',index=True)

In [82]:
from xgboost import plot_importance

In [None]:
############# Feature Importance #############
# feature importance
print(xgb_model.feature_importances_)

# plot feature importance
plot_importance(xgb_model, max_num_features=10, grid=False,color="#77AAD7")

#plt.savefig('../Results/feature_importance_sysucc.png', dpi=1200, bbox_inches="tight", pad_inches=0)
plt.show()