# Metagenomic-based Diagnostic for Sepsis (External Validation)

In [7]:
# Import Statements
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, cross_val_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

cwd = Path.cwd()
datasets = cwd / 'datasets'
results = cwd / 'results'

In [8]:
list(datasets.glob('*'))

[PosixPath('/home/csctan/git_repos/Polymicrobial-Signature-of-Sepsis/datasets/kapusta_genus_raw.csv'),
 PosixPath('/home/csctan/git_repos/Polymicrobial-Signature-of-Sepsis/datasets/karius_genus_raw.csv'),
 PosixPath('/home/csctan/git_repos/Polymicrobial-Signature-of-Sepsis/datasets/karius_genus_pathogens.csv')]

## Data Preprocessing
Since we are using stratified kfold, a validation split is not necesssary.

### Load data

In [12]:
raw_df = pd.read_csv(datasets / 'kapusta_genus_raw.csv')

# Remove NTCs
raw_df = raw_df.loc[raw_df.y != 'ntc', :]
display(raw_df)

X = raw_df.iloc[:, 1:].copy()
y = raw_df.iloc[:, 0].copy()

Unnamed: 0,y,Bifidobacterium,Aeriscardovia,Alloscardovia,Arthrobacter,Kocuria,Glutamicibacter,Citricoccus,Enteractinococcus,Micrococcus,...,Gallicola,Dethiosulfatibacter,Bilophila,Guyparkeria,Sinobaca,Cryptanaerobacter,Marinitoga,Candidatus Endolissoclinum,Luteimicrobium,Paeniclostridium
0,healthy,52711.0,1.0,1.0,199.0,3.0,2.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,healthy,29182.0,0.0,0.0,35.0,6.0,3.0,0.0,1.0,81.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,septic,33.0,0.0,0.0,215.0,82.0,59.0,1.0,3.0,144.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,septic,3.0,0.0,0.0,35.0,23.0,13.0,0.0,0.0,17.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,septic,2.0,0.0,0.0,299.0,78.0,87.0,0.0,4.0,87.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,septic,35.0,1.0,0.0,1285.0,295.0,275.0,7.0,11.0,382.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
77,septic,128.0,0.0,0.0,1745.0,132.0,120.0,3.0,7.0,142.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
78,septic,14.0,0.0,0.0,87.0,30.0,18.0,0.0,3.0,38.0,...,60.0,3.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
79,septic,811.0,0.0,0.0,642.0,159.0,122.0,5.0,7.0,188.0,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0


In [13]:
# Binary encode y
y.loc[y == 'septic'] = 1
y.loc[y == 'healthy'] = 0
y = y.astype('int')

# Relative abundance
X_RA = X.apply(func=lambda x: x / x.sum(), axis=1)

In [14]:
n_splits = 5

pos = len(y[y == 1])
neg = len(y[y == 0])
split_sizes = pd.DataFrame({'Septic': [pos - int(pos / n_splits), int(pos / n_splits)], 
                           'Healthy': [neg - int(neg / n_splits), int(neg / n_splits)]}, index=['Train fold', 'Test fold'])

display(split_sizes)

# Get negative to positive ratio
ratio = sum(y == 0) / sum(y == 1)

Unnamed: 0,Septic,Healthy
Train fold,44,19
Test fold,11,4


## Nested CV for hyperparameter optimisation

In [5]:
# Metrics
from imblearn.metrics import sensitivity_score, specificity_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate

In [6]:
def optimise(X, y, param_dict=False):
    np.random.seed(66)
    
    # Hyperparemeter Optimisation using grid search (F1)
    model = XGBClassifier()
    n_estimators = range(50, 500, 1)
    max_depth = range(1, 10)
    gamma = np.linspace(0, 5, 20)
    subsample = np.linspace(0.1, 1, 20)
    colsample_bytree = np.linspace(0.1, 1, 20)
    
    param_grid = dict(max_depth=max_depth, 
                      n_estimators=n_estimators, 
                      colsample_bytree=colsample_bytree,
                      gamma=gamma,
                      subsample=subsample,
                      scale_pos_weight=[ratio])
    
    inner_cv = StratifiedKFold(n_splits=5, shuffle=True)
    outer_cv = StratifiedKFold(n_splits=5, shuffle=True)
    
    if not param_dict:
        # Inner CV
        model = RandomizedSearchCV(model, 
                                   param_grid, 
                                   scoring="roc_auc",
                                   n_iter=1000,
                                   n_jobs=10, 
                                   cv=inner_cv, 
                                   verbose=1)

        model.fit(X, y)
        best_params = model.best_params_
        print(best_params)
                
    else:
        model = XGBClassifier(**param_dict)

    # Custom metrics
    sensitivity = make_scorer(sensitivity_score, average='binary')
    specificity = make_scorer(specificity_score, average='binary')
    scoring = {'sensitivity': sensitivity, 
               'specificity': specificity, 
               'AUROC': 'roc_auc'}
    
    # Outer CV
    outer_results = cross_validate(model, X=X, y=y, cv=outer_cv, scoring=scoring)
    outer_results = pd.DataFrame(outer_results).mean()[['test_specificity', 'test_sensitivity', 'test_AUROC']]
    
    return model, outer_results, best_params

### Optimise Model using Neat Data

In [None]:
raw_model, raw_results, raw_params = optimise(X, y)
# raw_params = {'subsample': 0.8578947368421053, 'scale_pos_weight': 1.4273504273504274, 'n_estimators': 348, 'max_depth': 7, 'gamma': 2.894736842105263, 'colsample_bytree': 0.24210526315789474}
# raw_model, raw_results = optimise(X, y, raw_params)
RA_model, RA_results, RA_params = optimise(X_RA, y)
# RA_params = {'subsample': 0.7157894736842105, 'scale_pos_weight': 1.4273504273504274, 'n_estimators': 456, 'max_depth': 6, 'gamma': 0.5263157894736842, 'colsample_bytree': 0.33684210526315794}
# RA_model, RA_results = optimise(X_RA, y, RA_params)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    6.1s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:   24.1s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:   52.7s
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:  1.6min
[Parallel(n_jobs=10)]: Done 1230 tasks      | elapsed:  2.5min
[Parallel(n_jobs=10)]: Done 1780 tasks      | elapsed:  3.9min
[Parallel(n_jobs=10)]: Done 2430 tasks      | elapsed:  5.2min
[Parallel(n_jobs=10)]: Done 3180 tasks      | elapsed:  6.9min
[Parallel(n_jobs=10)]: Done 4030 tasks      | elapsed:  8.9min
[Parallel(n_jobs=10)]: Done 4980 tasks      | elapsed: 11.1min
[Parallel(n_jobs=10)]: Done 5000 out of 5000 | elapsed: 11.2min finished


{'subsample': 0.5263157894736842, 'scale_pos_weight': 1.4273504273504274, 'n_estimators': 96, 'max_depth': 2, 'gamma': 1.8421052631578947, 'colsample_bytree': 0.19473684210526315}
Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    4.6s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:   18.3s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:   48.2s
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:  1.4min
[Parallel(n_jobs=10)]: Done 1230 tasks      | elapsed:  2.1min
[Parallel(n_jobs=10)]: Done 1780 tasks      | elapsed:  3.1min
[Parallel(n_jobs=10)]: Done 2430 tasks      | elapsed:  4.3min
[Parallel(n_jobs=10)]: Done 3180 tasks      | elapsed:  5.7min
[Parallel(n_jobs=10)]: Done 4030 tasks      | elapsed:  7.2min
[Parallel(n_jobs=10)]: Done 4980 tasks      | elapsed:  9.0min
[Parallel(n_jobs=10)]: Done 5000 out of 5000 | elapsed:  9.1min finished


Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    4.1s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:   20.2s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:   51.5s
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:  1.5min
[Parallel(n_jobs=10)]: Done 1230 tasks      | elapsed:  2.3min
[Parallel(n_jobs=10)]: Done 1780 tasks      | elapsed:  3.2min
[Parallel(n_jobs=10)]: Done 2430 tasks      | elapsed:  4.4min
[Parallel(n_jobs=10)]: Done 3180 tasks      | elapsed:  5.7min
[Parallel(n_jobs=10)]: Done 4030 tasks      | elapsed:  7.4min
[Parallel(n_jobs=10)]: Done 4980 tasks      | elapsed:  9.1min
[Parallel(n_jobs=10)]: Done 5000 out of 5000 | elapsed:  9.2min finished


Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    2.7s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:   19.8s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:   45.1s
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:  1.4min
[Parallel(n_jobs=10)]: Done 1230 tasks      | elapsed:  2.3min


## Estimates of test error

In [None]:
metric_df = pd.DataFrame({'Raw': raw_results, 'RA': RA_results}).round(3).T
display(metric_df)

### Remove Contaminants based on SHAP values

In [None]:
import math
from scipy.stats import spearmanr
import shap


def decontam(X_train, y_train, params):
    model = XGBClassifier(**params)
    model.fit(X=X_train, y=y_train)

    explainer = shap.TreeExplainer(model, feature_pertubation='interventional', model_output='probability', data=X_train)
    shap_val = explainer.shap_values(X_train)

    to_retain = np.array([True] * X_train.shape[1])
    corrs = np.zeros(X_train.shape[1])

    for i in range(X_train.shape[1]):
        rho = spearmanr(X_train.iloc[:, i], shap_val[:, i])[0]
        p = spearmanr(X_train.iloc[:, i], shap_val[:, i])[1]
        if rho < 0 and p < 0.05:
            to_retain[i] = False

        if math.isnan(rho):
            corrs[i] = 2
        else:
            corrs[i] = rho

    to_retain = np.logical_and(corrs > 0, corrs != 2)
    to_retain = X_train.columns[to_retain]
    print(to_retain.shape, to_retain)
    
    return to_retain

In [None]:
# Decontam using raw_params
genera_new = X.columns

for _ in range(10):
    genera_new = decontam(X.loc[:, genera_new], y, raw_params)

### Remove non-human associated pathogens

In [None]:
to_retain = list(set(genera_new).intersection(set(X_pathogens.columns)))
print(to_retain)

In [None]:
# Decontam + pathogens
raw_CR = X[to_retain]


In [None]:
# Get SHAP summary before removing Cellulomonas and Agrobacterium
pre_model = XGBClassifier(**raw_params)
pre_model.fit(X=raw_CR, y=y)

pre_explainer = shap.TreeExplainer(pre_model, feature_pertubation='interventional', model_output='probability', data=raw_CR)
shap_pre = pre_explainer.shap_values(raw_CR)

shap.summary_plot(shap_pre, raw_CR, show=False, plot_size=(4, 5), color_bar_label='Unique k-mer Count', max_display=25)
fig, ax = plt.gcf(), plt.gca()
ax.set_xlabel('SHAP Value')
plt.savefig(results / 'pre_shap.png', dpi=600, format='png', bbox_inches='tight')


### Drop features

In [None]:
# Normalise Datasets
RA_CR = raw_CR.apply(func=lambda x: x / x.sum(), axis=1)

### Number of Features

In [None]:
print('Neat', X.shape)
print('CR', raw_CR.shape)

### Optimise decontaminated models

#### Pathogens

In [None]:
raw_CR_model, raw_CR_results, raw_CR_params = optimise(raw_CR, y)
# raw_params = {'subsample': 0.8578947368421053, 'scale_pos_weight': 1.4273504273504274, 'n_estimators': 348, 'max_depth': 7, 'gamma': 2.894736842105263, 'colsample_bytree': 0.24210526315789474}
# raw_model, raw_results = optimise(X, y, raw_params)
RA_CR_model, RA_CR_results, RA_CR_params = optimise(RA_CR, y)
# RA_params = {'subsample': 0.7157894736842105, 'scale_pos_weight': 1.4273504273504274, 'n_estimators': 456, 'max_depth': 6, 'gamma': 0.5263157894736842, 'colsample_bytree': 0.33684210526315794}
# RA_model, RA_results = optimise(X_RA, y, RA_params)

In [None]:
300620_STOPPPPP

In [None]:
# RA_SS_params = optimise(RA_SS_train, y_train)
# RA_CR_params = {'colsample_bytree': 0.2, 'gamma': 1, 'max_depth': 4, 'n_estimators': 70, 'scale_pos_weight': 1.4273504273504274, 'subsample': 0.6}

## Fit optimised models

In [None]:
# Fit optimised model on all training data

# Decontam
raw_SS_model = XGBClassifier(**raw_SS_params)
raw_SS_model.fit(X=raw_SS_train, y=y_train)

RA_SS_model = XGBClassifier(**RA_SS_params)
RA_SS_model.fit(X=RA_SS_train, y=y_train)

## Evaluate model

In [None]:
raw_SS_metric = evaluate(raw_SS_model, raw_SS_test, y_test)
RA_SS_metric = evaluate(RA_SS_model, RA_SS_test, y_test)

raw_SS_metric = evaluate(raw_SS_model, raw_SS_test, y_test)
RA_SS_metric = evaluate(RA_SS_model, RA_SS_test, y_test)

metric_df = pd.concat([raw_metric,
                       RA_metric,
                       raw_SS_metric,
                       RA_SS_metric], axis=0)
metric_df.index = ['Raw', 'RA', 'Raw SS', 'RA SS']
display(metric_df.round(3))

### Confidence Intervals (non-parametric boostrap estimates)

Bootstrap with 1001 iterations, 95% CI

In [None]:
def get_percentiles(x, alpha=0.05):
    low = np.percentile(x, alpha / 2 * 100)
    high = np.percentile(x, (1 - alpha / 2) * 100)
    
    return low, high


np.random.seed(66)
from sklearn.utils import resample


def get_confint(model, X_test, y_test, n_iter=1001):
    boot_df = pd.DataFrame({'F1': [0], 'Sensitivity': [0], 'Specificity': [0], 'AUROC' : [0]})
    
    for _ in range(n_iter):
        boot_X, boot_y = resample(X_test, y_test, n_samples=len(y_test), replace=True, stratify=y_test)
        y_pred = model.predict(boot_X)
        y_score = model.predict_proba(boot_X)[:, 1]

        sensitivity, specificity, _ = sensitivity_specificity_support(y_true=boot_y, y_pred=y_pred, average='binary')
        precision, recall, f1, _ = precision_recall_fscore_support(y_true=boot_y, y_pred=y_pred, average='binary')
        auc = roc_auc_score(y_true=boot_y, y_score=y_score)
        temp_df = pd.DataFrame({'F1': [f1], 'Sensitivity': [sensitivity], 
                                'Specificity': [specificity], 'AUROC' : [auc]})
        
        boot_df = pd.concat([boot_df, temp_df], axis=0)
    
    boot_df = boot_df.iloc[1:, :]
    
    confints = [get_percentiles(boot_df[col]) for col in boot_df.columns]
    display(pd.DataFrame(confints, 
                         columns=['2.5%', '97.5%'], 
                         index=boot_df.columns).transpose().round(3))
    

In [None]:
print('Raw:', end='')
get_confint(raw_model, raw_test, y_test)
print('RA:', end='')
get_confint(RA_model, RA_test, y_test)

print('Raw SS:', end='')
get_confint(raw_SS_model, raw_SS_test, y_test)
print('RA SS:', end='')
get_confint(RA_SS_model, RA_SS_test, y_test)

## Interpreting model using SHAP values

### Feature importance
This is a plot of mean absolute SHAP values per feature

### Plot of SHAP values per Feature

In [None]:
import matplotlib.pyplot as plt
explainer_SS = shap.TreeExplainer(raw_SS_model, feature_pertubation='interventional', model_output='probability', data=raw_SS_train)
shap_SS = explainer_SS.shap_values(raw_SS_test)

explainer_raw = shap.TreeExplainer(raw_model, feature_pertubation='interventional', model_output='probability', data=raw_train)
shap_raw = explainer_raw.shap_values(raw_test)

In [None]:
shap.summary_plot(shap_SS, raw_SS_test, show=False, plot_size=(4, 5), color_bar_label='Unique k-mer Count', max_display=35)
fig, ax = plt.gcf(), plt.gca()
ax.set_xlabel('SHAP Value')
plt.savefig(results / 'SS_shap.png', dpi=1200, format='png', bbox_inches='tight')

In [None]:
shap.summary_plot(shap_raw, raw_test, show=False, plot_size=(4, 5), color_bar_label='Unique k-mer Count', max_display=23)
fig, ax = plt.gcf(), plt.gca()
ax.set_xlabel('SHAP Value')
plt.savefig(results / 'raw_shap.png', dpi=1200, format='png', bbox_inches='tight')

* Features are ranked by importance from top to botttom
* feature values are the kmer counts for each genus
* SHAP values are the average marginal contributions to probability

### Force plot for healthy patient

In [None]:
j = 72
print(f'Actual Classification {y_test[j]}')
print(raw_SS_test.index[j])
shap.force_plot(explainer_SS.expected_value, 
                shap_SS[j,:], 
                raw_SS_test.iloc[j,:],
                show=False,
                matplotlib=True)
plt.savefig(results / 'SS_force_plot.png', dpi=1200, format='png', bbox_inches='tight')

## How much does Escherichia drive predictions?

In [None]:
escherichia_idx = raw_SS_test.columns.get_loc('Escherichia')

In [None]:
y_score = raw_SS_model.predict_proba(raw_SS_test)[:, 1]
old_auc = roc_auc_score(y_true=y_test, y_score=y_score)
new_auc = roc_auc_score(y_true=y_test, y_score=y_score - shap_SS[:, escherichia_idx])
print(f"Before Removing Escherichia = {old_auc}\nAfter Removing Escherichia = {new_auc}")