In [None]:
###############################################################
# Import dependencies
extra_packages_reqd <- c("aws.s3", "aws.ec2metadata")
packages_available <- library()$results[, "Package"]
for (package_current in extra_packages_reqd) {
    if (!package_current %in% packages_available) {
        install.packages(package_current)
    }
}
extra_packages_reqd <- c("aws.s3", "aws.ec2metadata")
packages_available <- library()$results[, "Package"]
for (package_current in extra_packages_reqd) {
    if (!package_current %in% packages_available) {
        install.packages(package_current)
    }
}

In [None]:
###############################################################
# Load meta data file with test/train/... splits

setwd('/home/ec2-user/SageMaker/jbc-cough-in-a-box')
dir_root <- readRDS(file.path("notebooks/matching/config.RDS"))
Sys.setenv("AWS_DEFAULT_REGION" = "eu-west-2")
meta <- aws.s3::s3read_using(read.csv, 
             object = file.path(dir_root, "BAMstudy2022-prep/meta_data_with_splits_old_format.csv"))
meta$test_result_binary <- ifelse(meta$test_result == "Positive", 1, 0)

In [None]:
##############################################################################
# Load audio classifier predictions
input_files <- c("ss_predicts_standard_train_test.csv",
                 "ss_predicts_standard_train_long_test.csv",
                "ss_predicts_standard_train_train.csv",
                "ss_predicts_standard_train_val.csv")
res <- data.frame()
for (input_file in input_files) {
    res_add <- aws.s3::s3read_using(read.csv, 
             object = file.path(dir_root, "audio_sentences_for_matching", input_file))
    res <- rbind(res, res_add)
}
meta$audio_based_test_result <- res[match(meta$audio_sentence, res$audio_sentence), "Positive"]

In [None]:
##############################################################################
# Fit RF classifier to symptoms
meta_train <- meta[meta$split %in%  c("train", "val"), ]

all_symptoms_fields <- c("symptom_none", 
                         "symptom_cough_any", 
                         "symptom_new_continuous_cough", 
                         "symptom_runny_or_blocked_nose", 
                         "symptom_shortness_of_breath", 
                         "symptom_sore_throat", 
                         "symptom_abdominal_pain", 
                         "symptom_diarrhoea", 
                         "symptom_fatigue", 
                         "symptom_fever_high_temperature", 
                         "symptom_headache", 
                         "symptom_change_to_sense_of_smell_or_taste", 
                         "symptom_loss_of_taste", 
                         "symptom_other", 
                         "respiratory_condition_asthma", 
                         "respiratory_condition_copd_or_emphysema", 
                         "respiratory_condition_other")

symptoms_fields_poss_detectable_by_acoustics <- c("Sore.throat", 
                                                 "Runny.or.blocked.nose", 
                                                 "A.new.continuous.cough", 
                                                 "Cough..any.", 
                                                 "Shortness.of.breath", 
                                                 "COPD.or.Emphysema", 
                                                 "Other.respiratory.condition", 
                                                 "Asthma")

other_fields_poss_detect_by_acoustics <- c("age", 
                                            "gender", 
                                            "smoker_status",
                                            "ethnicity")

covariate_set <- c("symptoms_only", "symptoms_with_audible_confounders")[2]
if (covariate_set == "symptoms_only") {
    rf_covariates <- c(all_symptoms_fields)
}
if (covariate_set == "symptoms_with_audible_confounders") {
    rf_covariates <- c(all_symptoms_fields, other_fields_poss_detect_by_acoustics)
}
rf_formula <- as.formula(paste0("as.factor(test_result_binary) ~ ",
                     paste(rf_covariates, collapse = " + ")))
rf_to_predict <- randomForest::randomForest(formula = rf_formula, data = meta_train)
meta$rf_predicted_test_result <- predict(object = rf_to_predict, 
                                newdata = meta, type = "prob")[, 2]


In [None]:
##############################################################################
# Fit RF classifier to symptoms + audio
rf_audio_covariates <- c(rf_covariates, "audio_based_test_result")
rf_audio_formula <- as.formula(paste0("as.factor(test_result_binary) ~ ",
                     paste(rf_audio_covariates, collapse = " + ")))
rf_audio_to_predict <- randomForest::randomForest(formula = rf_audio_formula, 
                                                  data = meta_train)
meta$rf_audio_predicted_test_result <- predict(object = rf_audio_to_predict, 
                                newdata = meta, type = "prob")[, 2]



In [None]:
##############################################################################
# Generate subpopulation for assessing utility by subsampling entries in data
# so they have correct symptomatic proportions for covid+/covid-

meta$available_for_utility <- meta$split %in% c("long", "test")[1:2] & 
                                meta$audio_sentence %in% res$audio_sentence
meta_util <- meta[meta$available_for_utility, ]
meta_util$match_string <- paste0(meta_util$gender, "_", meta_util$age_binned)
# We fix prob_symptoms_covidpos, the probability of a COVID- being symptomatic, at 0.65
# in alignment with literature (described in our results paper)
prob_symptoms_covidpos <- 0.65
pROC_obj_list <- list()
meta_util_resam_list <- list()
# We vary prob_symptoms_covidneg, the probability of a COVID- being symptomatic, 
# as part of the expected utility function (described in our results paper)
for (prob_symptoms_covidneg in c(0.1, .2, .3)) {
    meta_util_resam<-meta_util_resam_pos<-meta_util_resam_neg <- data.frame()
    age_bins_unique <- unique(meta$age_binned)
    set.seed(1)
    for (age_curr in age_bins_unique) {
        sampling_possible <- TRUE
        while (sampling_possible) {
            gender_tab <- table(meta_util_resam_pos$gender)
            if (length(gender_tab) == 2) {
                sex_curr <- names(which.min(gender_tab))#table(meta_util_resam_pos$gender)
            } else {
                sex_curr <- ifelse(nrow(meta_util_resam_pos) == 0, "Female", "Male")
            }
            meta_match_curr <- meta_util[
                which(meta_util$match_string == paste0(sex_curr, "_", age_curr) & 
                !meta_util$audio_sentence %in% meta_util_resam$audio_sentence), ]
            any_matched <- FALSE
            if (sum(table(meta_match_curr$test_result_binary, meta_match_curr$no_symptoms) > 0) == 4) {
                any_matched <- TRUE
                covidpos_symptoms <- ifelse(runif(1) < prob_symptoms_covidpos, "Symptoms", "No symptoms")
                covidneg_symptoms <- ifelse(runif(1) < prob_symptoms_covidneg, "Symptoms", "No symptoms")
                covidpos_id_chosen <- sample(meta_match_curr$audio_sentence[
                    meta_match_curr$test_result_binary == 1 & meta_match_curr$no_symptoms == covidpos_symptoms], 1)
                covidneg_id_chosen <- sample(meta_match_curr$audio_sentence[
                    meta_match_curr$test_result_binary == 0 & meta_match_curr$no_symptoms == covidneg_symptoms], 1)
                meta_util_resam_pos <- rbind(meta_util_resam_pos, meta_util[meta_util$audio_sentence == covidpos_id_chosen, ])
                meta_util_resam_neg <- rbind(meta_util_resam_neg, meta_util[meta_util$audio_sentence == covidneg_id_chosen, ])
                meta_util_resam <- rbind(meta_util_resam_pos, meta_util_resam_neg)
            }
            if (!any_matched) {
                sampling_possible <- FALSE
            }
        }
    }
    meta_util_resam_list[[as.character(prob_symptoms_covidneg)]] <- meta_util_resam
}


In [None]:
##############################################################################
# Calculate ROC curves

pROC_obj_list <- list()
# We vary prob_symptoms_covidneg, the probability of a COVID- being symptomatic, 
# as part of the expected utility function (described in our results paper)
for (prob_symptoms_covidneg in c(0.1, .2, .3)) {
    prediction_fields <- c("rf_predicted_test_result", "rf_audio_predicted_test_result", "audio_based_test_result")
    pROC_obj_list[[as.character(prob_symptoms_covidneg)]] <- list()
    for (prediction_field in prediction_fields) {
        pROC_obj_list[[as.character(prob_symptoms_covidneg)]][[prediction_field]] <- 
                            pROC::roc(meta_util_resam_list[[as.character(prob_symptoms_covidneg)]]$test_result_binary, 
                                      meta_util_resam_list[[as.character(prob_symptoms_covidneg)]][, prediction_field],
                            smoothed = TRUE,
                            # arguments for ci
                            ci = TRUE, ci.alpha = 0.95, stratified = FALSE,
                            # arguments for plot
                            plot = F, auc.polygon = TRUE, max.auc.polygon = TRUE, grid = TRUE,
                            print.auc = TRUE, show.thres = TRUE, quiet = TRUE)
        pROC_obj_list[[as.character(prob_symptoms_covidneg)]][[prediction_field]]$ci_output <- 
                    pROC::ci.se(pROC_obj_list[[as.character(prob_symptoms_covidneg)]][[prediction_field]])
    }
}

In [None]:
##############################################################################
# Set up plot output folder
setwd('/home/ec2-user/SageMaker/jbc-cough-in-a-box')
if (substr(getwd(), nchar(getwd()) - 17, nchar(getwd())) != "jbc-cough-in-a-box") {
        stop("Use setwd(<path to your jbc-cough-in-a-box folder>)")
}
plot_dir <- file.path("plotting", "audio_added_utility")
dir.create(plot_dir, showWarnings = F)
Sys.setenv("AWS_DEFAULT_REGION" = "eu-west-2")

In [None]:
##############################################################################
# Create plot of ROC curves and utility functions

# We vary delta as part of the expected utility function (described in our results paper)
delta <- c(0, 0.25)
colv <- c(rf = "black", rf_audio = "blue", audio = "red")
classif_names <- c(rf = "Symptoms", audio = "Audio", rf_audio = "Symptoms+Audio")
col_prediction_field <- colv[c("rf", "rf_audio", "audio")]
names(col_prediction_field) <- c("rf_predicted_test_result", "rf_audio_predicted_test_result", "audio_based_test_result")
legcex <- .9
pdf(file.path(plot_dir, paste0("roc_and_utility_", covariate_set, "_delta_", delta, ".pdf")), 12, 8)
par(mfrow = c(2, 3), mar = c(4, 4, 3, 3), oma = c(1, 1, 3, 1))
letter_num <- 0

#' Add letter to plot
#'
#' Longer description of the function
#' @param ploteps An object of class "numeric". Scales distance of letter from plot
#' @param letcex An object of class "numeric". Character expansion for letter
#' @return Returns an object of class "NULL". 
add_letter <- function(ploteps = .05, letcex = 1.4) {
    coords <- par("usr")
    letter_num <<- letter_num + 1
    par(xpd = NA)
    text(x = coords[1] - diff(coords[1:2]) * ploteps, 
         y = coords[4] + diff(coords[3:4]) * ploteps, 
         labels = paste0("(", letters[letter_num], ")"),
        cex = letcex)
    par(xpd = F)
}

# We vary prob_symptoms_covidneg, the probability of a COVID- being symptomatic, 
# as part of the expected utility function (described in our results paper)
for (prob_symptoms_covidneg in c(0.1, .2, .3)) {
    plot(0, ty = "n", xlim = c(-1, 0), ylim = c(0, 1.2), xlab = "Specificity", ylab = "Sensitivity",
                xaxt = "n", yaxt = "n", yaxs = "r")
    add_letter()
    axis(side = 1, at = seq(-1, 0, by = .2), labels = formatC(seq(1, 0, by = -.2), format = "f", digits = 1))
    axis(side = 2, at = seq(0, 1, by = .2), labels = formatC(seq(0, 1, by = .2), format = "f", digits = 1))
    for (prediction_field in prediction_fields) {
        col_curr <- col_prediction_field[prediction_field]
        spec <- pROC_obj_list[[as.character(prob_symptoms_covidneg)]][[prediction_field]]$specificities
        sens <- pROC_obj_list[[as.character(prob_symptoms_covidneg)]][[prediction_field]]$sensitivities
        cis <- pROC_obj_list[[as.character(prob_symptoms_covidneg)]][[prediction_field]]$ci_output
        lines(x = -spec, y = sens, col = col_curr)
        ci_out <- pROC_obj_list[[as.character(prob_symptoms_covidneg)]][[prediction_field]]$ci_output
        spec_ats <- attr(ci_out, "specificities")
        eps = .01
        for (i in 1:length(spec_ats)) {
            lines(x = rep(-spec_ats[i], 2), y = ci_out[i, c(1, 3)], col = col_curr)
            lines(x = -spec_ats[i] + c(-1, 1) * eps, y = ci_out[i, c(1, 1)], col = col_curr)
            lines(x = -spec_ats[i] + c(-1, 1) * eps, y = ci_out[i, c(3, 3)], col = col_curr)
        }
        
    }
    auc_ci_rf <- pROC_obj_list[[as.character(prob_symptoms_covidneg)]][["rf_predicted_test_result"]]$ci
    auc_ci_audio <- pROC_obj_list[[as.character(prob_symptoms_covidneg)]][["audio_based_test_result"]]$ci
    auc_ci_rf_audio <- pROC_obj_list[[as.character(prob_symptoms_covidneg)]][["rf_audio_predicted_test_result"]]$ci
    leg_lab <- paste0(            "ROC-AUC: ", formatC(c(auc_ci_audio[2], auc_ci_rf[2], auc_ci_rf_audio[2]), format = "f", digits = 3), 
                               " (", formatC(c(auc_ci_audio[1], auc_ci_rf[1], auc_ci_rf_audio[1]), format = "f", digits = 3), "-", 
                                    formatC(c(auc_ci_audio[3], auc_ci_rf[3], auc_ci_rf_audio[3]), format = "f", digits = 3), ") - ", 
                      classif_names[c("audio", "rf", "rf_audio")])
    abline(1, 1, col = "grey")
    legend(x = "topleft", legend = leg_lab, text.col = colv[c("audio", "rf", "rf_audio")], 
           bg = "white", cex = legcex, xjust = 0, lty = 1, col = colv[c("audio", "rf", "rf_audio")])
    mtext(side = 3, line = 1.5, text = paste0(100 * prob_symptoms_covidneg, "% of COVID negatives symptomatic"))
}
# We vary prob_symptoms_covidneg, the probability of a COVID- being symptomatic, 
# as part of the expected utility function (described in our results paper)
for (prob_symptoms_covidneg in c(0.1, .2, .3)) {
    max_eu_mat <- NULL
    # We vary epsilon (epsv), reproduction number Rt (Rtv), and prevalence (prev_seq)  
    # as part of the expected utility function (described in our results paper)
    epsv <- c(0.02, .2)
    Rtv <- c(1.5, 1)
    prev_seq <- seq(0, .05, by = .001)
    do_tab <- expand.grid(Rt = Rtv, eps = epsv)
    ltyv <- c(1, 2, 4, 5)
    for (scen in 1:4) {
        Rt <- do_tab[scen, "Rt"]
        eps <- do_tab[scen, "eps"]
        for (prediction_field in prediction_fields) {
            obj_curr <- pROC_obj_list[[as.character(prob_symptoms_covidneg)]][[prediction_field]]
            sens <- obj_curr$sensitivities
            spec <- obj_curr$specificities
            #' Calculate expected utility 
            #'
            #' Longer description of the function
            #' @param pi An object of class "numeric". Prevalence proportion
            #' @param i An object of class "numeric". Index to extract entries from sens and spec vectors
            #' @return Returns an object of class "numeric". Expected utility
            f2 <- function(pi, i){ pi * ((Rt - eps + delta) * sens[i] - delta) + (1 - pi) * (eps * spec[i] - eps)}
            temp_mat <- outer(prev_seq, 1:length(sens), f2)
            max_eu <- apply(temp_mat, 1, max)      
            max_eu_mat <- cbind(max_eu_mat, max_eu)
        }
    }
    matplot(100 * prev_seq, max_eu_mat, ty = "l", col = rep(colv, times = 3), 
            lty = rep(ltyv, each = 3),
           xlab = "Prevalence (%)", ylab = "", yaxs = "r")
    add_letter()
    cexax <- .7
    mtext(side = 2, text = "Maximum expected utiliy", line = 3, cex = cexax)
    mtext(side = 2, text = "(# infections prevented per test)", line = 2.25, cex = cexax * .75)
    leglab = paste0("Rt = ", do_tab$Rt, ", eps = ", do_tab$eps)
    legend(x = "topright", legend = leglab, 
           lty = ltyv, cex = legcex, bg = "white")
    legend(x = "topleft", legend = classif_names[c("audio", "rf", "rf_audio")], 
           col = colv[c("audio", "rf", "rf_audio")],
          cex = legcex, bg = "white", text.col = colv[c("audio", "rf", "rf_audio")])
}
dev.off()

In [None]:
##############################################################################
# Calculate p-values for comparing ROC curves
pval_add_audio_to_sym<-pval_audio_vs_sym<- c()
# We vary prob_symptoms_covidneg, the probability of a COVID- being symptomatic, 
# as part of the expected utility function (described in our results paper)
for (prob_symptoms_covidneg in c(0.1, .2, .3)) {
    pval_audio_vs_sym[as.character(prob_symptoms_covidneg)] <- 
    pROC::roc.test(pROC_obj_list[[as.character(prob_symptoms_covidneg)]][["audio_based_test_result"]],
              pROC_obj_list[[as.character(prob_symptoms_covidneg)]][["rf_predicted_test_result"]], method = "delong")$p.value
    pval_add_audio_to_sym[as.character(prob_symptoms_covidneg)] <- 
    pROC::roc.test(pROC_obj_list[[as.character(prob_symptoms_covidneg)]][["rf_audio_predicted_test_result"]],
              pROC_obj_list[[as.character(prob_symptoms_covidneg)]][["rf_predicted_test_result"]], method = "delong")$p.value
}
pval_add_audio_to_sym
pval_audio_vs_sym
pval_out <- list(add_audio_to_sym = pval_add_audio_to_sym,  
                 audio_vs_sym = pval_audio_vs_sym)
saveRDS(pval_out, file = file.path(plot_dir, paste0("pvals_", covariate_set, ".RDS")))

In [None]:
pval_out
