In [None]:
###############################################################
# Import dependencies
rm(list = ls())
library(reticulate)
sagemaker <- import('sagemaker')
pd <- import("pandas")

extra_packages_reqd <- c("aws.s3", "aws.ec2metadata")
packages_available <- library()$results[, "Package"]
for (package_current in extra_packages_reqd) {
    if (!package_current %in% packages_available) {
        install.packages(package_current)
    }
}

In [None]:
###############################################################
# Load meta data files to be populated with matched test/train splits
Sys.setenv("AWS_DEFAULT_REGION" = "eu-west-2")
setwd('/home/ec2-user/SageMaker/jbc-cough-in-a-box')
dir_root <- readRDS(file.path("notebooks/matching/config.RDS"))
meta_splits <- aws.s3::s3read_using(read.csv, 
             object = file.path(dir_root, "BAMstudy2022-prep/train_test_splits_stage2.csv"))
meta_part <- aws.s3::s3read_using(read.csv, 
             object = file.path(dir_root, "BAMstudy2022-prep/participant_metadata_220922.csv"))
meta_audio <- aws.s3::s3read_using(read.csv, 
             object = file.path(dir_root, "BAMstudy2022-prep/audio_metadata_220922.csv"))


In [None]:
###############################################################
# Create combined meta file

meta_new <- meta_part
meta_new <- cbind(meta_new, meta_audio[match(meta_new$participant_identifier, 
                                     meta_audio$participant_identifier), ])
meta_new <- cbind(meta_new, meta_splits[match(meta_new$participant_identifier, 
                                     meta_splits$participant_identifier), ])
name_map <- data.frame(old = c('Cough..any.', 'test_result',
                               "Asthma", 
                               "COPD.or.Emphysema", 
                               'age', "gender", 'smoker_status',
                               "Runny.or.blocked.nose", 
                               "Shortness.of.breath",  
                               "Sore.throat", 
                              'audio_sentence'),
                       new = c('symptom_cough_any', 'covid_test_result',
                               'respiratory_condition_asthma',
                               'respiratory_condition_copd_or_emphysema',
                               'age', 'gender', 'smoker_status',
                               'symptom_runny_or_blocked_nose',
                               'symptom_shortness_of_breath', 
                               'symptom_sore_throat',
                               'sentence_file_name'
                              ),
                      binary_to_yesno = c(TRUE, FALSE, 
                                          TRUE,
                                         TRUE,
                                         FALSE, FALSE, FALSE,
                                          TRUE,
                                          TRUE,
                                          TRUE,
                                          FALSE
                                         ))
meta_new[, name_map$old] <- meta_new[, name_map$new]
meta_new[, name_map$old[name_map$binary_to_yesno]] <- 
                    ifelse(meta_new[, name_map$new[name_map$binary_to_yesno]] == 1,
                           "Yes", "No")
age_df <- data.frame(l = seq(0, 100, by = 10), u = seq(9, 109, by = 10))
age_df$age_binned <- paste0("[", age_df$l, ",", age_df$u, "]")
meta_new$age[meta_new$age == '94+'] <- 94
meta_new$age_binned <- age_df[findInterval(as.numeric(meta_new$age), age_df$l), "age_binned"]
meta_new$react_or_tt <- ifelse(meta_new$recruitment_source != "Test and Trace", "REACT", "T+T")
meta_new$cough <- ifelse(meta_new$Cough..any. == "Yes", "Cough", "No cough")
meta_new$no_symptoms <- ifelse(meta_new$symptom_none, "No symptoms", "Symptoms")
print("Splits")
table(meta_new$splits)
print("Original splits")
table(meta_new$original_splits)

In [None]:
###############################################################
# Explore differences in numbers between train and longit test
print("Longitudinal test")
table(meta_new[meta_new$splits == "long", c("test_result", "no_symptoms")])
print("Train")
table(meta_new[meta_new$splits %in% c("train", "val"), c("test_result", "no_symptoms")])
print("Longitudinal test")
table(meta_new[meta_new$splits == "long", c("test_result", "react_or_tt")])
print("Train")
table(meta_new[meta_new$splits %in% c("train", "val"), c("test_result", "react_or_tt")])

In [None]:
###############################################################
# Perform matching analysis where we ensure we have 1:1 ratio of covid+/covid- 
# at each level of metadata covariates

meta <- meta_new

# These are the variables we match on in the test set
matched_vars_test_set <- c("react_or_tt", 
                            "gender", 
                            "age_binned", 
                            "cough", 
                            "no_symptoms",
                            "Sore.throat",
                            "Asthma",
                            "Shortness.of.breath",
                            "Runny.or.blocked.nose")
# These are the variables we match on in the train set
matched_vars_train_set <- c("gender", 
                            "age_binned",
                            "cough", 
                            "Sore.throat", 
                            "Asthma",
                            "Shortness.of.breath", 
                            "Runny.or.blocked.nose", 
                            "COPD.or.Emphysema", 
                            "smoker_status")

# Perform matching for each relevant set in turn
for (set_curr in c("original_test", "rebalanced_test", 
                   "original_long_test", "rebalanced_long_test", 
                   "original_train", "rebalanced_train")) {
    if (set_curr == "original_test") {
        meta_curr <- meta[which(meta$original_splits %in% c("test")), ]
        matched_vars_curr <- matched_vars_test_set
    }
    if (set_curr == "rebalanced_test") {
        meta_curr <- meta[which(meta$splits %in% c("test")), ]
        matched_vars_curr <- matched_vars_test_set
    }
    if (set_curr == "original_long_test") {
        meta_curr <- meta[which(meta$original_splits %in% c("long")), ]
        matched_vars_curr <- matched_vars_test_set
    }
    if (set_curr == "rebalanced_long_test") {
        meta_curr <- meta[which(meta$splits %in% c("long")), ]
        matched_vars_curr <- matched_vars_test_set
    }
    if (set_curr == "original_train") {
        meta_curr <- meta[which(meta$original_splits %in% c("train", "val")), ]
        matched_vars_curr <- matched_vars_train_set
    }
    if (set_curr == "rebalanced_train") {
        meta_curr <- meta[which(meta$splits %in% c("train", "val")), ]
        matched_vars_curr <- matched_vars_train_set
    }
    meta_curr$match_string <- apply(meta_curr[, matched_vars_curr, drop = F], 1, 
                                    function(x) paste(x, collapse = "_"))
    match_tab_curr <- table(meta_curr$match_string, meta_curr$test_result)
    match_tab_curr <- match_tab_curr[, c("Negative", "Positive")]
    meta_curr$matched <- FALSE
    barc_list <- list()
    used_audio_sentences <- c()
    stratum_numbers <- data.frame()
    # Order by barcopde here for reproducibility
    meta_curr <- meta_curr[order(meta_curr$audio_sentence), ]
    stratum_df <- data.frame()
    for (j in 1:nrow(match_tab_curr)) {
        match_string_current <- rownames(match_tab_curr)[j]
        n_from_each_of_neg_pos <- min(match_tab_curr[j, ])
        if (n_from_each_of_neg_pos > 0) {
            set.seed(1)
            matched_pos_meta_curr <- meta_curr[which(
                meta_curr$match_string == match_string_current &
                meta_curr$test_result == "Positive"), ]
            matched_neg_meta_curr <- meta_curr[which(
                meta_curr$match_string == match_string_current &
                meta_curr$test_result == "Negative"), ]
            new_matched_audio_sentences <- c(matched_pos_meta_curr$audio_sentence[1:n_from_each_of_neg_pos],
                                    matched_neg_meta_curr$audio_sentence[1:n_from_each_of_neg_pos])
            meta_curr[meta_curr$audio_sentence %in% new_matched_audio_sentences, "matched"] <- TRUE  
            if (any(new_matched_audio_sentences %in% used_audio_sentences)) {
                print(j)
                print(match_string_current)
                print(used_audio_sentences)
                stop()
            }
            used_audio_sentences <- c(used_audio_sentences, new_matched_audio_sentences)
            barc_list[[match_string_current]] <- new_matched_audio_sentences
            add_stratum <- data.frame(audio_sentence = new_matched_audio_sentences,
                                     stratum = match_string_current)
            stratum_df <- rbind(stratum_df, add_stratum)

        }
    }
    in_field_name <- paste0("in_matched_", set_curr)
    stratum_field_name <- paste0("stratum_matched_", set_curr)
    meta[, in_field_name] <- FALSE
    meta[, stratum_field_name] <- NA
    meta[match(meta_curr$audio_sentence[meta_curr$matched], meta$audio_sentence), in_field_name] <- TRUE
    meta[match(stratum_df$audio_sentence, meta$audio_sentence), stratum_field_name] <- stratum_df$stratum
    head(meta[meta[, in_field_name], c(in_field_name, stratum_field_name)])
                                    
}
sum(meta$in_matched_rebalanced_test)
sum(meta$in_matched_original_test)
sum(meta$in_matched_rebalanced_long_test)
sum(meta$in_matched_original_long_test)
sum(meta$in_matched_rebalanced_train)
sum(meta$in_matched_original_train)

In [None]:
###############################################################
# Write new meta file (not over-writing imported data file)
new_splits_fields <- c('in_matched_rebalanced_test', 'in_matched_original_test',
               'in_matched_rebalanced_long_test', 'in_matched_original_long_test',
               'in_matched_rebalanced_train', 'in_matched_original_train')
meta_splits[, new_splits_fields] <- meta[match(meta_splits$participant_identifier,
                                                  meta$participant_identifier),
                                            new_splits_fields]

aws.s3::s3write_using(x = meta_splits, FUN = write.table, sep = ",",
                          row.names = FALSE, 
                          col.names = TRUE,
             object = file.path(dir_root, "BAMstudy2022-prep/train_test_splits_stage3.csv"))
aws.s3::s3write_using(x = meta, FUN = write.table, sep = ",",
                          row.names = FALSE, 
                          col.names = TRUE,
             object = file.path(dir_root, "BAMstudy2022-prep/meta_data_with_splits_old_format.csv"))
