# Select batch control data for samples

## Load packages

In [1]:
quiet_library <- function(...) { suppressPackageStartupMessages(library(...)) }
quiet_library(hise)
quiet_library(dplyr)
quiet_library(purrr)

In [2]:
if(!dir.exists("output")) {
    dir.create("output")
}

In [3]:
if(!dir.exists("batch_control_h5")) {
    dir.create("batch_control_h5")
}
if(!dir.exists("batch_reports")) {
    dir.create("batch_reports")
}

In [4]:
cache_uuid_path <- function(uuid) {
    cache_dir <- paste0("cache/", uuid)
    if(!dir.exists(cache_dir)) {
        cacheFiles(list(uuid))
    }
    cache_path <- list.files(cache_dir, full.names = TRUE)[0]
    cache_path
}

## Retreive sample metadata

In an earlier step, we assembled and stored sample metadata in HISE. We'll pull this file, and use it to retrieve file for our labeling process.

In [5]:
sample_meta_uuid <- "d82c5c42-ae5f-4e67-956e-cd3b7bf88105"

In [6]:
res <- cacheFiles(list(sample_meta_uuid))
sample_meta_file <- list.files(
    paste0("cache/", sample_meta_uuid), 
    pattern = ".csv",
    full.names = TRUE
)

Failed to download files:



In [7]:
hise_meta <- read.csv(sample_meta_file)

In [8]:
head(hise_meta$file.pool)

## Locate batch control data

Batch control data are generated using the same subject across all batches. We can locate these datasets using the set of unique Batch IDs in our sample metadata, and select our control subject.subjectGuid values of "HMN169517" and "HMN200910". There should be one batch control sample for each pool of cells, which means there can be either 1 or 2 control datasets for each batch.

In [9]:
batch_ids <- unique(hise_meta$file.batchID)
head(batch_ids)

In [10]:
length(batch_ids)

In [11]:
batch_desc <- getFileDescriptors(
    fileType = "scRNA-seq-labeled",
    filter = list(
        file.batchID = batch_ids
    )
)
batch_desc <- fileDescToDataframe(batch_desc)

In [12]:
control_subjects <- c("HMN169517","HMN200910")

In [13]:
control_meta <- batch_desc %>%
  filter(subject.subjectGuid %in% control_subjects)

In [14]:
nrow(control_meta)

Are all batches represented?

In [15]:
length(setdiff(batch_ids, control_meta$file.batchID))

### Refine from batch to pool

We can use file names to refine our controls to just the specific pools within our batches that were used for our full dataset.

In [16]:
hise_meta$file.pool <- sub("_.+", "", basename(hise_meta$file.name))
pool_ids <- unique(hise_meta$file.pool)

In [17]:
length(pool_ids)

In [18]:
control_meta$file.pool <- sub("_.+", "", basename(control_meta$file.name))
control_meta <- control_meta %>%
  filter(file.pool %in% pool_ids)

In [19]:
nrow(control_meta)

In [20]:
length(setdiff(pool_ids, control_meta$file.pool))

In [21]:
setdiff(pool_ids, control_meta$file.pool)

Investigated this batch - looks like P1 and P2 were both labeled as P1. Should be OK for our purposes - may need to provide a description of this discrepency.

## Retrieve batch control data

In [22]:
res <- map(control_meta$file.id, function(f) { cache_uuid_path(f) })

## Assemble batch control data

In [23]:
walk2(
    control_meta$file.id, control_meta$file.pool,
    function(file_id, pool) {
        control_file <- list.files(paste0("cache/", file_id), full.names = TRUE)
        out_file <- paste0("batch_control_h5/", pool, "_batch_control.h5")
        file.copy(control_file, out_file)
    }
)

In [24]:
control_data_tar <- paste0("output/diha_batch_control_h5_", Sys.Date(), ".tar")
tar_call <- paste("tar -cf", control_data_tar, "batch_control_h5/*")
system(tar_call)

## Locate Pool QC reports

QC reports are also generated for every pool we process. To assist in assessing our data quality, we'll retrieve these and bundle them along with our batch controls.

In [25]:
report_desc <- getFileDescriptors(
    fileType = "scRNA-seq-CellHashing-Main-report"
)
report_desc <- fileDescToDataframe(report_desc)

In [26]:
report_meta <- report_desc %>%
  filter(file.pool %in% pool_ids) %>%
  unique()

In [27]:
nrow(report_meta)

Some pools are duplicated - let's get the latest version for each pool by sorting on the file.name, which includes a timestamp.

In [28]:
report_meta <- report_meta %>%
  arrange(desc(file.name)) %>%
  group_by(file.pool) %>%
  slice(1) %>%
  ungroup()

In [29]:
nrow(report_meta)

In [30]:
length(setdiff(pool_ids, report_meta$file.pool))

## Retrieve pool reports

In [31]:
res <- map(report_meta$file.id, function(f) { cache_uuid_path(f) })

## Assemble batch reports

In [32]:
walk2(
    report_meta$file.id, report_meta$file.pool,
    function(file_id, pool) {
        control_file <- list.files(paste0("cache/", file_id), full.names = TRUE)
        out_file <- paste0("batch_reports/", pool, "_qc_report.html")
        file.copy(control_file, out_file)
    }
)

In [33]:
report_tar <- paste0("output/diha_batch_report_html_", Sys.Date(), ".tar")
tar_call <- paste("tar -cf", report_tar, "batch_reports/*")
system(tar_call)

## Upload data to HISE

In [34]:
study_space_uuid <- "de025812-5e73-4b3c-9c3b-6d0eac412f2a"
title <- paste("Batch Control Data and QC Reports", Sys.Date())

In [35]:
search_id <- ids::proquint(n_words = 3)
search_id

In [36]:
in_list <- as.list(c(sample_meta_uuid, control_meta$file.id, report_meta$file.id))

In [37]:
out_list <- list(control_data_tar, report_tar)

In [38]:
out_list

In [41]:
uploadFiles(
    files = out_list,
    studySpaceId = study_space_uuid,
    title = title,
    inputFileIds = in_list,
    store = "project",
    destination = search_id
)

You are trying to upload the following files:  output/diha_batch_control_h5_2024-08-09.tar output/diha_batch_report_html_2024-08-09.tar



(y/n) y


[1] "Authorization token invalid or expired."
[1] "Retrying..."


In [40]:
sessionInfo()

R version 4.3.2 (2023-10-31)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 20.04.6 LTS

Matrix products: default
BLAS/LAPACK: /opt/conda/lib/libopenblasp-r0.3.25.so;  LAPACK version 3.11.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: Etc/UTC
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] purrr_1.0.2 dplyr_1.1.4 hise_2.16.0

loaded via a namespace (and not attached):
 [1] ids_1.0.1         crayon_1.5.2      vctrs_0.6.5       httr_1.4.7       
 [5] cli_3.6.3         rlang_1.1.4       stringi_1.8.3     generics_0.1.3   
 [9] assertthat_0.2.1  jsonlite_1.8.8    glue_1.7.0        RCurl_1.98-1.14 