# Select batch control data for samples

## Load packages

In [1]:
quiet_library <- function(...) { suppressPackageStartupMessages(library(...)) }
quiet_library(hise)
quiet_library(dplyr)
quiet_library(purrr)

In [2]:
if(!dir.exists("output")) {
    dir.create("output")
}

In [3]:
if(!dir.exists("batch_control_h5")) {
    dir.create("batch_control_h5")
}
if(!dir.exists("batch_reports")) {
    dir.create("batch_reports")
}

## Retreive sample metadata

In an earlier step, we assembled and stored sample metadata in HISE. We'll pull this file, and use it to retrieve file for our labeling process.

In [4]:
sample_meta_uuid <- "2da66a1a-17cc-498b-9129-6858cf639caf"

In [5]:
res <- cacheFiles(list(sample_meta_uuid))
sample_meta_file <- list.files(
    paste0("cache/", sample_meta_uuid), 
    pattern = ".csv",
    full.names = TRUE
)

[1] "Initiating file download for ref_h5_meta_data_2024-02-18.csv"
[1] "Download successful."


In [6]:
hise_meta <- read.csv(sample_meta_file)

In [7]:
head(hise_meta$file.pool)

## Locate batch control data

Batch control data are generated using the same subject across all batches. We can locate these datasets using the set of unique Batch IDs in our sample metadata, and select our control subject.subjectGuid values of "HMN169517" and "HMN200910". There should be one batch control sample for each pool of cells, which means there can be either 1 or 2 control datasets for each batch.

In [8]:
batch_ids <- unique(hise_meta$file.batchID)
head(batch_ids)

In [9]:
length(batch_ids)

In [10]:
batch_desc <- getFileDescriptors(
    fileType = "scRNA-seq-labeled",
    filter = list(
        file.batchID = batch_ids
    )
)
batch_desc <- fileDescToDataframe(batch_desc)

In [11]:
control_subjects <- c("HMN169517","HMN200910")

In [12]:
control_meta <- batch_desc %>%
  filter(subject.subjectGuid %in% control_subjects)

In [13]:
nrow(control_meta)

Are all batches represented?

In [14]:
length(setdiff(batch_ids, control_meta$file.batchID))

### Refine from batch to pool

We can use file names to refine our controls to just the specific pools within our batches that were used for our full dataset.

In [15]:
hise_meta$file.pool <- sub("_.+", "", basename(hise_meta$file.name))
pool_ids <- unique(hise_meta$file.pool)

In [16]:
length(pool_ids)

In [17]:
control_meta$file.pool <- sub("_.+", "", basename(control_meta$file.name))
control_meta <- control_meta %>%
  filter(file.pool %in% pool_ids)

In [18]:
nrow(control_meta)

In [19]:
length(setdiff(pool_ids, control_meta$file.pool))

## Retrieve batch control data

In [20]:
res <- map(control_meta$file.id, function(f) { cacheFiles(list(f)) })

[1] "Initiating file download for B001-P1_IMM19_709_labeled.h5"
[1] "Download successful."
[1] "Initiating file download for B002-P2_IMM19_698_2023-11-17T21:38:04.103392546Z_labeled.h5"
[1] "Download successful."
[1] "Initiating file download for B002-P1_IMM19_698_2023-11-17T21:36:51.794326181Z_labeled.h5"
[1] "Download successful."
[1] "Initiating file download for B078-P1_IMM19_447_2021-08-15T20:26:02.548957446Z_labeled.h5"
[1] "Download successful."
[1] "Initiating file download for B078-P2_IMM19_447_2021-08-19T17:09:29.934849811Z_labeled.h5"
[1] "Download successful."
[1] "Initiating file download for B007-P1_IMM19_389_2020-06-21T16:49:04.135845892Z_labeled.h5"
[1] "Download successful."
[1] "Initiating file download for B010-P1_IMM19_394_2021-11-09T17:30:04.059664207Z_labeled.h5"
[1] "Download successful."
[1] "Initiating file download for B039-P2_IMM19_412_2020-12-14T06:30:12.885244281Z_labeled.h5"
[1] "Download successful."
[1] "Initiating file download for B039-P1_IMM19_412_202

## Assemble batch control data

In [21]:
walk2(
    control_meta$file.id, control_meta$file.pool,
    function(file_id, pool) {
        control_file <- list.files(paste0("cache/", file_id), full.names = TRUE)
        out_file <- paste0("batch_control_h5/", pool, "_batch_control.h5")
        file.copy(control_file, out_file)
    }
)

In [22]:
control_data_tar <- paste0("output/ref_pbmc_batch_control_h5_", Sys.Date(), ".tar")
tar_call <- paste("tar -cf", control_data_tar, "batch_control_h5/*")
system(tar_call)

## Locate Pool QC reports

QC reports are also generated for every pool we process. To assist in assessing our data quality, we'll retrieve these and bundle them along with our batch controls.

In [23]:
report_desc <- getFileDescriptors(
    fileType = "scRNA-seq-CellHashing-Main-report"
)
report_desc <- fileDescToDataframe(report_desc)

In [24]:
report_meta <- report_desc %>%
  filter(file.pool %in% pool_ids) %>%
  unique()

In [25]:
nrow(report_meta)

Some pools are duplicated - let's get the latest version for each pool by sorting on the file.name, which includes a timestamp.

In [26]:
report_meta <- report_meta %>%
  arrange(desc(file.name)) %>%
  group_by(file.pool) %>%
  slice(1) %>%
  ungroup()

In [27]:
nrow(report_meta)

In [28]:
length(setdiff(pool_ids, report_meta$file.pool))

## Retrieve pool reports

In [29]:
res <- map(report_meta$file.id, function(f) { cacheFiles(list(f)) })

[1] "Initiating file download for B001_hto_merge_summary_report.html"
[1] "Download successful."
[1] "Initiating file download for B002_hto_merge_summary_report.html"
[1] "Download successful."
[1] "Initiating file download for B002_hto_merge_summary_report.html"
[1] "Download successful."
[1] "Initiating file download for B007_hto_merge_summary_report.html"
[1] "Download successful."
[1] "Initiating file download for B010_hto_merge_summary_report.html"
[1] "Download successful."
[1] "Initiating file download for B036_hto_merge_summary_report.html"
[1] "Download successful."
[1] "Initiating file download for B039_hto_merge_summary_report.html"
[1] "Download successful."
[1] "Initiating file download for B039_hto_merge_summary_report.html"
[1] "Download successful."
[1] "Initiating file download for B040_hto_merge_summary_report.html"
[1] "Download successful."
[1] "Initiating file download for B040_hto_merge_summary_report.html"
[1] "Download successful."
[1] "Initiating file download 

## Assemble batch reports

In [30]:
walk2(
    report_meta$file.id, report_meta$file.pool,
    function(file_id, pool) {
        control_file <- list.files(paste0("cache/", file_id), full.names = TRUE)
        out_file <- paste0("batch_reports/", pool, "_qc_report.html")
        file.copy(control_file, out_file)
    }
)

In [31]:
report_tar <- paste0("output/ref_pbmc_batch_report_html_", Sys.Date(), ".tar")
tar_call <- paste("tar -cf", report_tar, "batch_reports/*")
system(tar_call)

## Upload data to HISE

In [32]:
study_space_uuid <- "64097865-486d-43b3-8f94-74994e0a72e0"
title <- paste("PBMC Ref. Batch Control Data and QC Reports", Sys.Date())

In [33]:
search_id <- ids::proquint(n_words = 3)

In [34]:
in_list <- as.list(c(sample_meta_uuid, control_meta$file.id, report_meta$file.id))

In [35]:
out_list <- list(control_data_tar, report_tar)

In [36]:
uploadFiles(
    files = out_list,
    studySpaceId = study_space_uuid,
    title = title,
    inputFileIds = in_list,
    store = "project",
    destination = search_id
)

You are trying to upload the following files:  output/ref_pbmc_batch_control_h5_2024-05-07.tar output/ref_pbmc_batch_report_html_2024-05-07.tar



(y/n) y


[1] "Authorization token invalid or expired."
[1] "Retrying..."


In [37]:
sessionInfo()

R version 4.3.2 (2023-10-31)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 20.04.6 LTS

Matrix products: default
BLAS/LAPACK: /opt/conda/lib/libopenblasp-r0.3.25.so;  LAPACK version 3.11.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: Etc/UTC
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] purrr_1.0.2 dplyr_1.1.4 hise_2.16.0

loaded via a namespace (and not attached):
 [1] ids_1.0.1        crayon_1.5.2     vctrs_0.6.5      httr_1.4.7      
 [5] cli_3.6.2        rlang_1.1.3      stringi_1.8.3    generics_0.1.3  
 [9] assertthat_0.2.1 jsonlite_1.8.8   glue_1.7.0       RCurl_1.98-1.14 
[13] plyr_