## Install dependencies
If BPCells isn't installed, install it from Github

In [1]:
if(! "BPCells" %in% rownames(installed.packages())) {
    devtools::install_github("bnprks/BPCells", upgrade = "never", quiet = TRUE)
}

## Load packages

In [2]:
quiet_library <- function(...) { suppressPackageStartupMessages(library(...)) }
quiet_library(BPCells)
quiet_library(dplyr)
quiet_library(furrr)
quiet_library(hise)
quiet_library(purrr)
quiet_library(H5weaver)

In [3]:
if(!dir.exists("output")) {
    dir.create("output")
}

In [4]:
plan("multicore", workers = 32)

## Helper functions

In [5]:
read_h5ad_cell_meta <- function(h5ad_file) 
{
    h5ad_contents <- H5weaver::h5ls(h5ad_file)
    obs_locs <- h5ad_contents$full_name[h5ad_contents$group == "/obs"]
    obs_locs <- obs_locs[!obs_locs %in% c("/obs/__categories", "/obs/_index")]
    obs_locs <- obs_locs[!grepl("Unnamed", obs_locs)]

    h5ad <- H5Fopen(h5ad_file)

    obs_list <- lapply(obs_locs, function(loc) {h5read(h5ad, loc)})

    obs_list <- lapply(
        obs_list,
        function(obs) {
            if(length(obs) == 2) {
                vals <- vector(length = length(obs$codes))
                vals[obs$codes >= 0] <- as.vector(obs$categories)[as.vector(obs$codes + 1)]
                vals[obs$codes == -1] <- NA
            } else {
                vals <- as.vector(obs)
            }

            vals
        }
    )

    obs_list <- lapply(obs_list, as.vector)
    names(obs_list) <- sub(".+/", "", obs_locs)

    H5Fclose(h5ad)
    as.data.frame(obs_list)
}

## Retrieve data from HISE

In [6]:
cache_uuid_path <- function(uuid) {
    cache_path <- paste0("cache/", uuid)
    if(!dir.exists(cache_path)) {
        cacheFiles(list(uuid))
    }
    list.files(cache_path, full.names = TRUE)[1]
}

### Sample metadata

In [7]:
meta_uuid <- "d82c5c42-ae5f-4e67-956e-cd3b7bf88105"

In [8]:
meta_file <- cache_uuid_path(meta_uuid)

In [9]:
sample_meta <- read.csv(meta_file)

### Sample data

In [10]:
search_id <- "cerium-cerium-chromium"
ps_list <- listFilesInProjectStores(list("cohorts"))
ps_df <- map(ps_list[[2]], function(l) { data.frame(id = l$id, name = l$name) }) %>%
  list_rbind()

In [11]:
tar_file_df <- ps_df %>%
  filter(grepl(search_id, name)) %>%
  filter(grepl(".tar$", name))

In [12]:
tar_file_df

id,name
<chr>,<chr>
a2102174-1ad8-422e-829e-1d30a4d0b34f,cerium-cerium-chromium/diha_BR1_Female_Negative_h5ads_2024-04-25.tar
75a5e2f2-55be-47cb-87f8-dc7d127f409c,cerium-cerium-chromium/diha_BR1_Female_Positive_h5ads_2024-04-25.tar
ec826f03-e24e-4a61-bed5-1a0ad923bd3e,cerium-cerium-chromium/diha_BR1_Male_Negative_h5ads_2024-04-25.tar
98ebd57f-d498-4426-9866-fdff89bfcca8,cerium-cerium-chromium/diha_BR1_Male_Positive_h5ads_2024-04-25.tar
03c43cf1-fa52-49f5-b64c-2a7faf723972,cerium-cerium-chromium/diha_BR2_Female_Negative_h5ads_2024-04-25.tar
7844526c-65ce-48b3-8e30-e39948e8389d,cerium-cerium-chromium/diha_BR2_Female_Positive_h5ads_2024-04-25.tar
162925c8-3619-4d31-a633-5dde2728159c,cerium-cerium-chromium/diha_BR2_Male_Negative_h5ads_2024-04-25.tar
e2f8a806-bf5b-4801-bee0-15be6a670830,cerium-cerium-chromium/diha_BR2_Male_Positive_h5ads_2024-04-25.tar


### Cache and unpack sample files

In [13]:
if(!dir.exists("sample_h5ad")) {
    tar_files <- map_chr(tar_file_df$id, )
    
    walk(
        tar_file_df$id,
        function(uuid) {
            tar_file <- cache_uuid_path(uuid)
            untar_call <- paste("tar -xf", tar_file)
            system(untar_call)
            
            rm_call <- paste0("rm -r cache/",uuid)
            system(rm_call)
    })
}

In [14]:
sample_files <- list.files("sample_h5ad", full.names = TRUE)
h5ad_df <- data.frame(h5ad_path = sample_files) %>%
  mutate(pbmc_sample_id = sub("_.+", "", basename(h5ad_path)))

### Assemble BPCells subsets

To generate a sketched atlas of cells, we'll start by subsetting the data into sets of samples that we can manipulate fairly easily.

We can do this by building BPCells datasets per subject to break things down into reasonable sizes.

We'll then sample from these datasets to generate the initial sketch, and then integrate the full dataset to generate a complete picture of all of the data.

In [15]:
if(!dir.exists("sample_bpcells")) {
    dir.create("sample_bpcells")
}

In [16]:
subset_meta <- sample_meta %>%
  left_join(h5ad_df)

[1m[22mJoining with `by = join_by(pbmc_sample_id)`


In [17]:
subset_list <- split(subset_meta, subset_meta$subject.subjectGuid)

In [18]:
subset_dirs <- future_map2(
    subset_list, names(subset_list),
    function(subset_df, subset_name) {
        subset_dir <- paste0("sample_bpcells/", subset_name)
        if(!dir.exists(subset_dir)) {
            mat_list <- map(subset_df$h5ad_path, read_h5ad_dgCMatrix, feature_names = "_index")
            mat <- do.call(cbind, mat_list)
            mat@x <- as.integer(mat@x)
            
            mat <- convert_matrix_type(mat, type = "uint32_t")
            
            write_matrix_dir(mat, subset_dir, overwrite = TRUE)

            meta <- map(subset_df$h5ad_path, read_h5ad_cell_meta) %>%
              list_rbind()

            saveRDS(meta, file.path(subset_dir, "cell_meta.rds"))
        }
        subset_dir
    },
    .options = furrr_options(seed = 3030)
)

In [19]:
out_tar <- paste0("output/diha_bpcells_per_subject_", Sys.Date(), ".tar")
tar_call <- paste("tar -cf", out_tar, "sample_bpcells/*")
system(tar_call)

## Upload results to HISE

In [20]:
study_space_uuid <- "de025812-5e73-4b3c-9c3b-6d0eac412f2a"
title <- paste("DIHA BPCells Per Subject", Sys.Date())

In [21]:
in_list <- as.list(tar_file_df$id)
in_list

In [22]:
out_list <- list(out_tar)
out_list

In [23]:
uploadFiles(
    files = out_list,
    studySpaceId = study_space_uuid,
    title = title,
    inputFileIds = in_list,
    store = "project",
    destination = search_id,
    doPrompt = FALSE
)

In [24]:
sessionInfo()

R version 4.3.2 (2023-10-31)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 20.04.6 LTS

Matrix products: default
BLAS/LAPACK: /opt/conda/lib/libopenblasp-r0.3.25.so;  LAPACK version 3.11.0

Random number generation:
 RNG:     L'Ecuyer-CMRG 
 Normal:  Inversion 
 Sample:  Rejection 
 
locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: Etc/UTC
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] H5weaver_1.2.0    rhdf5_2.46.1      Matrix_1.6-4      data.table_1.15.4
 [5] purrr_1.0.2       hise_2.16.0       furrr_0.3.1       future_1.33.1    
 [9] dplyr_1.1.4       BPCells_0.1.0    

loaded via a namespace