# Convert to Pseudobulk by count sums

In this notebook, we sum UMI counts either before or after LogNormalize in Seurat to generate pseudobulk values for each cell type per sample.

### Output structure

This notebook generates 3 matrices, as well as sample x cell type metadata:

`agg_mat`: Raw count aggregates generated as the sum of UMIs per gene within each sample x cell type group  
`norm_mat`: Normalized aggregates generated by performing `LogNormalize()` in Seurat prior to computing sums  
`detect_mat`: Counts of gene detection frequency within each group, which can be used for feature selection  

Sample metadata (`sample_meta`) has the following columns:

`cohort.cohortGuid`: Cohort ID  
`subject.ageAtFirstDraw`: Subject Age at first on-study blood draw  
`subject.biologicalSex`: Subject's biological sex (Female or Male)  
`subject.birthYear`: Subject's year of birth  
`subject.bmi`: Subject's BMI, rounded to integer  
`subject.cmv`: Subject's CMV status (Negative or Positive)  
`subject.ethnicity`: Subject's self-reported ethnicity  
`subject.race`: Subject's self-reported race  
`subject.subjectGuid`: Unique Subject ID  
`sample.drawDate`: Sample collection date (Year and Month, YYYY-MM)  
`sample.sampleKitGuid`: Unique Sample ID  
`sample.subjectAgeAtDraw`: Age of subject at time of sample draw  
`sample.visitName`: Name of sample collection visit  
`specimen.specimenGuid`: Unique ID of specific aliquot used to generate data  
`batch_id`: Batch ID for quality control  
`pool_id`: Pool ID for quality control (usually 2 sample pools per batch)  
`AIFI_L1`: Broad cell type label  
`AIFI_L2`: Intermediate resolution cell type label  
`AIFI_L3`: High resolution cell type label  
`n_cells`: Number of cells used to generate pseudobulk counts  
`barcodes`: Unique identifier for this pseudobulk population, with the structure `{subject.subjectGuid}_{sample.visitName}_{AIFI_L3}`.
- `barcodes` matches the column names of the matrices

## Load packages

In [1]:
quiet_library <- function(...) { suppressPackageStartupMessages(library(...)) }

quiet_library(dplyr)
quiet_library(hise)
quiet_library(H5weaver)
quiet_library(purrr)
quiet_library(furrr)
quiet_library(Seurat)
quiet_library(tidyr)

In [2]:
plan(multicore, workers = 48)

In [3]:
exclude <- c("^LINC","^MT","^RP")

In [4]:
if(!dir.exists("output")) {
    dir.create("output")
}
if(!dir.exists("pseudobulk_l3")) {
    dir.create("pseudobulk_l3")
}

In [5]:
out_files <- c()

## Helper functions

This function formats cell types for use in filenames

In [6]:
format_cell_type <- function(cell_type) {
    cell_type <- gsub("\\+", "pos", cell_type)
    cell_type <- gsub("-", "neg", cell_type)
    cell_type <- gsub(" ", "-", cell_type)
    cell_type
}

This function assists in reading cell metadata data directly from .h5ad files into R

In [7]:
read_h5ad_cell_meta <- function(h5ad_file) 
{
    h5ad_contents <- H5weaver::h5ls(h5ad_file)
    obs_locs <- h5ad_contents$full_name[h5ad_contents$group == "/obs"]
    obs_locs <- obs_locs[!obs_locs %in% c("/obs/__categories", "/obs/_index")]
    obs_locs <- obs_locs[!grepl("Unnamed", obs_locs)]

    h5ad <- H5Fopen(h5ad_file)

    obs_list <- lapply(obs_locs, function(loc) {h5read(h5ad, loc)})

    obs_list <- lapply(
        obs_list,
        function(obs) {
            if(length(obs) == 2) {
                vals <- vector(length = length(obs$codes))
                vals[obs$codes >= 0] <- as.vector(obs$categories)[as.vector(obs$codes + 1)]
                vals[obs$codes == -1] <- NA
            } else {
                vals <- as.vector(obs)
            }

            vals
        }
    )

    obs_list <- lapply(obs_list, as.vector)
    names(obs_list) <- sub(".+/", "", obs_locs)

    H5Fclose(h5ad)
    as.data.frame(obs_list)
}

This function converts from .h5ad expression values to pseudobulk

In [8]:
sum_list_to_matrix <- function(sum_list, col_names, row_names) {
    mat <- matrix(unlist(sum_list), ncol = length(sum_list))
    colnames(mat) <- col_names
    rownames(mat) <- row_names
    mat
}

In [9]:
sample_h5ad_to_l3_pseudobulk <- function(sample_h5ad_file, exclude = NULL) {
    # Read cell metadata
    meta <- read_h5ad_cell_meta(sample_h5ad_file)
    subject <- meta$subject.subjectGuid[1]
    visit <- meta$sample.visitName[1]
    
    sample_prefix <- paste0(subject, "_", gsub(" ", "-",visit), "_")
    
    # Format cell type so they can be used as names
    meta <- meta %>%
      mutate(format_AIFI_L3 = format_cell_type(AIFI_L3))
    
    # Read counts
    mat <- read_h5ad_dgCMatrix(sample_h5ad_file, feature_names = "_index")
    
    # Filter genes if needed
    genes <- rownames(mat)
    if(!is.null(exclude)) {
        keep_genes <- genes[!grepl("^RP|^MT-|^LINC",genes)]
    } else {
        keep_genes <- genes
    }
    
    # Filter for selected genes and ensure order matches metadata
    mat <- mat[keep_genes, meta$barcodes]

    # Split metadata and matrices by AIFI_L3 type
    split_meta <- split(meta, meta$format_AIFI_L3)
    split_mats <- map(
        split_meta,
        function(meta) { 
            # Transpose so each gene is a column
            t(mat[, meta$barcodes, drop = FALSE])
        }
    )
    
    # Sum counts and detection for each gene
    agg_sums <- map(split_mats, function(mat) { colSums(mat) })
    detect_sums <- map(split_mats, function(mat) { diff(mat@p) })

    # Normalize with Seurat
    so <- CreateSeuratObject(
        counts = mat,
        meta.data = meta
    )
    so <- NormalizeData(
        so, 
        normalization.method = "LogNormalize", 
        scale.factor = 1e4,
        verbose = FALSE
    )

    # Extract and transpose normalized data
    norm_mats <- map(
        split_meta,
        function(meta) {
            mat <- so[["RNA"]]@layers$data
            colnames(mat) <- so@meta.data$barcodes
            mat <- mat[,meta$barcodes]
            t(mat)
        }
    )

    # Sum normalized counts
    norm_sums <- map(norm_mats, function(mat) { colSums(mat) })

    # Assemble matrices from sums
    aggregate_names <- paste0(sample_prefix, names(split_meta))

    agg_mat <- sum_list_to_matrix(agg_sums, aggregate_names, keep_genes)
    norm_mat <- sum_list_to_matrix(norm_sums, aggregate_names, keep_genes)
    detect_mat <- sum_list_to_matrix(detect_sums, aggregate_names, keep_genes)
    
    # Generate aggregate metadata
    type_columns <- c("AIFI_L1", "AIFI_L2", "AIFI_L3")

    meta <- meta %>%
      select(cohort.cohortGuid,
             starts_with("subject"),
             starts_with("sample"),
             starts_with("specimen"),
             "batch_id", "pool_id",
             one_of(type_columns)) %>%
      group_by(AIFI_L3) %>%
      mutate(n_cells = n()) %>%
      ungroup() %>%
      unique() %>%
      mutate(barcodes = paste0(sample_prefix, format_cell_type(AIFI_L3))) %>%
      arrange(AIFI_L3)

    #print(head(meta$barcodes))
    agg_mat <- agg_mat[,meta$barcodes]
    norm_mat <- norm_mat[,meta$barcodes]
    detect_mat <- detect_mat[,meta$barcodes]
    
    list(
        agg_mat = agg_mat,
        norm_mat = norm_mat,
        detect_mat = detect_mat,
        sample_meta = meta
    )
}

In [10]:
select_pseudobulk_samples <- function(pb_data, ...) {
    pb_data$sample_meta <- pb_data$sample_meta %>%
      filter( ... )

    pb_data$agg_mat <- pb_data$agg_mat[, pb_data$sample_meta$barcodes]
    pb_data$norm_mat <- pb_data$norm_mat[, pb_data$sample_meta$barcodes]
    pb_data$detect_mat <- pb_data$detect_mat[, pb_data$sample_meta$barcodes]

    pb_data
}

## Retrieve files to process in HISE

We'll retrieve our clean, non-normalized .h5ad datasets for each sample from HISE

In [11]:
search_id <- "cerium-cerium-chromium"

In [12]:
ps_files <- listFilesInProjectStores(list("cohorts"))
ps_files <- map(
    ps_files$files, 
    function(l) {
        l <- l[c("id", "name")]
        as.data.frame(l)
    }) %>%
  list_rbind()

In [13]:
tar_files <- ps_files %>%
  filter(grepl(search_id, name)) %>%
  filter(grepl(".tar$", name))

## Retrieve and unpack sample files

In [14]:
if(!dir.exists("sample_h5ad")) {
    walk(tar_files$id,
         function(uuid) {
            if(!dir.exists(paste0("cache/", uuid))) {
                hise_res <- cacheFiles(list(uuid))
            }
            
            tar_file <- list.files(paste0("cache/",uuid), full.names = TRUE)
            untar_call <- paste("tar -xf", tar_file)
            system(untar_call)
        }
    )
}

## Convert to pseudobulk for each sample

Now, we'll iterate through each file and apply our pseudobulk function to each in parallel.

In [15]:
possibly_convert <- possibly(sample_h5ad_to_l3_pseudobulk, quiet = FALSE)

In [16]:
sample_h5ads <- list.files("sample_h5ad", full.names = TRUE)
results_list <- future_map(sample_h5ads, possibly_convert)

In [17]:
length(results_list)

In [18]:
sum(is.null(results_list))

### Restructure and combine results

In [19]:
result_names <- names(results_list[[1]])

pb_data <- map(
    result_names, 
    function(result_name) {
        map(results_list, result_name)
    }
)

names(pb_data) <- result_names

In [20]:
pb_data$agg_mat <- do.call(cbind, pb_data$agg_mat)
pb_data$norm_mat <- do.call(cbind, pb_data$norm_mat)
pb_data$detect_mat <- do.call(cbind, pb_data$detect_mat)
pb_data$sample_meta <- do.call(rbind, pb_data$sample_meta)

## Save to .rds for later use in R

In [21]:
out_rds <- paste0("output/diha_AIFI_L3_pseudobulk_list_", Sys.Date(), ".rds")
saveRDS(pb_data, out_rds)

In [22]:
out_files <- c(out_files, out_rds)

## Save per cell type

In [23]:
cell_types <- unique(pb_data$sample_meta$AIFI_L3)
walk(
    cell_types,
    function(cell_type) {
        type_data <- pb_data %>%
            select_pseudobulk_samples(AIFI_L3 == cell_type)

        out_type <- format_cell_type(cell_type)
        type_file <- paste0("pseudobulk_l3/diha_", out_type, "_pseudobulk.rds")
        saveRDS(type_data, type_file)
    }
)

In [24]:
out_tar <- paste0("output/diha_AIFI_L3_pseudobulk_per_type_", Sys.Date(), ".tar")
tar_call <- paste("tar -cf", out_tar, "pseudobulk_l3/*.rds")
system(tar_call)

In [25]:
out_files <- c(out_files, out_tar)

## Save to .csv for flexible downstream usage

In [26]:
agg_csv <- paste0("output/diha_AIFI_L3_pseudobulk_agg_", Sys.Date(), ".csv")
fwrite(pb_data$agg_mat, agg_csv)

x being coerced from class: matrix to data.table



In [27]:
out_files <- c(out_files, agg_csv)

In [28]:
norm_csv <- paste0("output/diha_AIFI_L3_pseudobulk_norm_", Sys.Date(), ".csv")
fwrite(pb_data$norm_mat, norm_csv)

x being coerced from class: matrix to data.table



In [29]:
out_files <- c(out_files, norm_csv)

In [30]:
detect_csv <- paste0("output/diha_AIFI_L3_pseudobulk_detect_", Sys.Date(), ".csv")
fwrite(pb_data$detect_mat, detect_csv)

x being coerced from class: matrix to data.table



In [31]:
out_files <- c(out_files, detect_csv)

In [32]:
meta_csv <- paste0("output/diha_AIFI_L3_pseudobulk_meta_", Sys.Date(), ".csv")
fwrite(pb_data$sample_meta, meta_csv)

In [33]:
out_files <- c(out_files, meta_csv)

## Upload results to HISE

In [43]:
study_space_uuid <- "de025812-5e73-4b3c-9c3b-6d0eac412f2a"
title <- paste("DIHA scRNA L3 Pseudobulk", Sys.Date())

In [44]:
search_id <- ids::proquint(n_words = 3)
search_id

In [45]:
in_list <- as.list(tar_files$id)
in_list

In [46]:
out_list <- as.list(out_files)
out_list

In [47]:
uploadFiles(
    files = out_list,
    studySpaceId = study_space_uuid,
    title = title,
    inputFileIds = in_list,
    store = "project",
    destination = search_id
)

You are trying to upload the following files:  output/diha_AIFI_L3_pseudobulk_list_2024-04-27.rds output/diha_AIFI_L3_pseudobulk_per_type_2024-04-27.tar output/diha_AIFI_L3_pseudobulk_agg_2024-04-27.csv output/diha_AIFI_L3_pseudobulk_norm_2024-04-27.csv output/diha_AIFI_L3_pseudobulk_detect_2024-04-27.csv output/diha_AIFI_L3_pseudobulk_meta_2024-04-27.csv



(y/n) y


In [48]:
sessionInfo()

R version 4.3.2 (2023-10-31)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 20.04.6 LTS

Matrix products: default
BLAS/LAPACK: /opt/conda/lib/libopenblasp-r0.3.25.so;  LAPACK version 3.11.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: Etc/UTC
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] tidyr_1.3.0        Seurat_5.0.1       SeuratObject_5.0.1 sp_2.1-2          
 [5] furrr_0.3.1        future_1.33.1      purrr_1.0.2        H5weaver_1.2.0    
 [9] rhdf5_2.46.1       Matrix_1.6-4       data.table_1.15.4  hise_2.16.0       
[13] dplyr_1.1.4       

loaded via a namespace (and not attached):
  