# Convert to Pseudobulk by count sums

In this notebook, we sum UMI counts, and compute means after LogNormalize in Seurat to generate pseudobulk values for each cell type per sample.

### Output structure

This notebook generates 3 matrices, as well as sample x cell type metadata:

`agg_mat`: Raw count aggregates generated as the sum of UMIs per gene within each sample x cell type group  
`mean_mat`: Mean of normalized aggregates generated by performing `LogNormalize()` in Seurat prior to computing means  
`detect_mat`: Counts of gene detection frequency within each group, which can be used for feature selection  

Sample metadata (`sample_meta`) has the following columns:

`cohort.cohortGuid`: Cohort ID  
`subject.biologicalSex`: Subject's biological sex (Female or Male)  
`subject.birthYear`: Subject's year of birth  
`subject.subjectGuid`: Unique Subject ID  
`sample.sampleKitGuid`: Unique Sample ID  
`sample.visitName`: Name of sample collection visit  
`specimen.specimenGuid`: Unique ID of specific aliquot used to generate data  
`batch_id`: Batch ID for quality control  
`pool_id`: Pool ID for quality control (usually 2 sample pools per batch)  
`aifi_cell_type`: T cell type 
`n_cells`: Number of cells used to generate pseudobulk counts  
`barcodes`: Unique identifier for this pseudobulk population, with the structure `{subject.subjectGuid}_{sample.visitName}_{aifi_cell_type}`.
- `barcodes` matches the column names of the matrices

## Load packages

In [1]:
quiet_library <- function(...) { suppressPackageStartupMessages(library(...)) }

quiet_library(dplyr)
quiet_library(hise)
quiet_library(H5weaver)
quiet_library(purrr)
quiet_library(furrr)
quiet_library(Seurat)
quiet_library(tidyr)

In [2]:
plan(multicore, workers = 8)
options(future.globals.maxSize = 4000 * 1024 ^ 2) # 4GB per worker

In [3]:
if(!dir.exists("output")) {
    dir.create("output")
}
if(!dir.exists("pseudobulk")) {
    dir.create("pseudobulk")
}

In [4]:
out_files <- c()

## Helper functions

In [5]:
read_path_uuid <- function(uuid) {
    uuid_path <- paste0("cache/", uuid)
    if(!dir.exists(uuid_path)) {
        cacheFiles(list(uuid))
    }
    list.files(uuid_path, full.names = TRUE)[1]
}

In [6]:
read_csv_uuid <- function(uuid) {
    filename <- read_path_uuid(uuid)
    read.csv(filename)
}

This function formats cell types for use in filenames

In [7]:
format_cell_type <- function(cell_type) {
    cell_type <- gsub("\\+", "pos", cell_type)
    cell_type <- gsub("-", "neg", cell_type)
    cell_type <- gsub(" ", "-", cell_type)
    cell_type
}

This function converts from .h5ad expression values to pseudobulk

In [8]:
sum_list_to_matrix <- function(sum_list, col_names, row_names) {
    mat <- matrix(unlist(sum_list), ncol = length(sum_list))
    colnames(mat) <- col_names
    rownames(mat) <- row_names
    mat
}

In [9]:
sample_so_to_type_pseudobulk <- function(sample_so, exclude = NULL) {
    # Read cell metadata
    meta <- sample_so@meta.data
    subject <- meta$subject.subjectGuid[1]
    visit <- meta$sample.visitName[1]
    
    sample_prefix <- paste0(subject, "_")
    
    # Format cell type so they can be used as names
    meta <- meta %>%
      mutate(format_type = format_cell_type(aifi_cell_type))
    
    # Read counts
    mat <- sample_so[["RNA"]]@layers$counts
    rownames(mat) <- rownames(all_so)
    colnames(mat) <- meta$barcodes
    
    # Filter genes if needed
    genes <- rownames(mat)
    if(!is.null(exclude)) {
        keep_genes <- genes[!grepl(exclude,genes)]
    } else {
        keep_genes <- genes
    }
    
    # Filter for selected genes and ensure order matches metadata
    mat <- mat[keep_genes, meta$barcodes]
    
    # Split metadata and matrices by AIFI_L3 type
    split_meta <- split(meta, meta$format_type)
    split_mats <- map(
        split_meta,
        function(meta) { 
            # Transpose so each gene is a column
            t(mat[, meta$barcodes, drop = FALSE])
        }
    )
    
    # Sum counts and detection for each gene
    agg_sums <- map(split_mats, function(mat) { colSums(mat) })
    detect_sums <- map(split_mats, function(mat) { diff(mat@p) })

    # Normalize with Seurat
    so <- CreateSeuratObject(
        counts = mat,
        meta.data = meta
    )
    so <- NormalizeData(
        so, 
        normalization.method = "LogNormalize", 
        scale.factor = 1e4,
        verbose = FALSE
    )

    # Extract and transpose normalized data
    norm_mats <- map(
        split_meta,
        function(meta) {
            mat <- so[["RNA"]]@layers$data
            colnames(mat) <- so@meta.data$barcodes
            mat <- mat[,meta$barcodes]
            t(mat)
        }
    )

    # Mean of normalized counts
    norm_means <- map(
        norm_mats,
        function(mat, n) {
            colSums(mat) / nrow(mat)
        })
    
    # Assemble matrices from sums
    aggregate_names <- paste0(sample_prefix, names(split_meta))

    agg_mat <- sum_list_to_matrix(agg_sums, aggregate_names, keep_genes)
    mean_mat <- sum_list_to_matrix(norm_means, aggregate_names, keep_genes)
    detect_mat <- sum_list_to_matrix(detect_sums, aggregate_names, keep_genes)
    
    # Generate aggregate metadata
    type_columns <- c("aifi_cell_type")

    meta <- meta %>%
      select(cohort.cohortGuid,
             starts_with("subject"),
             starts_with("sample"),
             starts_with("specimen"),
             "batch_id", "pool_id",
             one_of(type_columns)) %>%
      group_by(aifi_cell_type) %>%
      mutate(n_cells = n()) %>%
      ungroup() %>%
      unique() %>%
      mutate(barcodes = paste0(sample_prefix, format_cell_type(aifi_cell_type))) %>%
      arrange(aifi_cell_type)

    #print(head(meta$barcodes))
    agg_mat <- agg_mat[,meta$barcodes]
    mean_mat <- mean_mat[,meta$barcodes]
    detect_mat <- detect_mat[,meta$barcodes]
    
    list(
        agg_mat = agg_mat,
        mean_mat = mean_mat,
        detect_mat = detect_mat,
        sample_meta = meta
    )
}

In [10]:
select_pseudobulk_samples <- function(pb_data, ...) {
    pb_data$sample_meta <- pb_data$sample_meta %>%
      filter( ... )

    pb_data$agg_mat <- pb_data$agg_mat[, pb_data$sample_meta$barcodes]
    pb_data$mean_mat <- pb_data$mean_mat[, pb_data$sample_meta$barcodes]
    pb_data$detect_mat <- pb_data$detect_mat[, pb_data$sample_meta$barcodes]

    pb_data
}

## Retrieve files

Now, we'll use the HISE SDK package to retrieve the Seurat objects and cell type labels based on file UUIDs. This will be placed in the `cache/` subdirectory by default.

In [11]:
file_uuids <- list(
    cd4_so = "50756d6d-8216-4eba-8245-0a41d4f44e7a", # CD4 T cell Seurat object
    cd8_so = "057a9e09-1f9d-4684-b00f-7739209ffe17", # CD8 T cell Seurat object
    cd4_labels = "56da30a4-3ac1-4c2b-8ca8-5e1d58cc6986", # CD4 type labels
    cd8_labels = "d369cedf-18c3-4579-83ea-a0daab116e3c"  # CD8 type labels
)

In [12]:
file_paths <- map(file_uuids, read_path_uuid)

## Read CD4 and CD8 data, and split per sample

In [13]:
cd4_labels <- read.csv(file_paths$cd4_labels)
cd8_labels <- read.csv(file_paths$cd8_labels)
all_labels <- rbind(cd4_labels, cd8_labels)

In [14]:
head(all_labels)

Unnamed: 0_level_0,barcodes,sample.sampleKitGuid,predicted.celltype.l1.score,predicted.celltype.l1,predicted.celltype.l2.score,predicted.celltype.l2,predicted.celltype.l3.score,predicted.celltype.l3,aifi_cell_type
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<chr>,<chr>
1,dc6d9d6831b011ef80e742c13d66f8da,KT00395,1,CD4 T,0.8999748,CD4 TCM,0.3633358,CD4 TCM_3,t_cd4_em
2,dc6f966831b011ef80e742c13d66f8da,KT00395,1,CD4 T,0.8149228,CD4 Naive,0.8149228,CD4 Naive,t_cd4_naive
3,dc8f1c7c31b011ef80e742c13d66f8da,KT00395,1,CD4 T,0.4632712,CD4 TCM,0.3341503,CD4 Naive,t_cd4_naive
4,dca1879a31b011ef80e742c13d66f8da,KT00395,1,CD4 T,0.9009319,CD4 Naive,0.9009319,CD4 Naive,t_cd4_naive
5,dcb6a9f431b011ef80e742c13d66f8da,KT00395,1,CD4 T,0.5730879,CD4 TCM,0.3885105,CD4 TCM_2,t_cd4_cm
6,dcb8a98431b011ef80e742c13d66f8da,KT00395,1,CD4 T,0.8962496,CD4 TCM,0.5954403,CD4 TCM_1,t_cd4_naive


In [15]:
cd4_so <- readRDS(file_paths$cd4_so)
cd8_so <- readRDS(file_paths$cd8_so)

In [16]:
all_so <- merge(cd4_so, cd8_so)
all_so <- JoinLayers(all_so, layers = c("counts"))

In [17]:
all_so@meta.data$aifi_cell_type <- all_labels$aifi_cell_type[match(all_so@meta.data$barcodes, all_labels$barcodes)]

In [18]:
subjects <- unique(all_so@meta.data$subject.subjectGuid)
split_so <- map(
    subjects,
    function(subject) {
        all_so[,all_so$subject.subjectGuid == subject]
    }
)

In [19]:
results_list <- future_map(
    split_so, sample_so_to_type_pseudobulk
)

In [20]:
result_names <- names(results_list[[1]])

pb_data <- map(
    result_names, 
    function(result_name) {
        map(results_list, result_name)
    }
)

names(pb_data) <- result_names

In [21]:
pb_data$agg_mat <- do.call(cbind, pb_data$agg_mat)
pb_data$mean_mat <- do.call(cbind, pb_data$mean_mat)
pb_data$detect_mat <- do.call(cbind, pb_data$detect_mat)
pb_data$sample_meta <- do.call(rbind, pb_data$sample_meta)

## Save to .rds for later use in R

In [22]:
out_rds <- paste0("output/ped_sr_tea_pseudobulk_list_", Sys.Date(), ".rds")
saveRDS(pb_data, out_rds)

In [23]:
out_files <- c(out_files, out_rds)

## Save to .csv for flexible downstream usage

In [24]:
agg_csv <- paste0("output/ped_sr_tea_pseudobulk_agg_", Sys.Date(), ".csv")
agg_df <- as.data.frame(pb_data$agg_mat) %>%
  mutate(gene = rownames(pb_data$agg_mat)) %>%
  select(gene, everything())
fwrite(agg_df, agg_csv)

In [25]:
out_files <- c(out_files, agg_csv)

In [26]:
mean_csv <- paste0("output/ped_sr_tea_pseudobulk_mean_", Sys.Date(), ".csv")
mean_df <- as.data.frame(pb_data$mean_mat) %>%
  mutate(gene = rownames(pb_data$mean_mat)) %>%
  select(gene, everything())
fwrite(mean_df, mean_csv)

In [27]:
out_files <- c(out_files, mean_csv)

In [28]:
detect_csv <- paste0("output/ped_sr_tea_pseudobulk_detect_", Sys.Date(), ".csv")
detect_df <- as.data.frame(pb_data$detect_mat) %>%
  mutate(gene = rownames(pb_data$detect_mat)) %>%
  select(gene, everything())
fwrite(detect_df, detect_csv)

In [29]:
out_files <- c(out_files, detect_csv)

In [30]:
meta_csv <- paste0("output/ped_sr_tea_pseudobulk_meta_", Sys.Date(), ".csv")
fwrite(pb_data$sample_meta, meta_csv)

In [31]:
out_files <- c(out_files, meta_csv)

## Save to .h5 for use in visualization tools

For visualization, we'll use just the matrix of normalized data

In [32]:
h5_list <- list(
    obs = as.list(pb_data$sample_meta),
    var = list(genes = rownames(pb_data$mean_mat))
)

In [52]:
head(pb_data$sample_meta)

cohort.cohortGuid,subject.subjectGuid,sample.sampleKitGuid,Sample,batch_id,pool_id,aifi_cell_type,n_cells,barcodes
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<chr>
BR2,BR2005,KT00395,B065-P1_PB00395-02,B065,B065-P1,t_cd4_cm,2687,BR2005_t_cd4_cm
BR2,BR2005,KT00395,B065-P1_PB00395-02,B065,B065-P1,t_cd4_em,2471,BR2005_t_cd4_em
BR2,BR2005,KT00395,B065-P1_PB00395-02,B065,B065-P1,t_cd4_naive,4435,BR2005_t_cd4_naive
BR2,BR2005,KT00395,B065-P1_PB00395-02,B065,B065-P1,t_cd4_treg,621,BR2005_t_cd4_treg
BR2,BR2005,KT00395,B065-P1_PB00395-02,B065,B065-P1,t_cd8_memory,3334,BR2005_t_cd8_memory
BR2,BR2005,KT00395,B065-P1_PB00395-02,B065,B065-P1,t_cd8_naive,1269,BR2005_t_cd8_naive


In [33]:
out_h5 <- paste0("output/ped_sr_tea_pseudobulk_mean_", Sys.Date(), ".h5")

In [35]:
write_h5_list(
    h5_list,
    out_h5
)

In [36]:
str(pb_data$mean_mat)

 num [1:36601, 1:48] 0 0 0 0.00841 0 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:36601] "MIR1302-2HG" "FAM138A" "OR4F5" "AL627309.1" ...
  ..$ : chr [1:48] "BR2005_t_cd4_cm" "BR2005_t_cd4_em" "BR2005_t_cd4_naive" "BR2005_t_cd4_treg" ...


In [37]:
rhdf5::h5createDataset(
    file = out_h5, 
    dataset = "data",
    dims = dim(pb_data$mean_mat), 
    storage.mode = "double", 
    chunk = c(nrow(pb_data$mean_mat),1), 
    level = 4)

In [38]:
rhdf5::h5write(
    pb_data$mean_mat,
    out_h5,
    name = "data"
)

In [39]:
out_files <- c(out_files, out_h5)

## Upload results to HISE

In [40]:
study_space_uuid <- "00a53fa5-18da-4333-84cb-3cc0b0761201"
title <- paste("TEA-seq demo pseudobulk data", Sys.Date())

In [47]:
search_id <- ids::adjective_animal()
search_id

In [48]:
in_list <- file_uuids
in_list

In [49]:
out_list <- as.list(out_files)
out_list

In [50]:
uploadFiles(
    files = out_list,
    studySpaceId = study_space_uuid,
    title = title,
    inputFileIds = in_list,
    destination = search_id
)

[1] "Cannot determine the current notebook."
[1] "1) /home/jupyter/certpro-workflow-demos/adult_vs_pediatric_teaseq/08-R_assemble_pseudobulk_data.ipynb"
[1] "2) /home/jupyter/data-apps-vis/datasets/dynamics_imm_health/06-Python_assemble_all_long-viewer_data.ipynb"
[1] "3) /home/jupyter/data-apps-vis/datasets/dynamics_imm_health/05-Python_assemble_deg_freq_data_for_long-viewer.ipynb"


Please select (1-3)  1


You are trying to upload the following files:  output/ped_sr_tea_pseudobulk_list_2024-07-01.rds output/ped_sr_tea_pseudobulk_agg_2024-07-01.csv output/ped_sr_tea_pseudobulk_mean_2024-07-01.csv output/ped_sr_tea_pseudobulk_detect_2024-07-01.csv output/ped_sr_tea_pseudobulk_meta_2024-07-01.csv output/ped_sr_tea_pseudobulk_mean_2024-07-01.h5



(y/n) y


In [51]:
sessionInfo()

R version 4.3.2 (2023-10-31)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 20.04.6 LTS

Matrix products: default
BLAS/LAPACK: /opt/conda/lib/libopenblasp-r0.3.25.so;  LAPACK version 3.11.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: Etc/UTC
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] tidyr_1.3.0        Seurat_5.0.1       SeuratObject_5.0.1 sp_2.1-2          
 [5] furrr_0.3.1        future_1.33.1      purrr_1.0.2        H5weaver_1.2.0    
 [9] rhdf5_2.46.1       Matrix_1.6-4       data.table_1.15.4  hise_2.16.0       
[13] dplyr_1.1.4       

loaded via a namespace (and not attached):
  