# Assemble pseudobulk data per sample

To assist with downstream analysis processes, we'll split up our pseudobulk data per sample.

## Load libraries

In [1]:
quiet_library <- function(...) { suppressPackageStartupMessages(library(...)) }

quiet_library(data.table)
quiet_library(dplyr)
quiet_library(hise)
quiet_library(purrr)
quiet_library(furrr)

In [2]:
if(!dir.exists("output")) {
    dir.create("output")
}

## Retrieve pseudobulk data

In [3]:
pb_uuid <- "f0d841f9-1526-4189-af21-c4a9db1800fc"
res <- cacheFiles(list(pb_uuid))

In [4]:
pb_file <- list.files(paste0("cache/",pb_uuid), full.names = TRUE)
pb_data <- readRDS(pb_file)

## Split metadata by specimen

In [5]:
meta <- pb_data$sample_meta
split_meta <- split(meta, meta$specimen.specimenGuid)

## Split Sums of count values

In [6]:
if(!dir.exists("sample_pseudobulk_sum")) {
    dir.create("sample_pseudobulk_sum")
}

In [7]:
split_agg <- map(
    split_meta,
    function(meta) {
        mat <- pb_data$agg_mat[,meta$barcodes]
        df <- as.data.frame(mat)
        df$gene <- rownames(mat)
        select(df, gene, everything())
    }
)

In [8]:
walk2(
    split_agg, names(split_agg),
    function(df, specimen_id) {
        out_file <- paste0(
            "sample_pseudobulk_sum/", 
            specimen_id,
            ".csv")
        fwrite(df, out_file)
    }
)

### Assemble .tar for sums

In [9]:
sum_tar_file <- paste0("output/diha_pseudobulk_sum_per_sample_", Sys.Date(), ".tar")
tar_call <- paste("tar -cf", sum_tar_file, "sample_pseudobulk_sum/*")
system(tar_call)

## Split Mean values

In [10]:
if(!dir.exists("sample_pseudobulk_mean")) {
    dir.create("sample_pseudobulk_mean")
}

In [11]:
split_mean <- map(
    split_meta,
    function(meta) {
        mat <- pb_data$mean_mat[,meta$barcodes]
        df <- as.data.frame(mat)
        df$gene <- rownames(mat)
        select(df, gene, everything())
    }
)

In [12]:
walk2(
    split_mean, names(split_mean),
    function(df, specimen_id) {
        out_file <- paste0(
            "sample_pseudobulk_mean/", 
            specimen_id,
            ".csv")
        fwrite(df, out_file)
    }
)

### Assemble .tar for means

In [13]:
mean_tar_file <- paste0("output/diha_pseudobulk_mean_per_sample_", Sys.Date(), ".tar")
tar_call <- paste("tar -cf", mean_tar_file, "sample_pseudobulk_mean/*")
system(tar_call)

## Split Detection values

In [14]:
if(!dir.exists("sample_pseudobulk_detect")) {
    dir.create("sample_pseudobulk_detect")
}

In [15]:
split_detect <- map(
    split_meta,
    function(meta) {
        mat <- pb_data$detect_mat[,meta$barcodes]
        df <- as.data.frame(mat)
        df$gene <- rownames(mat)
        select(df, gene, everything())
    }
)

In [16]:
walk2(
    split_detect, names(split_detect),
    function(df, specimen_id) {
        out_file <- paste0(
            "sample_pseudobulk_detect/", 
            specimen_id,
            ".csv")
        fwrite(df, out_file)
    }
)

### Assemble .tar for detection

In [17]:
detect_tar_file <- paste0("output/diha_pseudobulk_detect_per_sample_", Sys.Date(), ".tar")
tar_call <- paste("tar -cf", detect_tar_file, "sample_pseudobulk_detect/*")
system(tar_call)

In [18]:
out_files <- c(sum_tar_file, mean_tar_file, detect_tar_file)

## Upload results to HISE

In [19]:
study_space_uuid <- "de025812-5e73-4b3c-9c3b-6d0eac412f2a"
title <- paste("DIHA scRNA L3 Pseudobulk per Sample", Sys.Date())

In [20]:
search_id <- ids::proquint(n_words = 3)
search_id

In [21]:
in_list <- as.list(pb_uuid)
in_list

In [22]:
out_list <- as.list(out_files)
out_list

In [23]:
uploadFiles(
    files = out_list,
    studySpaceId = study_space_uuid,
    title = title,
    inputFileIds = in_list,
    store = "project",
    destination = search_id
)

You are trying to upload the following files:  output/diha_pseudobulk_sum_per_sample_2024-05-06.tar output/diha_pseudobulk_mean_per_sample_2024-05-06.tar output/diha_pseudobulk_detect_per_sample_2024-05-06.tar



(y/n) y


In [24]:
sessionInfo()

R version 4.3.2 (2023-10-31)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 20.04.6 LTS

Matrix products: default
BLAS/LAPACK: /opt/conda/lib/libopenblasp-r0.3.25.so;  LAPACK version 3.11.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: Etc/UTC
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] furrr_0.3.1       future_1.33.1     purrr_1.0.2       hise_2.16.0      
[5] dplyr_1.1.4       data.table_1.15.4

loaded via a namespace (and not attached):
 [1] jsonlite_1.8.8    compiler_4.3.2    crayon_1.5.2      tidyselect_1.2.0 
 [5] IRdisplay_1.1     stringr_1.5.1     bitops_1.0-7      parallel_4.3.2   