# Assemble Pseudobulk data for download

To make our pseudobulk data available for re-use, we'll assemble data for each sample x AIFI_L3 cell type. Because this yields a very large array of values (~50,000 x 33,000), we'll split the data based on sample metadata to make some smaller sets that should be more amenable to import by other users.

We'll also put the data in a "tidy" orientation - that is, each row will represent a single observation (sample x AIFI_L3 cell type), while each column will represent a feature (gene).

We'll also supply both count sums per group, which we use for DESeq2 analysis, and mean(log(normalized data)) pseudobulk values, which we use for visualization or effect size comparisons.

To save on file size, we'll round the mean(log(normalized data)) values to 6 digits after the decimal place, which should provide adequate precision for most applications. Those requiring the full precision of 32-bit float values may want to recompute the pseudobulk values.

In [1]:
quiet_library <- function(...) { suppressPackageStartupMessages(library(...)) }
quiet_library(data.table)
quiet_library(dplyr)
quiet_library(H5weaver)
quiet_library(hise)
quiet_library(purrr)
quiet_library(furrr)

In [2]:
plan("multicore", workers = 8)

In [3]:
if(!dir.exists("output")) {
    dir.create("output")
}

## Helper functions

In [4]:
cache_uuid_path <- function(uuid) {
    cache_dir <- paste0("cache/", uuid)
    if(!dir.exists(cache_dir)) {
        res <- cacheFiles(list(uuid))
    }
    cache_path <- list.files(cache_dir, full.names = TRUE)
    return(cache_path)
}

In [5]:
transpose_pb <- function(df, round_to = NULL) {
    genes <- df$V1
    rownames(df) <- genes
    df <- df[,-1]
    mat <- t(df)
    mat <- mat[,sort(colnames(mat))]
    if(!is.null(round_to)) {
        mat <- round(mat, round_to)
    }
    as.data.frame(mat)
}

In [6]:
add_sample_id <- function(df, id) {
    df$specimen.specimenGuid <- id
    df$AIFI_L3 <- rownames(df)
    rownames(df) <- NULL
    df
}

## Download sample metadata

In [7]:
meta_uuid <- "af25e3e7-25c1-4476-afb4-926bd201db8f"
meta_file <- cache_uuid_path(meta_uuid)

In [8]:
meta <- read.csv(meta_file, row.names = 1)

## Download and unpack pseudobulk data

In [9]:
mean_pb_uuid <- "00478a17-3721-4cf3-b6fe-a3473a67e575"
sum_pb_uuid <- "d6be049e-4ed5-4abc-a172-f48aefb7825d"

In [10]:
mean_pb_tar <- cache_uuid_path(mean_pb_uuid)
sum_pb_tar <- cache_uuid_path(sum_pb_uuid)

In [11]:
mean_dir <- "Average_LogNormalized_Expression_of_Celltypes_by_Sample_AIFI_L3"
if(!dir.exists(mean_dir)) {
    tar_call <- paste("tar -xf", mean_pb_tar)
    system(tar_call)
}
sum_dir <- "Aggregated_Raw_Expression_of_Celltypes_by_Sample_AIFI_L3"
if(!dir.exists(sum_dir)) {
    tar_call <- paste("tar -xf", sum_pb_tar)
    system(tar_call)
}

In [12]:
mean_pb_files <- list.files(mean_dir, full.names = TRUE)
sum_pb_files <- list.files(sum_dir, full.names = TRUE)

In [13]:
mean_sample_ids <- sub(".csv", "", basename(mean_pb_files))
sum_sample_ids <- sub(".csv", "", basename(sum_pb_files))

## Read and restructure data

#### Mean values

In [14]:
mean_pb_list <- future_map(mean_pb_files, fread)
names(mean_pb_list) <- mean_sample_ids

In [15]:
mean_pb_list <- future_map(mean_pb_list, as.data.frame)
mean_pb_list <- future_map(mean_pb_list, transpose_pb, round_to = 6)
mean_pb_list <- future_map2(mean_pb_list, names(mean_pb_list), add_sample_id)

In [16]:
mean_pb_df <- mean_pb_list %>%
  list_rbind()

In [17]:
dim(mean_pb_df)

In [18]:
mean_pb_df <- mean_pb_df %>%
  select(specimen.specimenGuid, AIFI_L3, everything())

#### Sum values

In [19]:
sum_pb_list <- future_map(sum_pb_files, fread)
names(sum_pb_list) <- sum_sample_ids

In [20]:
sum_pb_list <- future_map(sum_pb_list, as.data.frame)
sum_pb_list <- future_map(sum_pb_list, transpose_pb)
sum_pb_list <- future_map2(sum_pb_list, names(sum_pb_list), add_sample_id)

In [21]:
sum_pb_df <- sum_pb_list %>%
  list_rbind()

In [22]:
dim(sum_pb_df)

In [23]:
sum_pb_df <- sum_pb_df %>%
  select(specimen.specimenGuid, AIFI_L3, everything())

### Subset by cohort, sex, and CMV

In [24]:
meta <- meta %>%
  mutate(pb_group = paste0(
      cohort.cohortGuid, "_", 
      subject.biologicalSex, "_",
      subject.cmv
  ))
split_meta <- split(meta, meta$pb_group)

### Write mean values

In [25]:
mean_out_dir <- "sound-life_pseudobulk_mean-log-norm"

if(!dir.exists(mean_out_dir)) {
    dir.create(mean_out_dir)
}

In [27]:
walk(
    split_meta,
    function(meta) {
        pb_group <- meta$pb_group[1]
        
        meta <- meta %>%
          select(-pb_group)
        
        mean_df <- mean_pb_df %>%
          filter(specimen.specimenGuid %in% meta$specimen.specimenGuid)

        out_meta <- paste0(mean_out_dir,"/",pb_group,"_pseudobulk_meta.csv")
        out_mean <- paste0(mean_out_dir,"/",pb_group,"_pseudobulk_mean-log-norm.csv")

        fwrite(meta, out_meta)
        fwrite(mean_df, out_mean)
    }
)

### Write sum values

In [28]:
sum_out_dir <- "sound-life_pseudobulk_sum"

if(!dir.exists(sum_out_dir)) {
    dir.create(sum_out_dir)
}

In [29]:
walk(
    split_meta,
    function(meta) {
        pb_group <- meta$pb_group[1]
        
        meta <- meta %>%
          select(-pb_group)
        
        sum_df <- sum_pb_df %>%
          filter(specimen.specimenGuid %in% meta$specimen.specimenGuid)

        out_meta <- paste0(sum_out_dir,"/",pb_group,"_pseudobulk_meta.csv")
        out_sum <- paste0(sum_out_dir,"/",pb_group,"_pseudobulk_sum.csv")

        fwrite(meta, out_meta)
        fwrite(sum_df, out_sum)
    }
)

## Tar and compress data for storage and download

In [31]:
mean_out_file <- paste0("output/", mean_out_dir, "_", Sys.Date(), ".tar.gz")
mean_call <- paste(
    "tar -czf",
    mean_out_file,
    mean_out_dir
)
system(mean_call)

In [32]:
sum_out_file <- paste0("output/", sum_out_dir, "_", Sys.Date(), ".tar.gz")
sum_call <- paste(
    "tar -czf",
    sum_out_file,
    sum_out_dir
)
system(sum_call)

## Upload data to HISE

In [33]:
study_space_uuid <- "de025812-5e73-4b3c-9c3b-6d0eac412f2a"
title <- paste("DIHA Pseudobulk Datasets", Sys.Date())

In [34]:
search_id <- ids::proquint(n_words = 3)
search_id

In [35]:
in_list <- as.list(c(meta_uuid, mean_pb_uuid, sum_pb_uuid))

In [36]:
out_list <- list(mean_out_file, sum_out_file)

In [37]:
out_list

In [38]:
uploadFiles(
    files = out_list,
    studySpaceId = study_space_uuid,
    title = title,
    inputFileIds = in_list,
    store = "project",
    destination = search_id
)

You are trying to upload the following files:  output/sound-life_pseudobulk_mean-log-norm_2024-08-14.tar.gz output/sound-life_pseudobulk_sum_2024-08-14.tar.gz



(y/n) y


In [39]:
sessionInfo()

R version 4.3.2 (2023-10-31)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 20.04.6 LTS

Matrix products: default
BLAS/LAPACK: /opt/conda/lib/libopenblasp-r0.3.25.so;  LAPACK version 3.11.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: Etc/UTC
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] furrr_0.3.1       future_1.33.1     purrr_1.0.2       hise_2.16.0      
[5] H5weaver_1.2.0    rhdf5_2.46.1      Matrix_1.6-4      dplyr_1.1.4      
[9] data.table_1.15.4

loaded via a namespace (and not attached):
 [1] jsonlite_1.8.8      compiler_4.3.2      crayon_1.5.2       
 [4] tidyselect_1.2.0    IRdis