# Select data for analysis from HISE

## Load libraries

In [1]:
quiet_library <- function(...) { suppressPackageStartupMessages(library(...)) }
quiet_library(hise)
quiet_library(dplyr)
quiet_library(purrr)

## Find files in Project Store

In [2]:
project_store <- "PedvsSenior"
project_files <- listFilesInProjectStores(
    storesToList = list(project_store),
    toDF = TRUE
)

In [3]:
project_files <- project_files %>%
  filter(grepl("B065", name))
h5_files <- project_files %>%
  filter(grepl(".h5$", name)) %>%
  filter(!grepl("coverage", name))
arrow_files <- project_files %>%
  filter(grepl(".arrow$", name)) %>%
  filter(!grepl("UPenn", name))

In [4]:
h5_files <- h5_files %>%
  mutate(specimen.specimenGuid = sub(".+P1_(.+).h5", "\\1", name)) %>%
  filter(!grepl("IMM", specimen.specimenGuid)) %>%
  mutate(sample.sampleKitGuid = sub("PB([0-9]+)-.+", "KT\\1", specimen.specimenGuid)) %>%
  rename(h5_file = name) %>%
  rename(h5_uuid = id) %>%
  select(sample.sampleKitGuid, specimen.specimenGuid, h5_file, h5_uuid)

In [5]:
arrow_files <- arrow_files %>%
  mutate(specimen.specimenGuid = sub(".+P1_(.+).arrow", "\\1", name)) %>%
  filter(!grepl("IMM", specimen.specimenGuid)) %>%
  mutate(sample.sampleKitGuid = sub("PB([0-9]+)-.+", "KT\\1", specimen.specimenGuid)) %>%
  rename(arrow_file = name) %>%
  rename(arrow_uuid = id) %>%
  select(sample.sampleKitGuid, specimen.specimenGuid, arrow_file, arrow_uuid)

In [6]:
file_meta <- h5_files %>%
  left_join(arrow_files)

[1m[22mJoining with `by = join_by(sample.sampleKitGuid, specimen.specimenGuid)`


## Get sample descriptors from HISE

In [7]:
desc <- getFileDescriptors(
    fileType = "scRNA-seq-labeled",
    filter = list(
        sample.sampleKitGuid = as.list(file_meta$sample.sampleKitGuid)
    )
)
desc <- fileDescToDataframe(desc)

In [8]:
desc <- desc %>%
  select(sample.sampleKitGuid, subject.subjectGuid, cohort.cohortGuid)

In [9]:
file_meta <- file_meta %>%
  left_join(desc)

[1m[22mJoining with `by = join_by(sample.sampleKitGuid)`


In [10]:
file_meta <- unique(file_meta)

In [11]:
file_meta

Unnamed: 0_level_0,sample.sampleKitGuid,specimen.specimenGuid,h5_file,h5_uuid,arrow_file,arrow_uuid,subject.subjectGuid,cohort.cohortGuid
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,KT00173,PB00173-02,AIFI-2021-07-30T16:42:27.518704481Z/B065-P1_PB00173-02.h5,7039b8ef-3a7c-454f-b49c-34ef30c99cf4,AIFI-2021-08-02T18:25:15.596486061Z/B065-P1_PB00173-02.arrow,23fb03d0-e059-4cfe-999a-6e0c55cddc03,UP1006,UP1
3,KT00192,PB00192-02,AIFI-2021-07-30T16:42:35.831864645Z/B065-P1_PB00192-02.h5,af026062-42d4-488a-a98f-d44af095dd5e,AIFI-2021-08-02T17:32:31.316293195Z/B065-P1_PB00192-02.arrow,bc30186c-2a66-4155-b862-aff510d5db64,UP1007,UP1
5,KT00197,PB00197-02,AIFI-2021-07-30T16:42:37.174285768Z/B065-P1_PB00197-02.h5,6786fe4f-c3c5-4472-b261-6372ca6156ee,AIFI-2021-08-02T17:31:07.836247339Z/B065-P1_PB00197-02.arrow,0edf63a2-f2f6-4117-9791-ddcf79131397,UP1010,UP1
6,KT00396,PB00396-03,AIFI-2021-07-30T16:43:42.188225625Z/B065-P1_PB00396-03.h5,827b5e1e-4f81-4837-ad08-53d4ed949048,AIFI-2021-08-02T18:18:01.30995505Z/B065-P1_PB00396-03.arrow,b7d197bf-0c7b-4f97-b78d-d8e34a92b01b,BR2015,BR2
7,KT00199,PB00199-02,AIFI-2021-07-30T16:43:47.370402584Z/B065-P1_PB00199-02.h5,77e30e6a-5b61-41c4-89cb-44d5ec5363b4,AIFI-2021-08-02T17:26:12.202501522Z/B065-P1_PB00199-02.arrow,cdb98f87-3e6d-421d-957f-e7e3141cb221,UP1001,UP1
8,KT00395,PB00395-02,AIFI-2021-07-30T16:44:28.930119166Z/B065-P1_PB00395-02.h5,592005f1-79ec-41e2-8336-e3129ef211ab,AIFI-2021-08-02T19:30:36.861837569Z/B065-P1_PB00395-02.arrow,d127a0b0-0582-4289-b91b-9fc97083a20b,BR2005,BR2
9,KT00559,PB00559-02,AIFI-2021-07-30T16:44:41.799313949Z/B065-P1_PB00559-02.h5,f8399812-35b8-47ed-a6f2-181e768c4fd3,AIFI-2021-08-02T19:03:36.434101708Z/B065-P1_PB00559-02.arrow,4b219032-fbeb-4d37-a7b0-4ffe0ac04894,BR2042,BR2
10,KT00593,PB00593-04,AIFI-2021-07-30T16:44:45.65518697Z/B065-P1_PB00593-04.h5,d0bff33a-a145-4373-8495-31b7f3ed47df,AIFI-2021-08-02T18:16:00.681553771Z/B065-P1_PB00593-04.arrow,ad1e4bb7-739f-4ec9-9c93-3fc0cb3f06bc,BR2002,BR2


## Cache these files so we can store their metadata

In [12]:
write.csv(
    file_meta,
    "sample_meta.csv",
    row.names = FALSE,
    quote = FALSE
)

In [13]:
sessionInfo()

R version 4.3.2 (2023-10-31)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 20.04.6 LTS

Matrix products: default
BLAS/LAPACK: /opt/conda/lib/libopenblasp-r0.3.25.so;  LAPACK version 3.11.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: Etc/UTC
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] purrr_1.0.2 dplyr_1.1.4 hise_2.16.0

loaded via a namespace (and not attached):
 [1] crayon_1.5.2     vctrs_0.6.5      httr_1.4.7       cli_3.6.2       
 [5] rlang_1.1.3      stringi_1.8.3    generics_0.1.3   assertthat_0.2.1
 [9] jsonlite_1.8.8   glue_1.7.0       RCurl_1.98-1.14  plyr_1.8.9      
[13] htmlt