# Retrieve ATAC Metadata

To begin our analysis, we'll retrieve the .arrow files that contain ATAC data and metadata after our TEA-seq QC and demultiplexing pipeline. We'll then extract the metadata for cells to use for cell filtering and QC plots.

## Load packages

hise: The Human Immune System Explorer R SDK package  
ArchR: .arrow file handling  
purrr: Functional programming tools  


In [1]:
quiet_library <- function(...) { suppressPackageStartupMessages(library(...)) }
quiet_library(hise)
quiet_library(ArchR)
quiet_library(purrr)


                                                   / |
                                                 /    \
            .                                  /      |.
            \\\                              /        |.
              \\\                          /           `|.
                \\\                      /              |.
                  \                    /                |\
                  \\#####\           /                  ||
                ==###########>      /                   ||
                 \\##==......\    /                     ||
            ______ =       =|__ /__                     ||      \\\
       \               '        ##_______ _____ ,--,__,=##,__   ///
        ,    __==    ___,-,__,--'#'  ==='      `-'    | ##,-/
        -,____,---'       \\####\\________________,--\\_##,/
           ___      .______        ______  __    __  .______      
          /   \     |   _  \      /      ||  |  |  | |   _  \     
         /  ^  \    |  |_) 

In [2]:
read_path_uuid <- function(uuid) {
    uuid_path <- paste0("cache/", uuid)
    if(!dir.exists(uuid_path)) {
        cacheFiles(list(uuid))
    }
    list.files(uuid_path, full.names = TRUE)[1]
}

In [3]:
read_csv_uuid <- function(uuid) {
    filename <- read_path_uuid(uuid)
    read.csv(filename)
}

## Get file metadata stored in HISE

In [4]:
meta_uuid <- "5e3115d4-9207-4020-8e3a-3792dd28ea6b"
sample_meta <- read_csv_uuid(meta_uuid)

## Retrieve files

Now, we'll use the HISE SDK package to retrieve the TEA-seq .arrow file outputs based on their file UUIDs. These will be placed in the `cache/` subdirectory by default.

In [5]:
arrow_files <- map_chr(
    sample_meta$atac_file.id,
    read_path_uuid
)

## Assemble metadata

Here, we list each of the files in `cache/` and assemble an ArchR Project to read cell metadata using the ArchR function `getCellColData()`.

In [6]:
addArchRGenome("hg38")

Setting default genome to Hg38.



In [7]:
proj <- ArchRProject(
    ArrowFiles = arrow_files,
    copyArrows = FALSE
)

Using GeneAnnotation set by addArchRGenome(Hg38)!

Using GeneAnnotation set by addArchRGenome(Hg38)!

Validating Arrows...

Getting SampleNames...

1 
2 
3 
4 
5 
6 
7 
8 


Getting Cell Metadata...

1 
2 
3 
4 
5 
6 
7 
8 


Merging Cell Metadata...

Initializing ArchRProject...


                                                   / |
                                                 /    \
            .                                  /      |.
            \\\                              /        |.
              \\\                          /           `|.
                \\\                      /              |.
                  \                    /                |\
                  \\#####\           /                  ||
                ==###########>      /                   ||
                 \\##==......\    /                     ||
            ______ =       =|__ /__                     ||      \\\
       \               '        ##_______ _____ ,--,__,=##,__   ///
  

In [8]:
all_metadata <- getCellColData(proj)
all_metadata <- as.data.frame(all_metadata)

In [9]:
all_metadata$archr_name <- rownames(all_metadata)

In [10]:
head(all_metadata)

Unnamed: 0_level_0,Sample,well_id,TSSEnrichment,tss_frac,tss_count,singlet,ReadsInTSS,ReadsInPromoter,ReadsInBlacklist,PromoterRatio,⋯,DoubletScore,DoubletEnrichment,chip_id,cell_name,BlacklistRatio,batch_id,barcodes,altius_frac,altius_count,archr_name
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<int>,<lgl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<chr>,<chr>,<dbl>,<int>,<chr>
B065-P1_PB00395-02#dc9f7b1231b011ef80e742c13d66f8da,B065-P1_PB00395-02,B065-AP1C1W1,23.217,0.6123411,18835,True,31047,28818,545,0.4688141,⋯,0.0,1.5,B065-AP1C1,unfearful_frisky_bighorn,0.008866114,B065,dc9f7b1231b011ef80e742c13d66f8da,0.9265256,28499,B065-P1_PB00395-02#dc9f7b1231b011ef80e742c13d66f8da
B065-P1_PB00395-02#bc84979a31ab11efb3acae799efb27c2,B065-P1_PB00395-02,B065-AP1C1W4,18.701,0.3886059,11064,True,17046,15403,380,0.2705223,⋯,2.048716,1.7666667,B065-AP1C1,expedient_flattering_gemsbok,0.006673926,B065,bc84979a31ab11efb3acae799efb27c2,0.7831478,22297,B065-P1_PB00395-02#bc84979a31ab11efb3acae799efb27c2
B065-P1_PB00395-02#5e619c6431b311efacc2825d2c0fee52,B065-P1_PB00395-02,B065-AP1C2W1,22.989,0.6343741,17189,True,28374,26201,433,0.4836631,⋯,0.0,1.4,B065-AP1C2,pernicious_shy_turtle,0.007993059,B065,5e619c6431b311efacc2825d2c0fee52,0.9335326,25295,B065-P1_PB00395-02#5e619c6431b311efacc2825d2c0fee52
B065-P1_PB00395-02#ffbeb28631ae11efa2195e333e12fe90,B065-P1_PB00395-02,B065-AP1C2W4,23.47,0.6011946,15803,True,26300,24211,411,0.4606179,⋯,0.0,0.4333333,B065-AP1C2,magic_infamous_dugong,0.007819337,B065,ffbeb28631ae11efa2195e333e12fe90,0.9345659,24566,B065-P1_PB00395-02#ffbeb28631ae11efa2195e333e12fe90
B065-P1_PB00395-02#0f242c8831af11ef83c91ae18f80cdcc,B065-P1_PB00395-02,B065-AP1C2W3,17.284,0.5764697,15062,True,22170,21143,359,0.4047282,⋯,0.0,0.7142857,B065-AP1C2,communist_cretaceous_arcticwolf,0.006872129,B065,0f242c8831af11ef83c91ae18f80cdcc,0.9043938,23630,B065-P1_PB00395-02#0f242c8831af11ef83c91ae18f80cdcc
B065-P1_PB00395-02#aa659c3e31b111ef92f7ea48c9c3293e,B065-P1_PB00395-02,B065-AP1C2W2,21.06,0.5645044,14733,True,22983,21731,369,0.4164622,⋯,0.0,0.8,B065-AP1C2,hypodermal_surprising_aphid,0.007071675,B065,aa659c3e31b111ef92f7ea48c9c3293e,0.9124104,23813,B065-P1_PB00395-02#aa659c3e31b111ef92f7ea48c9c3293e


## Write output file

Write the metadata as a .csv for later use. We remove `row.names` and set `quote = FALSE` to simplify the outputs and increase compatibility with other tools.

In [11]:
dir.create("output")

“'output' already exists”


In [12]:
write.csv(
    all_metadata,
    "output/atac_cell_metadata.csv",
    row.names = FALSE,
    quote = FALSE
)

## Store results in HISE

Finally, we store the output file in our Collaboration Space for later retrieval and use. We need to provide the UUID for our Collaboration Space (aka `studySpaceId`), as well as a title for this step in our analysis process.

The hise function `uploadFiles()` also requires the FileIDs from the original fileset for reference, which we assembled above when files were retrieved (`input_file_uuids`)

In [13]:
study_space_uuid <- "00a53fa5-18da-4333-84cb-3cc0b0761201"
title <- "TEA-seq demo unfiltered A cell metadata"

In [15]:
search_id <- ids::adjective_animal()
search_id

In [17]:
in_list <- as.list(sample_meta$atac_file.id)

In [18]:
in_list

In [19]:
out_list <- list("output/atac_cell_metadata.csv")

In [20]:
out_list

In [21]:
uploadFiles(
    files = out_list,
    studySpaceId = study_space_uuid,
    title = title,
    inputFileIds = in_list,
    destination = search_id
)

[1] "Cannot determine the current notebook."
[1] "1) /home/jupyter/certpro-workflow-demos/adult_vs_pediatric_teaseq/02-R_get_arrow_metadata.ipynb"
[1] "2) /home/jupyter/certpro-workflow-demos/adult_vs_pediatric_teaseq/01-R_get_h5_metadata.ipynb"
[1] "3) /home/jupyter/certpro-workflow-demos/adult_vs_pediatric_teaseq/00-R_select_samples.ipynb"


Please select (1-3)  1


You are trying to upload the following files:  output/atac_cell_metadata.csv



(y/n) y


In [22]:
sessionInfo()

R version 4.3.2 (2023-10-31)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 20.04.6 LTS

Matrix products: default
BLAS/LAPACK: /opt/conda/lib/libopenblasp-r0.3.25.so;  LAPACK version 3.11.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: Etc/UTC
tzcode source: system (glibc)

attached base packages:
[1] stats4    grid      stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] BSgenome.Hsapiens.UCSC.hg38_1.4.5 BSgenome_1.70.1                  
 [3] rtracklayer_1.62.0                BiocIO_1.12.0                    
 [5] Biostrings_2.70.1                 XVector_0.42.0                   
 [7] purrr_1.0.2                       rhdf5_2.46.1                   