In [1]:
project_dir = "~/ben_lustre/current_projects/kidney_glomTI_response"
setwd(project_dir)
suppressPackageStartupMessages(source("https://raw.githubusercontent.com/bjstewart1/helpful_singlecell/main/functions.R"))
ad <- import("anndata")
sc <- import("scanpy")
scv <- import("scvelo")
sp = import("scipy")

“[1m[22mThe `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
[36mℹ[39m Please use the `linewidth` argument instead.”


In [2]:
py_config()

python:         /home/jovyan/my-conda-envs/myenv/bin/python
libpython:      /home/jovyan/my-conda-envs/myenv/lib/libpython3.8.so
pythonhome:     /home/jovyan/my-conda-envs/myenv:/home/jovyan/my-conda-envs/myenv
version:        3.8.8 | packaged by conda-forge | (default, Feb 20 2021, 16:22:27)  [GCC 9.3.0]
numpy:          /home/jovyan/my-conda-envs/myenv/lib/python3.8/site-packages/numpy
numpy_version:  1.20.3

NOTE: Python version was forced by RETICULATE_PYTHON

In [3]:
tenx_dir <- "./data/seq_data/10X_3p"
sequencing_inventory <- read.csv("/nfs/team297/bs16/sample_donor_data/sequencing_inventory.csv", check.names = FALSE)
sequencing_inventory <- sequencing_inventory[sequencing_inventory$experiment %in% c("10X_3'_V3") & #just 3p
                                           sequencing_inventory$cellranger_SOC %in% 'TRUE' & sequencing_inventory$poor_QC %in% "FALSE",]
#exclude fetal - these are one per channel and we will analyse these elsewhere
sequencing_inventory <- sequencing_inventory[!sequencing_inventory$tissue %in% grep("fetal", sequencing_inventory$tissue, value = TRUE), ]
#remove perfusion samples
perfusion_batches = c("Batch_9", "Batch_12", "Batch_14")
sequencing_inventory = sequencing_inventory[!sequencing_inventory$tissue_dissociation_processing_batch %in% perfusion_batches, ]
#fix the irods spaces
sequencing_inventory$irods_ID <- gsub(" ", "", sequencing_inventory$irods_ID)

#make a set of channels and files
fn_in <- data.frame("channel" = sequencing_inventory$sanger_ID, "filename" = sequencing_inventory$irods_ID)

#now get the assignments
cell_assignments <- readRDS(file.path(project_dir,  "data/genotypes/cell_assignments_cells.RDS"))

In [4]:
#now read it all in
dir <- "/lustre/scratch126/cellgen/team297/bs16/current_projects/kidney_glomTI_response/data/seq_data/10X_3p"
adata_list <- lapply(1:nrow(fn_in), function(x){
    message("***====================***")
    message(paste0(x/nrow(fn_in)*100, "%"))
    start_time <- Sys.time()
    f <- fn_in[x, "filename"]
    ch <- fn_in[x, "channel"]
    message("sample ", ch)
#read in adata
    message("reading in adata")
    #adata <- afh$anndata_from_h5(file.path(dir, f, "cellbender_corrected_filtered.h5"))   
    #adata$var_names_make_unique()
    #orig_adata <- sc$read_10x_h5(file.path(dir, f,  "filtered_feature_bc_matrix.h5" ))
    adata <- sc$read_h5ad(file.path(dir, f, "scAR_denoised.h5ad"))
    adata$var_names_make_unique()
    adata$obs$barcode <- adata$obs_names$values
    adata$obs$channel <- ch #add the channel name
   #make the denoised counts a sparse layer.
    adata$layers = list("denoised_counts" = sp$sparse$csr_matrix(adata$layers['denoised_counts']),
                        "raw_counts" = adata$layers['raw_counts']
                       ) #this will help with memory probably
#add genotyping assignments
    message("adding genotyping assignments")
    assignments <- cell_assignments[cell_assignments$channel %in% ch, c(2,3, 5,6)] #these are ordered as per the barcode file
    adata$obs <- cbind(adata$obs, assignments)
    
#sort out the metadata
        message("adding metadata")
        metadata = sequencing_inventory
        rownames(metadata) <- metadata$sanger_ID
        adata$obs <- cbind(adata$obs, metadata[adata$obs$channel, ])
    
    
#add in some donor demographics ####    
    message("adding demographics")
    donor_demographics <- read.csv("~/ben_nfs/sample_donor_data/donor_demographics.csv", row.names = 1)
    donor_demographics <- donor_demographics[adata$obs$genotype, ]
    adata$obs <- cbind(adata$obs, donor_demographics) #bind this to adata$obs - this will result in NAs but we will remove the doublets in due cours

#add the velocyto data
    message("adding velocyto")
    ldata = scv$read(file.path(dir, f, 'velocyto', paste0(f, '.loom')))
    #fix up the loom names
    loom_names = gsub(f, "", ldata$obs_names$values)
    loom_names = gsub(":", "", loom_names)
    loom_names = gsub("x", "-1", loom_names)
    ldata$obs_names = paste0(loom_names, "_",  f)
    #merge
    adata = scv$utils$merge(adata, ldata)
    rm(ldata)
    gc() #clean
#return the adata
    message("sample ", ch, " done")
    message("====================")
        #time
    end_time <- Sys.time()
    time =  end_time - start_time
    message(paste(time, 'seconds'))
    message("***====================***")

return(adata)
})
adata <- sc$concat(adata_list)
vr <- adata_list[[1]]$var



2.08333333333333%

sample CG_SB_NB8768284

reading in adata

adding genotyping assignments

adding metadata

adding demographics

adding velocyto

sample CG_SB_NB8768284 done


20.0863261222839 seconds



4.16666666666667%

sample WSSS_A_KID9369817

reading in adata

adding genotyping assignments

adding metadata

adding demographics

adding velocyto

sample WSSS_A_KID9369817 done


58.5618648529053 seconds



6.25%

sample WSSS_A_KID9369818

reading in adata

adding genotyping assignments

adding metadata

adding demographics

adding velocyto

sample WSSS_A_KID9369818 done


53.1012592315674 seconds



8.33333333333333%

sample WSSS_A_KID9369819

reading in adata

adding genotyping assignments

adding metadata

adding demographics

adding velocyto

sample WSSS_A_KID9369819 done


59.6246421337128 seconds



10.4166666666667%

sample WSSS_A_KID9369820

reading in adata

adding genotyping assignments

adding metadata

adding demographics

adding velocyto

sample WSSS_A_KID9369820 done


In [None]:
#concatenate these data
adata <- sc$concat(adata_list)
vr <- adata_list[[1]]$var

In [None]:
#sort out symbol etc..
adata$var$ID <- vr$gene_ids
adata$var$Symbol <- rownames(vr)

In [None]:
adata$write_h5ad("data/raw/10X_3p/healthy_raw_pre_QC.h5ad")

In [None]:
sessionInfo()

R version 4.0.4 (2021-02-15)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.1 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] paletteer_1.4.1             pbapply_1.7-0              
 [3] SoupX_1.5.2                 igraph_1.2.6               
 [5] DropletUtils_1.10.3         pheatmap_1.0.12            
 [7] cowplot_1.1.1               scater_1.18.6              
