In [1]:
# this script processes the full data matrix from h5ad to SingleCellExperiment R object
# Note there are aspects that are hard-coded, so be careful generalising
# last updated 27 March 2020
# from Shila

library(rhdf5)
library(Matrix)
library(SingleCellExperiment)

Loading required package: SummarizedExperiment
Loading required package: GenomicRanges
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following object is masked from ‘package:Matrix’:

    which

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, 

In [2]:
mydir = "/hps/nobackup/stegle/users/acuomo/all_scripts/covid/"

In [3]:
filename = paste0(mydir,"full_counts_Krasnow_Kropski_Lafyatis_Meyer_Misharin_Nawijn_Regev-2.h5ad")

In [4]:
cbind(h5ls(filename)$group,
h5ls(filename)$name)

0,1
/,X
/X,data
/X,indices
/X,indptr
/,obs
/obs,__categories
/obs/__categories,age
/obs/__categories,anatomical_region
/obs/__categories,ann_level_1
/obs/__categories,ann_level_2


In [5]:
ind = h5read(file = filename, "obs/_index")
var = h5read(file = filename, "var") # gene names

In [6]:
X = h5read(file = filename, "X")
X_data = as.integer(X$data)
X_indptr = as.integer(X$indptr)
X_indices = as.integer(X$indices)
rm(X)

In [None]:
# note dimensions are hardcoded!
sMat = sparseMatrix(
i = X_indices,
p = X_indptr,
x = X_data,
dims = c(length(var[[1]]), length(ind)),
index1 = FALSE
)
rownames(sMat) <- as.character(var[[1]])
colnames(sMat) <- as.character(ind)

In [None]:
R_filename = paste0(mydir,"full_counts_sMat_v2.Rds")

In [None]:
saveRDS(sMat, file = R_filename)

In [None]:
# now the column data information
colinfo_all = apply(cbind(h5ls(filename)$group,
    h5ls(filename)$name)
    ,1,paste0, collapse = "/")

In [None]:
colinfo <- grep("categories|//",grep("obs",colinfo_all, value = TRUE), invert = TRUE, value = TRUE)

In [None]:
outList <- list()
for (colname in colinfo) {
print(colname)
colname_clean <- gsub("_index","index",rev(unlist(strsplit(colname, "/")))[1])
out = h5read(file = filename, colname)
if (length(out) == 1) {
out <- out[[1]]
}

if (sum(h5ls(filename)$name %in% colname_clean) > 1) {
    # then need to match category names
    colname_categ = paste0("obs/__categories/",colname_clean)

    if (colname_clean == "PI") {
    colname_categ = paste0("obs/__categories/last_author/",colname_clean)
    }

    out_categ = h5read(file = filename, colname_categ)

    out_proper <- out_categ[as.integer(out)+1] # zero indexing

    out <- out_proper

}

print(length(out))


outList[[colname_clean]] <- out
}

In [None]:
cData_raw = do.call(cbind, outList)
rownames(cData_raw) <- cData_raw[,"index"]

In [None]:
cData <- as.data.frame(cData_raw)
cData_R_filename = paste0(mydir,"full_counts_cData_v2.Rds")
saveRDS(cData, file = cData_R_filename)

In [None]:
# build the sce object
sce <- SingleCellExperiment(
assays = list(counts = sMat),
colData = cData)
sce

In [None]:
sce_R_filename = paste0(mydir,"full_counts_sce_v2.Rds")
saveRDS(sce, file = sce_R_filename)