In [30]:
library(rhdf5)
library(Matrix)
library(SingleCellExperiment)

In [31]:
mydir = "/hps/nobackup/stegle/users/acuomo/all_scripts/covid/"

In [32]:
filename = paste0(mydir,"full_counts_Kropski_Meyer_Misharin_Nawijn.h5ad")

In [33]:
cbind(h5ls(filename)$group,
h5ls(filename)$name)

0,1
/,X
/X,data
/X,indices
/X,indptr
/,obs
/obs,__categories
/obs/__categories,age
/obs/__categories,anatomical_region
/obs/__categories,ann_level_1
/obs/__categories,ann_level_2


In [34]:
ind = h5read(file = filename, "obs/_index")
var = h5read(file = filename, "var") # gene names

In [35]:
X = h5read(file = filename, "X")
X_data = as.integer(X$data)
X_indptr = as.integer(X$indptr)
X_indices = as.integer(X$indices)
rm(X)

In [36]:
# note dimensions are hardcoded!
sMat = sparseMatrix(
i = X_indices,
p = X_indptr,
x = X_data,
dims = c(45065, 141243),
index1 = FALSE
)
rownames(sMat) <- as.character(var[[1]])
colnames(sMat) <- as.character(ind)

In [37]:
R_filename = paste0(mydir,"full_counts_sMat.Rds")

In [39]:
# saveRDS(sMat, file = R_filename)

In [40]:
# now the column data information
colinfo_all = apply(cbind(h5ls(filename)$group,
    h5ls(filename)$name)
    ,1,paste0, collapse = "/")

In [41]:
colinfo <- grep("categories|//",grep("obs",colinfo_all, value = TRUE), invert = TRUE, value = TRUE)

In [42]:
outList <- list()
for (colname in colinfo) {
print(colname)
colname_clean <- gsub("_index","index",rev(unlist(strsplit(colname, "/")))[1])
out = h5read(file = filename, colname)
if (length(out) == 1) {
out <- out[[1]]
}

if (sum(h5ls(filename)$name %in% colname_clean) > 1) {
    # then need to match category names
    colname_categ = paste0("obs/__categories/",colname_clean)

    if (colname_clean == "PI") {
    colname_categ = paste0("obs/__categories/last_author/",colname_clean)
    }

    out_categ = h5read(file = filename, colname_categ)

    out_proper <- out_categ[as.integer(out)+1] # zero indexing

    out <- out_proper

}

print(length(out))


outList[[colname_clean]] <- out
}

[1] "/obs/_index"
[1] 141243
[1] "/obs/age"
[1] 141243
[1] "/obs/anatomical_region"
[1] 141243
[1] "/obs/ann_level_1"
[1] 141243
[1] "/obs/ann_level_2"
[1] 141243
[1] "/obs/ann_level_3"
[1] 141243
[1] "/obs/ann_level_4"
[1] 141243
[1] "/obs/ann_level_5"
[1] 141243
[1] "/obs/dataset"
[1] 141243
[1] "/obs/donor"
[1] 141243
[1] "/obs/last_author"
[1] 141243
[1] "/obs/last_author/PI"
[1] 141243
[1] "/obs/lung_vs_nasal"
[1] 141243
[1] "/obs/sample"
[1] 141243
[1] "/obs/sample_last_author_name"
[1] 141243
[1] "/obs/sex"
[1] 141243
[1] "/obs/smoking"
[1] 141243
[1] "/obs/total_counts"
[1] 141243


In [43]:
cData_raw = do.call(cbind, outList)
rownames(cData_raw) <- cData_raw[,"index"]

In [44]:
cData <- as.data.frame(cData_raw)
cData_R_filename = paste0(mydir,"full_counts_cData.Rds")
saveRDS(cData, file = cData_R_filename)

In [45]:
# build the sce object
sce <- SingleCellExperiment(
assays = list(counts = sMat),
colData = cData)
sce

class: SingleCellExperiment 
dim: 45065 141243 
metadata(0):
assays(1): counts
rownames(45065): RP11-34P13.3 FAM138A ... C11orf71.1 LINC01481.1
rowData names(0):
colnames(141243): TCTATTGCACGGTTTA-1-HCATisStab7659969_368C_Madissoon
  CATATGGTCGTCTGAA-1-HCATisStab7646034_367C_Madissoon ...
  CTCGTCAAGTAAGTAC_Donor_02_Reyfman
  TTAGGACCAGCGTTCG-1-HCATisStab7646035_367C_Madissoon
colData names(18): index age ... smoking total_counts
reducedDimNames(0):
spikeNames(0):
altExpNames(0):

In [46]:
sce_R_filename = paste0(mydir,"full_counts_sce.Rds")
saveRDS(sce, file = sce_R_filename)