In [1]:
# this script processes the partial data matrix from h5ad to SingleCellExperiment R object
# Note there are aspects that are hard-coded, so be careful generalising
# last updated 25 March 2020
# from Shila

In [5]:
library(rhdf5)
library(Matrix)
library(SingleCellExperiment)

In [3]:
mydir = "/hps/nobackup/stegle/users/acuomo/all_scripts/covid/"

In [4]:
filename = paste0(mydir,"cov19_w_regev_tsankov_whitsett_kaminski_krasnow_misharin_new_schultze_schultze_falk_rawlins_xavier_meyer_new_spence_barbry_seibold_shalek_nawijnNasal.h5ad")

In [6]:
cbind(h5ls(filename)$group,
h5ls(filename)$name)

0,1
/,X
/X,data
/X,indices
/X,indptr
/,obs
/obs,__categories
/obs/__categories,anatomical_region
/obs/__categories,ann_level_1
/obs/__categories,ann_level_2
/obs/__categories,ann_level_3


In [7]:
ind = h5read(file = filename, "obs/_index")
var = h5read(file = filename, "var") # gene names

In [8]:
X = h5read(file = filename, "X")
X_data = as.integer(X$data)
X_indptr = as.integer(X$indptr)
X_indices = as.integer(X$indices)
rm(X)

In [9]:
# note dimensions are hardcoded!
sMat = sparseMatrix(
i = X_indices,
p = X_indptr,
x = X_data,
dims = c(3, length(ind)),
index1 = FALSE
)
rownames(sMat) <- as.character(var[[1]])
colnames(sMat) <- as.character(ind)

In [11]:
sMat_R_filename = paste0(mydir,"partial_counts_sMat.Rds")

In [12]:
saveRDS(sMat, file = sMat_R_filename)

In [13]:
# now the column data information
colinfo_all = apply(cbind(h5ls(filename)$group,
    h5ls(filename)$name)
    ,1,paste0, collapse = "/")

In [14]:
colinfo <- grep("categories|//",grep("obs",colinfo_all, value = TRUE), invert = TRUE, value = TRUE)

In [15]:
outList <- list()
for (colname in colinfo) {
print(colname)
colname_clean <- gsub("_index","index",rev(unlist(strsplit(colname, "/")))[1])
out = h5read(file = filename, colname)
if (length(out) == 1) {
out <- out[[1]]
}

if (sum(h5ls(filename)$name %in% colname_clean) > 1) {
    # then need to match category names
    colname_categ = paste0("obs/__categories/",colname_clean)

    if (colname_clean == "PI") {
    colname_categ = paste0("obs/__categories/last_author/",colname_clean)
    }

    out_categ = h5read(file = filename, colname_categ)

    out_proper <- out_categ[as.integer(out)+1] # zero indexing

    out <- out_proper

}

print(length(out))


outList[[colname_clean]] <- out
}

[1] "/obs/_index"
[1] 1227421
[1] "/obs/age"
[1] 1227421
[1] "/obs/anatomical_region"
[1] 1227421
[1] "/obs/ann_level_1"
[1] 1227421
[1] "/obs/ann_level_2"
[1] 1227421
[1] "/obs/ann_level_3"
[1] 1227421
[1] "/obs/ann_level_4"
[1] 1227421
[1] "/obs/ann_level_5"
[1] 1227421
[1] "/obs/dataset"
[1] 1227421
[1] "/obs/donor"
[1] 1227421
[1] "/obs/last_author"
[1] 1227421
[1] "/obs/lung_vs_nasal"
[1] 1227421
[1] "/obs/pack_years"
[1] 1227421
[1] "/obs/sample"
[1] 1227421
[1] "/obs/sample_last_author_name"
[1] 1227421
[1] "/obs/sex"
[1] 1227421
[1] "/obs/smoking"
[1] 1227421
[1] "/obs/smoking_status"
[1] 1227421
[1] "/obs/total_counts"
[1] 1227421


In [16]:
cData_raw = do.call(cbind, outList)
rownames(cData_raw) <- cData_raw[,"index"]

In [17]:
cData <- as.data.frame(cData_raw)
cData_R_filename = paste0(mydir,"partial_counts_cData.Rds")
saveRDS(cData, file = cData_R_filename)

In [18]:
# build the sce object
sce <- SingleCellExperiment(
assays = list(counts = sMat),
colData = cData)
sce

class: SingleCellExperiment 
dim: 3 1227421 
metadata(0):
assays(1): counts
rownames(3): ACE2 TMPRSS2 CTSL
rowData names(0):
colnames(1227421):
  TCTATTGCACGGTTTA-1-HCATisStab7659969_368C_Madissoon____
  CATATGGTCGTCTGAA-1-HCATisStab7646034_367C_Madissoon____ ...
  4951STDY7487593GAATAAGCAGTGGAGT_ARMS054_nasal_nawijn_nasal
  4951STDY7487593GCTCTGTAGACTAGAT_ARMS054_nasal_nawijn_nasal
colData names(19): index age ... smoking_status total_counts
reducedDimNames(0):
spikeNames(0):
altExpNames(0):

In [19]:
sce_R_filename = paste0(mydir,"partial_counts_sce.Rds")
saveRDS(sce, file = sce_R_filename)