In [1]:
import pandas as pd
import numpy as np
import os
import h5py
import anndata

from collections import defaultdict
from joblib import Parallel, delayed

**step1. Load genome annotation file (for getting gene length information to compute rpkm)**

In [2]:
anno = pd.read_csv("data/raw/annotation/gencode.v32.annotation.gtf",
                   sep="\t",
                   comment = "#",
                  header = None)
anno = anno.loc[anno[2] == "gene"]
anno = pd.DataFrame(
    {
        'gene_ensembl':anno[8].str.extract("gene_id \"([^\.]+)\.")[0].values,
        'gene_length':(anno[4] - anno[3]).values
    }
)
anno.index = anno["gene_ensembl"]
anno = anno.drop_duplicates()

This cell shows sample IDs for scRNA-seq data. Download the datasets with the corresponding IDs from the HuBMap data portal: https://portal.hubmapconsortium.org/. Alternatively, you may perfrom batch download using commandline tool as described in https://software.docs.hubmapconsortium.org/clt/index.html. This script assumes you have downloaded the data into the folder **data/raw/rna/** and the subfolders in it were all named by sample ID. For example, for a sample with the ID "0eb5e457b4855ce28531bc97147196b6", its expression matrix (secondary_analysis.h5ad) and velocity matrix (scvelo_annotated.h5ad) can be found inside data/raw/rna/0eb5e457b4855ce28531bc97147196b6/.

In [8]:
import pdb
rootdir = "data/raw/rna/"

# Velocity matrix
v_mat = defaultdict(list)

# Expression matrix
e_mat = defaultdict(list)

# Iterate over tissues
for tissue in ["Liver","Heart","Kidney (Left)","Large Intestine","Spleen","Lung (Right)"]:
    if os.path.isdir(rootdir + tissue):
        print(tissue)
        for Dir in os.listdir(rootdir + tissue):
            velo_path = "/".join([rootdir,tissue,Dir,"scvelo_annotated.h5ad"])
            expr_path = "/".join([rootdir,tissue,Dir,"secondary_analysis.h5ad"])
            if os.path.exists(velo_path):
                print("{}, {}".format(tissue, Dir))

Liver
Liver, 0eb5e457b4855ce28531bc97147196b6
Liver, 37acb2625dbd65733a8d95069ec00f95
Liver, 7277b2460f9c548004496508684a90ef
Liver, 43c43f3fd7fbe61b9d8fed131d956101
Liver, 40f7670890ae22c74ba4cf95a3ed6c6b
Heart
Heart, d6ec9ff2219e8cb74e807ded7e35c25e
Heart, d4e85897ce6037c09b7d3a4de9a7205a
Heart, b513c806bf3ee0d43d7ee6080d49913d
Heart, f6b6f4b613ef675eae82e9a00684228a
Heart, 600325157b07f4ec34ab98bcec1f1867
Heart, 61407ec0b68d82c1f05861501ffefe9a
Heart, ae8847ddd48cd1d9c96dc1e6f6c259d1
Heart, c75ed9c6ddceb575abdcdb35d1d1515b
Heart, 0bb7731a301114bf2d9336d4979ed31f
Heart, e39662494e4dc66a3decc52e412b6552
Heart, 3c97dbd44ba989c41c29cd2687311366
Heart, 59cbc103a6ab0e9d55a8b4b47e3e25b0
Heart, 92436cee1b8037a1b191a01546c3802c
Heart, d3a399c31a0f9d8bad4294691d2c8f35
Kidney (Left)
Kidney (Left), 9fe24e50c8b8b77d86900ee4beecef69
Kidney (Left), 7646a8a89555a123a56446b66c183d58
Kidney (Left), 0d1eb3d774a694b79e844987f771b183
Kidney (Left), 6cd63d0ee2c67c3be41e4be1522d9c07
Kidney (Left), 7017034

**step2. Read velocity and expressions. Compute RPKMs from expressions**

In [None]:
import pdb
rootdir = "data/raw/rna/"

# Velocity matrix
v_mat = defaultdict(list)

# Expression matrix
e_mat = defaultdict(list)

# Iterate over tissues
for tissue in os.listdir(rootdir):
    print(tissue)
    if os.path.isdir(rootdir + tissue):
        for Dir in os.listdir(rootdir + tissue):
            velo_path = "/".join([rootdir,tissue,Dir,"scvelo_annotated.h5ad"])
            expr_path = "/".join([rootdir,tissue,Dir,"secondary_analysis.h5ad"])
            if os.path.exists(velo_path):
                v = anndata.read_h5ad(velo_path)
                expr = anndata.read_h5ad(expr_path)
                res = np.ravel(expr.layers['spliced'].todense().sum(axis = 1))
                        
                cells = v.obs.index.intersection(expr.obs.index).values
                
                # Get gene ids used for velocities
                v_genes = v.var.index[v.var["velocity_genes"] == True].values 
                v_genes = [gene.split('.')[0] for gene in v_genes] # Strip of the '.XX' for ensmebl ids
                
                # Filter gene ids used for expressions. Keep only the genes that have length info available
                expr.var.index = pd.Index([gene.split('.')[0] for gene in expr.var.index]) # Strip of the '.XX' for ensmebl ids
                
                select = expr.var.index.isin(anno.index)
                rpkm_expr = expr.layers['spliced'][:,select]
                e_genes = expr.var.index[select]
                
                # normalize expression data to rpkm values
                lib_size = rpkm_expr.sum(axis = 1)
                gene_length = anno.loc[e_genes,]["gene_length"].values.reshape(1,-1)
                rpkm_expr = (rpkm_expr * 1000000 * 1000) / (lib_size * gene_length) 
                
                # For expression matrix, keep overlapping cells in both matrices
                # For velocity matrix, keep velocity genes and overlapping cells in both matrices 
                v_select = v.layers['velocity'][v.obs.index.isin(cells),:][:,v.var["velocity_genes"] == True]
                e_select = rpkm_expr[expr.obs.index.isin(cells),:]
                
                
                v_mat[tissue].append(pd.DataFrame(v_select,
                                                index = cells,
                                                columns = v_genes
                                                )
                                    )
                
                e_mat[tissue].append(pd.DataFrame(e_select,
                                                index = cells,
                                                columns = e_genes
                                                )
                                    )
    if tissue in v_mat:
        v_mat[tissue] = pd.concat(v_mat[tissue])
        e_mat[tissue] = pd.concat(e_mat[tissue], join = 'inner')

**step3. Save velocity and RPKM matrices**

In [14]:
import re
with h5py.File("data/processed/rpkm/rpkm.hdf5", "w") as f:
    for tissue,mat in e_mat.items():
        t_name = re.sub(" ","_",tissue)
        t_name = re.sub("\(","",t_name)
        t_name = re.sub("\)","",t_name)
        f.create_dataset("{}/exp".format(t_name), data = mat.values)
        f.create_dataset("{}/symbol".format(t_name), data = ensembl_to_symbol.loc[mat.columns,]["gene_symbol"].values.astype('S'))
        f.create_dataset("{}/ensembl".format(t_name), data = mat.columns.values.astype('S'))
        f.create_dataset("{}/barcode".format(t_name), data = mat.index.values.astype('S'))

In [15]:
with h5py.File("data/processed/velo/velo.hdf5", "w") as f:
    for tissue,mat in v_mat.items():
        t_name = re.sub(" ","_",tissue)
        t_name = re.sub("\(","",t_name)
        t_name = re.sub("\)","",t_name)
        
        f.create_dataset("{}/velo".format(t_name), data = mat.values)
        f.create_dataset("{}/symbol".format(t_name), data = ensembl_to_symbol.loc[mat.columns,]["gene_symbol"].values.astype('S'))
        f.create_dataset("{}/ensembl".format(t_name), data = mat.columns.values.astype('S'))
        f.create_dataset("{}/barcode".format(t_name), data = mat.index.values.astype('S'))