In [4]:
import sklearn
import pandas as pd
import numpy as np
import os
import h5py

from collections import defaultdict
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from scipy.sparse import csr_matrix
from joblib import Parallel, delayed

### Step 1. Load the list of TF names (Ensemble ID)

In [5]:
tfs = pd.read_csv("data/raw/tf/tf_list.csv",index_col = 0)

### Step2. Read ensembl ID to gene symbol mappings

In [6]:
ensembl_to_symbol = pd.read_csv("data/raw/id_mapping/ensembl_to_symbol.csv",index_col = 0)
ensembl_to_symbol.index = ensembl_to_symbol["ensembl_id"]

# Drop duplicates
ensembl_to_symbol = ensembl_to_symbol.loc[~ensembl_to_symbol.index.duplicated(),]

### Step3. Read velocity and expression matrices

In [7]:
import h5py
import numpy as np
import pandas as pd

rpkm = dict()
with h5py.File('data/processed/rpkm/rpkm.hdf5', 'r') as f:
    for tissue in f.keys():
        
        # scale the rpkm by log10
        rpkm[tissue] = pd.DataFrame(np.log10(np.array(f[tissue]['exp'])+1),
                                        index = np.array(f[tissue]['barcode']).astype(str),
                                        columns = np.array(f[tissue]['ensembl']).astype(str))

velo = dict()
with h5py.File('data/processed/velo/velo.hdf5', 'r') as f:
    for tissue in f.keys():
        velo[tissue] = pd.DataFrame(np.array(f[tissue]['velo']),
                                        index = np.array(f[tissue]['barcode']).astype(str),
                                        columns = np.array(f[tissue]['ensembl']).astype(str))

### Step4. Train L1 model for each gene in each tissue. Use TF expressions to predict gene velocities

In [10]:
import itertools
import pdb

# Train model by 5 fold cv and return R2 score
def test_model(X, X_col, Y, Y_col):
    
    r2 = []
    selected_tfs = []
    X = csr_matrix(X)
    for i in range(Y.shape[1]):
        model = linear_model.Lasso(alpha=0.1)
        current_gene = Y_col[i]
        
        # Exclude the current gene to predict from the TF-cell expression matrix
        X_run = X.copy()
        X_run[:,X_col == current_gene] = 0
        
        # exclude nan values
        select = ~np.isnan(Y[:,i])
        X_select = X_run[select,]
        y_select = Y[select,i]
        
        # Compute R2 by 5 fold-cross-validation
        r2.append(np.mean(cross_val_score(model, X_select, y_select, cv=5)))
    
        # Get list of selected tfs by training on whole dataset
        model = linear_model.Lasso(alpha=0.1)
        model.fit(X_select, y_select)
        selected_tfs.append(np.where(model.coef_ != 0)[0])
    return([r2, selected_tfs])

r2_all = []
selected_tfs_all = []
res_all = []
n_jobs = 14 # This is number of CPU cores to use. Modify it to suit your own machine.

for tissue in rpkm.keys():
    
    X = rpkm[tissue].values
    Y = velo[tissue].values
    
    X_cols = rpkm[tissue].columns
    Y_splits = np.array_split(Y, n_jobs, axis = 1)
    Y_col_splits = np.array_split(velo[tissue].columns,14)
    
    res = Parallel(n_jobs=n_jobs)(delayed(test_model)(X, X_cols, Y_split, Y_col_split) 
                              for Y_split,Y_col_split in zip(Y_splits,Y_col_splits))
    
    r2 = list(itertools.chain(*[each[0] for each in res])) 
    selected_tfs = list(itertools.chain(*[each[1] for each in res]))
    r2_all.append(r2)
    selected_tfs_all.append(selected_tfs)
    print("{} finished".format(tissue))

Heart finished




Kidney_Left finished
Large_Intestine finished
Liver finished
Lung_Right finished
Lymph_Node finished
Spleen finished
Thymus finished


### Step4. Save L1 model results
ensembl ids were mapped to hgnc symbols

In [33]:
l1_res = []
for r2,tfs_idx,tissue in zip(r2_all,selected_tfs_all,rpkm.keys()):
    
    l1_res.append(pd.DataFrame({'tissue':tissue,
                                   'r2':r2,
                                   'gene_ensembl':velo[tissue].columns,
                                   'gene_hgnc':ensembl_to_symbol.loc[velo[tissue].columns,]["gene_symbol"].values
                                  })
                    )
    
    # Get the selected TF ensembl ids
    tf_ensembl = []
    tf_hgnc = []
    
    
    for i,gene in enumerate(velo[tissue].columns):
        if tfs_idx[i].shape[0] > 0:
            ensembl_ids = rpkm[tissue].columns[tfs_idx[i]]
            hgnc_ids = ensembl_to_symbol.loc[ensembl_ids,]["gene_symbol"].values.astype(str)
            
            tf_ensembl.append(" ".join(ensembl_ids))
            tf_hgnc.append(" ".join(hgnc_ids))
        else:
            tf_ensembl.append("")
            tf_hgnc.append("")
    
    l1_res[-1]["tf_ensembl"] = tf_ensembl
    l1_res[-1]["tf_hgnc"] = tf_hgnc
l1_res = pd.concat(l1_res)
l1_res = l1_res.sort_values(by = ["tissue","r2"], ascending = False)
l1_res.to_csv("results/r2/lasso_r2.csv")