### Read TF list and ensembl to symbol mapping file

In [6]:
import pandas as pd
tf_list = pd.read_csv("data/raw/tf/tf_list.csv", index_col = 0)    
ensembl_to_symbol = pd.read_csv("data/raw/id_mapping/ensembl_to_symbol.csv",index_col = 0)
ensembl_to_symbol = ensembl_to_symbol.loc[~ensembl_to_symbol["ensembl_id"].duplicated(),:]
ensembl_to_symbol.index = ensembl_to_symbol["ensembl_id"]

### Load expression data, and TF activitiy scores.

keep only one tissue and TF columns

In [7]:
import h5py
import numpy as np
import pandas as pd

tissue = 'Lung_Right'

# First we get indices of TFs
with h5py.File('data/processed/rpkm/rpkm.hdf5', 'r') as f:
    col_names = np.array(f[tissue]['ensembl']).astype(str)
tf_idx = np.where(np.isin(col_names, tf_list["Ensembl ID"].values))[0]

# Get expressions only for TF columns 
with h5py.File('data/processed/rpkm/rpkm.hdf5', 'r') as f:
    rpkm = pd.DataFrame(np.log10(np.array(f[tissue]['exp'][:,tf_idx])+1),
                                index = np.array(f[tissue]['barcode']).astype(str),
                                columns = np.array(f[tissue]['ensembl'][tf_idx]).astype(str))

with h5py.File('data/processed/velo/velo.hdf5', 'r') as f:
    velo = pd.DataFrame(np.array(f[tissue]['velo']),
                                index = np.array(f[tissue]['barcode']).astype(str),
                                columns = np.array(f[tissue]['ensembl']).astype(str))

Read TF activity score matrices. Missing values are treated as zeros.  
keep only one tissue

In [8]:
import os
import pandas as pd

folder = "data/tf_activity/"

tf_gene_mat_mean = pd.read_csv(folder + "mean_tf/" + tissue + "_tfGeneMat.csv", index_col = 0).transpose()
tf_gene_mat_sum = pd.read_csv(folder + "sum_tf/" + tissue + "_tfGeneMat.csv", index_col = 0).transpose()
tf_gene_mat_mean.fillna(0,inplace = True)
tf_gene_mat_sum.fillna(0,inplace = True)

# scale the value by log2
tf_gene_mat_mean = np.log2(tf_gene_mat_mean+1)
tf_gene_mat_sum = np.log2(tf_gene_mat_sum+1)

### Reformat the data matrices to keep the consistent TFs and genes.
Make rpkm matrix and tf activity matrix have the same tfs  
Make velo matrix and tf activity matrix have the same genes

In [9]:
act_mat_sum = dict()
act_mat_mean = dict()
    
# Find common TF 
tfmat_cols = pd.DataFrame(tf_gene_mat_mean.columns.values,columns = ['tfmat_cols'])
comm_tfs = ensembl_to_symbol.merge(how = 'left',
                                    left_on = 'gene_symbol',
                                    right_on = "tfmat_cols",
                                    right = tfmat_cols)
comm_tfs = comm_tfs.loc[~comm_tfs['tfmat_cols'].isna(),]
comm_tfs = comm_tfs.loc[comm_tfs["ensembl_id"].isin(rpkm.columns),]

# Find common (velocity) genes
tfmat_rows = pd.DataFrame(tf_gene_mat_mean.index.values,columns = ['tfmat_rows'])
comm_genes = ensembl_to_symbol.merge(how = 'left',
                                        left_on = 'gene_symbol',
                                        right_on = "tfmat_rows",
                                        right = tfmat_rows)
comm_genes = comm_genes.loc[~comm_genes['tfmat_rows'].isna(),]
comm_genes = comm_genes.loc[comm_genes["ensembl_id"].isin(velo.columns),]

'''
Reorder rpkm mat, velo mat, and cluster label vectors. The TF-Gene matrix follow the same tf_order and gene_order. In this order, the columns in upper left
corner are the tfs commonly found in rpkm matrix and tf activity matrix and the rows represent the genes commonly found in velo matrix
and tf activity matrix.
          TF
       _______ __
      | common|  |
Genes |_______|  |
      |__________|
'''
tf_order = pd.concat([comm_tfs['ensembl_id'], pd.Series(np.setdiff1d(rpkm.columns, comm_tfs['ensembl_id']))])
gene_order = pd.concat([comm_genes['ensembl_id'], pd.Series(np.setdiff1d(velo.columns, comm_genes['ensembl_id']))])

rpkm = rpkm.loc[:,tf_order]
velo = velo.loc[:,gene_order]

act_mat_sum = np.zeros((velo.shape[1], rpkm.shape[1]))
act_mat_mean = np.zeros((velo.shape[1], rpkm.shape[1]))

act_mat_sum[:comm_genes.shape[0],:][:,:comm_tfs.shape[0]] = tf_gene_mat_sum.loc[comm_genes['tfmat_rows'],comm_tfs['tfmat_cols']]
act_mat_mean[:comm_genes.shape[0],:][:,:comm_tfs.shape[0]] = tf_gene_mat_mean.loc[comm_genes['tfmat_rows'],comm_tfs['tfmat_cols']]

act_mat_sum = pd.DataFrame(act_mat_sum, index = gene_order, columns = tf_order)
act_mat_mean = pd.DataFrame(act_mat_mean, index = gene_order, columns = tf_order)

del tf_gene_mat_mean
del tf_gene_mat_sum

### Select genes
- For liver, we get genes with more than 1000 cells for which the velocity values are available.
- For other tissues, we get genes with more than 5000 cells for which the velocity values are available.

In [10]:
# Get cell counts for which the velocity values are available
cell_counts = (~velo.isna()).sum(axis = 0)
if tissue == "Liver":
    selected_genes = cell_counts.index[cell_counts > 1000].values
else:
    selected_genes = cell_counts.index[cell_counts > 5000].values
print("{} has {} selected genes.".format(tissue, str(selected_genes.shape[0])))

Lung_Right has 2380 selected genes.


### Standardize data

In [11]:
# standardize rpkm train
rpkm = (rpkm - rpkm.mean())/rpkm.std()

# Fill na as zeros in rpkm matrices
rpkm.fillna(value = 0, inplace = True)

### Compute shap values from trained models

Define model architecture

In [12]:
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.layers import Dense,Input
from tensorflow.keras import regularizers
from tensorflow.keras.layers import concatenate
from tensorflow.keras import Model
import tensorflow as tf

def build_model(num_tf, num_feature_type):
    inputs = [Input(shape = (num_feature_type,)) for i in range(num_tf)]
    concat_layer = [Dense(1, activation = "relu", kernel_regularizer = tf.keras.regularizers.l1(0.01))(inp) for inp in inputs]
    concat_layer = concatenate(concat_layer)
    out = Dense(64, activation = "relu", kernel_regularizer = tf.keras.regularizers.l1(0.01))(concat_layer)
    out = Dense(32, activation = "relu", kernel_regularizer = tf.keras.regularizers.l1(0.01))(out)
    out = Dense(16, activation = "relu", kernel_regularizer = tf.keras.regularizers.l1(0.01))(out)
    out = Dense(1, activation = "linear")(out)

    model = Model(inputs = inputs, outputs = out)
    model.compile()
    return(model)

Load trained model weights and compute SHAP values

In [None]:
from time import time
import shap
from joblib import Parallel,delayed
tissue = 'Lung_Right'
num_feature_type = 3

def run_shap(tissue, gene, rep):

    begin = time()

    # Build, compile the model, and load up weights from trained model.
    model = build_model(num_tf = rpkm.shape[1], num_feature_type = num_feature_type)
    model.load_weights("results/full_model/{}-{}-rep{}".format(gene,tissue,rep))

    # Prepare inputs
    velo_train = velo[gene]

    # Remove NAs.
    select = ~pd.isna(velo_train)
    rpkm_train = rpkm.loc[select,:]

    X_train = []

    # Generate inputs for current gene
    for i,TF in enumerate(rpkm_train.columns):

        X_train.append(np.empty((rpkm_train.shape[0], num_feature_type)))

        # For current tf, get expression rpkms
        X_train[-1][:,0] = rpkm_train[TF].values

        # Get TF mean signals and TF sum signals
        X_train[-1][:,1] = act_mat_mean.loc[gene,TF]
        X_train[-1][:,2] = act_mat_sum.loc[gene,TF]

    # Zeros as reference samples
    background = [np.zeros((1,3)) for TF in rpkm.columns]

    # Compute shap values for all samples
    e = shap.DeepExplainer(model, background)
    shap_values = e.shap_values(X_train)

    # Aggregate shap values (absolute) for each feature (TF)
    tf_shap_values = np.array([np.abs(np.abs(TF.sum(axis = 0))).sum() for TF in shap_values[0]])
    idx = tf_shap_values.argsort()[::-1] # Descending order
    tf_shap_values = tf_shap_values[idx]
    tf_ensembl = rpkm.columns[idx]
    ranking = pd.DataFrame({"gene_ensembl":gene,"tf_ensembl":tf_ensembl,"sum_shap_value":tf_shap_values,"rep":rep})
    
    # Save TF ranking results
    ranking.to_csv("results/ranking/shap-val-{}.csv".format(tissue),mode = "a", header = False)
    
    end = time()
    print("Completed {}s used".format(end-begin))
    return(None)

# Run shap computation in parallel
res = Parallel(n_jobs = 16)(delayed(run_shap)(tissue, gene, rep)
                              for gene,rep in zip(left_genes, left_reps))