### Step 1. Define the basic NN architecture

In [3]:
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.layers import Dense,Input
from tensorflow.keras import regularizers
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import Lambda
from tensorflow.keras import Model
import tensorflow as tf

def build_model(num_tf, num_feature_type):
    inputs = [Input(shape = (num_feature_type,)) for i in range(num_tf)]
    concat_layer = [Dense(1, activation = "relu", kernel_regularizer = tf.keras.regularizers.l1(0.01))(inp) for inp in inputs]
    concat_layer = concatenate(concat_layer)
    out = Dense(64, activation = "relu", kernel_regularizer = tf.keras.regularizers.l1(0.01))(concat_layer)
    out = Dense(32, activation = "relu", kernel_regularizer = tf.keras.regularizers.l1(0.01))(out)
    out = Dense(16, activation = "relu", kernel_regularizer = tf.keras.regularizers.l1(0.01))(out)
    out = Dense(1, activation = "linear")(out)

    # Skip model compilation step
    model = Model(inputs = inputs, outputs = out)

    return(model)

### Step 2. Load expression, TF activitiy scores, and ensmebl to symbol mappings

First, read RPKM values and velocity values and then scale them

In [4]:
import h5py
import numpy as np
import pandas as pd

rpkm = dict()
with h5py.File('data/processed/rpkm/rpkm.hdf5', 'r') as f:
    for tissue in f.keys():
        
        # scale the rpkm by log10
        rpkm[tissue] = pd.DataFrame(np.log10(np.array(f[tissue]['exp'])+1),
                                        index = np.array(f[tissue]['barcode']).astype(str),
                                        columns = np.array(f[tissue]['ensembl']).astype(str))

velo = dict()
with h5py.File('data/processed/velo/velo.hdf5', 'r') as f:
    for tissue in f.keys():
        velo[tissue] = pd.DataFrame(np.array(f[tissue]['velo']),
                                        index = np.array(f[tissue]['barcode']).astype(str),
                                        columns = np.array(f[tissue]['ensembl']).astype(str))

Read TF activity score matrices. Missing values are treated as zeros.

In [5]:
import os
import pandas as pd

folder = "data/tf_activity/"

tf_gene_mat_mean = dict()
for filename in os.listdir(folder + "mean_tf/"):
    tissue = "_".join(filename.split("_")[:-1])
    tf_gene_mat_mean[tissue] = pd.read_csv("{}/mean_tf/{}".format(folder,filename),index_col = 0).transpose()
    tf_gene_mat_mean[tissue].fillna(0,inplace = True)

tf_gene_mat_sum = dict()
for filename in os.listdir(folder + "sum_tf/"):
    tissue = "_".join(filename.split("_")[:-1])
    tf_gene_mat_sum[tissue] = pd.read_csv("{}/sum_tf/{}".format(folder,filename),index_col = 0).transpose()
    tf_gene_mat_sum[tissue].fillna(0,inplace = True)
    
    # scale the value by log2
    tf_gene_mat_sum[tissue] = np.log2(tf_gene_mat_sum[tissue]+1)

Read ensmebl to hgnc symbol mapping table

In [7]:
ensembl_to_symbol = pd.read_csv("data/raw/id_mapping/ensembl_to_symbol.csv",index_col = 0)
ensembl_to_symbol = ensembl_to_symbol.loc[~ensembl_to_symbol["ensembl_id"].duplicated(),:]
ensembl_to_symbol.index = ensembl_to_symbol["ensembl_id"]

### Step 3. Select genes for testing

Select 500 genes in each tissue for testing

In [None]:

import pandas as pd
import re

l1_res = pd.read_csv("results/r2/lasso_r2.csv", index_col=0)

num_genes = 500
num_samples = 4000

# Remove parenthesis and spaces in the tissue name
l1_res["tissue"] = l1_res['tissue'].str.replace('\s+',"_")
l1_res["tissue"] = l1_res['tissue'].str.replace('\(',"")
l1_res["tissue"] = l1_res['tissue'].str.replace('\)',"")

selected_genes = dict()

for tissue,df in l1_res.groupby("tissue"):
    
    if tissue in comm_tissues:
        
        # Select genes that have > 4000 samples
        # For Liver, because there fewer samples, threshold is 1000 samples
        sample_counts = (~velo[tissue].isna()).sum(axis = 0)
        if tissue == "Liver":
            genes = sample_counts.index[sample_counts > 1000].values
        else:
            genes = sample_counts.index[sample_counts > 4000].values
        genes = np.intersect1d(rpkm[tissue].columns, genes)
        df = df.loc[df["gene_ensembl"].isin(genes),]

        # Randomly select 500 genes in each tissue
        np.random.seed(1000)
        if num_genes < df.shape[0]:
            idx = np.random.choice(np.arange(df.shape[0]), size=num_genes, replace=False)
        else:
            idx = np.arange(df.shape[0])
        selected_genes[tissue] = df.iloc[idx,]["gene_ensembl"].values
        print("{} has {} selected genes.".format(tissue, str(selected_genes[tissue].shape[0])))

### Step 4. Cluster and order genes  
Cluster the selected velocity genes by velocity values using balanced kmeans clustering

In [9]:
from k_means_constrained import KMeansConstrained
cluster_labels = dict()

for tissue in velo.keys():
    
    # Get velocity matrix rows are cells, cols are genes.
    mat = velo[tissue].loc[:,selected_genes[tissue]].values
    
    # Impute the missing velocity values by mean velocity from all available cells.
    na_mask = np.isnan(mat).astype(int)
    mean_mat = np.nanmean(mat, axis = 0).reshape(1,-1) * na_mask
    mat = np.nan_to_num(mat,nan = 0)
    mat = mat + mean_mat
    mat = pd.DataFrame(
                        mat,
                        columns = velo[tissue].loc[:,selected_genes[tissue]].columns
                    )
    
    # Get cluster label for each gene.
    cluster_labels[tissue] = KMeansConstrained(n_clusters=20, size_min=int(mat.shape[1]/20), size_max=np.ceil(mat.shape[1]/20)).fit_predict(mat.values.T)
    cluster_labels[tissue] = pd.DataFrame(cluster_labels[tissue], index = mat.columns, columns = ["cluster"])

Keep only the TFs as columns for rpkm mat.

In [10]:
tf_list = pd.read_csv("data/raw/tf/tf_list.csv", index_col = 0)
for tissue in common_tissues:
    
    use_tfs = rpkm[tissue].columns.intersection(tf_list["Ensembl ID"])
    rpkm[tissue] = rpkm[tissue].loc[:,use_tfs]

### Step 5. Reformat the data matrices to keep the consistent TFs and genes across different data matrices
Make rpkm matrix and tf activity matrix have the same tfs  
Make velo matrix and tf activity matrix have the same genes

In [11]:
act_mat_sum = dict()
act_mat_mean = dict()

for tissue in tf_gene_mat_mean.keys():
    
    # Find common TF 
    tfmat_cols = pd.DataFrame(tf_gene_mat_mean[tissue].columns.values,columns = ['tfmat_cols'])
    comm_tfs = ensembl_to_symbol.merge(how = 'left',
                                        left_on = 'gene_symbol',
                                        right_on = "tfmat_cols",
                                        right = tfmat_cols)
    comm_tfs = comm_tfs.loc[~comm_tfs['tfmat_cols'].isna(),]
    comm_tfs = comm_tfs.loc[comm_tfs["ensembl_id"].isin(rpkm[tissue].columns),]
    
    # Find common (velocity) genes
    tfmat_rows = pd.DataFrame(tf_gene_mat_mean[tissue].index.values,columns = ['tfmat_rows'])
    comm_genes = ensembl_to_symbol.merge(how = 'left',
                                            left_on = 'gene_symbol',
                                            right_on = "tfmat_rows",
                                            right = tfmat_rows)
    comm_genes = comm_genes.loc[~comm_genes['tfmat_rows'].isna(),]
    comm_genes = comm_genes.loc[comm_genes["ensembl_id"].isin(velo[tissue].columns),]
    
    '''
    Reorder rpkm mat, velo mat, and cluster label vectors. The TF-Gene matrix follow the same tf_order and gene_order. In this order, the columns in upper left
    corner are the tfs commonly found in rpkm matrix and tf activity matrix and the rows represent the genes commonly found in velo matrix
    and tf activity matrix.
              TF
           _______ __
          | common|  |
    Genes |_______|  |
          |__________|
    '''
    tf_order = pd.concat([comm_tfs['ensembl_id'], pd.Series(np.setdiff1d(rpkm[tissue].columns, comm_tfs['ensembl_id']))])
    gene_order = pd.concat([comm_genes['ensembl_id'], pd.Series(np.setdiff1d(velo[tissue].columns, comm_genes['ensembl_id']))])
    
    rpkm[tissue] = rpkm[tissue].loc[:,tf_order]
    velo[tissue] = velo[tissue].loc[:,gene_order]
    
    act_mat_sum[tissue] = np.zeros((velo[tissue].shape[1], rpkm[tissue].shape[1]))
    act_mat_mean[tissue] = np.zeros((velo[tissue].shape[1], rpkm[tissue].shape[1]))
    
    act_mat_sum[tissue][:comm_genes.shape[0],:][:,:comm_tfs.shape[0]] = tf_gene_mat_sum[tissue].loc[comm_genes['tfmat_rows'],comm_tfs['tfmat_cols']]
    act_mat_mean[tissue][:comm_genes.shape[0],:][:,:comm_tfs.shape[0]] = tf_gene_mat_mean[tissue].loc[comm_genes['tfmat_rows'],comm_tfs['tfmat_cols']]
    
    act_mat_sum[tissue] = pd.DataFrame(act_mat_sum[tissue], index = gene_order, columns = tf_order)
    act_mat_mean[tissue] = pd.DataFrame(act_mat_mean[tissue], index = gene_order, columns = tf_order)

del tf_gene_mat_mean
del tf_gene_mat_sum

### Step 6. Define trace norm gradient (for loss function)

Define nuclear norm, (sub)gradient of nuclear norm, and tensor trace norm, and tensor unfold function

In [12]:
import tensorflow as tf

# Define nuclear norm operation
@tf.custom_gradient
def nuclear_norm(x):
    sigma = tf.linalg.svd(x, full_matrices=False, compute_uv=False)
    norm = tf.reduce_sum(sigma)
    
    # Grandient function
    def nuclear_norm_grad(dy):
        _, U, V = tf.linalg.svd(x, full_matrices=False, compute_uv=True)
        grad = tf.matmul(U, tf.transpose(V))
        return dy * grad
    
    return norm, nuclear_norm_grad

def TensorUnfold(A, k):
    tmp_arr = np.arange(A.shape.ndims)
    A = tf.transpose(A, [tmp_arr[k]] + np.delete(tmp_arr, k).tolist())
    A = tf.reshape(A, [A.shape[0], np.prod(A.shape[1:])])
    return A

def trace_norm(X):
    return nuclear_norm(TensorUnfold(X, -1))

### Step 7. Train and test MTLRANK models  
This step may take a long time to complte

In [13]:
import os, psutil
import datetime
import numpy as np
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from collections import defaultdict
from tensorflow import keras
from joblib import Parallel, delayed
from collections import defaultdict
from time import time

# MSE function and sgd optimizer
mse = tf.keras.losses.MeanSquaredError()
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.01)

# Function for training and testing each gene
def train_test(tissue, cluster, rep, n_epochs):
    
    # Get velocity genes for current cluster
    cluster_genes = cluster_labels[tissue].loc[cluster_labels[tissue].iloc[:,0] == cluster,].index.values
    '''
    # For each gene 1) Get train and test data.
                    2) Standardize train and test data
    '''
    X_train = defaultdict(list)
    X_test = defaultdict(list)
    y_train = defaultdict(list)
    y_test = defaultdict(list)
    
    print("Preparing features ... {}".format(datetime.datetime.now().ctime()))
    remain = dict() # Use this to keep track of which cells have been used for each gene.
    for gene in cluster_genes:
        
        # Split train and test data for each gene
        rpkm_train = rpkm[tissue].iloc[train_idx[rep],]
        rpkm_test = rpkm[tissue].iloc[test_idx[rep],]
        velo_train = velo[tissue].iloc[train_idx[rep]][gene]
        velo_test = velo[tissue].iloc[test_idx[rep]][gene]
        
        # Remove NAs.
        select = ~pd.isna(velo_train)
        velo_train = velo_train.loc[select]
        rpkm_train = rpkm_train.loc[select,:]

        select = ~pd.isna(velo_test)
        velo_test = velo_test.loc[select]
        rpkm_test = rpkm_test.loc[select,:]
        
        # Generate train/test inputs for current gene
        for i,TF in enumerate(rpkm_train.columns):
            
            X_train[gene].append(np.empty((rpkm_train.shape[0], 5)))
            X_test[gene].append(np.empty((rpkm_test.shape[0], 5)))
            
            # For current tf, get expression rpkms
            X_train[gene][-1][:,0] = rpkm_train[TF].values
            X_test[gene][-1][:,0] = rpkm_test[TF].values
            
            # Get TF mean signals and TF sum signals
            X_train[gene][-1][:,1] = act_mat_mean[tissue].loc[gene,TF]
            X_train[gene][-1][:,2] = act_mat_sum[tissue].loc[gene,TF]
            X_test[gene][-1][:,1] = act_mat_mean[tissue].loc[gene,TF]
            X_test[gene][-1][:,2] = act_mat_sum[tissue].loc[gene,TF]

            # The product of expressions and TF mean signals
            X_train[gene][-1][:,3] =  X_train[gene][-1][:,0] * X_train[gene][-1][:,1]
            X_test[gene][-1][:,3] =  X_test[gene][-1][:,0] * X_test[gene][-1][:,1]

            # The product of expressions and TF sum signals
            X_train[gene][-1][:,4] = X_train[gene][-1][:,0] * X_train[gene][-1][:,2]
            X_test[gene][-1][:,4] = X_test[gene][-1][:,0] * X_test[gene][-1][:,2]
        
            # Standardize the inputs
            if(np.std(X_train[gene][-1][:,0]) != 0):
                X_train[gene][-1][:,0] = (X_train[gene][-1][:,0] - np.mean(X_train[gene][-1][:,0]))/np.std(X_train[gene][-1][:,0])
           
            if(np.std(X_train[gene][-1][:,3]) != 0):
                X_train[gene][-1][:,3] = (X_train[gene][-1][:,3] - np.mean(X_train[gene][-1][:,3]))/np.std(X_train[gene][-1][:,3])
            
            if(np.std(X_train[gene][-1][:,4]) != 0):
                X_train[gene][-1][:,4] = (X_train[gene][-1][:,4] - np.mean(X_train[gene][-1][:,4]))/np.std(X_train[gene][-1][:,4])
           
            if(np.std(X_test[gene][-1][:,0]) != 0):
                X_test[gene][-1][:,0] = (X_test[gene][-1][:,0] - np.mean(X_test[gene][-1][:,0]))/np.std(X_test[gene][-1][:,0])
           
            if(np.std(X_test[gene][-1][:,3]) != 0):
                X_test[gene][-1][:,3] = (X_test[gene][-1][:,3] - np.mean(X_test[gene][-1][:,3]))/np.std(X_test[gene][-1][:,3])
            
            if(np.std(X_test[gene][-1][:,4]) != 0):
                X_test[gene][-1][:,4] = (X_test[gene][-1][:,4] - np.mean(X_test[gene][-1][:,4]))/np.std(X_test[gene][-1][:,4])
           
        y_train[gene] = velo_train.values
        y_test[gene] = velo_test.values
        
        y_train[gene] = (y_train[gene] - np.mean(y_train[gene]))/np.std(y_train[gene])
        y_test[gene] = (y_test[gene] - np.mean(y_test[gene]))/np.std(y_test[gene])
        
    '''
    Train by trace norm loss.
    '''
    print("Setting up models...{}".format(datetime.datetime.now().ctime()))
    
    # Build models without compilation. Each model corresponds to one gene in the cluster.
    num_tf = rpkm[tissue].shape[1]
    models = [build_model(num_tf = num_tf, num_feature_type = 5) for i in range(cluster_genes.shape[0])]
    
    # Get test batches
    print("Running training...{}".format(datetime.datetime.now().ctime()))
    x_test_batches = []
    y_test_batches = []
    for k,gene in enumerate(cluster_genes):
        x_test_batches.append(X_test[gene])
        y_test_batches.append(y_test[gene])
    
    # Training step for one batch of data (Each batch consists of a batch with batch_size cells from each gene)
    # So the total number of cells in a batch = batch_size * number of genes
    def train_step(x_batches, y_batches):
        
        # Record how loss value is computed for performing automatic differential later.
        with tf.GradientTape() as tape:
            
            # Run one round of forward pass for all models
            y_pred = [models[k](x, training = True) for k,x in enumerate(x_batches)]
            
            # Compute MSE loss
            MSE = [tf.reduce_mean(mse(y, y_pred[k])) for k,y in enumerate(y_batches)]
            
            # Get the weights of the last three layers. These layers are shared and their tracenorm loss
            # will be calculated.
            sharable_weights = []
            
            for layer in [num_tf*2, num_tf*2+2]:
                sharable_weights.append(tf.stack([model.trainable_weights[layer] for model in models], axis = -1))    
            
            # Compute tracenorm
            tracenorm = [trace_norm(sharable_weights[k]) for k in range(len(sharable_weights))]
            
            # Compute final loss. loss = MSE + lambda*tracenorm. (lambda = 0.01 for convenience)
            loss = tf.reduce_mean(MSE) + tf.math.multiply(0.01, tf.reduce_mean(tracenorm))
        
        '''
        Here gradients are calculated 
        'unconnected_gradients=tf.UnconnectedGradients.ZERO' ensures that
        gradients for other task-specific layer other than the current one
        are zeros.
        '''
        grads_all = tape.gradient(loss,
                              [model.trainable_weights for model in models],
                              unconnected_gradients=tf.UnconnectedGradients.ZERO)
        
        '''
        Run one step of gradient descent by updating
        the value of the variables to minimize the loss.
        '''
        for grad,model in zip(grads_all, models):
            optimizer.apply_gradients(zip(grad, model.trainable_weights))
       
        return(loss, tf.reduce_mean(MSE))
    
    # Test step for testing one batch (Each batch consists of a batch with batch_size cells from each gene)
    def test_step(models, x_batches, y_batches):
        r2s = []
        for x,y,model in zip(x_batches,y_batches,models):
            y_pred = model.predict(x)
            r2s.append(r2_score(y,y_pred))
        return(r2s)
    
    # Main training loop
    for i in range(n_epochs):
        
        # Generate sampling indices for each batch
        # Number of batches are dependent on the gene with the most training examples (N).
        # For other genes, examples are resampled if they have smaller sample sizes.
        tmp = [(gene,sample.shape[0]) for gene,sample in y_train.items()]
        tmp = sorted(tmp, key = lambda x: x[1], reverse = True)
        max_size = tmp[0][1]
        
        # Get sample indices for each training step
        idx_iter = dict()
        for gene,size in tmp:
            idx = np.arange(size)
            np.random.shuffle(idx)
            idx = np.resize(idx, max_size)
            idx_iter[gene] = np.split(idx, np.arange(batch_size,idx.shape[0],batch_size))
        n_iter = len(idx_iter[gene])
        
        # Training loop
        for j in range(n_iter):
            begin = time()
            
            # Get training data batches for each training step (one batch per gene).
            x_train_batches = []
            y_train_batches = []
            for k,gene in enumerate(cluster_genes):
                x_train_batches.append([tf.convert_to_tensor(feature[idx_iter[gene][j],], dtype = tf.float64) for feature in X_train[gene]])
                y_train_batches.append(tf.convert_to_tensor(y_train[gene][idx_iter[gene][j]], dtype = tf.float64)) 
                
            # Run one step of training (Compute loss and perform one step of gradient descent), make sure all input arguments are tf.Tensor or list of tf.Tensor
            loss, MSE = train_step(x_train_batches, y_train_batches)
            end = time()
            spent = np.round(end - begin, 1)
            
            print("Epoch {}/{}, Iter {}/{}, loss value: {}, MSE: {}, {}s used".format(str(i+1),
                                                                                str(n_epochs),
                                                                                str(j+1),
                                                                                str(n_iter),                                                                                          
                                                                                str(loss.numpy()),
                                                                                str(MSE.numpy()),
                                                                                str(spent)))
            
    r2s = test_step(models, x_test_batches, y_test_batches)
    with open("results/r2/MTLRANK_expTFAPredVelo_r2.csv","a") as f:
        for r2,gene in zip(r2s,cluster_genes):
            f.writelines("{},{},{},{}\n".format(tissue, gene, str(rep), str(r2)))
    
    return(r2s)

# Some hyperparameters 
n_rep = 3
num_cluster = 20
n_epochs = 10
batch_size = 256
test_ratio = 0.1
r2_tissue = dict()
batch_size = 256

for tissue in velo.keys():
    
    # Generate train and test sample indices for different replicates
    idx = np.arange(rpkm[tissue].shape[0])
    train_idx = []
    test_idx = []
    
    # Generate train and test cell indices for different replicates
    for rep in range(n_rep):
        idx1, idx2 = train_test_split(idx, test_size = test_ratio, shuffle = True)
        train_idx.append(idx1)
        test_idx.append(idx2)
    
    # Run this number of replicates
    for rep in [2]:
        
        #Train and test genes in current cluster
        res = Parallel(n_jobs = 4)(delayed(train_test)(tissue, cluster, rep, n_epochs = 3)
                                    #for cluster in range(num_cluster))
                                   for cluster in range(10,20,1))