### Step 1. Define and compile NN model

In [1]:
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.layers import Dense,Input
from tensorflow.keras import regularizers
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import Lambda
from tensorflow.keras import Model
import tensorflow as tf

def build_model(num_tf, num_feature_type):
    inputs = [Input(shape = (num_feature_type,)) for i in range(num_tf)]
    concat_layer = [Dense(1, activation = "relu", kernel_regularizer = tf.keras.regularizers.l1(0.01))(inp) for inp in inputs]
    concat_layer = concatenate(concat_layer)
    out = Dense(64, activation = "relu", kernel_regularizer = tf.keras.regularizers.l1(0.01))(concat_layer)
    out = Dense(32, activation = "relu", kernel_regularizer = tf.keras.regularizers.l1(0.01))(out)
    out = Dense(16, activation = "relu", kernel_regularizer = tf.keras.regularizers.l1(0.01))(out)
    out = Dense(1, activation = "linear")(out)

    # Skip model compilation step
    model = Model(inputs = inputs, outputs = out)

    return(model)

### Step 2. Load expression, TF activitiy scores, and ensembl ID mapping file

First, read RPKM values and velocity values and then scale them

In [2]:
import h5py
import numpy as np
import pandas as pd

rpkm = dict()
with h5py.File('data/processed/rpkm/rpkm_snare.hdf5', 'r') as f:
    for tissue in f.keys():
        
        # scale the rpkm by log10
        rpkm[tissue] = pd.DataFrame(np.log10(np.array(f[tissue]['exp'])+1),
                                        index = np.array(f[tissue]['barcode']).astype(str),
                                        columns = np.array(f[tissue]['ensembl']).astype(str))

velo = dict()
with h5py.File('data/processed/velo/velo_snare.hdf5', 'r') as f:
    for tissue in f.keys():
        velo[tissue] = pd.DataFrame(np.array(f[tissue]['velo']),
                                        index = np.array(f[tissue]['barcode']).astype(str),
                                        columns = np.array(f[tissue]['ensembl']).astype(str))

Keep only TF as columns for rpkm matrix.

In [3]:
# Keep TFs as columns for rpkm mat.
# Keep common genes between tf_gene mat and velocity mat.

tf_list = pd.read_csv("data/raw/tf/tf_list.csv", index_col = 0)
for tissue in rpkm.keys():
    
    use_tfs = rpkm[tissue].columns.intersection(tf_list["Ensembl ID"])
    rpkm[tissue] = rpkm[tissue].loc[:,use_tfs]

### Step 3. Select genes for testing

Select 500 genes in each tissue for testing

In [4]:
import pandas as pd
import re

num_genes = 500
num_samples = 4000

selected_genes = dict()

for tissue in rpkm.keys():
        
    # Select genes that have > 4000 samples
    sample_counts = (~velo[tissue].isna()).sum(axis = 0)
    genes = sample_counts.index[sample_counts > num_samples].values
    np.random.seed(1000)
    selected_genes[tissue] = np.random.choice(genes, num_genes, replace = False)
    print("{} has {} selected genes.".format(tissue, str(selected_genes[tissue].shape[0])))

Kidney_Left has 500 selected genes.
Kidney_Right has 500 selected genes.
Lung_Right has 500 selected genes.


### Step 4. Cluster and order genes
1. Cluster the selected velocity genes by velocity values (balanced kmeans clustering)
2. In each cluster, order the genes by correlation

In [5]:
from k_means_constrained import KMeansConstrained
cluster_labels = dict()

for tissue in rpkm.keys():
    
    # Get velocity matrix rows are cells, cols are genes.
    mat = velo[tissue].loc[:,selected_genes[tissue]].values
    
    # Impute the missing velocity values by mean velocity from all available cells.
    na_mask = np.isnan(mat).astype(int)
    mean_mat = np.nanmean(mat, axis = 0).reshape(1,-1) * na_mask
    mat = np.nan_to_num(mat,nan = 0)
    mat = mat + mean_mat
    mat = pd.DataFrame(
                        mat,
                        columns = velo[tissue].loc[:,selected_genes[tissue]].columns
                    )
    
    # Get cluster label for each gene.
    cluster_labels[tissue] = KMeansConstrained(n_clusters=20, size_min=int(mat.shape[1]/20), size_max=np.ceil(mat.shape[1]/20)).fit_predict(mat.values.T)
    cluster_labels[tissue] = pd.DataFrame(cluster_labels[tissue], index = mat.columns, columns = ["cluster"])

### Step 5. Define trace norm gradient (for loss function)

Define nuclear norm, (sub)gradient of nuclear norm, and tensor trace norm, and tensor unfold function

In [6]:
import tensorflow as tf

# Define nuclear norm operation
@tf.custom_gradient
def nuclear_norm(x):
    sigma = tf.linalg.svd(x, full_matrices=False, compute_uv=False)
    norm = tf.reduce_sum(sigma)
    
    # Grandient function
    def nuclear_norm_grad(dy):
        _, U, V = tf.linalg.svd(x, full_matrices=False, compute_uv=True)
        grad = tf.matmul(U, tf.transpose(V))
        return dy * grad
    
    return norm, nuclear_norm_grad

def TensorUnfold(A, k):
    tmp_arr = np.arange(A.shape.ndims)
    A = tf.transpose(A, [tmp_arr[k]] + np.delete(tmp_arr, k).tolist())
    A = tf.reshape(A, [A.shape[0], np.prod(A.shape[1:])])
    return A

def trace_norm(X):
    return nuclear_norm(TensorUnfold(X, -1))

### Step 7. Train and test MTLRANK models
This step may take a long time to complte

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from collections import defaultdict
from tensorflow import keras
from joblib import Parallel,delayed
from time import time
import numpy as np
import tensorflow as tf
import pdb
import os
import datetime

n_rep = 3
n_epoch = 3
batch_size = 256
test_ratio = 0.1
num_cluster = 20

# MSE function and adam optimizer
mse = tf.keras.losses.MeanSquaredError()
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.01)

def train_test(tissue, cluster, rep, n_epochs):
    
    # Get velocity genes for current cluster
    cluster_genes = cluster_labels[tissue].loc[cluster_labels[tissue].iloc[:,0] == cluster,].index.values
    '''
    # For each gene 1) Get train and test data.
                    2) Standardize train and test data
    '''
    X_train = defaultdict(list)
    X_test = defaultdict(list)
    y_train = defaultdict(list)
    y_test = defaultdict(list)
    
    print("Preparing features ... {}".format(datetime.datetime.now().ctime()))
    remain = dict() # Use this to keep track of which samples have been sampled for each gene.
    for gene in cluster_genes:

        # Reformat data to multiple inputs
        y_train[gene] = velo[tissue][gene].values[train_idx[rep]]
        y_test[gene] = velo[tissue][gene].values[test_idx[rep]]
        
        # Barcodes for the cells selected 
        use_cells_train = velo[tissue][gene].index.values[train_idx[rep]]
        use_cells_test = velo[tissue][gene].index.values[test_idx[rep]]
        
        # Remove NA entries
        select_train = ~np.isnan(y_train[gene])
        select_test = ~np.isnan(y_test[gene])

        y_train[gene] = y_train[gene][select_train]
        y_test[gene] = y_test[gene][select_test]
        
        use_cells_train = use_cells_train[select_train]
        use_cells_test = use_cells_test[select_test]
        
        # Standardize y
        y_train[gene] = (y_train[gene] - np.mean(y_train[gene]))/np.std(y_train[gene])
        y_test[gene] = (y_test[gene] - np.mean(y_test[gene]))/np.std(y_test[gene])

        # Generate train/test inputs for current gene
        for TF in rpkm[tissue].columns:

            X_train[gene].append(np.zeros((train_idx[rep].shape[0], 5)))
            X_test[gene].append(np.zeros((test_idx[rep].shape[0], 5)))

            # Expression rpkms
            X_train[gene][-1][:,0] = rpkm[tissue][TF].values[train_idx[rep],]
            X_test[gene][-1][:,0] = rpkm[tissue][TF].values[test_idx[rep],]
            
            # Remove the NA entries
            X_train[gene][-1] = X_train[gene][-1][select_train]
            X_test[gene][-1] = X_test[gene][-1][select_test]
            
            # Use TF activities if they exist (tf from rpkm matrix), otherwise specify them as zero.
            path = "data/tf_activity_snare/{}/{}/{}/".format(tissue,TF,gene)
            if os.path.isdir(path):
                # TF mean signals train
                tf_weights = pd.DataFrame(np.zeros(use_cells_train.shape[0]), columns = ["weight"], index = use_cells_train)
                nonzero_weights = pd.read_csv(path + "mean.csv",index_col = 1)
                comm_cells = np.intersect1d(tf_weights.index, nonzero_weights.index)
                #tf_weights.loc[comm_cells,"weight"] = nonzero_weights.loc[comm_cells,]["weight"].values
                tf_weights.loc[:,"weight"] = nonzero_weights.loc[:,"weight"].values.mean()
                X_train[gene][-1][:,1] = tf_weights["weight"].values 

                # TF mean signals test
                tf_weights = pd.DataFrame(np.zeros(use_cells_test.shape[0]), columns = ["weight"], index = use_cells_test)
                nonzero_weights = pd.read_csv(path + "mean.csv",index_col = 1)
                comm_cells = np.intersect1d(tf_weights.index, nonzero_weights.index)
                #tf_weights.loc[comm_cells,"weight"] = nonzero_weights.loc[comm_cells,]["weight"].values
                tf_weights.loc[:,"weight"] = nonzero_weights.loc[:,"weight"].values
                X_test[gene][-1][:,1] = tf_weights["weight"].values
                
                # TF sum signals train
                tf_weights = pd.DataFrame(np.zeros(use_cells_train.shape[0]), columns = ["weight"], index = use_cells_train)
                nonzero_weights = pd.read_csv(path + "sum.csv",index_col = 1)
                comm_cells = np.intersect1d(tf_weights.index, nonzero_weights.index)
                #tf_weights.loc[comm_cells,"weight"] = np.log2(nonzero_weights.loc[comm_cells,]["weight"].values + 1) # log2 scailing used
                tf_weights.loc[:,"weight"] = np.log2(nonzero_weights.loc[:,"weight"].values + 1) # log2 scaled
                X_train[gene][-1][:,2] = tf_weights["weight"].values
                
                # TF sum signals test
                tf_weights = pd.DataFrame(np.zeros(use_cells_test.shape[0]), columns = ["weight"], index = use_cells_test)
                nonzero_weights = pd.read_csv(path + "sum.csv",index_col = 1)
                comm_cells = np.intersect1d(tf_weights.index, nonzero_weights.index)
                #tf_weights.loc[comm_cells,"weight"] = np.log2(nonzero_weights.loc[comm_cells,]["weight"].values + 1) # log2 scailing used
                tf_weights.loc[:,"weight"] = np.log2(nonzero_weights.loc[:,"weight"].values + 1) # log2 scaled
                X_test[gene][-1][:,2] = tf_weights["weight"].values

            # The product of expressions and TF mean signals
            X_train[gene][-1][:,3] = X_train[gene][-1][:,0] * X_train[gene][-1][:,1]
            X_test[gene][-1][:,3] = X_test[gene][-1][:,0] * X_test[gene][-1][:,1]

            # The product of expressions and TF sum signals
            X_train[gene][-1][:,4] = X_train[gene][-1][:,0] * X_train[gene][-1][:,2]
            X_test[gene][-1][:,4] = X_test[gene][-1][:,0] * X_test[gene][-1][:,2]

            # Standardize the rpkms and the prod features
            if np.std(X_train[gene][-1][:,0]) != 0:
                X_train[gene][-1][:,0] = (X_train[gene][-1][:,0] - np.mean(X_train[gene][-1][:,0]))/np.std(X_train[gene][-1][:,0])

            if np.std(X_train[gene][-1][:,3]) != 0:
                X_train[gene][-1][:,3] = (X_train[gene][-1][:,3] - np.mean(X_train[gene][-1][:,3]))/np.std(X_train[gene][-1][:,3])

            if np.std(X_train[gene][-1][:,4]) != 0:
                X_train[gene][-1][:,4] = (X_train[gene][-1][:,4] - np.mean(X_train[gene][-1][:,4]))/np.std(X_train[gene][-1][:,4])

            if np.std(X_test[gene][-1][:,0]) != 0:
                X_test[gene][-1][:,0] = (X_test[gene][-1][:,0] - np.mean(X_test[gene][-1][:,0]))/np.std(X_test[gene][-1][:,0])

            if np.std(X_test[gene][-1][:,3]) != 0:
                X_test[gene][-1][:,3] = (X_test[gene][-1][:,3] - np.mean(X_test[gene][-1][:,3]))/np.std(X_test[gene][-1][:,3])

            if np.std(X_test[gene][-1][:,4]) != 0:
                X_test[gene][-1][:,4] = (X_test[gene][-1][:,4] - np.mean(X_test[gene][-1][:,4]))/np.std(X_test[gene][-1][:,4])
                
    '''
    Train by trace norm loss.
    '''
    print("Setting up models...{}".format(datetime.datetime.now().ctime()))
    # Build models without compilation. Each model corresponds to one gene in the cluster.
    # Compile model and keep the initial weights
    num_tf = rpkm[tissue].shape[1]
    models = [build_model(num_tf = num_tf, num_feature_type = 5) for i in range(cluster_genes.shape[0])]
    
    # Get test batches
    print("Running training...{}".format(datetime.datetime.now().ctime()))
    x_test_batches = []
    y_test_batches = []
    for k,gene in enumerate(cluster_genes):
        x_test_batches.append(X_test[gene])
        y_test_batches.append(y_test[gene])
    
    # One step of training
    def train_step(x_batches, y_batches):
        
        # Record how loss value is computed for performing automatic differential later.
        with tf.GradientTape() as tape:
            
            # Run one round of forward pass for all models
            y_pred = [models[k](x, training = True) for k,x in enumerate(x_batches)]
            
            # Compute MSE loss
            MSE = [tf.reduce_mean(mse(y, y_pred[k])) for k,y in enumerate(y_batches)]
            
            # Get the weights of the last three layers. These layers are shared and their tracenorm loss
            # will be calculated.
            sharable_weights = []
            
            for layer in [num_tf*2, num_tf*2+2]:
                sharable_weights.append(tf.stack([model.trainable_weights[layer] for model in models], axis = -1))    
            
            # Compute tracenorm
            tracenorm = [trace_norm(sharable_weights[k]) for k in range(len(sharable_weights))]
            
            # Compute final loss. loss = MSE + lambda*tracenorm. (lambda = 0.01 for convenience)
            loss = tf.reduce_mean(MSE) + tf.math.multiply(0.01, tf.reduce_mean(tracenorm))
            
       '''
        Here gradients are calculated 
        'unconnected_gradients=tf.UnconnectedGradients.ZERO' ensures that
        gradients for other task-specific layer other than the current one
        are zeros.
        '''
        grads_all = tape.gradient(loss,
                              [model.trainable_weights for model in models],
                              unconnected_gradients=tf.UnconnectedGradients.ZERO)
        
        # Run one step of gradient descent by updating
        # the value of the variables to minimize the loss.
        for grad,model in zip(grads_all, models):
            optimizer.apply_gradients(zip(grad, model.trainable_weights))
       
        return(loss, tf.reduce_mean(MSE))
        
    def test_step(models, x_batches, y_batches):
        r2s = []
        for x,y,model in zip(x_batches,y_batches,models):
            y_pred = model.predict(x)
            r2s.append(r2_score(y,y_pred))
        return(r2s)
        
    # Train MTL models
    for i in range(n_epochs):
        
        # Generate sampling indices for each batch
        # Number of batches are dependent on the gene with the most training examples (N).
        # For other genes, examples are resampled if they have smaller sample sizes.
        tmp = [(gene,sample.shape[0]) for gene,sample in y_train.items()]
        tmp = sorted(tmp, key = lambda x: x[1], reverse = True)
        max_size = tmp[0][1]
        
        # Get sample indices for each training step
        idx_iter = dict()
        for gene,size in tmp:
            idx = np.arange(size)
            np.random.shuffle(idx)
            idx = np.resize(idx, max_size)
            idx_iter[gene] = np.split(idx, np.arange(batch_size,idx.shape[0],batch_size))
        n_iter = len(idx_iter[gene])
        
        # Training loop
        for j in range(n_iter):
            begin = time()
            
            # Get training data batches for each training step (one batch per gene).
            x_train_batches = []
            y_train_batches = []
            for k,gene in enumerate(cluster_genes):
                x_train_batches.append([tf.convert_to_tensor(feature[idx_iter[gene][j],], dtype = tf.float64) for feature in X_train[gene]])
                y_train_batches.append(tf.convert_to_tensor(y_train[gene][idx_iter[gene][j]], dtype = tf.float64)) 
                
            # Run one step of training (Compute loss and perform one step of gradient descent), make sure all input arguments are tf.Tensor or list of tf.Tensor
            loss, MSE = train_step(x_train_batches, y_train_batches)
            end = time()
            spent = np.round(end - begin, 1)
            
            print("Epoch {}/{}, Iter {}/{}, loss value: {}, MSE: {}, {}s used".format(str(i+1),
                                                                                str(n_epochs),
                                                                                str(j+1),
                                                                                str(n_iter),                                                                                          
                                                                                str(loss.numpy()),
                                                                                str(MSE.numpy()),
                                                                                str(spent)))
    
    # Training finished, test models
    r2s = test_step(models, x_test_batches, y_test_batches)

    # Save output to file
    cur_res = "{},{},{},{}\n".format(tissue,gene,str(np.mean(r2s)),str(rep))
    with open("/results/r2/MTLRANK_expTFAPredVelo_r2_SNAREAverage.csv","a") as f:
        for r2,gene in zip(r2s,cluster_genes):
            f.writelines("{},{},{},{}\n".format(tissue,gene,str(rep),str(r2)))

    return(None)

for tissue in list(rpkm.keys()):
    
    # Generate train and test sample indices for different replicates
    idx = np.arange(rpkm[tissue].shape[0])
    train_idx = []
    test_idx = []
    
    # Generate train and test cell indices for different replicates
    for rep in range(n_rep):
        idx1, idx2 = train_test_split(idx, test_size = test_ratio, shuffle = True)
        train_idx.append(idx1)
        test_idx.append(idx2)
    
    # Run this number of replicates
    for rep in range(n_rep):    
    #Train and test genes in current cluster
        res = Parallel(n_jobs = 2)(delayed(train_test)(tissue, cluster, rep, n_epochs = 3)
                                    for cluster in range(num_cluster))