### Read TF list and ensembl to symbol mapping file

In [1]:
import pandas as pd
tf_list = pd.read_csv("data/raw/tf/tf_list.csv", index_col = 0)    
ensembl_to_symbol = pd.read_csv("data/raw/id_mapping/ensembl_to_symbol.csv",index_col = 0)
ensembl_to_symbol = ensembl_to_symbol.loc[~ensembl_to_symbol["ensembl_id"].duplicated(),:]
ensembl_to_symbol.index = ensembl_to_symbol["ensembl_id"]

### Load expression data, and TF activitiy scores.

This trains models for one tissue

In [2]:
import h5py
import numpy as np
import pandas as pd

tissue = 'Lung_Right'

# First we get indices of TFs
with h5py.File('data/processed/rpkm/rpkm.hdf5', 'r') as f:
    col_names = np.array(f[tissue]['ensembl']).astype(str)
tf_idx = np.where(np.isin(col_names, tf_list["Ensembl ID"].values))[0]

# Get expressions only for TF columns. RPKM values are log10 transformed.
with h5py.File('data/processed/rpkm/rpkm.hdf5', 'r') as f:
    rpkm = pd.DataFrame(np.log10(np.array(f[tissue]['exp'][:,tf_idx])+1),
                                index = np.array(f[tissue]['barcode']).astype(str),
                                columns = np.array(f[tissue]['ensembl'][tf_idx]).astype(str))

with h5py.File('data/processed/velo/velo.hdf5', 'r') as f:
    velo = pd.DataFrame(np.array(f[tissue]['velo']),
                                index = np.array(f[tissue]['barcode']).astype(str),
                                columns = np.array(f[tissue]['ensembl']).astype(str))

Read TF activity score matrices. Missing values are treated as zeros.  
This trains models for one tissue

In [3]:
import os
import pandas as pd

folder = "data/processed/tf_activity/"

tf_gene_mat_mean = pd.read_csv(folder + "mean_tf/" + tissue + "_tfGeneMat.csv", index_col = 0).transpose()
tf_gene_mat_sum = pd.read_csv(folder + "sum_tf/" + tissue + "_tfGeneMat.csv", index_col = 0).transpose()
tf_gene_mat_mean.fillna(0,inplace = True)
tf_gene_mat_sum.fillna(0,inplace = True)

# scale the value by log2
tf_gene_mat_mean = np.log2(tf_gene_mat_mean+1)
tf_gene_mat_sum = np.log2(tf_gene_mat_sum+1)

### Reformat the data matrices to keep the consistent TFs and genes.
Make rpkm matrix and tf activity matrix have the same tfs  
Make velo matrix and tf activity matrix have the same genes

In [4]:
act_mat_sum = dict()
act_mat_mean = dict()
    
# Find common TF 
tfmat_cols = pd.DataFrame(tf_gene_mat_mean.columns.values,columns = ['tfmat_cols'])
comm_tfs = ensembl_to_symbol.merge(how = 'left',
                                    left_on = 'gene_symbol',
                                    right_on = "tfmat_cols",
                                    right = tfmat_cols)
comm_tfs = comm_tfs.loc[~comm_tfs['tfmat_cols'].isna(),]
comm_tfs = comm_tfs.loc[comm_tfs["ensembl_id"].isin(rpkm.columns),]

# Find common (velocity) genes
tfmat_rows = pd.DataFrame(tf_gene_mat_mean.index.values,columns = ['tfmat_rows'])
comm_genes = ensembl_to_symbol.merge(how = 'left',
                                        left_on = 'gene_symbol',
                                        right_on = "tfmat_rows",
                                        right = tfmat_rows)
comm_genes = comm_genes.loc[~comm_genes['tfmat_rows'].isna(),]
comm_genes = comm_genes.loc[comm_genes["ensembl_id"].isin(velo.columns),]

'''
Reorder rpkm mat, velo mat, and cluster label vectors. The TF-Gene matrix follow the same tf_order and gene_order. In this order, the columns in upper left
corner are the tfs commonly found in rpkm matrix and tf activity matrix and the rows represent the genes commonly found in velo matrix
and tf activity matrix.
          TF
       _______ __
      | common|  |
Genes |_______|  |
      |__________|
'''
tf_order = pd.concat([comm_tfs['ensembl_id'], pd.Series(np.setdiff1d(rpkm.columns, comm_tfs['ensembl_id']))])
gene_order = pd.concat([comm_genes['ensembl_id'], pd.Series(np.setdiff1d(velo.columns, comm_genes['ensembl_id']))])

rpkm = rpkm.loc[:,tf_order]
velo = velo.loc[:,gene_order]

act_mat_sum = np.zeros((velo.shape[1], rpkm.shape[1]))
act_mat_mean = np.zeros((velo.shape[1], rpkm.shape[1]))

act_mat_sum[:comm_genes.shape[0],:][:,:comm_tfs.shape[0]] = tf_gene_mat_sum.loc[comm_genes['tfmat_rows'],comm_tfs['tfmat_cols']]
act_mat_mean[:comm_genes.shape[0],:][:,:comm_tfs.shape[0]] = tf_gene_mat_mean.loc[comm_genes['tfmat_rows'],comm_tfs['tfmat_cols']]

act_mat_sum = pd.DataFrame(act_mat_sum, index = gene_order, columns = tf_order)
act_mat_mean = pd.DataFrame(act_mat_mean, index = gene_order, columns = tf_order)

del tf_gene_mat_mean
del tf_gene_mat_sum

### Select genes
- For liver, we get genes with more than 1000 cells for which the velocity values are available.
- For other tissues, we get genes with more than 4000 cells for which the velocity values are available.

In [5]:
# Get cell counts for which the velocity values are available
cell_counts = (~velo.isna()).sum(axis = 0)
if tissue == "Liver":
    selected_genes = cell_counts.index[cell_counts > 1000].values
else:
    selected_genes = cell_counts.index[cell_counts > 5000].values
print("{} has {} selected genes.".format(tissue, str(selected_genes.shape[0])))

Lung_Right has 2380 selected genes.


### Cluster and order genes with balanced k-means clustering (by velocity values)
1. Cluster the selected velocity genes by velocity values (balanced kmeans clustering)
2. In each cluster, order the genes by correlation

In [None]:
from k_means_constrained import KMeansConstrained
cluster_labels = dict()
    
# Get velocity matrix (rows are cells, cols are genes)
mat = velo.loc[:,selected_genes].values

# For each gene, impute the missing velocity values by mean velocity from all available cells.
na_mask = np.isnan(mat).astype(int)
mean_mat = np.nanmean(mat, axis = 0).reshape(1,-1) * na_mask
mat = np.nan_to_num(mat,nan = 0)
mat = mat + mean_mat
mat = pd.DataFrame(
                    mat,
                    columns = velo.loc[:,selected_genes].columns
                )

# cluster size and number of clusters
c_size_min = 24
c_size_max = 25
n_clusters = int(np.ceil(selected_genes.shape[0]/c_size_max))

# Get cluster label for each gene.
cluster_labels = KMeansConstrained(n_clusters = n_clusters,
                                    size_min = c_size_min,
                                    size_max = c_size_max).fit_predict(mat.values.T)
cluster_labels = pd.DataFrame(cluster_labels, index = mat.columns, columns = ["cluster"])
sizes = np.unique(cluster_labels['cluster'], return_counts=True)[1]
print("Cluster sizes for {}: {}".format(tissue, np.unique(cluster_labels['cluster'], return_counts=True)[1]))

save clustering results

In [None]:
rep = 1
cluster_labels.to_csv("results/velo_clusters/{}-{}.csv".format(tissue,rep))

load clustering results and names of already trained genes

In [30]:
import os
import pandas as pd

rep = 1
cluster_labels = pd.read_csv("results/velo_clusters/{}-{}.csv".format(tissue,rep), index_col = 0)

### Standardize data

In [9]:
# standardize velo
velo = (velo - velo.mean())/velo.std()

# standardize rpkm train
rpkm = (rpkm - rpkm.mean())/rpkm.std()

# Fill na as zeros in rpkm matrices
rpkm.fillna(value = 0, inplace = True)

### Train and get models

**MTL model architecture**

In [8]:
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.layers import Dense,Input
from tensorflow.keras import regularizers
from tensorflow.keras.layers import concatenate
from tensorflow.keras import Model
import tensorflow as tf

def build_mtl_model(num_tf, num_feature_type, num_genes):
    models = []
    for k in range(num_genes):
        inputs = [Input(shape = (num_feature_type,)) for i in range(num_tf)]
        concat_layer = [Dense(1, activation = "relu", kernel_regularizer = tf.keras.regularizers.l1(0.01))(inp) for inp in inputs]
        concat_layer = concatenate(concat_layer)
        out = Dense(64, activation = "relu", kernel_regularizer = tf.keras.regularizers.l1(0.01))(concat_layer)
        out = Dense(32, activation = "relu", kernel_regularizer = tf.keras.regularizers.l1(0.01))(out)
        out = Dense(16, activation = "relu", kernel_regularizer = tf.keras.regularizers.l1(0.01))(out)
        out = Dense(1, activation = "linear")(out)

        models.append(Model(inputs = inputs, outputs = out))   
        
    return(models)

**Trace norm and gradient of trace norm**  
Define nuclear norm, (sub)gradient of nuclear norm, and tensor trace norm, and tensor unfold function

In [9]:
import tensorflow as tf

# Define nuclear norm operation
@tf.custom_gradient
def nuclear_norm(x):
    sigma = tf.linalg.svd(x, full_matrices=False, compute_uv=False)
    norm = tf.reduce_sum(sigma)
    
    # Grandient function
    def nuclear_norm_grad(dy):
        _, U, V = tf.linalg.svd(x, full_matrices=False, compute_uv=True)
        grad = tf.matmul(U, tf.transpose(V))
        return dy * grad
    
    return norm, nuclear_norm_grad

def TensorUnfold(A, k):
    tmp_arr = np.arange(A.shape.ndims)
    A = tf.transpose(A, tf.convert_to_tensor([tmp_arr[k]] + np.delete(tmp_arr, k).tolist()))
    A = tf.reshape(A, tf.convert_to_tensor([A.shape[0], np.prod(A.shape[1:])]))
    return A

def trace_norm(X):
    return nuclear_norm(TensorUnfold(X, -1))

Train models.

In [None]:
from collections import defaultdict
from time import time
from joblib import Parallel,delayed

import datetime

def train(tissue, cluster, n_epochs, batch_size, rep):
   
    # Get velocity genes for current cluster
    cluster_genes = cluster_labels.loc[cluster_labels.iloc[:,0] == cluster,].index.values
    num_genes = cluster_genes.shape[0]
    num_tf = rpkm.shape[1]    
    
    # Prepare training data
    X_train = defaultdict(list)
    y_train = defaultdict(list)

    print("Preparing features ... {}".format(datetime.datetime.now().ctime()))
    for gene in cluster_genes:

        # Split train and test data for each gene
        rpkm_train = rpkm
        velo_train = velo[gene]

        # Remove NAs.
        select = ~pd.isna(velo_train)
        velo_train = velo_train.loc[select]
        rpkm_train = rpkm_train.loc[select,:]

        # Generate train/test inputs for current gene
        for i,TF in enumerate(rpkm_train.columns):

            X_train[gene].append(np.empty((rpkm_train.shape[0], num_feature_type)))

            # For current tf, get expression rpkms
            X_train[gene][-1][:,0] = rpkm_train[TF].values

            # Get TF mean signals and TF sum signals
            X_train[gene][-1][:,1] = act_mat_mean.loc[gene,TF]
            X_train[gene][-1][:,2] = act_mat_sum.loc[gene,TF]

        y_train[gene] = velo_train.values
    '''
    Train by trace norm loss.
    '''
    print("Setting up models...{}".format(datetime.datetime.now().ctime()))
    # Build MTL model
    models = build_mtl_model(num_tf = num_tf,
                             num_feature_type = num_feature_type,
                             num_genes = num_genes
                            )
    
    # MSE loss function and adam optimizer
    mse = tf.keras.losses.MeanSquaredError()
    optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
    
    # Input signatures for training step as a tf.function
    x_signatures = tf.data.DatasetSpec(tf.TensorSpec(shape = (num_tf, None, num_feature_type), dtype = tf.float32))
    y_signatures = tf.data.DatasetSpec(tf.TensorSpec(shape = (None,), dtype = tf.float32))
    
    # training step and MTL loss function defined here
    @tf.function(input_signature = (x_signatures, y_signatures))
    def train_step(x_batches, y_batches):

        # Record how loss value is computed for performing automatic differentiation later.
        with tf.GradientTape() as tape:

            # Run one round of forward pass for all models
            y_pred = tf.TensorArray(tf.float32, size = num_genes)
            for k,X in zip(tuple(range(num_genes)), x_batches):
                y_pred = y_pred.write(k, models[k](tf.unstack(X), training = True))
            
            # Compute MSE loss
            MSE = tf.TensorArray(tf.float32, size = num_genes)
            for k,y in zip(tuple(range(num_genes)), y_batches):
                MSE = MSE.write(k, tf.reduce_mean(mse(y, y_pred.read(k))))
            MSE = MSE.gather(tf.range(num_genes))
            
            # Get the weights of the first and second FC layers. These layers are shared and their tracenorm loss will be calculated.
            sharable_weights = tf.TensorArray(tf.float32, size = 2)

            # Concat the first sharable FC layer from all models
            stacked_layers1 = tf.TensorArray(tf.float32, num_genes)
            for k,model in enumerate(models):
                stacked_layers1 = stacked_layers1.write(k, model.trainable_weights[num_tf*2])
            stacked_layers1 = stacked_layers1.gather(tf.range(num_genes))
            
            # Concat the second sharable FC layer from all models
            stacked_layers2 = tf.TensorArray(tf.float32, num_genes)
            for k,model in enumerate(models):
                stacked_layers2 = stacked_layers2.write(k, model.trainable_weights[num_tf*2 + 2])
            stacked_layers2 = stacked_layers2.gather(tf.range(num_genes))
            
            # Compute tracenorm from the two concatenated layer weight matrices
            tracenorm = tf.TensorArray(tf.float32, 2)    
            tracenorm = tracenorm.write(0,trace_norm(stacked_layers1))
            tracenorm = tracenorm.write(1,trace_norm(stacked_layers2))
            tracenorm = tracenorm.gather(tf.range(2))
            
            # Compute final loss. loss = MSE + lambda*tracenorm. (lambda = 0.01 for convenience)
            loss = tf.reduce_mean(MSE) + tf.math.multiply(0.01, tf.reduce_mean(tracenorm))

        '''
        Here gradients are calculated 
        'unconnected_gradients=tf.UnconnectedGradients.ZERO' ensures that
        gradients for other task-specific layer other than the current one
        are zeros.
        '''
        grads_all = tape.gradient(loss,
                              [model.trainable_weights for model in models],
                              unconnected_gradients=tf.UnconnectedGradients.ZERO)
        
        # Run one step of gradient descent by updating the weights.
        for grad,model in zip(grads_all, models):
            optimizer.apply_gradients(zip(grad, model.trainable_weights))
        
        return(loss, tf.reduce_mean(MSE))
    
    # Training epochs
    for i in range(n_epochs):

        # Generate sampling indices for each batch
        # Number of batches are dependent on the gene with the most training examples (N).
        # For other genes, examples are resampled if they have smaller sample sizes.
        tmp = [(gene,sample.shape[0]) for gene,sample in y_train.items()]
        tmp = sorted(tmp, key = lambda x: x[1], reverse = True)
        max_size = tmp[0][1]

        # Get sample indices for each training step
        idx_iter = dict()
        for gene,size in tmp:
            idx = np.arange(size)
            np.random.shuffle(idx)
            idx = np.resize(idx, max_size)
            idx_iter[gene] = np.split(idx, np.arange(batch_size,idx.shape[0],batch_size))
        n_iter = len(idx_iter[gene])

        # Training loop
        for j in range(n_iter):
            begin = time()

            # Get training data batches for each training step (one batch per gene).
            x_train_batches = []
            y_train_batches = []
            for k,gene in enumerate(cluster_genes):
                x_train_batches.append([tf.cast(feature[idx_iter[gene][j],], dtype = tf.float32) for feature in X_train[gene]])
                y_train_batches.append(tf.cast(y_train[gene][idx_iter[gene][j]], dtype = tf.float32)) 
            
            # Wrap the data using tf.data.Dataset
            x_batches = tf.data.Dataset.from_tensor_slices(x_train_batches)
            y_batches = tf.data.Dataset.from_tensor_slices(y_train_batches)
            
            # Run one step of training
            loss, MSE = train_step(x_batches, y_batches)
            end = time()
            spent = np.round(end - begin, 1)

            print("Epoch {}/{}, Iter {}/{}, loss value: {}, MSE: {}, {}s used".format(str(i+1),
                                                                                str(n_epochs),
                                                                                str(j+1),
                                                                                str(n_iter),                                                                                          
                                                                                str(loss.numpy()),
                                                                                str(MSE.numpy()),
                                                                                str(spent)))
    '''
    Save models
    '''
    print("Saving models ... {}".format(datetime.datetime.now().ctime()))
    for k,gene in enumerate(cluster_genes):
        models[k].save_weights("results/full_model/{}-{}-rep{}".format(gene,tissue,rep))
    return(None)

n_epochs = 4
batch_size = 256
tissue = 'Lung_Right' # Specify tissue to be trained here
num_clusters = cluster_labels['cluster'].max() + 1
num_feature_type = 3

# Train ensemble model for each cluster, run for five reps.
res = Parallel(n_jobs = 16)(delayed(train)(tissue, cluster, n_epochs, batch_size, rep)
                              for cluster,rep in list(zip(np.tile(np.arange(num_clusters),1), np.repeat(np.arange(5),num_clusters))))