### Step 1. Define and compile NN model

In [2]:
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.layers import Dense,Input
from tensorflow.keras import regularizers
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import Lambda
from tensorflow.keras import Model
import tensorflow as tf

def build_model(num_tf,num_feature_type):
    inputs = [Input(shape = (num_feature_type,)) for i in range(num_tf)]
    concat_layer = [Dense(1, activation = "relu", kernel_regularizer = tf.keras.regularizers.l1(0.01))(inp) for inp in inputs]
    concat_layer = concatenate(concat_layer)
    out = Dense(64, activation = "relu", kernel_regularizer = tf.keras.regularizers.l1(0.01))(concat_layer)
    out = Dense(32, activation = "relu", kernel_regularizer = tf.keras.regularizers.l1(0.01))(out)
    out = Dense(16, activation = "relu", kernel_regularizer = tf.keras.regularizers.l1(0.01))(out)
    out = Dense(1, activation = "linear")(out)
    model = Model(inputs = inputs, outputs = out)
    model.compile(loss = 'mean_squared_error',
                  metrics = 'mean_absolute_error',
                  optimizer = "SGD")
    return(model)

2021-12-12 21:04:15.417435: I tensorflow/stream_executor/platform/default/dso_loader.cc:54] Successfully opened dynamic library libcudart.so.11.0


### Step 2. Load expression, TF activitiy scores, and ensembl ID mapping file

First, read RPKM values and velocity values and then scale them

In [3]:
import h5py
import numpy as np
import pandas as pd

rpkm = dict()
with h5py.File('data/processed/rpkm/rpkm.hdf5', 'r') as f:
    for tissue in f.keys():
        
        # scale the rpkm by log10
        rpkm[tissue] = pd.DataFrame(np.log10(np.array(f[tissue]['exp'])+1),
                                        index = np.array(f[tissue]['barcode']).astype(str),
                                        columns = np.array(f[tissue]['ensembl']).astype(str))

velo = dict()
with h5py.File('data/processed/velo/velo.hdf5', 'r') as f:
    for tissue in f.keys():
        velo[tissue] = pd.DataFrame(np.array(f[tissue]['velo']),
                                        index = np.array(f[tissue]['barcode']).astype(str),
                                        columns = np.array(f[tissue]['ensembl']).astype(str))

Read TF activity score matrices. Missing values are treated as zeros.

In [4]:
import os
import pandas as pd

folder = "data/processed/tf_activity/"

tf_gene_mat_mean = dict()
for filename in os.listdir(folder + "mean_tf/"):
    tissue = "_".join(filename.split("_")[:-1])
    tf_gene_mat_mean[tissue] = pd.read_csv("{}/mean_tf/{}".format(folder,filename),index_col = 0).transpose()
    tf_gene_mat_mean[tissue].fillna(0,inplace = True)

tf_gene_mat_sum = dict()
for filename in os.listdir(folder + "sum_tf/"):
    tissue = "_".join(filename.split("_")[:-1])
    tf_gene_mat_sum[tissue] = pd.read_csv("{}/sum_tf/{}".format(folder,filename),index_col = 0).transpose()
    tf_gene_mat_sum[tissue].fillna(0,inplace = True)
    
    # scale the value by log2
    tf_gene_mat_sum[tissue] = np.log2(tf_gene_mat_sum[tissue]+1)

Load ensembl to hgnc mapping file

In [6]:
ensembl_to_symbol = pd.read_csv("data/raw/id_mapping/ensembl_to_symbol.csv",index_col = 0)
ensembl_to_symbol = ensembl_to_symbol.loc[~ensembl_to_symbol["ensembl_id"].duplicated(),:]
ensembl_to_symbol.index = ensembl_to_symbol["ensembl_id"]

### Step 3. Select genes for testing
For each tissue, train and test the model with reps.

Select 500 genes in each tissue for testing

In [7]:
import pandas as pd
import re

l1_res = pd.read_csv("results/r2/lasso_r2.csv", index_col=0)

num_genes = 500
num_samples = 4000

# Remove parenthesis and spaces in the tissue name
l1_res["tissue"] = l1_res['tissue'].str.replace('\s+',"_")
l1_res["tissue"] = l1_res['tissue'].str.replace('\(',"")
l1_res["tissue"] = l1_res['tissue'].str.replace('\)',"")

selected_genes = dict()

for tissue,df in l1_res.groupby("tissue"):
        
    # Select genes that have > 4000 cells for velocity values
    sample_counts = (~velo[tissue].isna()).sum(axis = 0)
    genes = sample_counts.index[sample_counts > num_samples].values
    df = df.loc[df["gene_ensembl"].isin(genes),]

    # Randomly select 500 genes in each tissue
    np.random.seed(1000)
    if num_genes < df.shape[0]:
        idx = np.random.choice(np.arange(df.shape[0]), size=num_genes, replace=False)
    else:
        idx = np.arange(df.shape[0])
    selected_genes[tissue] = df.iloc[idx,]["gene_ensembl"].values
    print("{} has {} selected genes.".format(tissue, str(selected_genes[tissue].shape[0])))

  l1_res["tissue"] = l1_res['tissue'].str.replace('\s+',"_")
  l1_res["tissue"] = l1_res['tissue'].str.replace('\(',"")
  l1_res["tissue"] = l1_res['tissue'].str.replace('\)',"")


Heart has 500 selected genes.
Kidney_Left has 500 selected genes.
Large_Intestine has 500 selected genes.
Liver has 500 selected genes.
Lung_Right has 500 selected genes.
Spleen has 500 selected genes.


In [8]:
rpkm_tf = dict()

# Keep TFs as columns for rpkm_tf mat.
tf_list = pd.read_csv("data/raw/tf/tf_list.csv", index_col = 0)
for tissue in velo.keys():
    
    use_tfs = rpkm[tissue].columns.intersection(tf_list["Ensembl ID"])
    rpkm_tf[tissue] = rpkm[tissue].loc[:,use_tfs]

### Step 4. Reformat the data matrices to keep the consistent TFs and genes.
Make rpkm matrix and tf activity matrix have the same tfs  
Make velo matrix and tf activity matrix have the same genes

In [9]:
act_mat_sum = dict()
act_mat_mean = dict()

for tissue in tf_gene_mat_mean.keys():
    
    # Find common TF 
    tfmat_cols = pd.DataFrame(tf_gene_mat_mean[tissue].columns.values,columns = ['tfmat_cols'])
    comm_tfs = ensembl_to_symbol.merge(how = 'left',
                                        left_on = 'gene_symbol',
                                        right_on = "tfmat_cols",
                                        right = tfmat_cols)
    comm_tfs = comm_tfs.loc[~comm_tfs['tfmat_cols'].isna(),]
    comm_tfs = comm_tfs.loc[comm_tfs["ensembl_id"].isin(rpkm_tf[tissue].columns),]
    
    # Find common (velocity) genes
    tfmat_rows = pd.DataFrame(tf_gene_mat_mean[tissue].index.values,columns = ['tfmat_rows'])
    comm_genes = ensembl_to_symbol.merge(how = 'left',
                                            left_on = 'gene_symbol',
                                            right_on = "tfmat_rows",
                                            right = tfmat_rows)
    comm_genes = comm_genes.loc[~comm_genes['tfmat_rows'].isna(),]
    comm_genes = comm_genes.loc[comm_genes["ensembl_id"].isin(velo[tissue].columns),]
    
    '''
    Reorder rpkm mat, velo mat, and cluster label vectors. The TF-Gene matrix follow the same tf_order and gene_order. In this order, the columns in upper left
    corner are the tfs commonly found in rpkm matrix and tf activity matrix and the rows represent the genes commonly found in velo matrix
    and tf activity matrix.
              TF
           _______ __
          | common|  |
    Genes |_______|  |
          |__________|
    '''
    tf_order = pd.concat([comm_tfs['ensembl_id'], pd.Series(np.setdiff1d(rpkm_tf[tissue].columns, comm_tfs['ensembl_id']))])
    gene_order = pd.concat([comm_genes['ensembl_id'], pd.Series(np.setdiff1d(velo[tissue].columns, comm_genes['ensembl_id']))])
    
    rpkm_tf[tissue] = rpkm_tf[tissue].loc[:,tf_order]
    velo[tissue] = velo[tissue].loc[:,gene_order]
    #cluster_labels[tissue] = cluster_labels[tissue].loc[gene_order,:]
    
    act_mat_sum[tissue] = np.zeros((velo[tissue].shape[1], rpkm_tf[tissue].shape[1]))
    act_mat_mean[tissue] = np.zeros((velo[tissue].shape[1], rpkm_tf[tissue].shape[1]))
    
    act_mat_sum[tissue][:comm_genes.shape[0],:][:,:comm_tfs.shape[0]] = tf_gene_mat_sum[tissue].loc[comm_genes['tfmat_rows'],comm_tfs['tfmat_cols']]
    act_mat_mean[tissue][:comm_genes.shape[0],:][:,:comm_tfs.shape[0]] = tf_gene_mat_mean[tissue].loc[comm_genes['tfmat_rows'],comm_tfs['tfmat_cols']]
    
    act_mat_sum[tissue] = pd.DataFrame(act_mat_sum[tissue], index = gene_order, columns = tf_order)
    act_mat_mean[tissue] = pd.DataFrame(act_mat_mean[tissue], index = gene_order, columns = tf_order)

del tf_gene_mat_mean
del tf_gene_mat_sum

### Step 5. Train and test baseline NN models  
This step may take a long time to complte

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from collections import defaultdict
from tensorflow import keras
from joblib import Parallel,delayed
import numpy as np
import tensorflow as tf
import pdb

n_rep = 3
n_epoch = 15
n_jobs = 15
batch_size = 256
test_ratio = 0.1
r2_all = dict()

def train_test_gene(tissue, gene):
    
    r2 = np.zeros((n_rep,))
    for rep in range(n_rep):

        # Reformat data to multiple inputs
        y_train = rpkm[tissue][gene].values[train_idx[rep]]
        y_test = rpkm[tissue][gene].values[test_idx[rep]]

        # Standardize y
        y_train = (y_train - np.mean(y_train))/np.std(y_train)
        y_test = (y_test - np.mean(y_test))/np.std(y_test)

        X_all_train = []
        X_all_test = []

        for TF in rpkm_tf[tissue].columns:

            # To avaoid self-prediction, let TF not equal to the current gene.
            if TF != gene:

                X_all_train.append(np.empty((train_idx[rep].shape[0], 5)))
                X_all_test.append(np.empty((test_idx[rep].shape[0], 5)))

                # Expression rpkms
                X_all_train[-1][:,0] = rpkm_tf[tissue][TF].values[train_idx[rep],]
                X_all_test[-1][:,0] = rpkm_tf[tissue][TF].values[test_idx[rep],]

                # TF mean signals
                X_all_train[-1][:,1] = act_mat_mean[tissue].loc[gene, TF]
                X_all_test[-1][:,1] = act_mat_mean[tissue].loc[gene, TF]

                # TF sum signals
                X_all_train[-1][:,2] = act_mat_sum[tissue].loc[gene, TF]
                X_all_test[-1][:,2] = act_mat_sum[tissue].loc[gene, TF]

                # The product of expressions and TF mean signals
                X_all_train[-1][:,3] =  X_all_train[-1][:,0] * X_all_train[-1][:,1]
                X_all_test[-1][:,3] = X_all_test[-1][:,0] * X_all_test[-1][:,1]

                # The product of expressions and TF sum signals
                X_all_train[-1][:,4] = X_all_train[-1][:,0] * X_all_train[-1][:,2]
                X_all_test[-1][:,4] = X_all_test[-1][:,0] * X_all_test[-1][:,2]

                # Standardize the rpkms and the prod features
                if np.sum(X_all_train[-1][:,0]) != 0:
                    X_all_train[-1][:,0] = (X_all_train[-1][:,0] - np.mean(X_all_train[-1][:,0]))/np.std(X_all_train[-1][:,0])

                if np.sum(X_all_train[-1][:,3]) != 0:
                    X_all_train[-1][:,3] = (X_all_train[-1][:,3] - np.mean(X_all_train[-1][:,3]))/np.std(X_all_train[-1][:,3])

                if np.sum(X_all_train[-1][:,4]) != 0:
                    X_all_train[-1][:,4] = (X_all_train[-1][:,4] - np.mean(X_all_train[-1][:,4]))/np.std(X_all_train[-1][:,4])

                if np.sum(X_all_test[-1][:,0]) != 0:
                    X_all_test[-1][:,0] = (X_all_test[-1][:,0] - np.mean(X_all_test[-1][:,0]))/np.std(X_all_test[-1][:,0])

                if np.sum(X_all_test[-1][:,3]) != 0:
                    X_all_test[-1][:,3] = (X_all_test[-1][:,3] - np.mean(X_all_test[-1][:,3]))/np.std(X_all_test[-1][:,3])

                if np.sum(X_all_test[-1][:,4]) != 0:
                    X_all_test[-1][:,4] = (X_all_test[-1][:,4] - np.mean(X_all_test[-1][:,4]))/np.std(X_all_test[-1][:,4])

        # Compile model
        model = build_model(num_tf = len(X_all_train), num_feature_type = 5)

        # Train model
        model.fit(
            x = X_all_train,
            y = y_train,
            batch_size = batch_size,
            epochs = n_epoch,
            verbose = 0
        )

        # Test model
        y_pred = model.predict(x = X_all_test)
        r2[rep] = r2_score(y_test, y_pred)

    # Save output to file
    with open("results/r2/basenn_expTFAPredExp_r2.csv","a") as f:
        f.writelines("{}, {}, {}, {}\n".format(tissue,gene,str(np.mean(r2)),str(rep)))
        
    return(r2)

for tissue in comm_tissues[-1:]:
    
    # Generate train and test sample indices
    idx = np.arange(rpkm_tf[tissue].shape[0])
    train_idx = []
    test_idx = []
    for rep in range(n_rep):
        idx1, idx2 = train_test_split(idx, test_size = test_ratio, shuffle = True)
        train_idx.append(idx1)
        test_idx.append(idx2)
        
    # Train and test each gene.
    res = Parallel(n_jobs = n_jobs)(delayed(train_test_gene)(tissue, gene) 
                              for gene in np.unique(selected_genes[tissue]))
    res = np.vstack(res).T
    r2_all[tissue] = pd.DataFrame(res, index = np.arange(n_rep) + 1, columns = np.unique(selected_genes[tissue]))

2021-12-12 21:24:25.029141: I tensorflow/stream_executor/platform/default/dso_loader.cc:54] Successfully opened dynamic library libcudart.so.11.0
2021-12-12 21:24:25.067160: I tensorflow/stream_executor/platform/default/dso_loader.cc:54] Successfully opened dynamic library libcudart.so.11.0
2021-12-12 21:24:25.115477: I tensorflow/stream_executor/platform/default/dso_loader.cc:54] Successfully opened dynamic library libcudart.so.11.0
2021-12-12 21:24:25.146372: I tensorflow/stream_executor/platform/default/dso_loader.cc:54] Successfully opened dynamic library libcudart.so.11.0
2021-12-12 21:24:25.177896: I tensorflow/stream_executor/platform/default/dso_loader.cc:54] Successfully opened dynamic library libcudart.so.11.0
2021-12-12 21:24:25.209414: I tensorflow/stream_executor/platform/default/dso_loader.cc:54] Successfully opened dynamic library libcudart.so.11.0
2021-12-12 21:24:25.269367: I tensorflow/stream_executor/platform/default/dso_loader.cc:54] Successfully opened dynamic libr