### Step 1. Define and compile baseline NN model

In [1]:
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.layers import Dense,Input
from tensorflow.keras import regularizers
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import Lambda
from tensorflow.keras import Model
import tensorflow as tf

def build_model(num_tf,num_feature_type):
    inputs = [Input(shape = (num_feature_type,)) for i in range(num_tf)]
    concat_layer = [Dense(1, activation = "relu", kernel_regularizer = tf.keras.regularizers.l1(0.01))(inp) for inp in inputs]
    concat_layer = concatenate(concat_layer)
    out = Dense(64, activation = "relu", kernel_regularizer = tf.keras.regularizers.l1(0.01))(concat_layer)
    out = Dense(32, activation = "relu", kernel_regularizer = tf.keras.regularizers.l1(0.01))(out)
    out = Dense(16, activation = "relu", kernel_regularizer = tf.keras.regularizers.l1(0.01))(out)
    out = Dense(1, activation = "linear")(out)
    model = Model(inputs = inputs, outputs = out)
    model.compile(loss = 'mean_squared_error',
                  metrics = 'mean_absolute_error',
                  optimizer = "SGD")
    return(model)

2021-12-15 22:05:41.477873: I tensorflow/stream_executor/platform/default/dso_loader.cc:54] Successfully opened dynamic library libcudart.so.11.0


### Step 2. Load expression, TF activitiy scores, and ensmebl to symbol mappings

First, read RPKM values and velocity values and then scale them

In [3]:
import h5py
import numpy as np
import pandas as pd

rpkm = dict()
with h5py.File('data/processed/rpkm/rpkm.hdf5', 'r') as f:
    for tissue in f.keys():
        
        # scale the rpkm by log10
        rpkm[tissue] = pd.DataFrame(np.log10(np.array(f[tissue]['exp'])+1),
                                        index = np.array(f[tissue]['barcode']).astype(str),
                                        columns = np.array(f[tissue]['ensembl']).astype(str))

velo = dict()
with h5py.File('data/processed/velo/velo.hdf5', 'r') as f:
    for tissue in f.keys():
        velo[tissue] = pd.DataFrame(np.array(f[tissue]['velo']),
                                        index = np.array(f[tissue]['barcode']).astype(str),
                                        columns = np.array(f[tissue]['ensembl']).astype(str))

Read TF activity score matrices. Missing values are treated as zeros.

In [4]:
import os
import pandas as pd

folder = "data/processed/tf_activity/"

tf_gene_mat_mean = dict()
for filename in os.listdir(folder + "mean_tf/"):
    tissue = "_".join(filename.split("_")[:-1])
    tf_gene_mat_mean[tissue] = pd.read_csv("{}/mean_tf/{}".format(folder,filename),index_col = 0).transpose()
    tf_gene_mat_mean[tissue].fillna(0,inplace = True)

tf_gene_mat_sum = dict()
for filename in os.listdir(folder + "sum_tf/"):
    tissue = "_".join(filename.split("_")[:-1])
    tf_gene_mat_sum[tissue] = pd.read_csv("{}/sum_tf/{}".format(folder,filename),index_col = 0).transpose()
    tf_gene_mat_sum[tissue].fillna(0,inplace = True)
    
    # scale the value by log2
    tf_gene_mat_sum[tissue] = np.log2(tf_gene_mat_sum[tissue]+1)

In [6]:
# Keep TFs as columns for rpkm mat.
# Keep common genes between tf_gene mat and velocity mat.

tf_list = pd.read_csv("data/raw/tf/tf_list.csv", index_col = 0)
for tissue in velo.keys():
    
    use_tfs = rpkm[tissue].columns.intersection(tf_list["Ensembl ID"])
    rpkm[tissue] = rpkm[tissue].loc[:,use_tfs]

Load ensembl to hgnc mapping file

In [8]:
ensembl_to_symbol = pd.read_csv("data/raw/id_maping/ensembl_to_symbol.csv",index_col = 0)
ensembl_to_symbol = ensembl_to_symbol.loc[~ensembl_to_symbol["ensembl_id"].duplicated(),:]
ensembl_to_symbol.index = ensembl_to_symbol["ensembl_id"]

### Step 3. Select genes for testing
For each tissue, train and test the model with reps.

Select 500 genes in each tissue for testing

In [None]:
import pandas as pd
import re

l1_res = pd.read_csv("results/r2/lasso_r2.csv", index_col=0)

num_genes = 500
num_samples = 4000

# Remove parenthesis and spaces in the tissue name
l1_res["tissue"] = l1_res['tissue'].str.replace('\s+',"_")
l1_res["tissue"] = l1_res['tissue'].str.replace('\(',"")
l1_res["tissue"] = l1_res['tissue'].str.replace('\)',"")

selected_genes = dict()

for tissue,df in l1_res.groupby("tissue"):
        
    # Select genes that have > 4000 cells for velocity values
    sample_counts = (~velo[tissue].isna()).sum(axis = 0)
    genes = sample_counts.index[sample_counts > num_samples].values
    df = df.loc[df["gene_ensembl"].isin(genes),]

    # Randomly select 500 genes in each tissue
    np.random.seed(1000)
    if num_genes < df.shape[0]:
        idx = np.random.choice(np.arange(df.shape[0]), size=num_genes, replace=False)
    else:
        idx = np.arange(df.shape[0])
    selected_genes[tissue] = df.iloc[idx,]["gene_ensembl"].values
    print("{} has {} selected genes.".format(tissue, str(selected_genes[tissue].shape[0])))

### Step 5. Train and test baseline NN models  
This step may take a long time to complte

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from collections import defaultdict
from tensorflow import keras
from joblib import Parallel,delayed
import numpy as np
import tensorflow as tf
import pdb

n_rep = 3
n_epoch = 15
n_job = 4
batch_size = 256
test_ratio = 0.1
r2 = dict()

def train_parallel(tissue):
    
    # Generate train and test sample indices
    idx = np.arange(rpkm[tissue].shape[0])
    train_idx = []
    test_idx = []
    for rep in range(n_rep):
        idx1, idx2 = train_test_split(idx, test_size = test_ratio, shuffle = True)
        train_idx.append(idx1)
        test_idx.append(idx2)

    # Compile model and keep the initial weights
    model = build_model(num_tf = rpkm[tissue].shape[1], num_feature_type = 5)
    ini_weights = model.get_weights()
        
    # Train and test each gene.
    r2_mat = np.full((n_rep,selected_genes[tissue].shape[0]),np.nan)
    for i,gene in enumerate(selected_genes[tissue]):
        for rep in range(n_rep):

            # Reformat data to multiple inputs
            y_train = velo[tissue][gene].values[train_idx[rep]]
            y_test = velo[tissue][gene].values[test_idx[rep]]
            
            # Remove NA entries
            select_train = ~np.isnan(y_train)
            select_test = ~np.isnan(y_test)

            y_train = y_train[select_train]
            y_test = y_test[select_test]
            
            # Standardize y
            y_train = (y_train - np.mean(y_train))/np.std(y_train)
            y_test = (y_test - np.mean(y_test))/np.std(y_test)

            X_all_train = []
            X_all_test = []

            for TF in rpkm[tissue].columns:

                X_all_train.append(np.empty((train_idx[rep].shape[0], 5)))
                X_all_test.append(np.empty((test_idx[rep].shape[0], 5)))

                # Expression rpkms
                X_all_train[-1][:,0] = rpkm[tissue][TF].values[train_idx[rep],]
                X_all_test[-1][:,0] = rpkm[tissue][TF].values[test_idx[rep],]

                # For current tf & gene, get corresponding gene symbol name
                tf_symbol = ensembl_to_symbol.loc[ensembl_to_symbol["ensembl_id"] == TF,"gene_symbol"].iloc[0]
                gene_symbol = ensembl_to_symbol.loc[ensembl_to_symbol["ensembl_id"] == gene,"gene_symbol"].iloc[0]
                
                # Remove the NA entries
                X_all_train[-1] = X_all_train[-1][select_train,:]
                X_all_test[-1] = X_all_test[-1][select_test,:]

                # Use TF activities if they exist (tf from rpkm matrix), otherwise specify them as zero.
                if tf_symbol in tf_gene_mat_mean[tissue].columns and gene_symbol in tf_gene_mat_mean[tissue].index:

                    # TF mean signals
                    X_all_train[-1][:,1] = tf_gene_mat_mean[tissue].loc[gene_symbol, tf_symbol]
                    X_all_test[-1][:,1] = tf_gene_mat_mean[tissue].loc[gene_symbol, tf_symbol]

                    # TF sum signals
                    X_all_train[-1][:,2] = tf_gene_mat_sum[tissue].loc[gene_symbol, tf_symbol]
                    X_all_test[-1][:,2] = tf_gene_mat_sum[tissue].loc[gene_symbol, tf_symbol]
                else:
                    # TF mean signals
                    X_all_train[-1][:,1] = 0
                    X_all_test[-1][:,1] = 0

                    # TF sum signals
                    X_all_train[-1][:,2] = 0
                    X_all_test[-1][:,2] = 0

                # The product of expressions and TF mean signals
                X_all_train[-1][:,3] =  X_all_train[-1][:,0] * X_all_train[-1][:,1]
                X_all_test[-1][:,3] = X_all_test[-1][:,0] * X_all_test[-1][:,1]

                # The product of expressions and TF sum signals
                X_all_train[-1][:,4] = X_all_train[-1][:,0] * X_all_train[-1][:,2]
                X_all_test[-1][:,4] = X_all_test[-1][:,0] * X_all_test[-1][:,2]

                # Standardize the rpkms and the prod features
                if np.sum(X_all_train[-1][:,0]) != 0:
                    X_all_train[-1][:,0] = (X_all_train[-1][:,0] - np.mean(X_all_train[-1][:,0]))/np.std(X_all_train[-1][:,0])

                if np.sum(X_all_train[-1][:,3]) != 0:
                    X_all_train[-1][:,3] = (X_all_train[-1][:,3] - np.mean(X_all_train[-1][:,3]))/np.std(X_all_train[-1][:,3])

                if np.sum(X_all_train[-1][:,4]) != 0:
                    X_all_train[-1][:,4] = (X_all_train[-1][:,4] - np.mean(X_all_train[-1][:,4]))/np.std(X_all_train[-1][:,4])

                if np.sum(X_all_test[-1][:,0]) != 0:
                    X_all_test[-1][:,0] = (X_all_test[-1][:,0] - np.mean(X_all_test[-1][:,0]))/np.std(X_all_test[-1][:,0])

                if np.sum(X_all_test[-1][:,3]) != 0:
                    X_all_test[-1][:,3] = (X_all_test[-1][:,3] - np.mean(X_all_test[-1][:,3]))/np.std(X_all_test[-1][:,3])

                if np.sum(X_all_test[-1][:,4]) != 0:
                    X_all_test[-1][:,4] = (X_all_test[-1][:,4] - np.mean(X_all_test[-1][:,4]))/np.std(X_all_test[-1][:,4])
            
            # Reset model weights.
            model.set_weights(ini_weights)
                
            # Train model
            model.fit(
                x = X_all_train,
                y = y_train,
                batch_size = batch_size,
                epochs = n_epoch,
                verbose = 0
            )

            # Test model
            y_pred = model.predict(x = X_all_test)
            r2_mat[rep,i] = r2_score(y_test, y_pred)
            
        # Save r2 results to file
        with open("results/r2/basenn_expTFAPredVelo_r2.csv","a") as f:
            f.writelines("Tissue:{}, Gene:{} finished; r2:{}\n".format(tissue,gene,str(np.mean(r2_mat[:,i]))))
    
    return(r2_mat,tissue)

res = Parallel(n_jobs=6)(delayed(train_parallel)(tissue) 
                              for tissue in comm_tissues) 