### Step 1. Load expression, TF activitiy scores, and ensmebl to symbol mappings

First, read RPKM values and velocity values and then scale them

In [1]:
import h5py
import numpy as np
import pandas as pd

rpkm = dict()
with h5py.File('data/raw/rpkm/rpkm.hdf5', 'r') as f:
    for tissue in f.keys():
        
        # scale the rpkm by log10
        rpkm[tissue] = pd.DataFrame(np.log10(np.array(f[tissue]['exp'])+1),
                                        index = np.array(f[tissue]['barcode']).astype(str),
                                        columns = np.array(f[tissue]['ensembl']).astype(str))

velo = dict()
with h5py.File('data/raw/velo/velo.hdf5', 'r') as f:
    for tissue in f.keys():
        velo[tissue] = pd.DataFrame(np.array(f[tissue]['velo']),
                                        index = np.array(f[tissue]['barcode']).astype(str),
                                        columns = np.array(f[tissue]['ensembl']).astype(str))

Read ensmebl to hgnc symbol mapping table

In [3]:
ensembl_to_symbol = pd.read_csv("data/raw/id_mapping/ensembl_to_symbol.csv",index_col = 0)
ensembl_to_symbol = ensembl_to_symbol.loc[~ensembl_to_symbol["ensembl_id"].duplicated(),:]
ensembl_to_symbol.index = ensembl_to_symbol["ensembl_id"]

### Step 2. Select genes for testing

Select 500 genes in each tissue for testing

In [4]:
import pandas as pd
import re

l1_res = pd.read_csv("results/r2/lasso_r2.csv", index_col=0)

num_genes = 500
num_samples = 4000

# Remove parenthesis and spaces in the tissue name
l1_res["tissue"] = l1_res['tissue'].str.replace('\s+',"_")
l1_res["tissue"] = l1_res['tissue'].str.replace('\(',"")
l1_res["tissue"] = l1_res['tissue'].str.replace('\)',"")

selected_genes = dict()

for tissue,df in l1_res.groupby("tissue"):
    
    if tissue in comm_tissues:
        
        # Select genes that have > 4000 samples
        # For Liver, because there fewer samples, threshold is 1000 samples
        sample_counts = (~velo[tissue].isna()).sum(axis = 0)
        if tissue == "Liver":
            genes = sample_counts.index[sample_counts > 1000].values
        else:
            genes = sample_counts.index[sample_counts > 4000].values
        genes = np.intersect1d(rpkm[tissue].columns, genes)
        df = df.loc[df["gene_ensembl"].isin(genes),]

        # Randomly select 500 genes in each tissue
        np.random.seed(1000)
        if num_genes < df.shape[0]:
            idx = np.random.choice(np.arange(df.shape[0]), size=num_genes, replace=False)
        else:
            idx = np.arange(df.shape[0])
        selected_genes[tissue] = df.iloc[idx,]["gene_ensembl"].values
        print("{} has {} selected genes.".format(tissue, str(selected_genes[tissue].shape[0])))

  l1_res["tissue"] = l1_res['tissue'].str.replace('\s+',"_")
  l1_res["tissue"] = l1_res['tissue'].str.replace('\(',"")
  l1_res["tissue"] = l1_res['tissue'].str.replace('\)',"")


Heart has 500 selected genes.
Kidney_Left has 500 selected genes.
Large_Intestine has 500 selected genes.
Liver has 500 selected genes.
Lung_Right has 500 selected genes.
Spleen has 500 selected genes.


Keep only the TFs as columns for rpkm_tf mat.

In [5]:
tf_list = pd.read_csv("data/raw/tf/tf_list.csv", index_col = 0)
rpkm_tf = dict()
for tissue in velo.keys():
    
    use_tfs = rpkm[tissue].columns.intersection(tf_list["Ensembl ID"])
    rpkm_tf[tissue] = rpkm[tissue].loc[:,use_tfs]

### Step 3. Train and test models

Use expressions to predict expressions

In [None]:
from sklearn.model_selection import train_test_split
from arboreto.core import SGBM_KWARGS, RF_KWARGS, EARLY_STOP_WINDOW_LENGTH, fit_model
from scipy.sparse import csr_matrix
from sklearn.metrics import r2_score
from collections import defaultdict
from joblib import Parallel, delayed
import os, psutil
import datetime
import scipy

import numpy as np
import tensorflow as tf
import pdb

n_rep = 3
test_ratio = 0.1

def train_test(tissue, gene, rep):
    # Exclude the gene from columns of tf matrix if it appears there
    if gene in rpkm_tf[tissue].columns:
        train_X = rpkm_tf[tissue].drop(columns = gene).iloc[train_idx[rep],:].values
        train_y = rpkm[tissue][gene].iloc[train_idx[rep]].values
        test_X = rpkm_tf[tissue].drop(columns = gene).iloc[test_idx[rep],:].values
        test_y = rpkm[tissue][gene].iloc[test_idx[rep]].values
    else:
        train_X = rpkm_tf[tissue].iloc[train_idx[rep],:].values
        train_y = rpkm[tissue][gene].iloc[train_idx[rep]].values
        test_X = rpkm_tf[tissue].iloc[test_idx[rep],:].values
        test_y = rpkm[tissue][gene].iloc[test_idx[rep]].values
    
    # Convert inputs to sparse matrix
    train_X = csr_matrix(train_X)
    test_X = csr_matrix(test_X)

    #Train regressor model
    model = fit_model(regressor_type = "GBM",
                      regressor_kwargs = SGBM_KWARGS,
                      tf_matrix = train_X,
                      target_gene_expression = train_y,
                      early_stop_window_length = EARLY_STOP_WINDOW_LENGTH,
                      seed = None)
    
    # Test regressor model
    pred_y = model.predict(test_X)
    r2 = r2_score(test_y, pred_y)
    
    # Save results
    print("{},{},{},{}".format(tissue, gene, str(rep), str(r2)))
    with open("results/r2/grnboost2_expTFAPredExp_r2.csv","a") as f:
        f.writelines("{},{},{},{}\n".format(tissue, gene, str(rep), str(r2)))
    
    return(None)

for tissue in velo.keys():
    
    # Generate train and test sample indices for different replicates
    idx = np.arange(rpkm[tissue].shape[0])
    train_idx = []
    test_idx = []
    
    # Generate train and test cell indices for different replicates
    for rep in range(n_rep):
        idx1, idx2 = train_test_split(idx, test_size = test_ratio, shuffle = True)
        train_idx.append(idx1)
        test_idx.append(idx2)
    
    # Run this number of replicates
    for rep in range(n_rep):
        res = Parallel(n_jobs = 50)(delayed(train_test)(tissue, gene, rep)
                                    for gene in selected_genes[tissue])