In [None]:
# calculate transcriptome based EXP (normalized expression) and JSD (tissue specificity) scores
# start with transcriptome (RNA-seq) TPM data file (generated with salmon package)
# output are EXP (normailzed log2 of TPM+1) and JSD score

In [None]:
import math
import pandas as pd
import numpy as np
import scipy as sp
from scipy.stats import entropy

In [None]:
# Calculate Jensen–Shannon divergence Score (tissue specificity)
# https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence
def JSD(p, q):
    """ Given two array (p,q), calculate Jensen-Shannon Divergence """
    p = np.asarray(p)
    q = np.asarray(q)
   # normalize
    p /= p.sum()
    q /= q.sum()
    m = (p + q) / 2
    return (entropy(p, m) + entropy(q, m)) / 2

In [None]:
# Input file: transcriptome (RNA-seq) data process with salmon tximport workflow
# https://f1000research.com/articles/4-1521
query_df=pd.read_csv("salmon_abundance_TPM_geneLevel.txt",sep="\t",index_col=0)
query_df.head()

In [None]:
# get backgroud reference RNA-seq data
# for mouse RNA-seq data
# Option 1) Use Encode project and 7 Organs Devevelopment RNA-seq as reference
# this is the prefered reference RNA-seq for embryonic developmental samples
ref_df=pd.read_csv("data/mouse_references_rnaseq/mouse_rnaseq_encode_7Organs-Dev_tpm_delSimi.txt",sep="\t",index_col=0)
# Option 2) Use FANTOM5 project cells and tissues as reference
# ref_df=pd.read_csv("data/mouse_references_rnaseq/mouse_rnaseq_f5cells_f5tissues_tpm_delSimi.txt",sep="\t",index_col=0)
# for human RNA-seq data
# ref_df=pd.read_csv("data/human_references_rnaseq/human_encode_rnaseq_combined_tpm.tsv",sep="\t",index_col=0)
ref_df.head()

In [None]:
# combine query data with reference data
query_ref_combined=pd.concat([query_df,ref_df], join='inner', axis=1)
query_ref_combined.head()

In [None]:
# save to file
query_ref_combined.to_csv("query_and_encode_7Organs_salmon_abundance_TPM.txt",sep="\t")

In [None]:
# quantile normalization in R, in terminal run:
# Rscript log2_quantile_normalization.R -i query_and_encode_7Organs_salmon_abundance_TPM.txt -o query_and_encode_7Organs_salmon_abundance_TPM_log2norm.txt
# or run directly in this notebook
! Rscript log2_quantile_normalization.R -i query_and_encode_7Organs_salmon_abundance_TPM.txt -o query_and_encode_7Organs_salmon_abundance_TPM_log2norm.txt

In [None]:
# load from file
query_ref_combined=pd.read_csv("query_and_encode_7Organs_salmon_abundance_TPM.txt",sep="\t",index_col=0)
query_ref_combined.head()

In [None]:
# remove similar samples based on Pearson correlation coefficient
def removeSimilarSamples(bkg_df, query_col, pcc_cutoff=0.85):
    pcc_list=list(np.corrcoef(bkg_df.T, query_col.T)[-1])
    remove_columns=[]
    for columnName,pcc_val in zip(bkg_df.columns.values, pcc_list[:-1]):
        if pcc_val>pcc_cutoff:remove_columns.append(columnName)
    if len(remove_columns)>0:
        return bkg_df.drop(labels=remove_columns,axis=1)
    else:
        return bkg_df

In [None]:
# calculate jsd scores for all genes, loop through all genes
def jsdLoop(bkg_df, query_col, pcc_cutoff=0.85):
    """ 
    rows for bkg_df and query_df should be exactly the same (same genes, same order)
    bkg_df have no Gene Symbole or name column, all columns are expression values
    query_col should be a pandas series (one column)
    """
    # remove samples in bkg_df that have PCC higher than pcc_cutoff with query sample
    bkg_df=removeSimilarSamples(bkg_df, query_col, pcc_cutoff=pcc_cutoff)
    query_gene_list=[]
    query_jsd_list=[]
    gene_val_list=query_col.iteritems()
    for rowIdx, (gene,val) in enumerate(query_col.iteritems()):
        bkg_arr=bkg_df.iloc[rowIdx,]
        P_arr=list(bkg_arr)
        P_arr.insert(0,val)
        Q_arr=[1.0]+[0.0 for i in range(len(P_arr)-1)]
        jsd_score=1.0-JSD(P_arr,Q_arr)
        query_gene_list.append(gene)
        if not math.isnan(jsd_score):
            query_jsd_list.append(jsd_score)
        else:
            query_jsd_list.append(0)
    #sorted_query_gene_jsd_list=sorted(query_gene_jsd_list, key = lambda x: x[1], reverse=True)
    return query_gene_list, query_jsd_list

In [None]:
# extract all genes that are transcription factors
# annotation of genome-wide transcription factors (TFs) download from AnimalTFDB database
# http://bioinfo.life.hust.edu.cn/AnimalTFDB/#!/
mouse_TF_df=pd.read_csv("data/AnimalTFDB_Mus_musculus_TF.txt",sep="\t",index_col=0)
query_ref_tf_df=query_ref_combined.loc[list(set(query_ref_combined.index).intersection(set(mouse_TF_df.index)))]
query_ref_tf_df.head()

In [None]:
# seperate query and reference dataframe
query_sample_number=len(query_df.columns)
query_tf_df=query_ref_tf_df.iloc[:,:query_sample_number]
ref_tf_df=query_ref_tf_df.iloc[:,query_sample_number:]

In [None]:
query_tf_df.head()

In [None]:
ref_tf_df.head()

In [None]:
# calculate TFs' JSD score for each sample in query_df
sample_JSDList_dict={}
for sampleName in query_tf_df.columns:
    #print(sampleName)
    query_col=query_tf_df.loc[:,sampleName]
    query_gene_list, query_jsd_list = jsdLoop(ref_tf_df, query_col, pcc_cutoff=0.85)
    # apply log2 transform
    query_jsd_list=np.log2(query_jsd_list + 0.01)
    sample_JSDList_dict[sampleName]=query_jsd_list
# add gene column
sample_JSDList_dict["Gene"]=query_gene_list
# create dataframe
sample_JSDList_df=pd.DataFrame(sample_JSDList_dict)
sample_JSDList_df.head()

In [None]:
# add TFs annotation information
sample_JSDList_df=sample_JSDList_df.set_index("Gene")
sample_JSDList_wName_df=pd.concat([sample_JSDList_df,mouse_TF_df], join='inner', axis=1)

In [None]:
# Export to file
sample_JSDList_wName_df.to_csv("salmon_abundance_TPM_geneLevel_log2norm_JSDscore.txt",sep="\t")