# Find Signature Gene Profiles
```
Andrew E. Davidson
aedavids@ucsc.edu
```

We can use the profiles to deconvole bulk tissue and plasma sample

Data set: GTEx+TCGA 1vsAll results

ref:
- extraCellularRNA/terra/deseq/doc/plots
- extraCellularRNA/terra/deseq/doc/plots/jupyterNotebooks/GTExValidateExploration.ipynb
- https://support.terra.bio/hc/en-us/articles/360042259232-Managing-data-and-automating-workflows-with-the-FISS-API
- [An introduction to using the Fiss API in Python in BioData Catalyst](https://terra.biodatacatalyst.nhlbi.nih.gov/#workspaces/biodata-catalyst/BioData%20Catalyst%20Collection/notebooks/launch/Intro%20to%20FISS%20API%20in%20Python.ipynb)
- [upsetPlot](https://upsetplot.readthedocs.io/en/stable/api.html#upsetplot.plot)
- https://www.adamsmith.haus/python/answers/how-to-read-a-dictionary-from-a-file-in--python
- terra/jupyterNotebooks/exploreUpsetPlotInteresections.ipynb

## TODO
- implement find best and find down

## initialize

In [2]:
from datetime import datetime

now = datetime.now()
current_time = now.strftime("%Y-%b-%d %H:%M:%S")
print("Run on ", current_time)

Run on  2022-Aug-11 16:12:13


In [3]:
from   firecloud import fiss
import firecloud.api as fapi
import io
from   matplotlib import pyplot as plt
import numpy as np
import os
import pandas as pd
from   pathlib import Path
import upsetplot as upsp


# use display() to print an html version of a data frame
# useful if dataFrame output is not generated by last like of cell
from IPython.display import display, Image

In [4]:
tmp = "./tmp"
!mkdir -p $tmp

In [5]:
# set env var used by fiss
# Get the Google billing project name and workspace name
billing_project = os.environ['WORKSPACE_NAMESPACE']
workspace = os.environ['WORKSPACE_NAME']
bucket = os.environ['WORKSPACE_BUCKET'] + "/"

print("Billing project: " + billing_project)
print("Workspace: " + workspace)
print("Workspace storage bucket: " + bucket)

Billing project: test-aedavids-proj
Workspace: uber
Workspace storage bucket: gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/


In [6]:
# Install a pip package in the current Jupyter kernel
# https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/
import sys
!{sys.executable} -m pip install --quiet upsetplot

# install upset plot
# see extraCellularRNA/terra/deseq/bin/createUpsetPlotZip.sh
url = bucket + "python/upsetPlot.zip"
! gsutil cp $url .
! unzip -o upsetPlot.zip > /dev/null
ORIG_PYTHONPATH = os.environ['PYTHONPATH']

Copying gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/python/upsetPlot.zip...
/ [1 files][ 13.0 MiB/ 13.0 MiB]                                                
Operation completed over 1 objects/13.0 MiB.                                     


In [7]:
# setting the python path allows us to run python scripts from using
# the CLI. 
PYTHONPATH = ORIG_PYTHONPATH + ":./python"
os.environ["PYTHONPATH"] = PYTHONPATH
PYTHONPATH = os.environ["PYTHONPATH"]
print("PYTHONPATH: {}".format(PYTHONPATH))

# test install. We should get a help message
! python python/plots/geneSignatureUpsetPlot.py

# to be able to import our local python files we need to set the sys.path
# https://stackoverflow.com/a/50155834
sys.path.append(os.getcwd() + '/python')
print("\nsys.path:\n{}\n".format(sys.path))

from   plots.DESeqSelect import DESeqSelect

PYTHONPATH: /etc/jupyter/custom:/usr/lib/spark/python:/home/jupyter/packages:./python
CALLING initializeLogging()
first call initialize Logging
CALLING initializeLogging()
usage: geneSignatureUpsetPlot.py [-h] [-t] [-w WIDTH] [-z HEIG] -d  -o  -c  -i
geneSignatureUpsetPlot.py: error: the following arguments are required: -d/--dataSetsCSV, -o/--outputFile, -c/--numThreads, -i/--intersectionOutputFile

sys.path:
['/home/jupyter/uber/edit', '/etc/jupyter/custom', '/usr/lib/spark/python', '/home/jupyter/packages', '/opt/conda/lib/python37.zip', '/opt/conda/lib/python3.7', '/opt/conda/lib/python3.7/lib-dynload', '', '/home/jupyter/.local/lib/python3.7/site-packages', '/opt/conda/lib/python3.7/site-packages', '/opt/conda/lib/python3.7/site-packages/IPython/extensions', '/home/jupyter/.ipython', '/home/jupyter/uber/edit/python']



In [8]:
outDir = "output"
! mkdir -p $outDir

outputImgDir = outDir + "/img"
! mkdir -p $outputImgDir

# Define select genes of interest functions 

In [9]:
def findUpRegulatedSignatureGenes(deseqDF, signatureGeneConfig):
    '''
    Find genes that that are statistically signifigant and up requlated
    
    arguments:
        deseqDF:
            results of DESeq2 as a pandas dataframe 
            
        signatureGeneConfig
            contains run parmeters            
            
    
    return:
        pandas dataframe
    '''
    selectSignificantRowsPS = deseqDF.loc[:,"padj"] < signatureGeneConfig.padjThreshold
#     print("number of genes with padj < {} : {}".format(padjThreshold,
#                                                        selectSignificantRowsPS.sum()))

    deseqLFCSignatureGenesDF = deseqDF.loc[ selectSignificantRowsPS,: ]\
                                        .sort_values("log2FoldChange", ascending=False)
    
    # find the genes that are over expresed 
    selectLFCPS = deseqLFCSignatureGenesDF.loc[:,"log2FoldChange"] >= signatureGeneConfig.lfcThreshold
#     print("number of genes with log2FoldChange >= {} = {}"\
#             .format( lfcThreshold, selectLFCPS.sum() ))
    
    deseqBaseMeanSignatureGenesDF = deseqLFCSignatureGenesDF.loc[ selectLFCPS,: ]\
                            .sort_values("baseMean", ascending=False)
    
    topSignatureGenesDF = deseqBaseMeanSignatureGenesDF.head( signatureGeneConfig.n )
    return topSignatureGenesDF

In [10]:
def findBestSignatureGenes(deseqDF, signatureGeneConfig):
    '''
    Find genes that that are statistically signifigant with  lfc <= -2.0 or >= 2.0
    
    arguments:
        deseqDF:
            results of DESeq2 as a pandas dataframe 
            
        signatureGeneConfig
            contains run parmeters            
            
    
    return:
        pandas dataframe
    '''
    print("\n!!!!!!!!!!!!!!!!!!! AEDWIP !!!!!!!!!!!!!!\n")

In [39]:
def runSelectGenesOfInterestFunction( signatureGeneConfig, candidateSignatureFileList,  skipRows=7 ):
    '''
    finds ups up regulated genes
    
    arguments:
        signatureGeneConfig
            contains run parmeters
            
        candidateSignatureFileList: 
            a list of file paths to candidate signature gene files to include in upset plot
            
            
        skipRows:
            int, default = 7
            1vsAll returns the results from DESeq with a self describing header comprised of 7 rows
            the lfcShrink output has 6 rows             
            
    returns: (upRegulatedDict, outFileList)
        upRegulatedDict : dictionary
            key: csgpFile name
            value: pandas dataframe
        
    '''
    retDict = {}
    retOutFileList = []
    for csgpFile in candidateSignatureFileList:
        deseqDF = pd.read_csv(csgpFile, skiprows=skipRows)
        #topSignatureGenesDF = findUpRegulatedSignatureGenes(deseqDF, signatureGeneConfig)
        signatureGenesDF = signatureGeneConfig.selectGenesOfInterestFunction(deseqDF, signatureGeneConfig)
                                                        

        fileName = csgpFile.split("/")[-1]
        outDir = signatureGeneConfig.getLocalCachedDir() 
        outFilePath = outDir + "/" + fileName
        signatureGenesDF.to_csv(outFilePath, index=False)
        print("saved to file: {}".format(outFilePath))

        retDict[fileName] = topSignatureGenesDF
        retOutFileList.append(outFilePath)

    
    return (retDict, retOutFileList)

# genesOfInterestDict, genesOfInterestFileList =  runSelectGenesOfInterestFunction(signatureGeneConfig, 
#                                                                    candidateSignatureFileList, 
#                                                                    skipRows=7)

In [81]:
def findBestSignatureGenes(deseqDF, signatureGeneConfig):
    '''
    Find genes that that are statistically signifigant with  lfc <= -2.0 or >= 2.0
    
    arguments:
        deseqDF:
            results of DESeq2 as a pandas dataframe 
            
        signatureGeneConfig
            contains run parmeters            
            
    
    return:
        pandas dataframe
    '''    
    colsToReturn = deseqDF.columns
    
    #
    # find statistically signifigant genes
    #
    selectSignificantRowsPS = deseqDF.loc[:,"padj"] < signatureGeneConfig.padjThreshold
#     print("number of genes with padj < {} : {}".format(padjThreshold,
#                                                        selectSignificantRowsPS.sum()))

    #
    # use absolute value of log fold change to select best 
    # biologically signifigant genes
    # 
    significantDF = deseqDF.loc[selectSignificantRowsPS, :]
    absPS = significantDF['log2FoldChange'].abs()
    significantDF2 = significantDF.assign(absLog2FoldChange=absPS)    

    selectBestUpRegulatedRows = significantDF2.loc[:, 'absLog2FoldChange'] >= signatureGeneConfig.lfcThreshold
    significantDF3 = significantDF2.loc[selectBestUpRegulatedRows, :]
        
    significantDF3 = significantDF3.sort_values( by = ['absLog2FoldChange'], ascending=False)

    return significantDF3.loc[:, colsToReturn].head(n=signatureGeneConfig.n)


def testFindBestSignatureGenes(signatureGeneConfig, candidateSignature, skipRows=7):
    print(candidateSignature)
    deseqDF = pd.read_csv(candidateSignature, skiprows=skipRows)
    return findBestSignatureGenes(deseqDF, signatureGeneConfig)
     

# testResults = testFindBestSignatureGenes(signatureGeneConfig, candidateSignatureFileList[0], skipRows=7)
# print("\n****** results")
# testResults

./tmp/Adipose_Subcutaneous_vs_all.results

****** results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
550,AC013391.3,20.346586,-8.850412,0.256571,-34.495026,9.524422000000001e-261,7.216608e-259
1394,AC092720.2,35.995507,-8.516456,0.305093,-27.914282,1.790093e-171,5.357318e-170
278,SNCB,1173.38509,-8.327166,0.19331,-43.076734,0.0,0.0
436,CACNG7,573.231325,-8.2484,0.228954,-36.026371,3.23377e-284,3.089397e-282
73,CPLX2,4440.399653,-8.230058,0.199041,-41.348495,0.0,0.0
39,C1orf61,2958.098269,-8.183661,0.18449,-44.358222,0.0,0.0
123,GABRA5,311.347063,-8.149244,0.210556,-38.703515,0.0,0.0
165,KCNQ2,1734.559278,-8.085285,0.198117,-40.810572,0.0,0.0
1158,NXPH1,145.485359,-8.017738,0.272647,-29.407064,4.4605450000000003e-190,1.606758e-188
147,HRH3,232.693133,-7.985822,0.208733,-38.258554,0.0,0.0


# Configure

In [24]:
class SignatureGeneConfig(object):
    '''
    Class used to keep track of all the parameters for a given data set
    
    should be treated as constant, immutable values
    '''
    
    def __init__(self, 
                 terraDataEntity, design, padjThreshold, lfcThreshold, n, 
                 selectGenesOfInterestFunction,
                 dataOutputBucketRoot,
                 localCacheRoot, title):
        '''
        arguments
            terraDataEntity:
                a name a terra workspace data model returned by listWorkspaceEntities.
                Expected to have a 'candidateSignatureGeneProfile' column
                
            Design:
                a string with the DESeq design. displayed on plots and encoded into data file names
                
            padjThreshold:
                selects genes with padj values <= padjThreshold
                
            lfcThreshold
            
            n: 
                type integer: 
                The number of rows to be select. 
                
            selectGenesOfInterestFunction:
                a function that takes in two arguments
                    deseqDF:
                        results of DESeq2 as a pandas dataframe
                        
                    signatureGeneConfig:
                        a configuration object 
                    
                example: findUpRegulatedSignatureGenes
                
            dataOutputBucketRoot
                example: gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/1vsAll/{up|down|best}
                location to store genes of interest
                
            localCacheRoot
                example: output
                
            title
                plot title       
                
        '''
        self.terraDataEntity = terraDataEntity
        self.design = design
        self.padjThreshold = padjThreshold
        self.lfcThreshold = lfcThreshold
        self.n = n
        self.selectGenesOfInterestFunction = selectGenesOfInterestFunction
        self.dataOutputBucketRoot = dataOutputBucketRoot
        self.localCacheRoot = localCacheRoot
        self.title = title
        
        localCache = self.getLocalCachedDir()
        ! mkdir -p $localCache
        
    def getfileNameBase(self):
        tmp = "{}-design:{}-padj:{}-lfc:{}-n:{}".format(
                                self.terraDataEntity,
                                self.design,
                                self.padjThreshold,
                                self.lfcThreshold, 
                                self.n
                                )
        return tmp.replace(" ","_")
        
    def saveGenesOfInterestToBucketURL(self):
        return self.dataOutputBucketRoot + "/" + self.getfileNameBase() 
    
    def getLocalCachedDir(self) :
        return self.localCacheRoot + "/" + self.getfileNameBase()

In [25]:
def createGTExTCGA_Config_top25():
    terraDataEntity = 'GTEx_TCGA_1vsAll'
    design = "~  gender + category"
    padjThreshold = 0.001
    lfcThreshold = 2.0
    n = 25
    selectGenesOfInterestFunction = findUpRegulatedSignatureGenes
    dataOutputBucketRoot = bucket + "data/1vsAll/up"
    localCacheRoot = outDir
    title = "{} topN={} Signature Genes, padj < {} lf2c > {} sorted by baseMean".format(terraDataEntity, 
                                                                                        n,
                                                                                        padjThreshold, 
                                                                                        lfcThreshold )
    
    GTExTCGA_Config_top25 = SignatureGeneConfig(
        terraDataEntity,
        design, 
        padjThreshold,
        lfcThreshold,
        n,
        selectGenesOfInterestFunction,
        dataOutputBucketRoot,
        localCacheRoot,
        title
    )
    
    return GTExTCGA_Config_top25

In [26]:
def createGTExTCGA_Config_best25():
    terraDataEntity = 'GTEx_TCGA_1vsAll'
    design = "~  gender + category"
    padjThreshold = 0.001
    lfcThreshold = 2.0
    n = 25
    selectGenesOfInterestFunction = findBestSignatureGenes
    dataOutputBucketRoot = bucket + "data/1vsAll/best"
    localCacheRoot = outDir
    title = "{} bestN={} Signature Genes, padj < {} lfc2 <= -{} or {} <= lfc2 ".format(terraDataEntity, 
                                                                                        n,
                                                                                        padjThreshold, 
                                                                                        lfcThreshold,
                                                                                        lfcThreshold)
    
    GTExTCGA_Config_best25 = SignatureGeneConfig(
        terraDataEntity,
        design, 
        padjThreshold,
        lfcThreshold,
        n,
        selectGenesOfInterestFunction,
        dataOutputBucketRoot,
        localCacheRoot,
        title
    )
    
    return GTExTCGA_Config_best25

In [27]:
GTExTCGA_Config_top25 = createGTExTCGA_Config_top25()
GTExTCGA_Config_best25 = createGTExTCGA_Config_best25()

# set signatureGeneConfig = to the data set you want to run
#signatureGeneConfig = GTExTCGA_Config_top25
signatureGeneConfig = GTExTCGA_Config_best25

print("title:\n{}\n".format(signatureGeneConfig.title))
print( "save to URL:\n{}\n".format(signatureGeneConfig.saveGenesOfInterestToBucketURL()))
print( "localCacheDir:\n{}".format(signatureGeneConfig.getLocalCachedDir()))

title:
GTEx_TCGA_1vsAll bestN=25 Signature Genes, padj < 0.001 lfc2 <= -2.0 or 2.0 <= lfc2 

save to URL:
gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/1vsAll/best/GTEx_TCGA_1vsAll-design:~__gender_+_category-padj:0.001-lfc:2.0-n:25

localCacheDir:
output/GTEx_TCGA_1vsAll-design:~__gender_+_category-padj:0.001-lfc:2.0-n:25


# load 1vsAll results

In [28]:
def listWorkspaceEntities():
    '''
    prints out the names of all the workspace data models, and the number of rows in each model
    
    returns void
    '''
    # Take a look at all of the data models / entities in of our workspace
    ent_types = fiss.fapi.list_entity_types(billing_project, workspace).json()
    for t in ent_types.keys():
        print (t, "count:", ent_types[t]['count'])
        
listWorkspaceEntities()        

GTEx_TCGA_1vsAll_set count: 7
GTEx_1vsAll_set count: 2
TCGA_1vsAll_set count: 2
GTEx_1vsAll count: 51
GTEx_TCGA_1vsAll count: 83
TCGA_1vsAll count: 32


In [29]:
def loadTerraDataModel(billingProject, workspace, modelName) :
    '''
    makes the data models we would see on the terra uber workspace data tab.
    
    returns a pandas dataframe
    '''
    ret = pd.read_csv( io.StringIO(
                                    fiss.fapi.get_entities_tsv(
                                        billing_project, 
                                        workspace, 
                                        modelName,
                                        model='flexible')
                                    .text), 
                              sep='\t')
    return ret

# load the data model that has 1vsAll results
terraDataEntityDF = loadTerraDataModel(billing_project, workspace, signatureGeneConfig.terraDataEntity)
terraDataEntityDF

Unnamed: 0,entity:GTEx_TCGA_1vsAll_id,candidateSignatureGeneProfile,category,dataSet,estimatedSizeFactors
0,GTEx_Adipose_Subcutaneous,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,Adipose_Subcutaneous,GTEx,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
1,GTEx_Adipose_Visceral_Omentum,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,Adipose_Visceral_Omentum,GTEx,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
2,GTEx_Adrenal_Gland,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,Adrenal_Gland,GTEx,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
3,GTEx_Artery_Aorta,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,Artery_Aorta,GTEx,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
4,GTEx_Artery_Coronary,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,Artery_Coronary,GTEx,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
...,...,...,...,...,...
78,TCGA_THCA,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,THCA,TCGA,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
79,TCGA_THYM,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,THYM,TCGA,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
80,TCGA_UCEC,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,UCEC,TCGA,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
81,TCGA_UCS,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,UCS,TCGA,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...


In [30]:
def select1vsAllResults(df):
    '''
    select rows where candidateSignatureGeneProfile is not null
    
    returns a pandas dataframe
    '''
    selectRowsLogical = ~df.loc[:,'candidateSignatureGeneProfile'].isnull()
    retDF = df.loc[selectRowsLogical, :]
    
    return retDF  

In [31]:
# clean up. remove rows that are missing 1vsAll results
terraDataEntityDF = select1vsAllResults(terraDataEntityDF)
print("{}.shape:{}".format(signatureGeneConfig.terraDataEntity, terraDataEntityDF.shape))
assert terraDataEntityDF.shape[0] == 83, "ERROR: expected 83 candidateSignatureGeneProfiles"
#display( terraDataEntityDF.head() )

print("\n")
for f in terraDataEntityDF.loc[:,'candidateSignatureGeneProfile'].to_list():
    print( f.split("/")[-1] )

terraDataEntityDF

GTEx_TCGA_1vsAll.shape:(83, 5)


Adipose_Subcutaneous_vs_all.results
Adipose_Visceral_Omentum_vs_all.results
Adrenal_Gland_vs_all.results
Artery_Aorta_vs_all.results
Artery_Coronary_vs_all.results
Artery_Tibial_vs_all.results
Bladder_vs_all.results
Brain_Amygdala_vs_all.results
Brain_Anterior_cingulate_cortex_BA24_vs_all.results
Brain_Caudate_basal_ganglia_vs_all.results
Brain_Cerebellar_Hemisphere_vs_all.results
Brain_Cerebellum_vs_all.results
Brain_Cortex_vs_all.results
Brain_Frontal_Cortex_BA9_vs_all.results
Brain_Hippocampus_vs_all.results
Brain_Hypothalamus_vs_all.results
Brain_Nucleus_accumbens_basal_ganglia_vs_all.results
Brain_Putamen_basal_ganglia_vs_all.results
Brain_Spinal_cord_cervical_c-1_vs_all.results
Brain_Substantia_nigra_vs_all.results
Breast_Mammary_Tissue_vs_all.results
Cells_Cultured_fibroblasts_vs_all.results
Cells_EBV-transformed_lymphocytes_vs_all.results
Cervix_Endocervix_vs_all.results
Colon_Sigmoid_vs_all.results
Colon_Transverse_vs_all.results
Esophagus_Gast

Unnamed: 0,entity:GTEx_TCGA_1vsAll_id,candidateSignatureGeneProfile,category,dataSet,estimatedSizeFactors
0,GTEx_Adipose_Subcutaneous,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,Adipose_Subcutaneous,GTEx,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
1,GTEx_Adipose_Visceral_Omentum,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,Adipose_Visceral_Omentum,GTEx,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
2,GTEx_Adrenal_Gland,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,Adrenal_Gland,GTEx,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
3,GTEx_Artery_Aorta,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,Artery_Aorta,GTEx,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
4,GTEx_Artery_Coronary,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,Artery_Coronary,GTEx,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
...,...,...,...,...,...
78,TCGA_THCA,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,THCA,TCGA,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
79,TCGA_THYM,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,THYM,TCGA,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
80,TCGA_UCEC,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,UCEC,TCGA,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
81,TCGA_UCS,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,UCS,TCGA,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...


In [32]:
%%time
def downLoadCandidateSignatureGeneProfile(cache, dataModelDF):
    '''
    downloads candidateSignatureGeneProfile files to local disk if not already in local cache
    These files are the results created by running 1vsAll.wdl
    
    arguments:
        cache : 
            string path to director to store files location
        
        dataModelDF
            example, GTEx_1vsAllDF
            
    returns
        list of file path on local machine
    '''
    retList = []
    resultsList = dataModelDF.loc[:, 'candidateSignatureGeneProfile'].to_list()
    for gsURL in resultsList:
        fileName = gsURL.split("/")[-1]
        savePath = cache + "/" + fileName
        path = Path(savePath)
        if path.is_file() :
            print("skipping download of {}".format(fileName))            
        else:
            !gsutil -m cp $gsURL $savePath
            
        retList.append(savePath)
            
    return retList
            
        
candidateSignatureFileList = downLoadCandidateSignatureGeneProfile(tmp, terraDataEntityDF)

skipping download of Adipose_Subcutaneous_vs_all.results
skipping download of Adipose_Visceral_Omentum_vs_all.results
skipping download of Adrenal_Gland_vs_all.results
skipping download of Artery_Aorta_vs_all.results
skipping download of Artery_Coronary_vs_all.results
skipping download of Artery_Tibial_vs_all.results
skipping download of Bladder_vs_all.results
skipping download of Brain_Amygdala_vs_all.results
skipping download of Brain_Anterior_cingulate_cortex_BA24_vs_all.results
skipping download of Brain_Caudate_basal_ganglia_vs_all.results
skipping download of Brain_Cerebellar_Hemisphere_vs_all.results
skipping download of Brain_Cerebellum_vs_all.results
skipping download of Brain_Cortex_vs_all.results
skipping download of Brain_Frontal_Cortex_BA9_vs_all.results
skipping download of Brain_Hippocampus_vs_all.results
skipping download of Brain_Hypothalamus_vs_all.results
skipping download of Brain_Nucleus_accumbens_basal_ganglia_vs_all.results
skipping download of Brain_Putamen_basa

# Find Genes of interest

In [33]:
aedwip
genesOfInterestDict, genesOfInterestFileList =  runSelectGenesOfInterestFunction(signatureGeneConfig, 
                                                                   candidateSignatureFileList, 
                                                                   skipRows=7)

NameError: name 'aedwip' is not defined

In [None]:
%%time
# save to long term storage
URL_ROOT = signatureGeneConfig.saveGenesOfInterestToBucketURL() 
print("saving to: {}".format(URL_ROOT))
print()

for f in genesOfInterestFileList:
    print()
    baseName = f.split("/")[-1]
    url = URL_ROOT + "/" + baseName
    ! gsutil -m cp $f $url

# Explore the genes of interest

In [None]:
for key in genesOfInterestDict.keys():
    df = genesOfInterestDict[key]
    print("\n" + key)
    display(df.head(n=2))

## create dataSet.csv
This file is an argument to plots/geneSignatureUpsetPlot.py. I defines the set of candidate genes to include
in the upset plot

In [None]:
def createDataSetCSV(csgpFileList):
    retDF = None
    for csgp in csgpFileList:
        setName = csgp.split("/")[-1]
        filePath = csgp
        numHeaderLines = 8

        tmpDF = pd.DataFrame( {"setName":[setName], 'numHeaderLines':[numHeaderLines], 'filePath':[filePath]} )
        if retDF is None:
            retDF = tmpDF
        else:
            byRows = 0
            retDF = pd.concat([retDF, tmpDF], axis=byRows, ignore_index=True)
            
    return retDF
        
def saveDataSetDF(genesOfInterestFileList):
    dataSetDF = createDataSetCSV(genesOfInterestFileList)
    dataSetPaths = signatureGeneConfig.getLocalCachedDir() + "/dataSets.csv"
    dataSetDF.to_csv(dataSetPaths , index=False)
    print("saved dataSetPaths to: {}".format(dataSetPaths))
    #dataSetDF
    return dataSetPaths

dataSetPaths = saveDataSetDF(genesOfInterestFileList)

## Create Upset Plot Data

In [None]:
def createGeneSets(dataSetPaths):
    '''
    arguments: 
        dataSetPaths: a csv file with cols 'setName,numHeaderLines,filePath'
        
    returns:
        (geneSetsDict, geneSetsDESeqDict)
        
        geneSetsDict can be passed to upsetplot.from_contents()
        geneSetsDESeqDict contains DESeq result value for genes in the intersections
        
    '''
    
    dataSetsDF = pd.read_csv( dataSetPaths )
    print("dataSetsDF.shape:{}".format(dataSetsDF.shape))
    print("dataSetsDF.columns: {}".format(dataSetsDF.columns))
    
    geneSetsDict = {} 
    geneSetsDESeqDict = {} 
    
    numFiles = dataSetsDF.shape[0]

    for i in range(numFiles):
        tissueId = dataSetsDF.iloc[i,0]
        tissueId = tissueId.replace('_vs_all.results', '')
        numHeaderLines = dataSetsDF.iloc[i,1]
        file = dataSetsDF.iloc[i,2]

        #tokens = file.split(".")
        isHack = False
        if "lfcShrink" in file:
            isHack = True

        print("processing setId:{} numHeaderLines:{} file: {}".format(tissueId, numHeaderLines, file))
        # tokens = file.split("/")
        # #print(tokens)
        #
        # # last token: signatureGenesValidateThyroid.csv
        # fileName = tokens[-1].split(".")[0]
        # #print(fileName)
        # tissueId = fileName[ len( "signatureGenesValidate" ):]
        # print(tissueId)
        dataLoader = DESeqSelect( file )

        if isHack:
            geneNamesNP, baseMeanNP, xlog2FoldChangeNP, yNeglog10pValueNP = dataLoader.readVolcanoPlotData(numHeaderLines, hackPadjIndx=5)
        else:
            geneNamesNP, baseMeanNP, xlog2FoldChangeNP, yNeglog10pValueNP = dataLoader.readVolcanoPlotData(numHeaderLines)

        geneSetsDict[tissueId] = set( geneNamesNP )

        # hold on to deseq results Data so that we can analyze signature gene sets with overlapping genes
        geneSetsDESeqDict[tissueId] = {
            "inputFile":file,
            # key is geneName
            "deseqResultSet": dataLoader.loadDESeqResultsAsStrings(numHeaderLines)
        }   

    return (geneSetsDict, geneSetsDESeqDict)


geneSetsDict, geneSetsDESeqDict = createGeneSets( dataSetPaths )

In [None]:
print("small sample of DESeq results for genes of interest")
i = 0
for key,item in geneSetsDESeqDict.items():
    print("\nkey:{} item:{}".format(key,item.keys()))
    itemDict = item['deseqResultSet']
    for k,ii in itemDict.items():
        s = [ "{0:0.2f}".format(float(g)) for g in ii ]
        print("\tgene:{} ii:{}".format(k,  s))
        
    i += 1
    if i >= 2:
        break

In [None]:
# create up set plot data
geneSetsUpsetPlotData = upsp.from_contents(geneSetsDict)

## Create Upset Plot

In [None]:
def configurablePlot(signatureGeneConfig, geneSetsUpsetPlotData, **kwags):
    '''
    return 
        (fig, pltDict)
    
    allows you to pass extra upset plot variables
    examples
        configurablePlot(geneSetsUpsetPlotData, show_counts=True, min_degree=4)
        configurablePlot(geneSetsUpsetPlotData, show_counts=True, min_degree=4, max_degree=7)
    '''
    pltDict = upsp.UpSet(geneSetsUpsetPlotData,  **kwags).plot()
    fig = plt.gcf()
    title = signatureGeneConfig.title
    fig.suptitle( title, fontsize=40 ) 
    
    return (fig, pltDict)

In [None]:
def findIntersectionElements(geneSetsDict, geneSetsUpsetPlotData):
    '''
    arguments:
        geneSetsDict
            key: set name
            value is a list of elements
        
        geneSetsUpsetPlotData:
            pandas multilevel index data frame. created from a function like upsetPlot from_contents()
            
    returns:
        retDict
            key: a comma separated list of set name
            value: the name of the elements in the intersection
            
        retSingleSetDict
            key: set name
            value: list of unique elements. elements not in any intersections
    '''
    retDict = dict()
    retSingleSetDict = dict()
    
    # convert list to numpy array so we can use fancy array index
    setNames = list(geneSetsUpsetPlotData.index.names)
    setNamesNP = np.asarray(setNames)
    
    # for each intersection 
    for idx in geneSetsUpsetPlotData.index.values:
        #print("\n********")
        arrayNP = np.asarray(idx)
        
        # get the list of sets that the intersection was formed from 
        setNameList = sorted( setNamesNP[arrayNP] )
        setList = list()
        for sn in setNameList:
            setList.append( geneSetsDict[sn] )
            
        if len(setList) > 1:
            intersection = set.intersection( *setList )
            #print(intersection)
            key = ",".join(setNameList)
            retDict[ key ] = intersection
            
    # find elements that are not in an intersection
    n = len(setNamesNP)
    for i in range(0, n):
        testSetName = setNamesNP[i]
        knockOut = [True]*n 
        knockOut[i] = False # the name to knock out
        unionSetNameList = setNamesNP[ knockOut ]
        
        unionSetList = list()
        for u in unionSetNameList:
            unionSetList.append( geneSetsDict[u] )
            
        union = set.union( *unionSetList )
        testSet = geneSetsDict[testSetName]
        intersection = testSet.difference(union)
        
        if len(intersection) > 0:
            retSingleSetDict[testSetName] = intersection 
            
    return (retDict, retSingleSetDict)

In [None]:
def getBaseName(signatureGeneConfig):
    # remove chars that can not be used in file names
    baseName =  signatureGeneConfig.title.replace(" ", "-")
    baseName = baseName.replace("<", "lt")
    baseName = baseName.replace(">", "gt")
    return baseName
    
def savePlot(fig, signatureGeneConfig, extraFileNameParts):
    # remove chars that can not be used in file names
#     baseName =  signatureGeneConfig.title.replace(" ", "-")
#     baseName = baseName.replace("<", "lt")
#     baseName = baseName.replace(">", "gt")
    baseName = getBaseName( signatureGeneConfig )
    baseName = baseName + "-" + extraFileNameParts
    
    # save png
    fileName = baseName + ".png"
    filePath = signatureGeneConfig.localCacheRoot + "/" + fileName

    ! rm $filePath 
    print("saving to: {}".format(filePath))
    fig.savefig(filePath, dpi=300, bbox_inches='tight', facecolor='white')

    imgURL = signatureGeneConfig.saveGenesOfInterestToBucketURL() + fileName
    print("save to :\n{}".format(imgURL))
    ! gsutil cp $filePath $imgURL    

In [None]:
def saveInteresection(signatureGeneConfig, intersectionElementsDict):
    baseName = getBaseName( signatureGeneConfig )    
    
    # https://www.adamsmith.haus/python/answers/how-to-read-a-dictionary-from-a-file-in--python
    fileName =  baseName + ".intersection.dict"
    filePath = signatureGeneConfig.localCacheRoot + "/" + fileName
    with open(filePath,'w') as data: 
      data.write(str(intersectionElementsDict))
    
    intersectionURL = signatureGeneConfig.saveGenesOfInterestToBucketURL() + fileName
    print("save to :\n{}".format(intersectionURL))
    ! gsutil cp $filePath $intersectionURL   

In [None]:
# for each interesection save the list of sets in the intersection and the 
# intersection elements for down stream analysis
intersectionElementsDict, retSingleSetDict  = findIntersectionElements(geneSetsDict, geneSetsUpsetPlotData)
intersectionElementsDict.update(retSingleSetDict)
saveInteresection(signatureGeneConfig, intersectionElementsDict)

# organize intersections so we can print the intersections for each plot
setDegreesDict = dict()
for setNames,elements in intersectionElementsDict.items():
    numSetsInIntersection = len(setNames.split(","))
    if numSetsInIntersection in setDegreesDict:
        d = setDegreesDict[numSetsInIntersection]
    else :
        d = dict()
        setDegreesDict[numSetsInIntersection] = d
        
    d = setDegreesDict[numSetsInIntersection]
    d[setNames] = elements
      
        
def printInserectionElements(setDegreesDict, testFunction):
    sortedKeys = sorted(setDegreesDict.keys(), reverse=True)
    #print(sortedKeys)
    for key in sortedKeys:
        #if key >=4:
        if testFunction(key) :
            d = setDegreesDict[key]
            n = len(d.keys())
            print("\n############### degree: {} number of intersections:{}".format(key, n))
            for setName, elements in d.items():
                print("\nsetName:{}\n elements:\n{}".format(setName, elements))

### upset plot max_degree = 1

In [None]:
# create a plot sets of genes that are only found in one type
# use extra args to insure saved file name is unique
extraFileNameParts = 'max_degree=1'
fig, pltDict = configurablePlot(signatureGeneConfig, geneSetsUpsetPlotData, show_counts=True, max_degree=1)

# add more text to plot
designStr = "design: {}".format(signatureGeneConfig.design)
# hack to find text positions
fig.text(x=0.35, y=0.95, s=designStr, fontsize=40 )
fig.text(x=0.40, y=0.93, s="degree = 1", fontsize=40)

savePlot(fig, signatureGeneConfig, extraFileNameParts)

### upset plot max_degree = 1 intersection elements

In [None]:
def testFunction(key):
    return key == 1
                
printInserectionElements(setDegreesDict, testFunction)

### upset plot: min_degree = 2 max_degree = 3

In [None]:
# create a plot of all interesections composed of 4 or more sets
# use extra args to insure saved file name is unique
extraFileNameParts = 'min_degree=2,max_degree=3'
fig, pltDict = configurablePlot(signatureGeneConfig, geneSetsUpsetPlotData, show_counts=True, min_degree=2, max_degree=3)

# add more text to plot
designStr = "design: {}".format(signatureGeneConfig.design)
# hack to find text positions
fig.text(x=0.35, y=0.95, s=designStr, fontsize=40 )
fig.text(x=0.40, y=0.93, s="2 <= degree <=3", fontsize=40)

savePlot(fig, signatureGeneConfig, extraFileNameParts)

### upset plot: min_degree = 2 max_degree = 3 interesections elements

In [None]:
def testFunction(key):
    return key >= 2 and key <= 3
                
printInserectionElements(setDegreesDict, testFunction)

###  upset plot: min_degree = 4 

In [None]:
# create a plot of all interesections composed of 4 or more sets
# use extra args to insure saved file name is unique
extraFileNameParts = 'min_degree=4'
fig, pltDict = configurablePlot(signatureGeneConfig, geneSetsUpsetPlotData, show_counts=True, min_degree=4)

# add more text to plot
designStr = "design: {}".format(signatureGeneConfig.design)
# hack to find text positions
fig.text(x=0.35, y=0.95, s=designStr, fontsize=40 )
fig.text(x=0.40, y=0.93, s="degree > 4", fontsize=40)


savePlot(fig, signatureGeneConfig, extraFileNameParts)

###  upset plot: min_degree = 4  interesections elements

In [None]:
def testFunction(key):
    return key >= 4
                
printInserectionElements(setDegreesDict, testFunction)                