# AEDWIP
```
Andrew E. Davidson
aedavids@ucsc.edu
```

ref:
- extraCellularRNA/terra/deseq/doc/plots
- extraCellularRNA/terra/deseq/doc/plots/jupyterNotebooks/GTExValidateExploration.ipynb
- https://support.terra.bio/hc/en-us/articles/360042259232-Managing-data-and-automating-workflows-with-the-FISS-API
- [An introduction to using the Fiss API in Python in BioData Catalyst](https://terra.biodatacatalyst.nhlbi.nih.gov/#workspaces/biodata-catalyst/BioData%20Catalyst%20Collection/notebooks/launch/Intro%20to%20FISS%20API%20in%20Python.ipynb)
- [upsetPlot](https://upsetplot.readthedocs.io/en/stable/api.html#upsetplot.plot)

## initialize

In [1]:
from datetime import datetime

now = datetime.now()
current_time = now.strftime("%Y-%b-%d %H:%M:%S")
print("Run on ", current_time)

Run on  2022-Jul-31 18:29:38


In [2]:
from firecloud import fiss
import firecloud.api as fapi
import numpy as np
import os
import io
import pandas as pd
from pathlib import Path

# use display() to print an html version of a data frame
# useful if dataFrame output is not generated by last like of cell
from IPython.display import display, Image

In [3]:
tmp = "./tmp"
!mkdir -p $tmp

In [4]:
# set env var used by fiss
# Get the Google billing project name and workspace name
billing_project = os.environ['WORKSPACE_NAMESPACE']
workspace = os.environ['WORKSPACE_NAME']
bucket = os.environ['WORKSPACE_BUCKET'] + "/"

print("Billing project: " + billing_project)
print("Workspace: " + workspace)
print("Workspace storage bucket: " + bucket)

Billing project: test-aedavids-proj
Workspace: uber
Workspace storage bucket: gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/


In [5]:
# Install a pip package in the current Jupyter kernel
# https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/
import sys
!{sys.executable} -m pip install --quiet upsetplot

# install upset plot
# see extraCellularRNA/terra/deseq/bin/createUpsetPlotZip.sh
url = bucket + "python/upsetPlot.zip"
! gsutil cp $url .
! unzip -o upsetPlot.zip > /dev/null
ORIG_PYTHONPATH = os.environ['PYTHONPATH']

Copying gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/python/upsetPlot.zip...
/ [1 files][ 13.0 MiB/ 13.0 MiB]                                                
Operation completed over 1 objects/13.0 MiB.                                     


In [6]:
# setting the python path allows us to run python scripts from using
# the CLI. 
PYTHONPATH = ORIG_PYTHONPATH + ":./python"
os.environ["PYTHONPATH"] = PYTHONPATH
PYTHONPATH = os.environ["PYTHONPATH"]
print("PYTHONPATH: {}".format(PYTHONPATH))

# test install. We should get a help message
! python python/plots/geneSignatureUpsetPlot.py

# to be able to import our local python files we need to set the sys.path
# https://stackoverflow.com/a/50155834
sys.path.append(os.getcwd() + '/python')
print("\nsys.path:\n{}\n".format(sys.path))

PYTHONPATH: /etc/jupyter/custom:/usr/lib/spark/python:/home/jupyter/packages:./python
CALLING initializeLogging()
first call initialize Logging
CALLING initializeLogging()
usage: geneSignatureUpsetPlot.py [-h] [-t] [-w WIDTH] [-z HEIG] -d  -o  -c  -i
geneSignatureUpsetPlot.py: error: the following arguments are required: -d/--dataSetsCSV, -o/--outputFile, -c/--numThreads, -i/--intersectionOutputFile

sys.path:
['/home/jupyter/uber/edit', '/etc/jupyter/custom', '/usr/lib/spark/python', '/home/jupyter/packages', '/opt/conda/lib/python37.zip', '/opt/conda/lib/python3.7', '/opt/conda/lib/python3.7/lib-dynload', '', '/home/jupyter/.local/lib/python3.7/site-packages', '/opt/conda/lib/python3.7/site-packages', '/opt/conda/lib/python3.7/site-packages/IPython/extensions', '/home/jupyter/.ipython', '/home/jupyter/uber/edit/python']



In [7]:
outDir = "output"
! mkdir -p $outDir

outputImgDir = outDir + "/img"
! mkdir -p $outputImgDir

## configure

In [8]:
def listWorkspaceEntities():
    '''
    prints out the names of all the workspace data models, and the number of rows in each model
    
    returns void
    '''
    # Take a look at all of the data models / entities in of our workspace
    ent_types = fiss.fapi.list_entity_types(billing_project, workspace).json()
    for t in ent_types.keys():
        print (t, "count:", ent_types[t]['count'])
        
listWorkspaceEntities()        

GTEx_TCGA_1vsAll_set count: 7
GTEx_1vsAll_set count: 2
TCGA_1vsAll_set count: 2
GTEx_1vsAll count: 51
GTEx_TCGA_1vsAll count: 83
TCGA_1vsAll count: 32


In [None]:
class SignatureGeneConfig(object):
    '''
    Class used to ensure parameters for a given data model are in a consistant state
    
    should be treated as constant, immutable values
    '''
    
    def __init__(self, terraDataEntity, design, padjThreshold, lfcThreshold, n, dataOutputBucketRoot,
                localCacheRoot):
        '''
        arguments
            terraDataEntity:
                a name a terra workspace data model returned by listWorkspaceEntities.
                Expected to have a 'candidateSignatureGeneProfile' column
                
            Design:
                a string with the DESeq design. displayed on plots and encoded into data file names
                
            padjThreshold:
                selects genes with padj values <= padjThreshold
                
            lfcThreshold
            
            n: 
                type integer: 
                The number of rows to be select. 
                
            dataOutputBucketRoot
                example: gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/1vsAll/{up|down|best}
                location to store genes of interest
                
            localCacheRoot
                example: output
                
                
                
        '''
        self.terraDataEntity = terraDataEntity
        self.design = design
        self.padjThreshold = padjThreshold
        self.lfcThreshold = lfcThreshold
        self.n = n
        self.dataOutputBucketRoot = dataOutputBucketRoot
        self.localCacheRoot = localCacheRoot
        
        localCache = self.getLocalCachedDir()
        ! mkdir -p $localCache
        
    def getfileNameBase(self):
        tmp = "{}-design:{}-padj:{}-lfc:{}-n:{}".format(
                                self.terraDataEntity,
                                self.design,
                                self.padjThreshold,
                                self.lfcThreshold, 
                                self.n
                                )
        return tmp.replace(" ","_")
        
    def saveGenesOfInterestToBucketURL(self):
        return self.dataOutputBucketRoot + "/" + self.getfileNameBase() 
    
    def getLocalCachedDir(self) :
        return self.localCacheRoot + "/" + self.getfileNameBase()

In [9]:
GTExTCGA_Config_top25 = SignatureGeneConfig(
    terraDataEntity = 'GTEx_TCGA_1vsAll',
    design = "~  gender + category", 
    padjThreshold = 0.001,
    lfcThreshold = 2.0,
    n = 25,
    dataOutputBucketRoot = bucket + "data/1vsAll/up",
    localCacheRoot = outDir
    )

# set signatureGeneConfig = to the data set you want to run
signatureGeneConfig = GTExTCGA_Config_top25

print( signatureGeneConfig.saveGenesOfInterestToBucketURL())
print( signatureGeneConfig.getLocalCachedDir() )

gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/1vsAll/up/GTEx_TCGA_1vsAll-design:~__gender_+_category-padj:0.001-lfc:2.0-n:25
output/GTEx_TCGA_1vsAll-design:~__gender_+_category-padj:0.001-lfc:2.0-n:25


In [10]:
# terraDataEntity = 'GTEx_TCGA_1vsAll'
# GTEx_TCGA_Design = "~  gender + category" 
# designStr = GTEx_TCGA_Design
# PADJ_THRESHOLD = 0.001
#lfcThreshold = 2.0
#topN = 25


#design = "~ sex + tissue_id".replace(" ", "")
# terraDataEntity = "GTEx_1vsAll"
#designStr = "~sex+tissue_id"


# outDir = "output"
# ! mkdir -p $outDir

# outputImgDir = outDir + "/img"
# ! mkdir -p $outputImgDir

# load 1vsAll results

In [11]:
def loadTerraDataModel(billingProject, workspace, modelName) :
    '''
    makes the data models we would see on the terra uber workspace data tab.
    
    returns a pandas dataframe
    '''
    ret = pd.read_csv( io.StringIO(
                                    fiss.fapi.get_entities_tsv(
                                        billing_project, 
                                        workspace, 
                                        modelName,
                                        model='flexible')
                                    .text), 
                              sep='\t')
    return ret

# load the data model that has 1vsAll results
terraDataEntityDF = loadTerraDataModel(billing_project, workspace, signatureGeneConfig.terraDataEntity)
terraDataEntityDF

Unnamed: 0,entity:GTEx_TCGA_1vsAll_id,candidateSignatureGeneProfile,category,dataSet,estimatedSizeFactors
0,GTEx_Adipose_Subcutaneous,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,Adipose_Subcutaneous,GTEx,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
1,GTEx_Adipose_Visceral_Omentum,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,Adipose_Visceral_Omentum,GTEx,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
2,GTEx_Adrenal_Gland,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,Adrenal_Gland,GTEx,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
3,GTEx_Artery_Aorta,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,Artery_Aorta,GTEx,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
4,GTEx_Artery_Coronary,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,Artery_Coronary,GTEx,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
...,...,...,...,...,...
78,TCGA_THCA,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,THCA,TCGA,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
79,TCGA_THYM,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,THYM,TCGA,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
80,TCGA_UCEC,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,UCEC,TCGA,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
81,TCGA_UCS,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,UCS,TCGA,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...


In [12]:
def select1vsAllResults(df):
    '''
    select rows where candidateSignatureGeneProfile is not null
    
    returns a pandas dataframe
    '''
    selectRowsLogical = ~df.loc[:,'candidateSignatureGeneProfile'].isnull()
    retDF = df.loc[selectRowsLogical, :]
    
    return retDF  

In [13]:
# clean up. remove rows that are missing 1vsAll results
terraDataEntityDF = select1vsAllResults(terraDataEntityDF)
print("{}.shape:{}".format(signatureGeneConfig.terraDataEntity, terraDataEntityDF.shape))
assert terraDataEntityDF.shape[0] == 83, "ERROR: expected 83 candidateSignatureGeneProfiles"
#display( terraDataEntityDF.head() )

print("\n")
for f in terraDataEntityDF.loc[:,'candidateSignatureGeneProfile'].to_list():
    print( f.split("/")[-1] )

terraDataEntityDF

GTEx_TCGA_1vsAll.shape:(83, 5)


Adipose_Subcutaneous_vs_all.results
Adipose_Visceral_Omentum_vs_all.results
Adrenal_Gland_vs_all.results
Artery_Aorta_vs_all.results
Artery_Coronary_vs_all.results
Artery_Tibial_vs_all.results
Bladder_vs_all.results
Brain_Amygdala_vs_all.results
Brain_Anterior_cingulate_cortex_BA24_vs_all.results
Brain_Caudate_basal_ganglia_vs_all.results
Brain_Cerebellar_Hemisphere_vs_all.results
Brain_Cerebellum_vs_all.results
Brain_Cortex_vs_all.results
Brain_Frontal_Cortex_BA9_vs_all.results
Brain_Hippocampus_vs_all.results
Brain_Hypothalamus_vs_all.results
Brain_Nucleus_accumbens_basal_ganglia_vs_all.results
Brain_Putamen_basal_ganglia_vs_all.results
Brain_Spinal_cord_cervical_c-1_vs_all.results
Brain_Substantia_nigra_vs_all.results
Breast_Mammary_Tissue_vs_all.results
Cells_Cultured_fibroblasts_vs_all.results
Cells_EBV-transformed_lymphocytes_vs_all.results
Cervix_Endocervix_vs_all.results
Colon_Sigmoid_vs_all.results
Colon_Transverse_vs_all.results
Esophagus_Gast

Unnamed: 0,entity:GTEx_TCGA_1vsAll_id,candidateSignatureGeneProfile,category,dataSet,estimatedSizeFactors
0,GTEx_Adipose_Subcutaneous,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,Adipose_Subcutaneous,GTEx,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
1,GTEx_Adipose_Visceral_Omentum,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,Adipose_Visceral_Omentum,GTEx,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
2,GTEx_Adrenal_Gland,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,Adrenal_Gland,GTEx,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
3,GTEx_Artery_Aorta,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,Artery_Aorta,GTEx,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
4,GTEx_Artery_Coronary,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,Artery_Coronary,GTEx,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
...,...,...,...,...,...
78,TCGA_THCA,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,THCA,TCGA,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
79,TCGA_THYM,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,THYM,TCGA,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
80,TCGA_UCEC,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,UCEC,TCGA,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...
81,TCGA_UCS,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...,UCS,TCGA,gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/f...


In [14]:
def downLoadCandidateSignatureGeneProfile(cache, dataModelDF):
    '''
    downloads candidateSignatureGeneProfile files to local disk if not already in local cache
    These files are the results created by running 1vsAll.wdl
    
    arguments:
        cache : 
            string path to director to store files location
        
        dataModelDF
            example, GTEx_1vsAllDF
            
    returns
        list of file path on local machine
    '''
    retList = []
    resultsList = dataModelDF.loc[:, 'candidateSignatureGeneProfile'].to_list()
    for gsURL in resultsList:
        fileName = gsURL.split("/")[-1]
        savePath = cache + "/" + fileName
        path = Path(savePath)
        if path.is_file() :
            print("skipping download of {}".format(fileName))            
        else:
            !gsutil -m cp $gsURL $savePath
            
        retList.append(savePath)
            
    return retList
            
        
candidateSignatureFileList = downLoadCandidateSignatureGeneProfile(tmp, terraDataEntityDF)

skipping download of Adipose_Subcutaneous_vs_all.results
skipping download of Adipose_Visceral_Omentum_vs_all.results
skipping download of Adrenal_Gland_vs_all.results
skipping download of Artery_Aorta_vs_all.results
skipping download of Artery_Coronary_vs_all.results
skipping download of Artery_Tibial_vs_all.results
skipping download of Bladder_vs_all.results
skipping download of Brain_Amygdala_vs_all.results
skipping download of Brain_Anterior_cingulate_cortex_BA24_vs_all.results
skipping download of Brain_Caudate_basal_ganglia_vs_all.results
skipping download of Brain_Cerebellar_Hemisphere_vs_all.results
skipping download of Brain_Cerebellum_vs_all.results
skipping download of Brain_Cortex_vs_all.results
skipping download of Brain_Frontal_Cortex_BA9_vs_all.results
skipping download of Brain_Hippocampus_vs_all.results
skipping download of Brain_Hypothalamus_vs_all.results
skipping download of Brain_Nucleus_accumbens_basal_ganglia_vs_all.results
skipping download of Brain_Putamen_basa

# Find Genes of interest

In [15]:
# def findUpRegulatedSignatureGenes(deseqDF, topN=25, lfcThreshold=2.0, padjThreshold=PADJ_THRESHOLD):
#     '''
#     Find genes that that are statistically signifigant and up requlated
    
#     arguments:
#         deseqDF:
#             results of DESeq2 as a pandas dataframe 
            
#         topN:
#             The number of genes to select
#             default = 25
        
#         lfcThreshold:
#             log fold threshold. selectes genes with lfc >=
#             default = 2.0
            
#         padjThreshold:
#             p value
#             default = PADJ_THRESHOLD
    
#     return:
#         pandas dataframe
#     '''
#     selectSignificantRowsPS = deseqDF.loc[:,"padj"] < padjThreshold
# #     print("number of genes with padj < {} : {}".format(padjThreshold,
# #                                                        selectSignificantRowsPS.sum()))

#     deseqLFCSignatureGenesDF = deseqDF.loc[ selectSignificantRowsPS,: ]\
#                                         .sort_values("log2FoldChange", ascending=False)
    
#     # find the genes that are over expresed 
#     selectLFCPS = deseqLFCSignatureGenesDF.loc[:,"log2FoldChange"] >= lfcThreshold
# #     print("number of genes with log2FoldChange >= {} = {}"\
# #             .format( lfcThreshold, selectLFCPS.sum() ))
    
#     deseqBaseMeanSignatureGenesDF = deseqLFCSignatureGenesDF.loc[ selectLFCPS,: ]\
#                             .sort_values("baseMean", ascending=False)
    
#     topSignatureGenesDF = deseqBaseMeanSignatureGenesDF.head( topN )
#     return topSignatureGenesDF

In [16]:
def findUpRegulatedSignatureGenes(deseqDF, signatureGeneConfig):
    '''
    Find genes that that are statistically signifigant and up requlated
    
    arguments:
        deseqDF:
            results of DESeq2 as a pandas dataframe 
            
        signatureGeneConfig
            contains run parmeters            
            
    
    return:
        pandas dataframe
    '''
    selectSignificantRowsPS = deseqDF.loc[:,"padj"] <signatureGeneConfig.padjThreshold
#     print("number of genes with padj < {} : {}".format(padjThreshold,
#                                                        selectSignificantRowsPS.sum()))

    deseqLFCSignatureGenesDF = deseqDF.loc[ selectSignificantRowsPS,: ]\
                                        .sort_values("log2FoldChange", ascending=False)
    
    # find the genes that are over expresed 
    selectLFCPS = deseqLFCSignatureGenesDF.loc[:,"log2FoldChange"] >= signatureGeneConfig.lfcThreshold
#     print("number of genes with log2FoldChange >= {} = {}"\
#             .format( lfcThreshold, selectLFCPS.sum() ))
    
    deseqBaseMeanSignatureGenesDF = deseqLFCSignatureGenesDF.loc[ selectLFCPS,: ]\
                            .sort_values("baseMean", ascending=False)
    
    topSignatureGenesDF = deseqBaseMeanSignatureGenesDF.head( signatureGeneConfig.n )
    return topSignatureGenesDF

In [17]:
# def runFindUpRegulated( csgpFileList, outDir, topN=topN, lfcThreshold=lfcThreshold, 
#                        padjThreshold=PADJ_THRESHOLD, skipRows=7 ):
#     '''
#     finds ups up regulated genes
#     saves to disk as CSV file
    
#     arguments:
#         csgpFileList: 
#             a list of file paths to candidate signature gene files to include in upset plot
            
#         outDir:
#             String
#             path to directory to save up regulated genes
            
#         topN:
#             an integer: 
#             number of genes to select. 
        
#         lfcThreshold:
#             float: 
#             log fold change threshold. example 2.0
            
#         padjThreshold
#             float:
#             p-value threshold. example 0.001
            
#         skipRows:
#             int, default = 7
#             1vsAll returns the results from DESeq with a self describing header comprised of 7 rows
#             the lfcShrink output has 6 rows 
            
#     returns: (upRegulatedDict, outFileList)
#         upRegulatedDict : dictionary
#             key: csgpFile name
#             value: pandas dataframe
        
#     '''
#     retDict = {}
#     retOutFileList = []
#     for csgpFile in csgpFileList:
#         deseqDF = pd.read_csv(csgpFile, skiprows=skipRows)
#         topSignatureGenesDF = findUpRegulatedSignatureGenes(deseqDF,
#                                                             topN=25, 
#                                                             lfcThreshold=2.0,
#                                                             padjThreshold=PADJ_THRESHOLD)

#         fileName = csgpFile.split("/")[-1]
#         outFilePath = outDir + "/" + fileName
#         topSignatureGenesDF.to_csv(outFilePath, index=False)
#         print("saved to file: {}".format(outFilePath))

#         retDict[fileName] = topSignatureGenesDF
#         retOutFileList.append(outFilePath)

    
#     return (retDict, retOutFileList)

In [18]:
def runFindUpRegulated( signatureGeneConfig, candidateSignatureFileList,  skipRows=7 ):
    '''
    finds ups up regulated genes
    
    arguments:
        signatureGeneConfig
            contains run parmeters
            
        candidateSignatureFileList: 
            a list of file paths to candidate signature gene files to include in upset plot
            
            
        skipRows:
            int, default = 7
            1vsAll returns the results from DESeq with a self describing header comprised of 7 rows
            the lfcShrink output has 6 rows             
            
    returns: (upRegulatedDict, outFileList)
        upRegulatedDict : dictionary
            key: csgpFile name
            value: pandas dataframe
        
    '''
    retDict = {}
    retOutFileList = []
    for csgpFile in candidateSignatureFileList:
        deseqDF = pd.read_csv(csgpFile, skiprows=skipRows)
        topSignatureGenesDF = findUpRegulatedSignatureGenes(deseqDF, signatureGeneConfig)
                                                        

        fileName = csgpFile.split("/")[-1]
        outDir = signatureGeneConfig.getLocalCachedDir() 
        outFilePath = outDir + "/" + fileName
        topSignatureGenesDF.to_csv(outFilePath, index=False)
        print("saved to file: {}".format(outFilePath))

        retDict[fileName] = topSignatureGenesDF
        retOutFileList.append(outFilePath)

    
    return (retDict, retOutFileList)

upRegulatedDict, outFileList =  runFindUpRegulated(signatureGeneConfig, candidateSignatureFileList, skipRows=7)

saved to file: output/GTEx_TCGA_1vsAll-design:~__gender_+_category-padj:0.001-lfc:2.0-n:25/Adipose_Subcutaneous_vs_all.results
saved to file: output/GTEx_TCGA_1vsAll-design:~__gender_+_category-padj:0.001-lfc:2.0-n:25/Adipose_Visceral_Omentum_vs_all.results
saved to file: output/GTEx_TCGA_1vsAll-design:~__gender_+_category-padj:0.001-lfc:2.0-n:25/Adrenal_Gland_vs_all.results
saved to file: output/GTEx_TCGA_1vsAll-design:~__gender_+_category-padj:0.001-lfc:2.0-n:25/Artery_Aorta_vs_all.results
saved to file: output/GTEx_TCGA_1vsAll-design:~__gender_+_category-padj:0.001-lfc:2.0-n:25/Artery_Coronary_vs_all.results
saved to file: output/GTEx_TCGA_1vsAll-design:~__gender_+_category-padj:0.001-lfc:2.0-n:25/Artery_Tibial_vs_all.results
saved to file: output/GTEx_TCGA_1vsAll-design:~__gender_+_category-padj:0.001-lfc:2.0-n:25/Bladder_vs_all.results
saved to file: output/GTEx_TCGA_1vsAll-design:~__gender_+_category-padj:0.001-lfc:2.0-n:25/Brain_Amygdala_vs_all.results
saved to file: output/GTEx

In [28]:
%%time
# save to long term storage
URL_ROOT = signatureGeneConfig.saveGenesOfInterestToBucketURL() 
print("saving to: {}".format(url))
print()

for f in outFileList:
    url = URL_ROOT + "/" + f
    ! echo gsutil -m cp $f $url

saving to: gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/1vsAll/up/GTEx_TCGA_1vsAll-design:~__gender_+_category-padj:0.001-lfc:2.0-n:25/output/GTEx_TCGA_1vsAll-design:~__gender_+_category-padj:0.001-lfc:2.0-n:25/Adrenal_Gland_vs_all.results

gsutil -m cp output/GTEx_TCGA_1vsAll-design:~__gender_+_category-padj:0.001-lfc:2.0-n:25/Adipose_Subcutaneous_vs_all.results gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/1vsAll/up/GTEx_TCGA_1vsAll-design:~__gender_+_category-padj:0.001-lfc:2.0-n:25/output/GTEx_TCGA_1vsAll-design:~__gender_+_category-padj:0.001-lfc:2.0-n:25/Adipose_Subcutaneous_vs_all.results
gsutil -m cp output/GTEx_TCGA_1vsAll-design:~__gender_+_category-padj:0.001-lfc:2.0-n:25/Adipose_Visceral_Omentum_vs_all.results gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/1vsAll/up/GTEx_TCGA_1vsAll-design:~__gender_+_category-padj:0.001-lfc:2.0-n:25/output/GTEx_TCGA_1vsAll-design:~__gender_+_category-padj:0.001-lfc:2.0-n:25/Adipose_Visceral_Omentum_vs_all.results
gsutil -m cp o

# Explore the genes of interest

In [20]:
for key in upRegulatedDict.keys():
    df = upRegulatedDict[key]
    print("\n" + key)
    display(df.head(n=2))


Adipose_Subcutaneous_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
2631,SCD,19314.422968,2.339338,0.101717,22.998473,4.8282020000000005e-117,7.658534e-116
888,FASN,16799.560621,2.530867,0.080808,31.319477,2.534326e-215,1.190164e-213



Adipose_Visceral_Omentum_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
2583,C3,33138.986228,2.237056,0.114028,19.618497,1.074844e-85,1.643047e-84
1837,SCD,19314.422968,2.4487,0.1123,21.804963,2.081741e-105,4.4738189999999997e-104



Adrenal_Gland_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
2668,APOE,22484.798309,2.944127,0.173451,16.973774,1.2840709999999998e-64,1.899646e-63
8890,H19,18991.694669,2.015694,0.204351,9.863875,5.969993000000001e-23,2.651279e-22



Artery_Aorta_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
6545,FN1,122392.921176,2.090274,0.147289,14.191672,1.031686e-45,6.394212e-45
3574,TAGLN,59329.90116,2.302641,0.123165,18.69565,5.371221e-78,6.0955469999999996e-77



Artery_Coronary_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
2072,TAGLN,59329.90116,2.211401,0.165311,13.377238,8.214294e-41,1.4247620000000001e-39
1906,ACTA2,35825.191865,2.357846,0.171441,13.753063,4.881781e-43,9.204474e-42



Artery_Tibial_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1949,FLNA,94255.813936,2.527528,0.082492,30.639826,3.6107000000000006e-206,7.702457000000001e-205
7692,MYH11,76286.18537,2.426667,0.135169,17.952818,4.561615e-72,2.466581e-71



Bladder_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
920,PLAT,2646.820238,2.229408,0.526635,4.233311,2.3e-05,0.000729
664,TNFRSF21,2461.767052,2.148223,0.468446,4.585845,5e-06,0.000198



Brain_Amygdala_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1506,MT-RNR2,774541.966032,2.601088,0.16993,15.306792,6.887865e-53,1.8461029999999998e-51
1159,MT-RNR1,140332.123897,2.944454,0.180956,16.271639,1.569016e-59,5.463286e-58



Brain_Anterior_cingulate_cortex_BA24_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
4174,MT-RNR2,774541.966032,2.111731,0.158137,13.353771,1.126008e-40,1.1218280000000001e-39
3891,MT-RNR1,140332.123897,2.314849,0.168681,13.72326,7.36765e-43,7.874034e-42



Brain_Caudate_basal_ganglia_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
2664,MT-RNR2,774541.966032,2.221928,0.133572,16.634744,3.904184e-62,6.097998e-61
2362,MT-RNR1,140332.123897,2.446625,0.142396,17.181795,3.6348880000000004e-66,6.402972e-65



Brain_Cerebellar_Hemisphere_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
11600,MEG3,17225.356894,2.0799,0.186994,11.122801,9.717218e-29,3.536679e-28
10892,CHGB,12600.515595,3.717206,0.317075,11.723427,9.667983e-32,3.747464e-31



Brain_Cerebellum_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
10179,MEG3,17225.356894,2.271109,0.176804,12.845359,9.132765999999999e-38,3.906366e-37
397,PKD1,9250.982787,3.247662,0.090071,36.056875,1.0761619999999999e-284,1.17737e-282



Brain_Cortex_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
16177,GFAP,53598.504524,2.075486,0.319555,6.494932,8.307084e-11,2.13084e-10
629,CALM3,14031.222097,2.111779,0.081638,25.867598,1.542532e-147,1.016063e-145



Brain_Frontal_Cortex_BA9_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1206,CALM1,23054.194866,2.051947,0.093827,21.869388,5.083444e-106,1.748208e-104
282,CALM3,14031.222097,2.453894,0.089202,27.509263,1.3603199999999997e-166,1.995249e-164



Brain_Hippocampus_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1629,MT-RNR2,774541.966032,2.514353,0.149156,16.857198,9.288996999999999e-64,2.3723980000000003e-62
1604,MT-RNR1,140332.123897,2.692087,0.159136,16.916941,3.375136e-64,8.754323999999999e-63



Brain_Hypothalamus_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
3231,MT-RNR2,774541.966032,2.080632,0.148028,14.055695,7.108111e-45,8.664769e-44
2387,MT-RNR1,140332.123897,2.452351,0.157598,15.560849,1.3430600000000002e-54,2.215823e-53



Brain_Nucleus_accumbens_basal_ganglia_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
2889,MT-RNR2,774541.966032,2.293501,0.133893,17.12932,8.969736000000001e-66,1.2916419999999998e-64
1975,MT-RNR1,140332.123897,2.716199,0.142298,19.08813,3.1691340000000003e-81,6.674428e-80



Brain_Putamen_basal_ganglia_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
2127,MT-RNR2,774541.966032,2.441662,0.146223,16.698226,1.350191e-62,2.562888e-61
1879,MT-ND2,549933.369686,2.083296,0.120939,17.225951,1.696175e-66,3.6443409999999997e-65



Brain_Spinal_cord_cervical_c-1_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
579,MBP,102582.48943,5.751208,0.281764,20.411467,1.322537e-92,8.871489e-91
3441,GFAP,53598.504524,5.17657,0.39984,12.946594,2.456124e-38,2.776234e-37



Brain_Substantia_nigra_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
2386,MT-RNR2,774541.966032,2.303201,0.177454,12.979169,1.6061469999999997e-38,2.705015e-37
2201,MT-RNR1,140332.123897,2.517651,0.189282,13.301081,2.281469e-40,4.165182e-39



Breast_Mammary_Tissue_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
3975,KRT17,19620.982687,2.558719,0.1956,13.081352,4.208869e-39,4.063527e-38
712,FASN,16799.560621,2.162057,0.098509,21.947918,9.065866e-107,4.880945e-105



Cells_Cultured_fibroblasts_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
934,FN1,122392.921176,5.481103,0.120644,45.432152,0.0,0.0
3111,COL1A1,64196.089643,4.121512,0.118529,34.772059,6.433917e-265,8.895841e-264



Cells_EBV-transformed_lymphocytes_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
14844,IGHG1,57607.938748,2.809384,0.28942,9.706958,2.816172e-22,7.726309e-22
9114,CD74,46946.692376,2.203075,0.146816,15.005695,6.738131e-51,3.0107579999999995e-50



Cervix_Endocervix_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
286,COL27A1,2425.663537,2.708469,0.603903,4.484942,7.293377e-06,0.000654803
21,MMP11,2190.739043,5.042047,0.724873,6.95577,3.506418e-12,4.106813e-09



Colon_Sigmoid_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1245,FLNA,94255.813936,2.123448,0.111491,19.045834,7.114389e-81,2.2434309999999998e-79
7315,DES,89881.666167,2.129629,0.207991,10.239065,1.325106e-24,7.116559e-24



Colon_Transverse_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1584,IGKC,45823.345403,3.217246,0.185982,17.298707,4.8106810000000003e-67,1.168221e-65
1163,IGHA1,25987.995442,3.574244,0.189841,18.827537,4.491674e-79,1.485262e-77



Esophagus_Gastroesophageal_Junction_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1764,FLNA,94255.813936,2.101194,0.111282,18.881779,1.6106929999999998e-79,3.4739869999999996e-78
2925,MYH11,76286.18537,2.820204,0.178221,15.824241,2.1173699999999998e-56,2.754752e-55



Esophagus_Mucosa_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
136,KRT13,59347.437558,8.692863,0.21378,40.662723,0.0,0.0
9947,S100A9,51087.401729,2.554602,0.17817,14.33798,1.2668860000000001e-46,5.3302700000000004e-46



Esophagus_Muscularis_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1703,FLNA,94255.813936,2.196068,0.09469,23.192091,5.471906000000001e-119,1.261043e-117
6910,DES,89881.666167,2.2846,0.177467,12.873385,6.355425999999999e-38,3.6113089999999997e-37



Heart_Atrial_Appendage_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
774,GSN,38976.202224,2.056743,0.079494,25.873046,1.339461e-147,7.066305e-146
1616,TPM1,34273.982714,2.366993,0.108421,21.831398,1.1679420000000001e-105,2.953081e-104



Heart_Left_Ventricle_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
384,MT-CO1,1346880.0,2.914182,0.089488,32.564884,1.2890869999999999e-232,1.409559e-230
191,MT-ND4,947920.9,2.86131,0.080816,35.40532,1.414157e-274,3.1006860000000003e-272



Kidney_Cortex_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1281,GPX3,40969.907575,3.021156,0.280362,10.775905,4.473596e-27,1.242731e-25
7170,CD24,17904.273532,2.016594,0.351054,5.744395,9.225014e-09,4.581439e-08



Liver_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
490,ALB,65671.457318,10.332808,0.326319,31.664725,4.75558e-220,4.2455410000000003e-218
1267,SERPINA1,41268.947751,6.748775,0.25734,26.225179,1.372162e-151,4.743483e-150



Lung_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
133,A2M,34166.126479,3.092962,0.083189,37.179964,1.438447e-302,4.276588e-300
169,EPAS1,14686.625943,2.501564,0.068715,36.404955,3.5538660000000003e-290,8.328381999999999e-288



Minor_Salivary_Gland_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
2344,IGKC,45823.345403,3.240115,0.294353,11.007597,3.512478e-28,5.459544e-27
11310,KRT4,27073.675686,2.436097,0.453319,5.373908,7.704819e-08,2.48283e-07



Muscle_Skeletal_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1626,GAPDH,107019.507739,2.807605,0.044461,63.14704,0.0,0.0
4017,TPT1,97157.171974,2.320934,0.033631,69.012239,0.0,0.0



Nerve_Tibial_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
434,APOD,17203.670074,3.866154,0.107956,35.812198,7.133304e-281,6.668082e-279
1604,XIST,11814.52676,2.804947,0.106782,26.267922,4.461501e-152,1.13033e-150



Ovary_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
2572,IGFBP5,30404.286687,2.46909,0.168917,14.617211,2.1815649999999998e-48,3.245469e-47
3865,MEG3,17225.356894,2.619489,0.205919,12.72095,4.523268e-37,4.478574e-36



Pancreas_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
83,PRSS2,57033.109761,13.095619,0.274639,47.683054,0.0,0.0
82,PRSS1,50785.445354,13.357119,0.279956,47.711487,0.0,0.0



Pituitary_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
189,GNAS,47530.818642,2.062121,0.070634,29.194343,2.2879509999999998e-187,4.8099959999999997e-185
1286,MEG3,17225.356894,3.328224,0.161011,20.670787,6.347275e-95,1.969973e-93



Prostate_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
6,KLK3,16009.618654,11.95335,0.337302,35.438082,4.426827e-275,2.426723e-271
4519,AZGP1,6663.49556,2.453399,0.246099,9.96917,2.079588e-23,1.765487e-22



Skin_Not_Sun_Exposed_Suprapubic_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
6863,KRT5,48084.270387,3.376393,0.192296,17.558349,5.134707000000001e-69,3.251682e-68
1365,KRT10,37525.178564,4.260657,0.146132,29.156184,6.974404e-187,2.219351e-185



Skin_Sun_Exposed_Lower_leg_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
7419,KRT5,48084.270387,3.178158,0.179384,17.717095,3.094849e-70,1.7586000000000002e-69
404,KRT10,37525.178564,4.938355,0.132141,37.371976,1.114858e-305,1.160636e-303



Small_Intestine_Terminal_Ileum_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
5450,IGKC,45823.345403,2.300687,0.274952,8.367585,5.881132000000001e-17,4.189835e-16
4964,IGHA1,25987.995442,2.453427,0.281589,8.712786,2.964967e-18,2.3190640000000003e-17



Spleen_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1080,IGHG1,57607.938748,5.450848,0.237393,22.96128,1.136774e-116,4.215948e-115
3280,CD74,46946.692376,2.079366,0.124801,16.661472,2.4979990000000003e-62,3.05234e-61



Stomach_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
15,PGA3,81308.952124,12.501654,0.255367,48.955704,0.0,0.0
18,PGC,33550.512407,11.242033,0.260141,43.215183,0.0,0.0



Testis_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
16768,PTGDS,12880.588442,2.102604,0.155689,13.505199,1.457188e-41,5.150516e-41
15773,XIST,11814.52676,2.039538,0.142287,14.333985,1.341938e-46,5.042349e-46



Thyroid_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
274,TG,43695.53656,10.887768,0.150421,72.382063,0.0,0.0
4502,CD24,17904.273532,2.478858,0.128623,19.272293,9.177894e-83,8.283748e-82



Uterus_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
2689,IGFBP5,30404.286687,2.013928,0.19062,10.565136,4.323424e-26,5.958611e-25
307,PBX1,3723.74297,2.322476,0.128112,18.12852,1.8980030000000002e-73,2.284629e-71



Vagina_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
4553,KRT13,59347.437558,3.170509,0.450681,7.034925,1.993679e-12,1.582422e-11
7030,KRT5,48084.270387,2.179236,0.380668,5.724774,1.035714e-08,5.324549e-08



Whole_Blood_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
3030,HBB,985081.56977,11.377987,0.110899,102.597364,0.0,0.0
3029,HBA2,939295.036781,11.396289,0.106056,107.455747,0.0,0.0



ACC_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1526,MT-CO1,1346880.0,2.399273,0.215615,11.127558,9.212518e-29,2.2279570000000002e-27
1165,MT-ND4,947920.9,2.352362,0.196427,11.975747,4.761281e-33,1.5079700000000002e-31



BLCA_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1986,H19,18991.694669,2.431156,0.165621,14.679029,8.783892e-49,1.70709e-47
782,FBLN1,14411.907169,2.203683,0.118181,18.646675,1.343793e-77,6.627322e-76



BRCA_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
137,MGP,22954.639209,3.575263,0.093918,38.067969,0.0,0.0
223,XBP1,9075.728796,2.923984,0.062057,47.117811,0.0,0.0



CESC_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
3850,S100A9,51087.401729,2.802561,0.246247,11.381099,5.194087e-30,5.059751e-29
5100,KRT5,48084.270387,2.838869,0.278809,10.182141,2.382604e-24,1.7522250000000003e-23



CHOL_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
671,AQP1,14328.468422,2.53529,0.36755,6.897814,5.280881e-12,2.667002e-10
3727,SPP1,11663.014318,2.271747,0.538284,4.220352,2.439217e-05,0.0002220551



COAD_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
7413,PIGR,8819.197109,2.827562,0.291555,9.698213,3.0682550000000003e-22,1.622856e-21
2492,FCGBP,5921.834991,2.994425,0.195771,15.295509,8.191778000000001e-53,1.288537e-51



DLBC_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
4191,CD74,46946.692376,2.435293,0.27905,8.727098,2.612903e-18,2.397613e-17
939,RPS19,24775.496871,2.127333,0.160933,13.218741,6.839904e-40,2.7989759999999997e-38



ESCA_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
236,MALAT1,33662.063256,3.41139,0.111355,30.635216,4.159132e-206,7.230045e-204
10588,SPRR3,14643.990289,3.261258,0.455026,7.167196,7.654947e-13,2.978337e-12



GBM_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
3564,CLU,56070.78888,2.094595,0.195185,10.731307,7.256270000000001e-27,8.045402999999999e-26
6198,GFAP,53598.504524,3.495422,0.41433,8.436317,3.274942e-17,2.087942e-16



HNSC_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
8172,KRT13,59347.437558,3.472354,0.251711,13.79503,2.7304070000000003e-43,1.352909e-42
3177,S100A9,51087.401729,3.833441,0.186068,20.602356,2.614124e-94,3.331157e-93



KICH_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
4100,MT-CO1,1346880.0,2.132551,0.239512,8.903749,5.399131e-19,5.029051e-18
2598,MT-ND4,947920.9,2.270156,0.218074,10.410019,2.23177e-25,3.280162e-24



KIRC_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1361,GPX3,40969.907575,2.720334,0.114798,23.696769,3.893103e-124,1.182337e-122
926,TNS1,26144.983512,2.206154,0.085005,25.953082,1.678315e-148,7.48887e-147



KIRP_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
959,FHL1,28068.059244,2.892166,0.152145,19.009334,1.427541e-80,5.800278e-79
524,CRYAB,21594.014674,3.560525,0.167178,21.297821,1.1892689999999999e-100,8.835927e-99



LGG_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1400,MBP,102582.48943,5.266625,0.15783,33.369064,3.8544770000000006e-244,1.219786e-242
6294,CLU,56070.78888,2.172112,0.107044,20.291691,1.5226839999999998e-91,1.0724339999999998e-90



LIHC_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
542,FTL,101725.794849,2.816165,0.086002,32.745221,3.551182e-235,2.815047e-233
23,ALB,65671.457318,10.348104,0.271729,38.08239,0.0,0.0



LUAD_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
2345,IGHA1,25987.995442,2.265104,0.173259,13.073548,4.6638170000000005e-39,7.564885999999999e-38
174,SFTPB,14421.44464,6.019867,0.241647,24.91181,5.541549e-137,1.2049859999999999e-134



LUSC_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
4421,IGHG3,9899.278605,2.064312,0.173642,11.888314,1.361275e-32,1.230473e-31
726,GPNMB,8647.086557,2.068357,0.102532,20.172835,1.696307e-90,9.326421e-89



MESO_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
2363,C3,33138.986228,2.144622,0.283119,7.574992,3.591496e-14,5.45956e-13
1007,COL6A1,29599.580298,2.098197,0.212662,9.866348,5.824664e-23,2.076539e-21



OV_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
430,LSU-rRNA,432623.999254,3.099496,0.130274,23.792062,4.0354780000000005e-125,3.881642e-123
1325,SSU-rRNA,245645.018737,2.552032,0.141378,18.051069,7.737910000000001e-73,2.419235e-71



PAAD_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
1102,REG1A,26610.922181,4.781937,0.496257,9.636004,5.6338720000000005e-22,1.888956e-20
2235,LYZ,17275.659268,2.27153,0.27951,8.126834,4.406469e-16,7.288015e-15



PCPG_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
49,GNAS,47530.818642,2.934325,0.086357,33.979123,4.532256e-253,3.484217e-250
768,CALM1,23054.194866,2.202899,0.101754,21.649303,6.170653e-104,3.084363e-102



PRAD_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
7701,DES,89881.666167,2.494366,0.188778,13.213254,7.357424e-40,4.089952e-39
2348,FASN,16799.560621,2.154881,0.099439,21.670459,3.898543e-104,7.105838e-103



READ_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
10643,PIGR,8819.197109,2.263653,0.486304,4.654815,3.242721e-06,1.127823e-05
8400,IGHA2,7640.204928,2.331596,0.429211,5.432286,5.563669e-08,2.451696e-07



SARC_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
3240,FLNA,94255.813936,2.0493,0.13527,15.149655,7.61655e-52,9.648882999999999e-51
5213,DES,89881.666167,2.966324,0.250519,11.840692,2.4046140000000001e-32,1.89353e-31



SKCM_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
2013,VIM,71448.838704,2.08538,0.19039,10.9532,6.414112e-28,1.2176939999999999e-26
2065,KRT10,37525.178564,3.931369,0.362007,10.859919,1.789082e-27,3.311015e-26



STAD_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
2109,LSU-rRNA,432623.999254,2.313742,0.114982,20.122683,4.671034e-90,9.507215e-89
908,SSU-rRNA,245645.018737,3.208417,0.121439,26.420084,8.05542e-154,3.8058090000000002e-152



TGCT_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
96,SVA,18758.575047,2.599498,0.094009,27.651715,2.661397e-168,1.131505e-165
2341,MEG3,17225.356894,3.220742,0.238593,13.498918,1.5869019999999999e-41,2.794357e-40



THCA_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
3699,FN1,122392.921176,2.557947,0.139062,18.394281,1.459847e-75,1.637515e-74
3341,CLU,56070.78888,2.094799,0.109254,19.173627,6.146998e-82,7.633718e-81



THYM_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
343,GAS6,9361.489576,2.816359,0.152059,18.521508,1.385016e-76,1.598123e-74
4738,ARHGDIB,7775.381549,2.060777,0.221962,9.284384,1.626456e-20,1.36229e-19



UCEC_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
765,FBLN1,14411.907169,2.397763,0.178757,13.413559,5.036298e-41,2.404865e-39
3240,SLPI,5385.050153,2.402001,0.258488,9.292496,1.507125e-20,1.7008979999999998e-19



UCS_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
5811,DES,89881.666167,2.486514,0.532799,4.666894,3.057872e-06,1.965572e-05
374,ACTC1,14193.843595,5.841343,0.580326,10.065615,7.839607e-24,7.81013e-22



UVM_vs_all.results


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
152,EEF2,80410.686591,2.12597,0.095457,22.271588,6.968822e-110,1.678347e-107
1736,VIM,71448.838704,3.094284,0.21469,14.412812,4.298477e-47,9.118612e-46


In [21]:
aediwp move to bucket

SyntaxError: invalid syntax (3367799331.py, line 1)

## create dataSet.csv
This file is an argument to plots/geneSignatureUpsetPlot.py. I defines the set of candidate genes to include
in the upset plot

In [None]:
def createDataSetCSV(csgpFileList):
    retDF = None
    for csgp in csgpFileList:
        setName = csgp.split("/")[-1]
        filePath = csgp
        numHeaderLines = 8

        tmpDF = pd.DataFrame( {"setName":[setName], 'numHeaderLines':[numHeaderLines], 'filePath':[filePath]} )
        if retDF is None:
            retDF = tmpDF
        else:
            byRows = 0
            retDF = pd.concat([retDF, tmpDF], axis=byRows, ignore_index=True)
            
    return retDF
        
        
dataSetDF = createDataSetCSV(outFileList)
dataSetPath = outDir + "/dataSet.csv"
dataSetDF.to_csv(dataSetPath , index=False)
dataSetDF

## Create Upset Plot

In [None]:
%%time
title = "{} topN={} Signature Genes, padj < {} lf2c > {} sorted by baseMean".format(terraDataEntity, topN, PADJ_THRESHOLD, lfcThreshold)

print("title: {}".format(title))
name= terraDataEntity + "-" + designStr

baseName = name.replace(" ", "_")
outputImg = outputImgDir + "/" + baseName + ".png"
print("outputImg: {}".format(outputImg))

#IntersectionOutFile =  outDir + "/" + baseName + "Insersections.csv"
IntersectionOutFile =  "{}/{}Insersections.csv".format(outDir, baseName)
print("IntersectionOutFile: {}".format(IntersectionOutFile))
print()


numCPU = os.cpu_count() - 1 # leave a cpu for OS 
# width default 8, height default 3
# width default 10, height default 4 
# 10,4 bad image
# 15,10 better, interseciton is blac
# 20, 15 can almost read the set names inserections are blac
# 25, 20
# 40 40 
# 50 50
# ! python python/plots/geneSignatureUpsetPlot.py \
#     -t "$title" \
#     -d "$dataSetPath" \
#     -o "$outputImg" \
#     --width 75 \
#     --height 75 \
#     --numThreads "$numCPU" \
#     --intersectionOutputFile="$IntersectionOutFile"

In [None]:
%%time

from plots import geneSignatureUpsetPlot as gs

dataSetsDF = pd.read_csv( dataSetPath )
gsup = gs.GeneSignatureUpsetPlot(dataSetsDF, numThreads=numCPU)

In [None]:
%%time 
def save():
    print("\n BEGIN PLOT")
    # https://upsetplot.readthedocs.io/en/stable/api.html#upsetplot.plot
    figureWidthInInches = 75
    figureHeightInInches = 75
    fig, subPlotDict = gsup.plot(figureWidthInInches, figureHeightInInches)
    fig.suptitle( title, fontsize=8 )  # arial is not installed on courtyard, default font is huge
    fig.savefig(outputImg, dpi=300, bbox_inches='tight')
    print("saved plot: {}".format(outputImg))
    print("END PLOT\n")
 
save()

In [None]:
%%time
import upsetplot as upsp

plotData = gsup.getUpSetPlotData()
# 2       398
# 3       621
# 4      1058
# 5      1615
# 6      1894
# 7.     1632
# 8 sets 1017
# 9       448
# 10      133
# 11       24
# 12        2
# number of intersection created by 8 or more sets 1017 + 448 + 133 + 24 + 2
upsp.plot( plotData.plotData, show_counts=True, min_degree=8)

intersectionDF = gsup.getIntersectionDF()
intersectionDF.to_csv(IntersectionOutFile, index=False)
print("\n*************** wrote file: {}".format(IntersectionOutFile))

In [None]:
%%time
geneSetsDict = gsup.getGeneSets()
from upsetplot import from_contents
geneSetsUpsetPlotData = from_contents(geneSetsDict)

from upsetplot import UpSet
#plt = UpSet(geneSetsUpsetPlotData, subset_size='count', min_degree=8).plot()
plt = UpSet(geneSetsUpsetPlotData, show_counts=True, min_degree=8).plot()

In [None]:
%%time
#plt = UpSet(geneSetsUpsetPlotData, subset_size='count', max_degree=7).plot()
plt = UpSet(geneSetsUpsetPlotData, show_counts=True, max_degree=7).plot()

In [None]:
plt = UpSet(geneSetsUpsetPlotData, show_counts=True, max_degree=2).plot()

In [None]:
plt = UpSet(geneSetsUpsetPlotData, show_counts=True, min_degree=3, max_degree=5).plot()

In [None]:
plt = UpSet(geneSetsUpsetPlotData, show_counts=True, min_degree=6, max_degree=7).plot()

In [None]:
%%time
from matplotlib import pyplot as plt
fig = plt.figure(figsize=(10,3))
UpSet.plot(geneSetsUpsetPlotData, fig, element_size=None, subset_size='count', max_degree=7)

## explore the intersection

In [None]:
plotData = gsup.getUpSetPlotData()
plotData.plotData

In [None]:
plotData.intersectionDict

In [None]:
print(len(plotData.intersectionDict))

In [None]:
aedwip

In [None]:
! ls -l $IntersectionOutFile
! ls -l $outputImg

In [None]:
display( Image(filename=outputImg) )

## Save

In [None]:
target = bucket + "upsetPlots/"
print(target)
! gsutil -m cp $outputImg $IntersectionOutFile $target

## Explore the intersections

In [None]:
adwip wrong file

! ls -l geneSignatureUpsetPlot.Intersection.csv
intersectionDF = pd.read_csv("geneSignatureUpsetPlot.Intersection.csv")
intersectionDF.loc[:, ['tissueId', 'gene', 'log2FoldChange', 'baseMean', 'padj']]