# Find Candidate Enrichment Biomarkers
Andrew E. Davidson  
aedavids@ucsc.edu   
5/16/24  

Copyright (c) 2020-2023, Regents of the University of California All rights reserved. https://polyformproject.org/licenses/noncommercial/1.0.0

ref: deconvolutionAnalysis/doc/addDegree2Genes.md

<span style="color:red;background-color:yellow">('ESCA', 'STAD') : len(v) = 167</span>  
STAD is 'stomach adenocarcinoma'

**topp 2 candidate genes based on differentical expression**  
ANKRD36C, FGF19, AL031708.1

<span style="color:red;background-color:yellow">TODO look at mis classificaiton error metrics</span>

In [1]:
import ipynbname

# use display() to print an html version of a data frame
# useful if dataFrame output is not generated by last like of cell
from IPython.display import display

import numpy as np
import os
import pandas as pd

import pprint as pp
import matplotlib.pyplot as plt

import sys

notebookName = ipynbname.name()
notebookPath = ipynbname.path()
notebookDir = os.path.dirname(notebookPath)

#outDir = f'{notebookDir}/{notebookName}.out'
outDir = f'/private/groups/kimlab/aedavids/elife/{notebookName}.out'
os.makedirs(outDir, exist_ok=True)
print(f'outDir:\n{outDir}')

# results of hyperparmeter search
#hyperparameterOut = "/private/groups/kimlab/aedavids/elife/hyperparmeterTunning"

imgOut = f'{outDir}/img'
os.makedirs(imgOut, exist_ok=True)
print(f'\nimgOut :\n{imgOut}')

import logging
loglevel = "INFO"
#loglevel = "WARN"
# logFMT = "%(asctime)s %(levelname)s [thr:%(threadName)s %(name)s %(funcName)s() line:%(lineno)s] [%(message)s]"
logFMT = "%(asctime)s %(levelname)s %(name)s %(funcName)s() line:%(lineno)s] [%(message)s]"
logging.basicConfig(format=logFMT, level=loglevel)    
logger = logging.getLogger(notebookName)

meaningOfLife = 42

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


outDir:
/private/groups/kimlab/aedavids/elife/findCandidateEnrichmentBiomarkers.out

imgOut :
/private/groups/kimlab/aedavids/elife/findCandidateEnrichmentBiomarkers.out/img


In [2]:
# setting the python path allows us to run python scripts from using
# the CLI. 
ORIG_PYTHONPATH = os.environ['PYTHONPATH']

####### config deconvolutionModules
deconvolutionModules = notebookPath.parent.joinpath("../../../deconvolutionAnalysis/python/")
print("deconvolutionModules: {}\n".format(deconvolutionModules))

PYTHONPATH = ORIG_PYTHONPATH + f':{deconvolutionModules}'
print("PYTHONPATH: {}\n".format(PYTHONPATH))

##### config intraExtraRNA_POCModules
intraExtraRNA_POCModules=notebookPath.parent.joinpath("../../python/src")
print("intraExtraRNA_POCModules: {}\n".format(intraExtraRNA_POCModules))

PYTHONPATH = PYTHONPATH + f':{intraExtraRNA_POCModules}'
print("PYTHONPATH: {}\n".format(PYTHONPATH))

###### set new PYTHONPATH
os.environ["PYTHONPATH"] = PYTHONPATH
PYTHONPATH = os.environ["PYTHONPATH"]
print("PYTHONPATH: {}\n".format(PYTHONPATH))

###### set sys.path
# to be able to import our local python files we need to set the sys.path
# https://stackoverflow.com/a/50155834
sys.path.append( str(deconvolutionModules) )
sys.path.append( str(intraExtraRNA_POCModules) )
print("\nsys.path:\n{}\n".format(sys.path))

deconvolutionModules: /private/home/aedavids/extraCellularRNA/deconvolutionAnalysis/jupyterNotebooks/hyperParameterTunning/../../../deconvolutionAnalysis/python

PYTHONPATH: :/private/home/aedavids/extraCellularRNA/src:/private/home/aedavids/extraCellularRNA/deconvolutionAnalysis/jupyterNotebooks/hyperParameterTunning/../../../deconvolutionAnalysis/python

intraExtraRNA_POCModules: /private/home/aedavids/extraCellularRNA/deconvolutionAnalysis/jupyterNotebooks/hyperParameterTunning/../../python/src

PYTHONPATH: :/private/home/aedavids/extraCellularRNA/src:/private/home/aedavids/extraCellularRNA/deconvolutionAnalysis/jupyterNotebooks/hyperParameterTunning/../../../deconvolutionAnalysis/python:/private/home/aedavids/extraCellularRNA/deconvolutionAnalysis/jupyterNotebooks/hyperParameterTunning/../../python/src

PYTHONPATH: :/private/home/aedavids/extraCellularRNA/src:/private/home/aedavids/extraCellularRNA/deconvolutionAnalysis/jupyterNotebooks/hyperParameterTunning/../../../deconvolutionA

In [3]:
# import local 
from analysis.utilities import findIntersectionsWithDegree
from analysis.utilities import loadDictionary

# Find Candidate ESCA Degree 2 Genes
Avoid Genes shared between ESCA and Esophagus

In [4]:
upstreamRunName = "best500LFC_FindAllDegree1_wl500"
upstreamRoot = f'/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/{upstreamRunName}' 
print(f'upstreamRoot:\n{upstreamRoot}')

upstreamOut = f'{upstreamRoot}/training/{upstreamRunName}.sh.out'
print(f'\nupstreamOut:\n{upstreamOut}')

upsetPlotOut=f'{upstreamOut}/upsetPlot.out'

best500LFC_findAllDegree1_wl500Path = f'{upsetPlotOut}/best500LFC_findAllDegree1_wl500.intersection.dict'
print(f'\nbest500LFC_findAllDegree1_wl500Path :\n{best500LFC_findAllDegree1_wl500Path}')
best500LFC_findAllDegree1_wl500_intersectionDict = loadDictionary( best500LFC_findAllDegree1_wl500Path)

upstreamRoot:
/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/best500LFC_FindAllDegree1_wl500

upstreamOut:
/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/best500LFC_FindAllDegree1_wl500/training/best500LFC_FindAllDegree1_wl500.sh.out

best500LFC_findAllDegree1_wl500Path :
/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/best500LFC_FindAllDegree1_wl500/training/best500LFC_FindAllDegree1_wl500.sh.out/upsetPlot.out/best500LFC_findAllDegree1_wl500.intersection.dict


In [5]:
degree2Dict = findIntersectionsWithDegree(
                    best500LFC_findAllDegree1_wl500_intersectionDict, 
                    degree=2)

In [6]:
def findBiomarkers(
    intersectionDict : dict[ list, list ], 
    setName: str) -> dict[ list[str], list[str] ]:
    '''
    arguments
        intersectionDict
            key: muliti index of GTEx, or TCGA classes
                example:  ('Liver', 'PRAD', 'UVM')
                
            values: list of biomarkers
                example : ['ABCC11'],
        setName :
            a GTEx, or TCGA classes
                example : 'Liver'

    returns
        a dictionary, all the keys will contain setName
    '''

    retDict = dict()
    for key,values in intersectionDict.items():
        if setName in key:
            retDict[key] = values

    return retDict

In [7]:
ESCAIntersectionDict = findBiomarkers(degree2Dict, "ESCA")

def viewDict( intersectionDict ) :
    for k,v in intersectionDict.items():    
        if len(v) < 5:
            print(f'{str(k)} : {v}')
        else :
            print(f'{k} : len(v) = {len(v)}')

viewDict( ESCAIntersectionDict )

('ESCA', 'Esophagus_Mucosa') : ['TMPRSS11BNL', 'KRT24', '(TGGCCC)n']
('ESCA', 'Lung') : ['AC108058.1']
('ESCA', 'Minor_Salivary_Gland') : ['ANKRD36C']
('ESCA', 'OV') : ['GAL3ST2', 'AC142086.6', 'AL031708.1']
('ESCA', 'PAAD') : ['FGF19']
('ESCA', 'Prostate') : ['(GTGTCGT)n']
('ESCA', 'SARC') : ['AC002550.1', 'Arthur1A']
('ESCA', 'STAD') : len(v) = 167
('ESCA', 'Testis') : ['AC007598.2']


## Compare Differential Expression Values

In [8]:
def findCandidateBiomarkers(
    intersectionDict : dict[ list, list ],
    ignore : list[str],
    ):
    '''
    TODO
    '''
    retList = []
    ignoreSet = set(ignore)
    #t is tuple of set names
    for t,v in intersectionDict.items():
        setName = set(t)
        if len( setName.intersection(ignoreSet) ) == 0:
            print(f'adding biomarkers from {setName}')
            retList = retList + v

    return retList

allESCA_sharedGenes = findCandidateBiomarkers( ESCAIntersectionDict, ignore=['Esophagus_Mucosa', 'STAD'])
allESCA_sharedGenes

adding biomarkers from {'Lung', 'ESCA'}
adding biomarkers from {'Minor_Salivary_Gland', 'ESCA'}
adding biomarkers from {'OV', 'ESCA'}
adding biomarkers from {'PAAD', 'ESCA'}
adding biomarkers from {'Prostate', 'ESCA'}
adding biomarkers from {'SARC', 'ESCA'}
adding biomarkers from {'Testis', 'ESCA'}


['AC108058.1',
 'ANKRD36C',
 'GAL3ST2',
 'AC142086.6',
 'AL031708.1',
 'FGF19',
 '(GTGTCGT)n',
 'AC002550.1',
 'Arthur1A',
 'AC007598.2']

In [34]:
# it is faster to load these results the resuls files are only 500 lines long
deseqResultsDir= f'{upstreamOut}/GTEx_TCGA-design-tilda_gender_category-padj-0001-lfc-20-n-500'
print(f'{deseqResultsDir}')

def loadResultsAndSelect(path : str, names : str, index_col : str="name") :
    df = pd.read_csv(path, index_col="name")
    #print(f'geneNames : {names}')
    resultsDF = df.loc[names, :]
    resultsDF = resultsDF.reset_index()    

    return resultsDF
    
def xxx(
    intersectionDict : dict[ list, list ], 
    rootSetName : str, 
    sharedGenes : list[str],
    ignore : list[str] = []
    ):
    '''
    TODO
    '''
    # select all the share genes
    foundSet = {rootSetName}
    resultsPath = f'{deseqResultsDir}/{rootSetName}_vs_all.results'
    # todo clean this up it is sloppy
    # rootSetNames = list(set(sharedGenes) - set(ignore))
    # print(f'rootSetNames:\n{rootSetNames}')
    retDF = loadResultsAndSelect( resultsPath,  sharedGenes)
    retDF['source'] = rootSetName

    for keys,geneNames in intersectionDict.items():
        for k in keys:
            if (k not in foundSet)  and (k not in ignore):
                hack = { n  for n in geneNames} # use comprehension to create a set
                foundSet = foundSet.union( hack )
                resultsPath = f'{deseqResultsDir}/{k}_vs_all.results'
                resultsDF = loadResultsAndSelect( resultsPath, geneNames )
                resultsDF['source'] = [k]*resultsDF.shape[0] 
                print(f'k : {k} geneNames : {geneNames} ')
                tmpDF = pd.concat([retDF, resultsDF])
                retDF = tmpDF

    return retDF.reset_index()

resultsDF = xxx( ESCAIntersectionDict, 'ESCA', allESCA_sharedGenes, ignore=['Esophagus_Mucosa', 'STAD'])
resultsDF.sort_values(by=["name", ])

#('ESCA', 'Esophagus_Mucosa') : ['TMPRSS11BNL', 'KRT24', '(TGGCCC)n']
# ('ESCA', 'STAD') : len(v) = 167

/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/best500LFC_FindAllDegree1_wl500/training/best500LFC_FindAllDegree1_wl500.sh.out/GTEx_TCGA-design-tilda_gender_category-padj-0001-lfc-20-n-500
k : Lung geneNames : ['AC108058.1'] 
k : Minor_Salivary_Gland geneNames : ['ANKRD36C'] 
k : OV geneNames : ['GAL3ST2', 'AC142086.6', 'AL031708.1'] 
k : PAAD geneNames : ['FGF19'] 
k : Prostate geneNames : ['(GTGTCGT)n'] 
k : SARC geneNames : ['AC002550.1', 'Arthur1A'] 
k : Testis geneNames : ['AC007598.2'] 


Unnamed: 0,index,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,source
16,0,(GTGTCGT)n,16.358871,2.187128,0.184329,11.865321,1.792136e-32,2.540437e-31,Prostate
6,6,(GTGTCGT)n,16.358871,2.411707,0.213385,11.302161,1.280276e-29,1.0666550000000001e-28,ESCA
17,0,AC002550.1,4.448281,3.673153,0.477803,7.687587,1.499355e-14,6.005317e-14,SARC
7,7,AC002550.1,4.448281,2.195868,0.571838,3.840017,0.0001230258,0.0002642618,ESCA
9,9,AC007598.2,6.156776,2.15603,0.533844,4.038687,5.375122e-05,0.0001194958,ESCA
19,0,AC007598.2,6.156776,9.989467,0.178072,56.098004,0.0,0.0,Testis
10,0,AC108058.1,25.07793,-2.159156,0.103184,-20.925289,3.151465e-97,5.683621e-96,Lung
0,0,AC108058.1,25.07793,2.305431,0.174739,13.193567,9.555485e-40,1.125111e-38,ESCA
3,3,AC142086.6,16.552706,2.424101,0.176463,13.73714,6.083157e-43,7.92349e-42,ESCA
13,1,AC142086.6,16.552706,2.686089,0.141511,18.98148,2.426653e-80,9.314977e-79,OV


In [35]:
resultsDF.describe()

Unnamed: 0,index,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,2.45,44.626028,2.682294,0.240287,15.265873,8.841716e-06,1.920367e-05
std,2.999561,81.022432,2.058142,0.171177,15.026279,2.943417e-05,6.35519e-05
min,0.0,4.448281,-2.159156,0.07524,-20.925289,0.0,0.0
25%,0.0,11.117586,2.182306,0.114449,7.325459,3.63586e-111,2.124735e-109
50%,1.0,17.881229,2.417904,0.177267,13.465354,3.041578e-43,3.961745e-42
75%,4.25,25.295868,2.810186,0.300877,22.804978,3.748515e-15,1.501399e-14
max,9.0,279.47476,9.989467,0.571838,56.098004,0.0001230258,0.0002642618


In [40]:
resultsDF.sort_values(by="baseMean", ascending=False)

Unnamed: 0,index,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,source
1,1,ANKRD36C,279.47476,2.05924,0.115702,17.797833,7.345638e-71,2.0503590000000002e-69,ESCA
11,0,ANKRD36C,279.47476,3.042776,0.119519,25.458607,5.6686719999999995e-143,8.264697000000001e-140,Minor_Salivary_Gland
15,0,FGF19,42.567752,2.757725,0.508007,5.428522,5.682257e-08,3.143004e-07,PAAD
5,5,FGF19,42.567752,3.124223,0.500751,6.239072,4.401744e-10,1.446267e-09,ESCA
14,2,AL031708.1,25.295868,2.175672,0.090271,24.101505,2.4105360000000004e-128,2.449353e-126,OV
4,4,AL031708.1,25.295868,2.661881,0.110691,24.047776,8.807299000000001e-128,6.234569000000001e-126,ESCA
0,0,AC108058.1,25.07793,2.305431,0.174739,13.193567,9.555485e-40,1.125111e-38,ESCA
10,0,AC108058.1,25.07793,-2.159156,0.103184,-20.925289,3.151465e-97,5.683621e-96,Lung
12,0,GAL3ST2,19.209752,2.967567,0.19398,15.298298,7.8482720000000005e-53,1.357388e-51,OV
2,2,GAL3ST2,19.209752,2.184517,0.241902,9.030578,1.7076709999999997e-19,9.327100999999998e-19,ESCA


In [None]:
ANKRD36C, FGF19, AL031708.1