# Enrich ESCA
Andrew E. Davidson  
aedavids@ucsc.edu  

Copyright (c) 2020-2023, Regents of the University of California All rights reserved. https://polyformproject.org/licenses/noncommercial/1.0.0

search for ESCA biomarkers that are only shared with one other class

ref: extraCellularRNA/intraExtraRNA_POC/adenocarcinoma.vs.control/README.md

In [1]:
import ipynbname
from IPython.display import display
from IPython.display import Image
import numpy as np
import pathlib as pl
import pandas as pd
import os
import sys

notebookName = ipynbname.name()
notebookPath = ipynbname.path()
notebookDir = os.path.dirname(notebookPath)

outDir = f'{notebookDir}/{notebookName}.out'
os.makedirs(outDir, exist_ok=True)
print(f'outDir:\n{outDir}')

imgOut = f'{outDir}/img'
os.makedirs(imgOut, exist_ok=True)
print(f'\nimgOut :\n{imgOut}')

outDir:
/private/home/aedavids/extraCellularRNA/intraExtraRNA_POC/adenocarcinoma.vs.control/enrichESCA.out

imgOut :
/private/home/aedavids/extraCellularRNA/intraExtraRNA_POC/adenocarcinoma.vs.control/enrichESCA.out/img


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# setting the python path allows us to run python scripts from using
# the CLI. 
PYTHONPATH = os.environ['PYTHONPATH']
print("ORIG_PYTHONPATH: {}\n".format(PYTHONPATH))

gitRepoRoot = !git rev-parse --show-toplevel
gitRepoRoot = gitRepoRoot[0]


#
# add deseq modules
#
deseqModules = f'{gitRepoRoot}/terra/deseq/python'
# deseqModules = f'{gitRepoRoot}/terra'
print("deseqModules: {}\n".format(deseqModules))

PYTHONPATH = PYTHONPATH + f':{deseqModules}'
print("PYTHONPATH: {}\n".format(PYTHONPATH))

#
# add deconvolutionAnalysis modules
#
deconvolutionModules = f'{gitRepoRoot}/deconvolutionAnalysis/python'
print("deconvolutionModules: {}\n".format(deconvolutionModules))

PYTHONPATH = PYTHONPATH + f':{deconvolutionModules}'
print("PYTHONPATH: {}\n".format(PYTHONPATH))


#
# to be able to import our local python files we need to set the sys.path
# https://stackoverflow.com/a/50155834
#
sys.path.append( str(deseqModules) )
sys.path.append( str(deconvolutionModules) )
print("\nsys.path:\n{}\n".format(sys.path))

ORIG_PYTHONPATH: :/private/home/aedavids/extraCellularRNA/src

deseqModules: /private/home/aedavids/extraCellularRNA/terra/deseq/python

PYTHONPATH: :/private/home/aedavids/extraCellularRNA/src:/private/home/aedavids/extraCellularRNA/terra/deseq/python

deconvolutionModules: /private/home/aedavids/extraCellularRNA/deconvolutionAnalysis/python

PYTHONPATH: :/private/home/aedavids/extraCellularRNA/src:/private/home/aedavids/extraCellularRNA/terra/deseq/python:/private/home/aedavids/extraCellularRNA/deconvolutionAnalysis/python


sys.path:
['/private/home/aedavids/extraCellularRNA/intraExtraRNA_POC/adenocarcinoma.vs.control', '/private/home/aedavids/extraCellularRNA/intraExtraRNA_POC/adenocarcinoma.vs.control', '/private/home/aedavids/extraCellularRNA/src', '/private/home/aedavids/miniconda3/envs/extraCellularRNA/lib/python311.zip', '/private/home/aedavids/miniconda3/envs/extraCellularRNA/lib/python3.11', '/private/home/aedavids/miniconda3/envs/extraCellularRNA/lib/python3.11/lib-dynload'

In [28]:
# local imports
from analysis.bestSignatureGeneConfig import BestSignatureGeneConfig
from analysis.utilities import findIntersectionsWithDegree, loadDictionary
from pipeline.dataFactory.driver import _countExtraHeaderLines

In [6]:
deconRoot="/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/best500FindAllDegree1_wl500/training"
intersectionDictionaryPath = f"{deconRoot}/best500FindAllDegree1_wl500.sh.out/upsetPlot.out/best500_findAllDegree1_wl500.intersection.dict"
print(f'loading:\n{intersectionDictionaryPath}' )
intersectionDict = loadDictionary(intersectionDictionaryPath)
degreeDict = findIntersectionsWithDegree( intersectionDict, 2)

loading:
/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/best500FindAllDegree1_wl500/training/best500FindAllDegree1_wl500.sh.out/upsetPlot.out/best500_findAllDegree1_wl500.intersection.dict


In [14]:
ESCA_IntersectionDict = dict()
candidateGeneSet = set()
for key in degreeDict.keys():
    if 'ESCA' in key:
        ESCA_IntersectionDict[key] = degreeDict[key]
        candidateGeneSet.update( degreeDict[key] )

In [15]:
ESCA_IntersectionDict.keys()

dict_keys([('BLCA', 'ESCA'), ('CESC', 'ESCA'), ('CHOL', 'ESCA'), ('COAD', 'ESCA'), ('ESCA', 'GBM'), ('ESCA', 'KIRP'), ('ESCA', 'LUSC'), ('ESCA', 'OV'), ('ESCA', 'PRAD'), ('ESCA', 'STAD'), ('ESCA', 'TGCT'), ('ESCA', 'UCEC'), ('ESCA', 'UCS')])

In [16]:
candidateGeneSet

{'(CCAT)n',
 'AL160408.2',
 'AS3MT',
 'CELF6',
 'ENPP5',
 'FAT3',
 'GPIHBP1',
 'GPR173',
 'HSPB2',
 'KCNQ1OT1',
 'L1M1',
 'L1M2',
 'L1M4a1',
 'L1M4a2',
 'L1M4c',
 'L1M6',
 'L1M7',
 'L1MA2',
 'L1MA3',
 'L1MA4A',
 'L1MA5',
 'L1MCa',
 'L1MEi',
 'L1PA8A',
 'L1PB3',
 'L1PB4',
 'L1PBa',
 'LINC01002',
 'MAGEH1',
 'MEOX2',
 'MUC17',
 'NALCN',
 'PLPP1',
 'PLPP7',
 'PNMA8B',
 'PSMC1P1',
 'RPL22P1',
 'RTL5',
 'SAMD9',
 'SLC13A3',
 'SLC16A4',
 'SLC9A3',
 'SPOCK3',
 'YPEL4',
 'ZNF667'}

# evaluate DESeq results for candidate genes

In [23]:
deseqResultsPath = "/private/groups/kimlab/GTEx_TCGA/1vsAll/ESCA_vs_all.results"
numRowsToSkip = _countExtraHeaderLines(deseqResultsPath)
print(f"runSelectGenesOfInterest() numRowsToSkip: {numRowsToSkip}")
deseqDF = pd.read_csv(deseqResultsPath, skiprows=numRowsToSkip, index_col='name')
print(f'deseqDF.shape : {deseqDF.shape}')
deseqDF.head()

runSelectGenesOfInterest() numRowsToSkip: 7
deseqDF.shape : (74777, 6)


Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
(ATGTGC)n,0.596123,2.900568,0.056265,51.552292,0.0,0.0
(TACATA)n,0.285152,2.894953,0.062715,46.16019,0.0,0.0
(TATG)n,0.546036,3.397431,0.044634,76.117174,0.0,0.0
AC005096.1,0.456616,3.018803,0.05878,51.357275,0.0,0.0
AC005776.1,0.764986,2.401486,0.054634,43.956061,0.0,0.0


In [31]:
candidateDF = deseqDF.loc[ list(candidateGeneSet) , :]
print(f'candidateDF.shape : {candidateDF.shape}')
candidateDF

candidateDF.shape : (45, 6)


Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HSPB2,703.213665,-3.130593,0.167985,-18.636189,1.634798e-77,5.2866580000000005e-76
PSMC1P1,523.196804,-6.297372,0.147768,-42.616615,0.0,0.0
(CCAT)n,755.411436,-2.155953,0.135462,-15.915561,4.9421649999999996e-57,9.613421e-56
FAT3,382.707236,-2.273265,0.2322,-9.790113,1.241576e-22,7.809420000000001e-22
GPIHBP1,575.63202,-3.24488,0.187535,-17.302783,4.482072e-67,1.1363499999999999e-65
L1PBa,888.002853,2.218696,0.083756,26.49014,1.259051e-154,1.232105e-152
L1MA3,3599.073077,2.136612,0.070743,30.202315,2.208483e-200,3.596336e-198
GPR173,468.726977,-2.001261,0.163526,-12.238177,1.943499e-34,1.907796e-33
SLC16A4,435.466506,-2.312013,0.149096,-15.506919,3.114679e-54,5.660418e-53
L1M4c,1356.364089,3.138658,0.088652,35.40417,1.4729679999999999e-274,3.590817e-272


In [30]:
hack = BestSignatureGeneConfig(
        dataSetName=None, 
        design=None, 
        padjThreshold=0.001, 
        lfcThreshold=2.0, 
        n=9999, 
        localCacheRootPath=None, 
        title=None,
        )

significantDF = hack._select(candidateDF, fileName=None)
print(f'significantDF.shape : {significantDF.shape}')
significantDF

significantDF.shape : (45, 7)


Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,absLog2FoldChange
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
L1MA3,3599.073077,2.136612,0.070743,30.202315,2.208483e-200,3.596336e-198,2.136612
PLPP1,3225.374676,-2.044375,0.12262,-16.672393,2.080946e-62,4.669548e-61,2.044375
L1M1,2886.792373,2.196612,0.073438,29.911042,1.413868e-196,2.1735060000000002e-194,2.196612
LINC01002,2323.659751,2.246173,0.18301,12.273488,1.2571779999999999e-34,1.242074e-33,2.246173
L1MA4A,2312.761788,2.000856,0.070513,28.375838,4.0184990000000005e-177,5.109819000000001e-175,2.000856
L1MA2,2148.67886,2.291392,0.075538,30.334452,4.029077e-202,6.693304e-200,2.291392
L1M2,1999.265599,2.028193,0.073894,27.447383,7.465685e-166,8.268245999999999e-164,2.028193
L1MCa,1901.333189,2.128278,0.074944,28.3982,2.1282830000000003e-177,2.714648e-175,2.128278
L1PB4,1444.575147,2.040922,0.074833,27.272871,8.901228999999999e-164,9.727367e-162,2.040922
L1M4c,1356.364089,3.138658,0.088652,35.40417,1.4729679999999999e-274,3.590817e-272,3.138658
