# Enrich ESCA
Andrew E. Davidson  
aedavids@ucsc.edu  

Copyright (c) 2020-2023, Regents of the University of California All rights reserved. https://polyformproject.org/licenses/noncommercial/1.0.0

search for ESCA biomarkers that are only shared with one other class

ref: extraCellularRNA/intraExtraRNA_POC/adenocarcinoma.vs.control/README.md

In [1]:
import ipynbname
from IPython.display import display
from IPython.display import Image
import numpy as np
import pathlib as pl
import pandas as pd
import os
import sys

notebookName = ipynbname.name()
notebookPath = ipynbname.path()
notebookDir = os.path.dirname(notebookPath)

outDir = f'{notebookDir}/{notebookName}.out'
os.makedirs(outDir, exist_ok=True)
print(f'outDir:\n{outDir}')

imgOut = f'{outDir}/img'
os.makedirs(imgOut, exist_ok=True)
print(f'\nimgOut :\n{imgOut}')

outDir:
/private/home/aedavids/extraCellularRNA/intraExtraRNA_POC/adenocarcinoma.vs.control/enrichESCA.out

imgOut :
/private/home/aedavids/extraCellularRNA/intraExtraRNA_POC/adenocarcinoma.vs.control/enrichESCA.out/img


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# setting the python path allows us to run python scripts from using
# the CLI. 
PYTHONPATH = os.environ['PYTHONPATH']
print("ORIG_PYTHONPATH: {}\n".format(PYTHONPATH))

gitRepoRoot = !git rev-parse --show-toplevel
gitRepoRoot = gitRepoRoot[0]


#
# add deseq modules
#
deseqModules = f'{gitRepoRoot}/terra/deseq/python'
# deseqModules = f'{gitRepoRoot}/terra'
print("deseqModules: {}\n".format(deseqModules))

PYTHONPATH = PYTHONPATH + f':{deseqModules}'
print("PYTHONPATH: {}\n".format(PYTHONPATH))

#
# add deconvolutionAnalysis modules
#
deconvolutionModules = f'{gitRepoRoot}/deconvolutionAnalysis/python'
print("deconvolutionModules: {}\n".format(deconvolutionModules))

PYTHONPATH = PYTHONPATH + f':{deconvolutionModules}'
print("PYTHONPATH: {}\n".format(PYTHONPATH))


#
# to be able to import our local python files we need to set the sys.path
# https://stackoverflow.com/a/50155834
#
sys.path.append( str(deseqModules) )
sys.path.append( str(deconvolutionModules) )
print("\nsys.path:\n{}\n".format(sys.path))

ORIG_PYTHONPATH: :/private/home/aedavids/extraCellularRNA/src

deseqModules: /private/home/aedavids/extraCellularRNA/terra/deseq/python

PYTHONPATH: :/private/home/aedavids/extraCellularRNA/src:/private/home/aedavids/extraCellularRNA/terra/deseq/python

deconvolutionModules: /private/home/aedavids/extraCellularRNA/deconvolutionAnalysis/python

PYTHONPATH: :/private/home/aedavids/extraCellularRNA/src:/private/home/aedavids/extraCellularRNA/terra/deseq/python:/private/home/aedavids/extraCellularRNA/deconvolutionAnalysis/python


sys.path:
['/private/home/aedavids/extraCellularRNA/intraExtraRNA_POC/adenocarcinoma.vs.control', '/private/home/aedavids/extraCellularRNA/intraExtraRNA_POC/adenocarcinoma.vs.control', '/private/home/aedavids/extraCellularRNA/src', '/private/home/aedavids/miniconda3/envs/extraCellularRNA/lib/python311.zip', '/private/home/aedavids/miniconda3/envs/extraCellularRNA/lib/python3.11', '/private/home/aedavids/miniconda3/envs/extraCellularRNA/lib/python3.11/lib-dynload'

In [3]:
# local imports
from analysis.bestSignatureGeneConfig import BestSignatureGeneConfig
from analysis.utilities import findIntersectionsWithDegree, loadDictionary
from pipeline.dataFactory.driver import _countExtraHeaderLines

In [4]:
deconRoot="/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/best500FindAllDegree1_wl500/training"
intersectionDictionaryPath = f"{deconRoot}/best500FindAllDegree1_wl500.sh.out/upsetPlot.out/best500_findAllDegree1_wl500.intersection.dict"
print(f'loading:\n{intersectionDictionaryPath}' )
intersectionDict = loadDictionary(intersectionDictionaryPath)
degreeDict = findIntersectionsWithDegree( intersectionDict, 2)

loading:
/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/best500FindAllDegree1_wl500/training/best500FindAllDegree1_wl500.sh.out/upsetPlot.out/best500_findAllDegree1_wl500.intersection.dict


In [5]:
ESCA_IntersectionDict = dict()
candidateGeneSet = set()
for key in degreeDict.keys():
    if 'ESCA' in key:
        ESCA_IntersectionDict[key] = degreeDict[key]
        candidateGeneSet.update( degreeDict[key] )

In [6]:
ESCA_IntersectionDict.keys()

dict_keys([('BLCA', 'ESCA'), ('CESC', 'ESCA'), ('CHOL', 'ESCA'), ('COAD', 'ESCA'), ('ESCA', 'GBM'), ('ESCA', 'KIRP'), ('ESCA', 'LUSC'), ('ESCA', 'OV'), ('ESCA', 'PRAD'), ('ESCA', 'STAD'), ('ESCA', 'TGCT'), ('ESCA', 'UCEC'), ('ESCA', 'UCS')])

In [7]:
candidateGeneSet

{'(CCAT)n',
 'AL160408.2',
 'AS3MT',
 'CELF6',
 'ENPP5',
 'FAT3',
 'GPIHBP1',
 'GPR173',
 'HSPB2',
 'KCNQ1OT1',
 'L1M1',
 'L1M2',
 'L1M4a1',
 'L1M4a2',
 'L1M4c',
 'L1M6',
 'L1M7',
 'L1MA2',
 'L1MA3',
 'L1MA4A',
 'L1MA5',
 'L1MCa',
 'L1MEi',
 'L1PA8A',
 'L1PB3',
 'L1PB4',
 'L1PBa',
 'LINC01002',
 'MAGEH1',
 'MEOX2',
 'MUC17',
 'NALCN',
 'PLPP1',
 'PLPP7',
 'PNMA8B',
 'PSMC1P1',
 'RPL22P1',
 'RTL5',
 'SAMD9',
 'SLC13A3',
 'SLC16A4',
 'SLC9A3',
 'SPOCK3',
 'YPEL4',
 'ZNF667'}

# evaluate DESeq results for candidate genes

In [8]:
deseqResultsPath = "/private/groups/kimlab/GTEx_TCGA/1vsAll/ESCA_vs_all.results"
numRowsToSkip = _countExtraHeaderLines(deseqResultsPath)
print(f"runSelectGenesOfInterest() numRowsToSkip: {numRowsToSkip}")
deseqDF = pd.read_csv(deseqResultsPath, skiprows=numRowsToSkip, index_col='name')
print(f'deseqDF.shape : {deseqDF.shape}')
deseqDF.head()

runSelectGenesOfInterest() numRowsToSkip: 7
deseqDF.shape : (74777, 6)


Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
(ATGTGC)n,0.596123,2.900568,0.056265,51.552292,0.0,0.0
(TACATA)n,0.285152,2.894953,0.062715,46.16019,0.0,0.0
(TATG)n,0.546036,3.397431,0.044634,76.117174,0.0,0.0
AC005096.1,0.456616,3.018803,0.05878,51.357275,0.0,0.0
AC005776.1,0.764986,2.401486,0.054634,43.956061,0.0,0.0


In [9]:
candidateDF = deseqDF.loc[ list(candidateGeneSet) , :]
print(f'candidateDF.shape : {candidateDF.shape}')
candidateDF

candidateDF.shape : (45, 6)


Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
L1M4a2,408.048756,2.199935,0.071609,30.721439,2.944364e-207,5.183968000000001e-205
L1MA2,2148.67886,2.291392,0.075538,30.334452,4.029077e-202,6.693304e-200
PLPP1,3225.374676,-2.044375,0.12262,-16.672393,2.080946e-62,4.669548e-61
L1MA3,3599.073077,2.136612,0.070743,30.202315,2.208483e-200,3.596336e-198
MAGEH1,897.382865,-2.123894,0.101233,-20.980312,9.924011e-98,4.704941e-96
L1M2,1999.265599,2.028193,0.073894,27.447383,7.465685e-166,8.268245999999999e-164
SPOCK3,611.284233,-2.840818,0.342008,-8.306279,9.875019e-17,4.708807e-16
L1PB4,1444.575147,2.040922,0.074833,27.272871,8.901228999999999e-164,9.727367e-162
AS3MT,709.573364,-2.059524,0.173648,-11.860337,1.902071e-32,1.739091e-31
MEOX2,487.218965,-2.012774,0.211763,-9.504832,2.003716e-21,1.192762e-20


In [10]:
hack = BestSignatureGeneConfig(
        dataSetName=None, 
        design=None, 
        padjThreshold=0.001, 
        lfcThreshold=2.0, 
        n=9999, 
        localCacheRootPath=None, 
        title=None,
        )

significantDF = hack._select(candidateDF, fileName=None)
print(f'significantDF.shape : {significantDF.shape}')
significantDF

significantDF.shape : (45, 7)


Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,absLog2FoldChange
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
L1MA3,3599.073077,2.136612,0.070743,30.202315,2.208483e-200,3.596336e-198,2.136612
PLPP1,3225.374676,-2.044375,0.12262,-16.672393,2.080946e-62,4.669548e-61,2.044375
L1M1,2886.792373,2.196612,0.073438,29.911042,1.413868e-196,2.1735060000000002e-194,2.196612
LINC01002,2323.659751,2.246173,0.18301,12.273488,1.2571779999999999e-34,1.242074e-33,2.246173
L1MA4A,2312.761788,2.000856,0.070513,28.375838,4.0184990000000005e-177,5.109819000000001e-175,2.000856
L1MA2,2148.67886,2.291392,0.075538,30.334452,4.029077e-202,6.693304e-200,2.291392
L1M2,1999.265599,2.028193,0.073894,27.447383,7.465685e-166,8.268245999999999e-164,2.028193
L1MCa,1901.333189,2.128278,0.074944,28.3982,2.1282830000000003e-177,2.714648e-175,2.128278
L1PB4,1444.575147,2.040922,0.074833,27.272871,8.901228999999999e-164,9.727367e-162,2.040922
L1M4c,1356.364089,3.138658,0.088652,35.40417,1.4729679999999999e-274,3.590817e-272,3.138658


## remove any genes we are already using
random forest hyperparemeter search for ESCA used genes in /private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/best10CuratedDegree1_ce467ff/training/best10CuratedDegree1.sh.out/GTEx_TCGA-design-tilda_gender_category-padj-0001-lfc-20-n-10/ESCA_vs_all.results

In [11]:

ESCABest10CuratedDegree1_ce467ffDF = pd.read_csv('/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/best10CuratedDegree1_ce467ff/training/best10CuratedDegree1.sh.out/GTEx_TCGA-design-tilda_gender_category-padj-0001-lfc-20-n-10/ESCA_vs_all.results')
print(f'ESCABest10CuratedDegree1_ce467ffDF.shape : {ESCABest10CuratedDegree1_ce467ffDF.shape}' )
ESCABest10CuratedDegree1_ce467ffDF

ESCABest10CuratedDegree1_ce467ffDF.shape : (10, 7)


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
0,AC012615.3,36.58301,-6.299969,0.315824,-19.947741,1.568149e-88,6.2724449999999995e-87
1,(TA)n,37.185061,3.689817,0.106398,34.679272,1.6179819999999998e-263,3.8091e-261
2,UBE2SP2,37.23481,-4.912241,0.277765,-17.684851,5.486623e-70,1.4871270000000001e-68
3,HERVFH19-int,37.909259,2.46598,0.119989,20.55168,7.434462e-94,3.311269e-92
4,PRELID1P1,38.467183,-3.566584,0.11696,-30.494135,3.1168560000000004e-204,5.3282720000000005e-202
5,LTR106,38.813262,2.019337,0.09309,21.692377,2.421443e-104,1.3023629999999999e-102
6,AC010336.3,39.422501,-2.760584,0.463634,-5.954236,2.612899e-09,8.160777e-09
7,GOLGA8S,39.687887,-2.155476,0.170743,-12.624118,1.554742e-36,1.648747e-35
8,MER5C,40.155981,2.43635,0.086018,28.323579,1.7712389999999998e-176,2.211311e-174
9,CCDC160,40.245224,-3.866188,0.219109,-17.64502,1.1113580000000001e-69,2.994562e-68


In [17]:
# select genes we are not already using 
selectRows = ~ significantDF.index.isin( ESCABest10CuratedDegree1_ce467ffDF.loc[:, "name"] )
candidateSignificantDF = significantDF.loc[selectRows, :]

print('we expect candidateSignificantDF shape to be the same as significantDF. ESCABest10CuratedDegree1_ce467ffDF genes are degree1, our candidates are degree2')
print(f'candidateSignificantDF.shape : {candidateSignificantDF.shape}' )
candidateSignificantDF

we expect candidateSignificantDF shape to be the same as significantDF. ESCABest10CuratedDegree1_ce467ffDF genes are degree1, our candidates are degree2
candidateSignificantDF.shape : (45, 7)


Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,absLog2FoldChange
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
L1MA3,3599.073077,2.136612,0.070743,30.202315,2.208483e-200,3.596336e-198,2.136612
PLPP1,3225.374676,-2.044375,0.12262,-16.672393,2.080946e-62,4.669548e-61,2.044375
L1M1,2886.792373,2.196612,0.073438,29.911042,1.413868e-196,2.1735060000000002e-194,2.196612
LINC01002,2323.659751,2.246173,0.18301,12.273488,1.2571779999999999e-34,1.242074e-33,2.246173
L1MA4A,2312.761788,2.000856,0.070513,28.375838,4.0184990000000005e-177,5.109819000000001e-175,2.000856
L1MA2,2148.67886,2.291392,0.075538,30.334452,4.029077e-202,6.693304e-200,2.291392
L1M2,1999.265599,2.028193,0.073894,27.447383,7.465685e-166,8.268245999999999e-164,2.028193
L1MCa,1901.333189,2.128278,0.074944,28.3982,2.1282830000000003e-177,2.714648e-175,2.128278
L1PB4,1444.575147,2.040922,0.074833,27.272871,8.901228999999999e-164,9.727367e-162,2.040922
L1M4c,1356.364089,3.138658,0.088652,35.40417,1.4729679999999999e-274,3.590817e-272,3.138658


# Genes to add

In [18]:
candidateSignificantDF.sort_values(by="absLog2FoldChange", ascending=False).head(n=5)

Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,absLog2FoldChange
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
PSMC1P1,523.196804,-6.297372,0.147768,-42.616615,0.0,0.0,6.297372
MUC17,549.323046,3.740385,0.614954,6.082382,1.184099e-09,3.781096e-09,3.740385
GPIHBP1,575.63202,-3.24488,0.187535,-17.302783,4.482072e-67,1.1363499999999999e-65,3.24488
L1M4c,1356.364089,3.138658,0.088652,35.40417,1.4729679999999999e-274,3.590817e-272,3.138658
HSPB2,703.213665,-3.130593,0.167985,-18.636189,1.634798e-77,5.2866580000000005e-76,3.130593


# weak check for genes that are differentical epressed
most of the genes names are in ENSGO format not HUGO
```
/private/groups/kimlab/aedavids/londonCalling2024/data/run.adenocarcinoma.vs.control.sh.out
 grep 'PSMC1P1\|MUC17\|GPIHBP1\|L1M4c\|HSPB2' adenocarcinoma_vs_all.results
"L1M4c",251.280300870265,-0.129235119719905,0.157360739557202,-0.821266601082074,0.411494427482241,0.925425472687107
```

```
v39=/private/groups/kimlab/genomes.annotations/genomes.annotations/gencode.39/gencode.v39.annotation.expanded.tx.to.gene.tsv

v35=/private/groups/kimlab/genomes.annotations/genomes.annotations/gencode.35/gencode.v35.ucsc.rmsk.tx.to.gene.csv

grep 'PSMC1P1\|MUC17\|GPIHBP1\|L1M4c\|HSPB2' $v35 | cut -d \| -f 2 | cut -d \. -f 1 | sort | uniq
ENSG00000169876
ENSG00000170276
ENSG00000217385
ENSG00000226126
ENSG00000232314
ENSG00000236348
ENSG00000241506
ENSG00000254445
ENSG00000277494


$ grep 'ENSG00000169876\|ENSG00000170276\|ENSG00000217385\|ENSG00000226126\|ENSG00000232314\|ENSG00000236348\|ENSG00000241506\|ENSG00000254445\|ENSG00000277494' adenocarcinoma_vs_all.results 
"ENSG00000217385.1",1.79922894262971,-2.55776199056194,1.01723882830602,-2.51441639798721,0.0119229567221693,0.495166691521839
"ENSG00000169876.14",0.37074765065209,-0.736660388239078,2.50210380280408,-0.29441639767843,0.768439738354521,NA
"ENSG00000170276.6",0,NA,NA,NA,NA,NA
"ENSG00000226126.2",0,NA,NA,NA,NA,NA
"ENSG00000236348.1",0,NA,NA,NA,NA,NA
"ENSG00000241506.1",0.124111789829559,-0.420066328349982,3.19870189200308,-0.131324000339066,0.895519006677846,NA
"ENSG00000254445.1",0,NA,NA,NA,NA,NA
"ENSG00000277494.2",0.0305832576181515,-0.0989810149538228,1.85268408929795,-0.0534257381091507,0.957392698447606,NA


# -f name, baseMean, LFC, p-adj
grep 'ENSG00000169876\|ENSG00000170276\|ENSG00000217385\|ENSG00000226126\|ENSG00000232314\|ENSG00000236348\|ENSG00000241506\|ENSG00000254445\|ENSG00000277494' adenocarcinoma_vs_all.results  | cut -d , -f 1,2,3,7
"ENSG00000217385.1",1.79922894262971,-2.55776199056194,0.495166691521839
"ENSG00000169876.14",0.37074765065209,-0.736660388239078,NA
"ENSG00000170276.6",0,NA,NA
"ENSG00000226126.2",0,NA,NA
"ENSG00000236348.1",0,NA,NA
"ENSG00000241506.1",0.124111789829559,-0.420066328349982,NA
"ENSG00000254445.1",0,NA,NA
"ENSG00000277494.2",0.0305832576181515,-0.0989810149538228,NA
```

In [None]:
none of these are good write a script search all. we are hacking we want genes that work well in general and 
likely to work with the bad controls

load adenocarcinoma_vs_all.results
map out hugos to ensgo
check ensgo lfc