# Curated Tunning targets
Andrew E. Davidson  
aedavids@ucsc.edu
1/11/24


In [1]:
import ipynbname

# use display() to print an html version of a data frame
# useful if dataFrame output is not generated by last like of cell
from IPython.display import display

import numpy as np
import pandas as pd
# display all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import pathlib as pl
import pprint as pp
import os
import sys

In [2]:
# setting the python path allows us to run python scripts from using
# the CLI. 
ORIG_PYTHONPATH = os.environ['PYTHONPATH']

notebookPath = ipynbname.path()
deconvolutionModules = notebookPath.parent.joinpath("../../python")
print("deconvolutionModules: {}\n".format(deconvolutionModules))

PYTHONPATH = ORIG_PYTHONPATH + f':{deconvolutionModules}'
print("PYTHONPATH: {}\n".format(PYTHONPATH))

os.environ["PYTHONPATH"] = PYTHONPATH
PYTHONPATH = os.environ["PYTHONPATH"]
print("PYTHONPATH: {}\n".format(PYTHONPATH))

# to be able to import our local python files we need to set the sys.path
# https://stackoverflow.com/a/50155834
sys.path.append( str(deconvolutionModules) )
print("\nsys.path:\n{}\n".format(sys.path))

deconvolutionModules: /private/home/aedavids/extraCellularRNA/deconvolutionAnalysis/jupyterNotebooks/hyperParameterTunning/../../python

PYTHONPATH: :/private/home/aedavids/extraCellularRNA/src:/private/home/aedavids/extraCellularRNA/src:/private/home/aedavids/extraCellularRNA/deconvolutionAnalysis/jupyterNotebooks/hyperParameterTunning/../../python

PYTHONPATH: :/private/home/aedavids/extraCellularRNA/src:/private/home/aedavids/extraCellularRNA/src:/private/home/aedavids/extraCellularRNA/deconvolutionAnalysis/jupyterNotebooks/hyperParameterTunning/../../python


sys.path:
['/private/home/aedavids/extraCellularRNA/deconvolutionAnalysis/jupyterNotebooks/hyperParameterTunning', '/private/home/aedavids/extraCellularRNA/deconvolutionAnalysis/jupyterNotebooks/hyperParameterTunning', '/private/home/aedavids/extraCellularRNA/src', '/private/home/aedavids/miniconda3/envs/extraCellularRNA/lib/python311.zip', '/private/home/aedavids/miniconda3/envs/extraCellularRNA/lib/python3.11', '/private/hom

In [3]:
from analysis.findMisclassificationErrors import findMisclassificationErrors
from analysis.hyperParameterTunningMetrics import metricsRunner, elifeCols, lungCols
from analysis.hyperParameterTunningMetrics import findFile, findSummaryMetricsCols
from analysis.utilities import findAllCategories, findAllGenes
from analysis.utilities import findIntersectionsWithDegree
from analysis.utilities import loadDictionary, saveDictionary

In [4]:
root = "/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category"
notebookName = ipynbname.name()
outDir = f'{root}/hyperParameter/{notebookName}.out'
print( f'output dir: \n{outDir}' )
os.makedirs(outDir, exist_ok=True)

output dir: 
/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/hyperParameter/curatedTunningTargets.out


In [5]:
def saveBestDegree1IntersectionDict() -> dict:
    '''
    best10_from_best500FindAllDegree1_wl500.intersection.dict
    
    returns the curated degree1 dict
    '''
    stageName="best10CuratedDegree1_ce467ff"
    upsetOut=f'{root}/{stageName}/training/best10CuratedDegree1.sh.out/upsetPlot.out'
    intersectionDictPath = f'{upsetOut}/best10_from_best500FindAllDegree1_wl500.intersection.dict'
    #! ls -l $intersectionDictPath
    intersectionDict = loadDictionary(intersectionDictPath)
    degree1Dict = findIntersectionsWithDegree(intersectionDict, degree=1)
    dictPath = f'{outDir}/{stageName}.degree1.dict'
    print( f'saving stageName : {stageName} saving degree1Dict to \n{dictPath}')
    saveDictionary(dictPath, degree1Dict)

best10CuratedDict = saveBestDegree1IntersectionDict()

saving stageName : best10CuratedDegree1_ce467ff saving degree1Dict to 
/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/hyperParameter/curatedTunningTargets.out/best10CuratedDegree1_ce467ff.degree1.dict


In [6]:
def getRefIntersectionDict() -> dict:
    '''
    returns degree1IntersectionDict for best500FindAllDegree1_wl500
    '''
    stageName="best500FindAllDegree1_wl500"
    stageOut=f'{root}/{stageName}/training'
    #print(f'stageOut \n{stageOut}')
    pattern = f'best500_findAllDegree1_wl500.intersection.dict'
    #print(f'pattern:\n{pattern}')
    intersectionDictPath = findFile(stageOut, pattern)[0]
    #print(f'intersectionDictPath:\n{intersectionDictPath}')
    interesectionDict = loadDictionary( intersectionDictPath )
    degree1IntersectionDict = findIntersectionsWithDegree(interesectionDict, degree=1)

    return degree1IntersectionDict

degree1_wl500_IntersectionDict = getRefIntersectionDict()

In [7]:
print(f'how many unique genes does each category have?')
for key,genes in degree1_wl500_IntersectionDict.items():
    print(f' {key} : len(genes) : {len(genes)}')

how many unique genes does each category have?
 ('ACC',) : len(genes) : 38
 ('Adipose_Subcutaneous',) : len(genes) : 1
 ('Adipose_Visceral_Omentum',) : len(genes) : 4
 ('Adrenal_Gland',) : len(genes) : 17
 ('Artery_Aorta',) : len(genes) : 9
 ('Artery_Coronary',) : len(genes) : 8
 ('Artery_Tibial',) : len(genes) : 14
 ('BLCA',) : len(genes) : 102
 ('BRCA',) : len(genes) : 96
 ('Bladder',) : len(genes) : 231
 ('Brain_Amygdala',) : len(genes) : 1
 ('Brain_Caudate_basal_ganglia',) : len(genes) : 1
 ('Brain_Cerebellar_Hemisphere',) : len(genes) : 13
 ('Brain_Cerebellum',) : len(genes) : 14
 ('Brain_Cortex',) : len(genes) : 9
 ('Brain_Frontal_Cortex_BA9',) : len(genes) : 7
 ('Brain_Hippocampus',) : len(genes) : 2
 ('Brain_Hypothalamus',) : len(genes) : 6
 ('Brain_Nucleus_accumbens_basal_ganglia',) : len(genes) : 5
 ('Brain_Putamen_basal_ganglia',) : len(genes) : 1
 ('Brain_Spinal_cord_cervical_c-1',) : len(genes) : 28
 ('Brain_Substantia_nigra',) : len(genes) : 4
 ('Breast_Mammary_Tissue',) 

## Explore Confusion Matrix

In [8]:
def loadConfusionMatrix(
        rootDir : str,
        stageName : str,
    ) -> pd.DataFrame :

    path = f'{root}/{stageName}'
    cmPath = findFile( path, 'confusionMatrix.csv')[0]
    retDF = pd.read_csv(cmPath)

    return retDF

stageName="best10CuratedDegree1_ce467ff"
cmDF = loadConfusionMatrix(root, stageName="best10CuratedDegree1_ce467ff")

In [9]:
b10cd1ErrorDF = findMisclassificationErrors(cmDF, 10).sort_values(by="errorCount", ascending=False)
# aedwipDF.sort_values(by="errorCount", ascending=False)
LUSCSelectRows = (b10cd1ErrorDF.loc[:, 'trueCat'] == 'LUSC') | (b10cd1ErrorDF.loc[:, 'predCat'] == 'LUSC')
lUSCErrorDF = b10cd1ErrorDF.loc[LUSCSelectRows, :]
print(f'lUSCErrorDF')
lUSCErrorDF

lUSCErrorDF


Unnamed: 0,trueCat,predCat,errorCount
43,HNSC,LUSC,44
51,LUSC,HNSC,21
53,LUSC,PAAD,19
10,BRCA,LUSC,14
52,LUSC,LUAD,11


## Find genes to add
looks like improving LUSC might improve tp by 44 + 21 +19 + 11 = 95

In [10]:
#
# get the deseq results from best10CuratedDegree1"#_ce467f for LUSC
#
stageName="best10CuratedDegree1"#_ce467ff"
path = f'{root}/{stageName}'
LUSCDeseqResultsPath = findFile( path, 'LUSC_vs_all.results')[0]
#! ls $LUSCDeseqResultsPath
LUSCDeseqResultsDF = pd.read_csv(LUSCDeseqResultsPath)
LUSCDeseqResultsDF

Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
0,GCLC,1919.390917,2.091534,0.056606,36.948753,7.627089000000001e-299,2.54052e-295
1,IGLV3-21,936.529628,2.232165,0.199444,11.191927,4.466228e-29,3.4969560000000003e-28
2,NTS,753.565215,2.980513,0.215927,13.803358,2.4325320000000003e-43,3.275968e-42
3,PRR15L,720.562866,-2.004048,0.171613,-11.677741,1.656384e-31,1.435234e-30
4,PBLD,633.672061,-2.498053,0.100048,-24.968642,1.339909e-137,2.177134e-135
5,NOVA2,597.802337,-2.090792,0.101873,-20.523594,1.325413e-93,7.930853e-92
6,IGLV2-11,578.523446,2.009677,0.197799,10.160207,2.984417e-24,1.893494e-23
7,TRIM16L,562.182156,2.066548,0.076463,27.026894,7.139915e-161,1.853179e-158
8,IGHV1-18,498.825676,2.102721,0.207372,10.139873,3.6757709999999996e-24,2.3262230000000002e-23
9,CACNA1D,498.146861,-2.566797,0.115933,-22.140311,1.293422e-108,1.109428e-106


In [11]:
#
# get all the LUSC degree 1 genes from our reference best500_findAllDegree1_wl500
#
key = ('LUSC',)
allLUSCD1_wl500Genes = degree1_wl500_IntersectionDict[key]
len(allLUSCD1_wl500Genes)

218

In [12]:
#
# get the LUSC results for best500FindAllDegree1_wl500
# we will search this df for genes to add
#
stageName="best500FindAllDegree1_wl500"
path = f'{root}/{stageName}'
LUSCDeseqResultsPath = findFile( path, 'LUSC_vs_all.results')[0]
print(f'LUSCDeseqResultsPath:\n{LUSCDeseqResultsPath}')
#! ls $LUSCDeseqResultsPath
bestLUSCDeseqResultsDF = pd.read_csv(LUSCDeseqResultsPath)

# select rows not in allLUSCD1_wl500Genes
# these are all the degree1 genes from findAll_wl500 
selectRows = bestLUSCDeseqResultsDF.loc[:, 'name'].isin(allLUSCD1_wl500Genes)
candidateLUSCDeseqResultsDF = bestLUSCDeseqResultsDF.loc[selectRows, :]

print(f'candidateLUSCDeseqResultsDF.shape : {candidateLUSCDeseqResultsDF.shape}')
print(f'head(n=12)')
candidateLUSCDeseqResultsDF.sort_values(by="baseMean", ascending=False).head(n=12)

LUSCDeseqResultsPath:
/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/best500FindAllDegree1_wl500/training/best500FindAllDegree1_wl500.sh.out/GTEx_TCGA-design-tilda_gender_category-padj-0001-lfc-20-n-500/LUSC_vs_all.results
candidateLUSCDeseqResultsDF.shape : (218, 7)
head(n=12)


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
101,GCLC,1919.390917,2.091534,0.056606,36.948753,7.627089000000001e-299,2.54052e-295
211,IGLV3-21,936.529628,2.232165,0.199444,11.191927,4.466228e-29,3.4969560000000003e-28
264,NTS,753.565215,2.980513,0.215927,13.803358,2.4325320000000003e-43,3.275968e-42
271,PRR15L,720.562866,-2.004048,0.171613,-11.677741,1.656384e-31,1.435234e-30
294,PBLD,633.672061,-2.498053,0.100048,-24.968642,1.339909e-137,2.177134e-135
309,NOVA2,597.802337,-2.090792,0.101873,-20.523594,1.325413e-93,7.930853e-92
315,IGLV2-11,578.523446,2.009677,0.197799,10.160207,2.984417e-24,1.893494e-23
325,TRIM16L,562.182156,2.066548,0.076463,27.026894,7.139915e-161,1.853179e-158
341,IGHV1-18,498.825676,2.102721,0.207372,10.139873,3.6757709999999996e-24,2.3262230000000002e-23
342,CACNA1D,498.146861,-2.566797,0.115933,-22.140311,1.293422e-108,1.109428e-106


In [13]:
#
# select LUSC results from  best500FindAllDegree1_wl500 
# that are where not already used in stage best10CuratedDegree1_ce467ff
# 
selectRows = ~candidateLUSCDeseqResultsDF.loc[:, "name"].isin( LUSCDeseqResultsDF.loc[:,'name'] )
topCandidateLUSCDF = candidateLUSCDeseqResultsDF.loc[selectRows, :]

bestLUSC_CAndidateDF = topCandidateLUSCDF.sort_values(by='baseMean', ascending=False)
print(f'head(10)')
bestLUSC_CAndidateDF.head(10)

head(10)


Unnamed: 0,name,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
352,IGKV1-9,486.040351,2.12137,0.211572,10.026715,1.163222e-23,7.200736e-23
356,IGHV5-51,472.599411,2.175191,0.20451,10.6361,2.0242559999999998e-26,1.4167670000000001e-25
366,HNF1B,456.940747,-4.126474,0.224553,-18.376366,2.03128e-75,7.469394e-74
380,TM4SF18,427.242712,-2.267819,0.09653,-23.493451,4.7586049999999995e-122,5.4656950000000005e-120
381,ADH7,426.779623,3.972077,0.303427,13.090696,3.721839e-39,4.3033159999999996e-38
396,FBXO27,394.835675,2.207173,0.093,23.733,1.6462169999999999e-124,1.9583619999999998e-122
402,ARSL,383.50274,-3.12524,0.148423,-21.056327,2.001003e-98,1.3352600000000001e-96
408,TMEM220,375.6547,-2.062147,0.071197,-28.963842,1.878669e-184,7.988542e-182
419,GAL3ST1,344.361289,-4.674176,0.143126,-32.657824,6.205059e-234,5.511609e-231
420,PLA1A,344.32277,-2.620594,0.117929,-22.221768,2.1157669999999999e-109,1.850532e-107


In [17]:
bestLUSC_CAndidateDFPath = f'{outDir}/{stageName}.LUSC.degree1.results.csv'
print( f'saving bestLUSC_CAndidateDF to \n{bestLUSC_CAndidateDFPath}')
bestLUSC_CAndidateDF.to_csv(bestLUSC_CAndidateDFPath)

saving bestLUSC_CAndidateDF to 
/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/hyperParameter/curatedTunningTargets.out/best500FindAllDegree1_wl500.LUSC.degree1.results.csv


PermissionError: [Errno 13] Permission denied: '/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/hyperParameter/curatedTunningTargets.out/best500FindAllDegree1_wl500.LUSC.degree1.results.csv'

In [28]:
genes = bestLUSC_CAndidateDF.loc[:, "name"].values.tolist()
genes[0:5]
bestLUSCGenesPath = f'{outDir}/{stageName}.LUSC.orderedGenes.txt'
with open(bestLUSCGenesPath, 'w') as f:
    f.write( pp.pformat(genes) )

print( f'saving bestLUSC_CAndidateDF genes list to  to \n{bestLUSCGenesPath}')

saving bestLUSC_CAndidateDF genes list to  to 
/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/hyperParameter/curatedTunningTargets.out/best500FindAllDegree1_wl500.LUSC.orderedGenes.txt
