# Ascending vs. Descending Base Mean Gene Signature Selection
Andrew E. Davidson  
aedavids@ucsc.edu  
7/2/24

Copyright (c) 2020-2023, Regents of the University of California All rights reserved. https://polyformproject.org/licenses/noncommercial/1.0.0

ref: extraCellularRNA/deconvolutionAnalysis/jupyterNotebooks/hyperParameterTunning/hyperparameterTunningResults6.ipynb

**Abstract**  
- best10CuratedDegree1_ce467ff has best results. 
- potential bug: sorted base mean in ascending order. this picks potentiall weak signals
- best10CuratedDegree1 is sorted in desecnding order.
- create box plots and histograms of base means to decide if best10CuratedDegree1_ce467ff is valid

**Results**  
<span style="color:red;background-color:yellow">TODO</span>  

In [1]:
import ipynbname

# use display() to print an html version of a data frame
# useful if dataFrame output is not generated by last like of cell
from IPython.display import display

import matplotlib.pyplot as plt 
import numpy as np
import os
import pandas as pd
import sys

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# setting the python path allows us to run python scripts from using
# the CLI. 
PYTHONPATH = os.environ['PYTHONPATH']
print("ORIG_PYTHONPATH: {}\n".format(PYTHONPATH))

gitRepoRoot = !git rev-parse --show-toplevel
gitRepoRoot = gitRepoRoot[0]

#
# add deconvolutionAnalysis modules
#
deconvolutionModules = f'{gitRepoRoot}/deconvolutionAnalysis/python'
print("deconvolutionModules: {}\n".format(deconvolutionModules))

PYTHONPATH = PYTHONPATH + f':{deconvolutionModules}'
#print("PYTHONPATH: {}\n".format(PYTHONPATH))

sys.path.append( str(deconvolutionModules) )
#print("\nsys.path:\n{}\n".format(sys.path))

#
# add intraExtraRNA_POC modules
#
intraExtraRNA_POCModules = f'{gitRepoRoot}/intraExtraRNA_POC/python/src'
print("intraExtraRNA_POCModules: {}\n".format(intraExtraRNA_POCModules))

PYTHONPATH = PYTHONPATH + f':{intraExtraRNA_POCModules}'
#print("PYTHONPATH: {}\n".format(PYTHONPATH))

sys.path.append( str(intraExtraRNA_POCModules) )
#print("\nsys.path:\n{}\n".format(sys.path))

ORIG_PYTHONPATH: :/private/home/aedavids/extraCellularRNA/src

deconvolutionModules: /private/home/aedavids/extraCellularRNA/deconvolutionAnalysis/python

intraExtraRNA_POCModules: /private/home/aedavids/extraCellularRNA/intraExtraRNA_POC/python/src



In [3]:
from analysis.hyperParameterTunningMetrics import metricsRunner, elifeCols, lungCols
from analysis.hyperParameterTunningMetrics import findSummaryMetricsCols
# from analysis.hyperParameterTunningMetrics import symetricRowSort
# from analysis.utilities import findAllCategories, findAllGenes
# from analysis.utilities import findIntersectionsWithDegree
# from analysis.utilities import loadDictionary

# from pipeline.dataFactory.utilities import urlify

In [4]:
root = "/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category"
notebookName = ipynbname.name()
outDir = f'{root}/hyperParameter/{notebookName}.out'
print( f'output dir: \n{outDir}' )
os.makedirs(outDir, exist_ok=True)

imgOut = f'{outDir}/img'
os.makedirs(imgOut, exist_ok=True)
print(f'\nimgOut :\n{imgOut}')

output dir: 
/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/hyperParameter/ascending-vs.-DescendingBaseMeanGeneSignatureSelection.out

imgOut :
/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/hyperParameter/ascending-vs.-DescendingBaseMeanGeneSignatureSelection.out/img


# Hyperparmeter Tunning Metrics

In [5]:
def evaluateDeconvolution(
    root: str,
    outDir: str,    
    resultsDirs : list[str],
    outFilePrefix : str,
    metric : str,
    #stageName : str, #  stageName is part of the outputfile prefix
    threshold : float,
    verbose : bool = False,
    ) -> tuple[pd.DataFrame, pd.DataFrame] :
    '''
    TODO: this was cut-n-paste from hyperparameterTunningResults6.ipynb

    display(df.loc[:, findSummaryMetricsCols(metric) + elifeCols  ] )
    
    returns:
        df : 
            returns a row for each run in resultsDirs 
            The row contains the "metric" for each class + summary statistic

        bellowThresholdDF:
            example:
                    stage	        category	               value
                id			
                0	best10GTEx_TCGA	ACC	                        True
                2	best10GTEx_TCGA	Adipose_Visceral_Omentum	True
                5	best10GTEx_TCGA	Artery_Coronary	            True
                7	best10GTEx_TCGA	BLCA	                    True
                8	best10GTEx_TCGA	BRCA	                    True
            
    '''

    retDF, retBellowThresholdDF = metricsRunner(root, outDir, outFilePrefix, resultsDirs, 
                           metric=metric, threshold=threshold, verbose=verbose)

    display( retDF.loc[:, findSummaryMetricsCols(metric) + elifeCols  ] )

    # print(f'\n{stageName} classs < {threshold} {metric}')
    # selectRowsBellow = retBellowThresholdDF.loc[:, "stage"] == stageName
    
    # display( retBellowThresholdDF.loc[selectRowsBellow, 'category'] )

    return (retDF, retBellowThresholdDF)

In [6]:
def evaluateBest10CuratedDegree1(
        threshold : float = 0.7,
        metric : str = 'sensitivity',
        verbose : bool = False):
    '''
    TODO: this was cut-n-paste from hyperparameterTunningResults6.ipynb
    '''
    bestBes10ResultsDirs = [    
        "best10CuratedDegree1_ce467ff",        
        # "best10CuratedDegree1Ascending", # Ascending performance should match ce467ff
        "best10CuratedDegree1",
    ]
    
    print(f'metric : {metric} threshold: {threshold}')
    #stageName = "AEDWIP"
    outFilePrefix =  "best10CuratedDegree1Runs" 
    best10DF, bestBellowThresholdDF = evaluateDeconvolution(
                                            root=root,
                                            outDir=outDir,
                                            resultsDirs=bestBes10ResultsDirs, 
                                            outFilePrefix=outFilePrefix,
                                            metric=metric,
                                            #stageName=stageName,
                                            threshold=threshold,
                                            verbose=verbose,
                                            )
    
    return(best10DF, bestBellowThresholdDF)

In [7]:
evaluateBest10CuratedDegree1SpecificityThreshold = 0.96
evaluateBest10CuratedDegree1SpecificityMetric = "specificity"

t = evaluateBest10CuratedDegree1(threshold=evaluateBest10CuratedDegree1SpecificityThreshold, 
                                 metric=evaluateBest10CuratedDegree1SpecificityMetric, verbose=False)

evalBest10CuratedDegree1SpecificityDF, evalBest10CuratedDegree1SpecificityBellowThresholdDF = t

metric : specificity threshold: 0.96

saving : /private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/hyperParameter/ascending-vs.-DescendingBaseMeanGeneSignatureSelection.out/best10CuratedDegree1Runs.specificity.0.96.csv

saving : /private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/hyperParameter/ascending-vs.-DescendingBaseMeanGeneSignatureSelection.out/best10CuratedDegree1Runs.specificity.bellow.0.96.csv


id,mean_specificity,std_specificity,median_specificity,numGenes,numTypes,numDegree1,numAboveThreshold,percentAboveThreshold,LUAD,LUSC,COAD,READ,ESCA,LIHC,STAD,Whole_Blood
best10CuratedDegree1_ce467ff,0.997855,0.002922,0.999,716,83,83,83,1.0,0.998,0.995,0.995,0.996,0.999,1.0,0.999,0.999
best10CuratedDegree1,0.997542,0.00303,0.998,716,83,83,83,1.0,1.0,0.994,0.993,0.991,0.998,0.999,1.0,1.0


In [8]:
evaluateBest10CuratedDegree1SensitivityMetric = 'sensitivity'
evaluateBest10CuratedDegree1SensitivityThreshold = 0.90

t =  evaluateBest10CuratedDegree1(
        threshold=evaluateBest10CuratedDegree1SensitivityThreshold,
        metric=evaluateBest10CuratedDegree1SensitivityMetric)

evalBest10CuratedDegree1SensitivityDF, evalBest10CuratedDegree1SensitivityBellowThresholdDF  = t

metric : sensitivity threshold: 0.9

saving : /private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/hyperParameter/ascending-vs.-DescendingBaseMeanGeneSignatureSelection.out/best10CuratedDegree1Runs.sensitivity.0.9.csv

saving : /private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/hyperParameter/ascending-vs.-DescendingBaseMeanGeneSignatureSelection.out/best10CuratedDegree1Runs.sensitivity.bellow.0.9.csv


id,mean_sensitivity,std_sensitivity,median_sensitivity,numGenes,numTypes,numDegree1,numAboveThreshold,percentAboveThreshold,LUAD,LUSC,COAD,READ,ESCA,LIHC,STAD,Whole_Blood
best10CuratedDegree1_ce467ff,0.808831,0.202235,0.871,716,83,83,36,0.433735,0.841,0.691,0.652,0.607,0.369,0.874,0.409,1.0
best10CuratedDegree1,0.786482,0.206089,0.833,716,83,83,30,0.361446,0.485,0.595,0.627,0.679,0.396,0.906,0.271,0.987


# Ascending vs. Descending Box Plots

In [9]:
def loadBaseMeans(
        deseqResultsPath : str
                 ):
    '''
    TODO
    '''
    
    

In [10]:
# asendingBaseMeansSeries = loadBaseMeans( aedwip )

In [11]:
# from analysis.utilities import findDir
# from analysis.utilities import findFile

# runName = "best10CuratedDegree1_ce467ff"
# rootDir = f'{root}/{runName}/training'
# print(f'rootDir : {rootDir}')
# pattern = "GTEx_TCGA-design-tilda_gender_category-padj-0001-lfc-20-n-10"

# signatureDESeqDir = findDir(rootDir, pattern)[0]
# print(f'signatureDESeqDir : {signatureDESeqDir}')

# findFile( signatureDESeqDir, pattern="*.results")

In [12]:
# evalBest10CuratedDegree1SensitivityDF.columns

In [13]:
# findSummaryMetricsCols(evaluateBest10CuratedDegree1SensitivityMetric)

In [14]:
# cList = list(evalBest10CuratedDegree1SensitivityDF)
# for c in findSummaryMetricsCols(evaluateBest10CuratedDegree1SensitivityMetric):
#     print(f'c: {c}')
#     cList.remove( c )

In [15]:
# len(cList)

In [16]:
def getCategories( 
        df : pd.DataFrame,
        metric : str = 'sensitivity'
    ) -> list[str] :
    '''
    removes metric releated columns

    returns tissue type columns
    
    example df: evalBest10CuratedDegree1SensitivityDF
    '''
    
    retList = list(evalBest10CuratedDegree1SensitivityDF)
    
    for c in findSummaryMetricsCols( metric):
        retList.remove( c )

    return retList

In [17]:
categoryList = getCategories(evalBest10CuratedDegree1SensitivityDF, "sensitivity")

In [18]:
from analysis.utilities import findSignatureGenesForPipelineStage

runName = "best10CuratedDegree1_ce467ff"

baseMeansList = findSignatureGenesForPipelineStage(category="LUAD", pipelineStageName=runName, colName="baseMean")


In [20]:
# from pipeline.dataFactory.driver import _countExtraHeaderLines

# numRowsToSkip = _countExtraHeaderLines(resultFile)
baseMeansList

[7.53000571683325,
 7.69014666423403,
 7.69383921183709,
 7.69518919254538,
 7.75964315454308,
 7.86206998976733,
 7.93991164366358,
 8.00738487193841,
 8.03794226103207,
 8.27972423818304]