# hyperparameterTunningResults6
Andrew E. Davidson  
aedavids@ucsc.edu  
5/20/2024

Copyright (c) 2020-2023, Regents of the University of California All rights reserved. https://polyformproject.org/licenses/noncommercial/1.0.0

ref: 
- deconvolutionAnalysis/doc/addDegree2Genes.md
- deconvolutionAnalysis/doc/bestCuratedNotes.md
- intraExtraRNA_POC/adenocarcinoma.vs.control/enrichESCA.ipynb
- deconvolutionAnalysis/jupyterNotebooks/hyperParameterTunning/findCandidateEnrichmentBiomarkers.ipynb

**overview**  
See if we can improve both deconvolution hyperparameter results from dec 2023 - jan 24, and elife random forest hyperparater results

**<span style="color:red;background-color:yellow">best10CuratedDegree1_ce467ff bug</span>**  
sorted results in ascending order!


In [1]:
import ipynbname

# use display() to print an html version of a data frame
# useful if dataFrame output is not generated by last like of cell
from IPython.display import display

import numpy as np
import pandas as pd
# display all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import pathlib as pl
import pprint as pp
import os
import sys

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# setting the python path allows us to run python scripts from using
# the CLI. 
PYTHONPATH = os.environ['PYTHONPATH']
print("ORIG_PYTHONPATH: {}\n".format(PYTHONPATH))

gitRepoRoot = !git rev-parse --show-toplevel
gitRepoRoot = gitRepoRoot[0]

#
# add deconvolutionAnalysis modules
#
deconvolutionModules = f'{gitRepoRoot}/deconvolutionAnalysis/python'
print("deconvolutionModules: {}\n".format(deconvolutionModules))

PYTHONPATH = PYTHONPATH + f':{deconvolutionModules}'
#print("PYTHONPATH: {}\n".format(PYTHONPATH))

sys.path.append( str(deconvolutionModules) )
#print("\nsys.path:\n{}\n".format(sys.path))

#
# add intraExtraRNA_POC modules
#
intraExtraRNA_POCModules = f'{gitRepoRoot}/intraExtraRNA_POC/python/src'
print("intraExtraRNA_POCModules: {}\n".format(intraExtraRNA_POCModules))

PYTHONPATH = PYTHONPATH + f':{intraExtraRNA_POCModules}'
#print("PYTHONPATH: {}\n".format(PYTHONPATH))

sys.path.append( str(intraExtraRNA_POCModules) )
#print("\nsys.path:\n{}\n".format(sys.path))

ORIG_PYTHONPATH: :/private/home/aedavids/extraCellularRNA/src

deconvolutionModules: /private/home/aedavids/extraCellularRNA/deconvolutionAnalysis/python

intraExtraRNA_POCModules: /private/home/aedavids/extraCellularRNA/intraExtraRNA_POC/python/src



In [3]:
from analysis.hyperParameterTunningMetrics import metricsRunner, elifeCols, lungCols
from analysis.hyperParameterTunningMetrics import findSummaryMetricsCols
from analysis.utilities import findAllCategories, findAllGenes
from analysis.utilities import findIntersectionsWithDegree
from analysis.utilities import loadDictionary

In [4]:
root = "/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category"
notebookName = ipynbname.name()
outDir = f'{root}/hyperParameter/{notebookName}.out'
print( f'output dir: \n{outDir}' )
os.makedirs(outDir, exist_ok=True)

output dir: 
/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/hyperParameter/hyperparameterTunningResults6.out


In [5]:
def evaluateDecovolution(
    root: str,
    outDir: str,    
    resultsDirs : list[str],
    outFilePrefix : str,
    metric : str,
    #stageName : str, #  stageName is part of the outputfile prefix
    threshold : float,
    verbose : bool = False,
    ) -> tuple[pd.DataFrame, pd.DataFrame] :
    '''
    display(df.loc[:, findSummaryMetricsCols(metric) + elifeCols  ] )
    
    returns:
        df : 
            returns a row for each run in resultsDirs 
            The row contains the "metric" for each class + summary statistic

        bellowThresholdDF:
            example:
                    stage	        category	               value
                id			
                0	best10GTEx_TCGA	ACC	                        True
                2	best10GTEx_TCGA	Adipose_Visceral_Omentum	True
                5	best10GTEx_TCGA	Artery_Coronary	            True
                7	best10GTEx_TCGA	BLCA	                    True
                8	best10GTEx_TCGA	BRCA	                    True
            
    '''

    retDF, retBellowThresholdDF = metricsRunner(root, outDir, outFilePrefix, resultsDirs, 
                           metric=metric, threshold=threshold, verbose=verbose)

    display( retDF.loc[:, findSummaryMetricsCols(metric) + elifeCols  ] )

    # print(f'\n{stageName} classs < {threshold} {metric}')
    # selectRowsBellow = retBellowThresholdDF.loc[:, "stage"] == stageName
    
    # display( retBellowThresholdDF.loc[selectRowsBellow, 'category'] )

    return (retDF, retBellowThresholdDF)

In [6]:
def evaluateBest10CuratedDegree1(
        threshold : float = 0.7,
        metric : str = 'sensitivity',
        verbose : bool = False):
    '''
    TODO
    '''
    bestBes10ResultsDirs = [    
        'best1CuratedDegree1',
        'best2CuratedDegree1',
        'best3CuratedDegree1',
        'best5CuratedDegree1',
        "best10CuratedDegree1_ce467ff",        
        "best10CuratedDegree1",
        "best10CuratedDegree1.degree1ESCA_01",
    ]
    
    print(f'metric : {metric} threshold: {threshold}')
    #stageName = "AEDWIP"
    outFilePrefix =  "best10CuratedDegree1Runs" 
    best10DF, bestBellowThresholdDF = evaluateDecovolution(
                                            root=root,
                                            outDir=outDir,
                                            resultsDirs=bestBes10ResultsDirs, 
                                            outFilePrefix=outFilePrefix,
                                            metric=metric,
                                            #stageName=stageName,
                                            threshold=threshold,
                                            verbose=verbose,
                                            )
    
    return(best10DF, bestBellowThresholdDF)

## Best 10 CuratedDegree1 Specificity

In [7]:
evalBest10CuratedDegree1SpecificityDF, evalBest10CuratedDegree1SpecificityBellowThresholdDF = \
    evaluateBest10CuratedDegree1(threshold=0.96, metric='specificity', verbose=True)

metric : specificity threshold: 0.96
path : /private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/best1CuratedDegree1

load best1CuratedDegree1 :
/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/best1CuratedDegree1/training/best1CuratedDegree1.sh.out/metrics/metricsRounded.csv

load
/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/best1CuratedDegree1/training/best1CuratedDegree1.sh.out/upsetPlot.out/best1_from_best500FindAllDegree1_wl500.intersection.dict

best1CuratedDegree1 types without degree 1 intersections: 
 set()
path : /private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/best2CuratedDegree1

load best2CuratedDegree1 :
/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/best2CuratedDegree1/training/best2CuratedDegree1.sh.out/metrics/metricsRounded.csv

load
/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/best2CuratedDegree1/training/best2CuratedDegree1.sh.out/upset

id,mean_specificity,std_specificity,median_specificity,numGenes,numTypes,numDegree1,numAboveThreshold,LUAD,LUSC,COAD,READ,ESCA,LIHC,STAD,Whole_Blood
best1CuratedDegree1,0.994181,0.009109,0.999,83,83,83,82,0.999,0.997,0.999,0.994,0.987,0.989,0.998,0.999
best2CuratedDegree1,0.99588,0.006005,0.998,161,83,83,83,0.991,0.996,0.998,0.995,0.994,0.991,0.999,1.0
best3CuratedDegree1,0.996398,0.004783,0.998,236,83,83,83,0.997,0.995,0.997,0.993,0.997,0.992,0.999,1.0
best5CuratedDegree1,0.997,0.004006,0.998,380,83,83,83,0.999,0.996,0.996,0.995,0.998,0.994,1.0,1.0
best10CuratedDegree1_ce467ff,0.997855,0.002922,0.999,716,83,83,83,0.998,0.995,0.995,0.996,0.999,1.0,0.999,0.999
best10CuratedDegree1,0.997542,0.00303,0.998,716,83,83,83,1.0,0.994,0.993,0.991,0.998,0.999,1.0,1.0
best10CuratedDegree1.degree1ESCA_01,0.996759,0.005057,0.998,713,83,83,83,1.0,0.995,0.997,0.997,0.999,0.999,1.0,1.0


In [8]:
# print(evalBest10CuratedDegree1SpecificityDF.shape)
# evalBest10CuratedDegree1SpecificityDF.head()

In [9]:
print(evalBest10CuratedDegree1SpecificityBellowThresholdDF.shape)
evalBest10CuratedDegree1SpecificityBellowThresholdDF

(1, 3)


Unnamed: 0_level_0,stage,category,value
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
45,best1CuratedDegree1,LGG,True


## Best 10 CuratedDegree1 Sensitivity

In [10]:
evaluateBest10CuratedDegree1Metric = 'sensitivity'
evaluateBest10CuratedDegree1Threshold = 0.70
# t =  evaluateBest10CuratedDegree1(
                                    # threshold=evaluateBest10CuratedDegree1Threshold, 
                                    # metric=evaluateBest10CuratedDegree1Metric)
#  evalBest10CuratedDegree1SensitivityDF, evalBest10CuratedDegree1SensitivityBellowThresholdDF = t

evaluateBest10CuratedDegree1Metric = 'sensitivity'
evaluateBest10CuratedDegree1Threshold = 0.90
t =  evaluateBest10CuratedDegree1(
        threshold=evaluateBest10CuratedDegree1Threshold,
        metric=evaluateBest10CuratedDegree1Metric)

evalBest10CuratedDegree1SensitivityDF, evalBest10CuratedDegree1SensitivityBellowThresholdDF  = t

metric : sensitivity threshold: 0.9

saving : /private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/hyperParameter/hyperparameterTunningResults6.out/best10CuratedDegree1Runs.sensitivity.0.9.csv

saving : /private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/hyperParameter/hyperparameterTunningResults6.out/best10CuratedDegree1Runs.sensitivity.bellow.0.9.csv


id,mean_sensitivity,std_sensitivity,median_sensitivity,numGenes,numTypes,numDegree1,numAboveThreshold,LUAD,LUSC,COAD,READ,ESCA,LIHC,STAD,Whole_Blood
best1CuratedDegree1,0.486398,0.359127,0.484,83,83,83,15,0.246,0.203,0.304,0.607,0.486,0.789,0.431,0.989
best2CuratedDegree1,0.621855,0.310492,0.716,161,83,83,17,0.146,0.296,0.411,0.589,0.486,0.807,0.28,0.989
best3CuratedDegree1,0.668783,0.304236,0.792,236,83,83,23,0.172,0.359,0.424,0.607,0.423,0.839,0.293,0.985
best5CuratedDegree1,0.720494,0.258503,0.804,380,83,83,25,0.22,0.488,0.614,0.625,0.441,0.888,0.267,0.982
best10CuratedDegree1_ce467ff,0.808831,0.202235,0.871,716,83,83,36,0.841,0.691,0.652,0.607,0.369,0.874,0.409,1.0
best10CuratedDegree1,0.786482,0.206089,0.833,716,83,83,30,0.485,0.595,0.627,0.679,0.396,0.906,0.271,0.987
best10CuratedDegree1.degree1ESCA_01,0.70541,0.214658,0.747,713,83,83,19,0.382,0.575,0.525,0.625,0.27,0.852,0.156,0.993


In [11]:
# print(evalBest10CuratedDegree1SensitivityDF.shape)
# print(evalBest10CuratedDegree1SensitivityDF.index)
# evalBest10CuratedDegree1SensitivityDF.head()

In [15]:
def findUnderPerformaingClasses(df, bellowThresholdDF):
    stageNames = bellowThresholdDF.loc[:, 'stage'].unique()

    seriesList = []
    print(f'stageNames : {stageNames}')
    for stageName in stageNames:
        #print(f'\n####### stageName : {stageName}')
        selectRows = bellowThresholdDF.loc[:, 'stage'] == stageName
        bellowCategories = bellowThresholdDF.loc[selectRows, 'category']
        bellowSeries = df.loc[stageName, bellowCategories]
        #print(f'{bellowSeries}')
        seriesList.append(bellowSeries)
       
    retDF = pd.concat( seriesList, axis=1)
    return retDF.sort_index()
    

aedwipDF = findUnderPerformaingClasses( evalBest10CuratedDegree1SensitivityDF, evalBest10CuratedDegree1SensitivityBellowThresholdDF )
print('\n#############')
display( aedwipDF)

print(f'\n####### number of classes above threshold')
aedwipIsNaNDF = aedwipDF.isna()
# aedwipNumNaN = aedwipIsNaNDF.isna().sum()
aedwipNumNaN = aedwipIsNaNDF.sum()
print(aedwipNumNaN)

print(f'\n####### {evaluateBest10CuratedDegree1Metric} classes above threshold : {evaluateBest10CuratedDegree1Threshold} evaluateBest10CuratedDegree1 ')
for c in aedwipIsNaNDF.columns:
    booleanSeries = aedwipIsNaNDF.loc[:, c]
    # nanNames = aedwipIsNaNDF.loc[selectRows, c].index
    nanNames = aedwipDF.loc[booleanSeries, c]
    print(f'\n{c} nanNames : {nanNames.index.tolist()}')

stageNames : ['best1CuratedDegree1' 'best2CuratedDegree1' 'best3CuratedDegree1'
 'best5CuratedDegree1' 'best10CuratedDegree1_ce467ff'
 'best10CuratedDegree1' 'best10CuratedDegree1.degree1ESCA_01']

#############


Unnamed: 0_level_0,best1CuratedDegree1,best2CuratedDegree1,best3CuratedDegree1,best5CuratedDegree1,best10CuratedDegree1_ce467ff,best10CuratedDegree1,best10CuratedDegree1.degree1ESCA_01
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ACC,0.625,0.812,0.833,0.854,,0.896,0.458
Adipose_Subcutaneous,0.776,0.781,0.842,,,,0.852
Adipose_Visceral_Omentum,0.043,0.462,0.557,0.603,0.763,0.818,0.643
Adrenal_Gland,0.297,0.735,,,,,0.781
Artery_Aorta,0.88,,,,,,
Artery_Coronary,0.069,0.125,0.208,0.535,0.681,0.792,0.646
Artery_Tibial,0.874,,,,,,
BLCA,0.168,0.545,0.713,0.758,0.75,0.807,0.672
BRCA,0.611,0.693,0.687,0.696,0.618,0.534,0.549
Bladder,0.556,0.556,0.556,0.556,0.778,0.778,0.667



####### number of classes above threshold
best1CuratedDegree1                     7
best2CuratedDegree1                     9
best3CuratedDegree1                    15
best5CuratedDegree1                    17
best10CuratedDegree1_ce467ff           28
best10CuratedDegree1                   22
best10CuratedDegree1.degree1ESCA_01    11
dtype: int64

####### sensitivity classes above threshold : 0.9 evaluateBest10CuratedDegree1 

best1CuratedDegree1 nanNames : ['Brain_Cerebellar_Hemisphere', 'Heart_Atrial_Appendage', 'KICH', 'LGG', 'Muscle_Skeletal', 'PCPG', 'Pituitary']

best2CuratedDegree1 nanNames : ['Artery_Aorta', 'Artery_Tibial', 'Brain_Cerebellar_Hemisphere', 'Heart_Atrial_Appendage', 'KICH', 'LGG', 'PCPG', 'PRAD', 'Thyroid']

best3CuratedDegree1 nanNames : ['Adrenal_Gland', 'Artery_Aorta', 'Artery_Tibial', 'Brain_Cerebellar_Hemisphere', 'Brain_Spinal_cord_cervical_c-1', 'DLBC', 'Esophagus_Mucosa', 'Heart_Atrial_Appendage', 'Heart_Left_Ventricle', 'KICH', 'LGG', 'Lung', 'Muscle_Sk

In [None]:
# print(evalBest10CuratedDegree1SensitivityBellowThresholdDF.shape)
# print(evalBest10CuratedDegree1SensitivityBellowThresholdDF.index)

# evalBest10CuratedDegree1SensitivityBellowThresholdDF

In [None]:
aedwip

In [None]:
# 3/29 with Daniel
runOfInsterest = ["best10CuratedDegree1_ce467ff", "best10CuratedDegree1", ]
evalBest10LFCSpecificityDF.loc[ runOfInsterest ]

## which classes in best10LFC_GTEx_TCGA under performed?
<span style="color:red;background-color:yellow">aedwip: look at output of foo() looks like we could manually add genes from best10CuratedDegree1_ce467ff" </span>


In [None]:
def foo(
    df : pd.DataFrame,
    stage : str = "best10LFC_GTEx_TCGA",
    ) -> pd.Series:
    '''
    TODO
    '''
    
    selectRows = df.loc[:,'stage'] == stage
    retSeries = df.loc[selectRows, "category"]
    return retSeries



stage = "best10LFC_GTEx_TCGA"
belowSeries = foo(evalBestLFCSensitivityBellowThresholdDF, stage)
print(f'belowSeries.shape : {belowSeries.shape}')

print('\n\n^^^^^^^^^^^^^^^')
display( evalBest10LFCSensitivityDF.loc[:, belowSeries.values] )

In [None]:
selectBellowSenitivityRows = evalBest10LFCSensitivityDF.index == stage
display( evalBest10LFCSensitivityDF.loc[selectBellowSenitivityRows, belowSeries.values] )
aedwipDF = evalBest10LFCSensitivityDF.loc[selectBellowSenitivityRows, belowSeries.values]
aedwipDF.transpose().sort_values(by=stage)

## find difference between best10CuratedDegree1_ce467ff and best10CuratedDegree1
These two runs have the highest mean sensitivity

<span style="color:red;background-color:yellow">manual curation </span>  
look for neg difference. These are categories where best10CuratedDegree1 did better. we should be able to 
 


In [None]:
# threshold = 0.7
metricCols = ['mean_sensitivity', 'std_sensitivity',
       'median_sensitivity', 'numGenes', 'numTypes', 'numDegree1',
       'numAboveThreshold']

categoryCols = ~ evalBest10LFCSensitivityDF.columns.isin(metricCols)

selectBestRuns = ["best10CuratedDegree1_ce467ff", "best10CuratedDegree1", ]
bestRunsDF = evalBest10LFCSensitivityDF.loc[selectBestRuns , categoryCols]
bestRunsDF.loc['diff'] = bestRunsDF.loc['best10CuratedDegree1_ce467ff', :] - bestRunsDF.loc['best10CuratedDegree1', :] #.transpose()
bestRunsDF

In [None]:
xxxRunsDF = evalBest10LFCSensitivityDF.loc[["best10CuratedDegree1_ce467ff", "best10LFC_CuratedDegree1", ] , categoryCols]
xxxRunsDF.loc['diff'] = xxxRunsDF.loc['best10CuratedDegree1_ce467ff', :] - xxxRunsDF.loc['best10LFC_CuratedDegree1', :] #.transpose()
xxxRunsDF

# <span style="color:red;background-color:yellow">Esophagus </span>

In [None]:
esophCategories = ['Esophagus_Gastroesophageal_Junction','Esophagus_Mucosa', 
                   'Esophagus_Muscularis', 'ESCA', 'HNSC' ]

print(f'specificity')
display( evalBest10LFCSpecificityDF.loc[:, esophCategories] )

print(f'\nsensitivity')
evalBest10LFCSensitivityDF.loc[:, esophCategories]

# <span style="color:red;background-color:yellow"> Uterus</span>

In [None]:
uterusCategories = ['Uterus', 'UCEC', 'UCS' ]

print(f'specificity')
display( evalBest10LFCSpecificityDF.loc[:, uterusCategories] )

print(f'\nsensitivity')
evalBest10LFCSensitivityDF.loc[:, uterusCategories]

# <span style="color:red;background-color:yellow">AEDWIP </span>


In [None]:
lungCategories = ['Lung', 'LUAD', 'LUSC']
# best10CuratedDegree1_ce467ff
# display( evalBest10LFCSpecificityDF.loc[:,  metricCols + lungCategories] )

# some where above we messed up the df

# # if we do not bind the results, jupyter lab will print them.
# evalBest10LFCSensitivityDF, evalBestLFCSensitivityBellowThresholdDF = \
#     evaluateBest10LCFResults(threshold=0.70, metric='sensitivity', verbose=False)

# display( evalBest10LFCSensitivityDF )


# evalBest10LFCSpecificityDF, evalBestLFCSpecificityBellowThresholdDF = \
#     evaluateBest10LCFResults(threshold=0.96, metric='specificity')

# display(evalBest10LFCSpecificityDF)



In [None]:
sensitivityMetrics = ['mean_sensitivity',  
'numGenes', 'numTypes', 'numDegree1', ]

specificityMetrics = ['mean_specificity', 'numGenes', 'numTypes', 'numDegree1', ]

elifeBinaryFeatures = ['Stomach', 'Liver', 'Lung', 'COAD', 'Esophagus_Mucosa']

display(evalBest10LFCSpecificityDF.loc[["best10CuratedDegree1_ce467ff"],
                                        specificityMetrics + elifeBinaryFeatures + ['ESCA']])

display(evalBest10LFCSensitivityDF.loc[["best10CuratedDegree1_ce467ff"],
                                        sensitivityMetrics + elifeBinaryFeatures + ['ESCA']])

# aedwipDF1 = evalBest10LFCSpecificityDF.loc[ ['best10CuratedDegree1_ce467ff'], sensitivityMetrics + lungCategories]
# aedwipDF2 = evalBest10LFCSensitivityDF.loc[ ['best10CuratedDegree1_ce467ff'], specificityMetrics + lungCategories]
# # display(evalBest10LFCSpecificityDF.loc[ ['best10CuratedDegree1_ce467ff'], specificityMetrics + lungCategories])
# display(evalBest10LFCSensitivityDF.loc[ ['best10CuratedDegree1_ce467ff'], sensitivityMetrics + lungCategories])
# print(metricCols)

In [None]:
from IPython.display import HTML
HTML(aedwipDF1.to_html(index=False))

In [None]:
HTML(aedwipDF2.to_html(index=False))

In [None]:
aedwipDF1.to_string(index=False)

In [None]:
aedwipDF2.to_string(index=False)

In [None]:
evalBest10LFCSpecificityDF.columns