# Random Forest GTEx_TCGA Gene Signature 

Andrew E. Davidson  
aedavids@ucsc.edu  
7/15/24

Copyright (c) 2020-2023, Regents of the University of California All rights reserved. https://polyformproject.org/licenses/noncommercial/1.0.0

ref:  
* extraCellularRNA/deconvolutionAnalysis/jupyterNotebooks/randomForestGeneSignatureDeconvolution.ipynb
* extraCellularRNA/deconvolutionAnalysis/jupyterNotebooks/randomForestGeneSignatureDeconvolutionPOC.ipynb
* extraCellularRNA/intraExtraRNA_POC/jupyterNotebooks/elife/elifeBinaryRandomForestResults.ipynb

## <span style="color:red;background-color:yellow">TODO</span>
- deconvolutionAnalysis/jupyterNotebooks/randomForestGeneSignatureDeconvolution.ipynb

- deconvolutionAnalysis/jupyterNotebooks/randomForestGeneSignatureDeconvolutionPOC.ipynb

  
- deconvolutionAnalysis/jupyterNotebooks/randomForestGeneSignatureDeconvolutionPOC.ipynb
  * can we find these missing genes
  * KeyError: "['ENSG00000263264.2', 'ENSG00000288380.1', 'ENSG00000274031.1'] not in index"


## **<span style="color:red;background-color:yellow">model bulk tissue models fails to predict elife</span>**  
92% are predicted to be 'Whole_Blood'.

- todo
  * treat like mulit model. ie what is the next most probably class
  * train a new model on bulk wiht out whole_blood bio markers. Label whole_blood samples as unknown. I.E. not one of the 82 other classes
    + elife healthy control should map to unknown?
  * look for likelihood vectors with no strong signal 

In [1]:
import ipynbname

# use display() to print an html version of a data frame
# useful if dataFrame output is not generated by last like of cell
from IPython.display import display
import joblib
import logging
import matplotlib.pyplot as plt 
import numpy as np
import os
import pandas as pd
# display all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# from sklearn.metrics import confusion_matrix
from sklearn.ensemble      import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# from sklearn.metrics         import recall_score
# from sklearn.metrics         import roc_auc_score
# from sklearn.metrics         import make_scorer

import sys

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
notebookName = ipynbname.name()
notebookPath = ipynbname.path()
notebookDir = os.path.dirname(notebookPath)

#outDir = f'{notebookDir}/{notebookName}.out'
outDir = f'/private/groups/kimlab/aedavids/deconvolution/{notebookName}.out'
os.makedirs(outDir, exist_ok=True)
print(f'outDir:\n{outDir}')

modelOutDir = os.path.join(outDir, "model")
os.makedirs(modelOutDir, exist_ok=True)
print(f'\nmodelOutDir ;\n{modelOutDir}')

imgOut = f'{outDir}/img'
os.makedirs(imgOut, exist_ok=True)
print(f'\nimgOut :\n{imgOut}')

#loglevel = "INFO"
loglevel = "WARN"
# logFMT = "%(asctime)s %(levelname)s [thr:%(threadName)s %(name)s %(funcName)s() line:%(lineno)s] [%(message)s]"
logFMT = "%(asctime)s %(levelname)s %(name)s %(funcName)s() line:%(lineno)s] [%(message)s]"
logging.basicConfig(format=logFMT, level=loglevel)    
logger = logging.getLogger(notebookName)

outDir:
/private/groups/kimlab/aedavids/deconvolution/randomForestGTEx_TCGAGeneSignature.out

modelOutDir ;
/private/groups/kimlab/aedavids/deconvolution/randomForestGTEx_TCGAGeneSignature.out/model

imgOut :
/private/groups/kimlab/aedavids/deconvolution/randomForestGTEx_TCGAGeneSignature.out/img


In [3]:
# setting the python path allows us to run python scripts from using
# the CLI. 
PYTHONPATH = os.environ['PYTHONPATH']
print("ORIG_PYTHONPATH: {}\n".format(PYTHONPATH))

gitRepoRoot = !git rev-parse --show-toplevel
gitRepoRoot = gitRepoRoot[0]

#
# add deconvolutionAnalysis modules
#
deconvolutionModules = f'{gitRepoRoot}/deconvolutionAnalysis/python'
print("deconvolutionModules: {}\n".format(deconvolutionModules))

PYTHONPATH = PYTHONPATH + f':{deconvolutionModules}'
#print("PYTHONPATH: {}\n".format(PYTHONPATH))

sys.path.append( str(deconvolutionModules) )
#print("\nsys.path:\n{}\n".format(sys.path))

#
# add intraExtraRNA_POC modules
#
intraExtraRNA_POCModules = f'{gitRepoRoot}/intraExtraRNA_POC/python/src'
print("intraExtraRNA_POCModules: {}\n".format(intraExtraRNA_POCModules))

PYTHONPATH = PYTHONPATH + f':{intraExtraRNA_POCModules}'
#print("PYTHONPATH: {}\n".format(PYTHONPATH))

sys.path.append( str(intraExtraRNA_POCModules) )
#print("\nsys.path:\n{}\n".format(sys.path))

ORIG_PYTHONPATH: :/private/home/aedavids/extraCellularRNA/src

deconvolutionModules: /private/home/aedavids/extraCellularRNA/deconvolutionAnalysis/python

intraExtraRNA_POCModules: /private/home/aedavids/extraCellularRNA/intraExtraRNA_POC/python/src



In [4]:
# local imports
# from analysis.utilities import saveList
# from analysis.utilities import loadList
from intraExtraRNA.elifeUtilities import loadElifeTrainingData
from intraExtraRNA.elifeUtilities import validElifeCategories
from models.mlUtilities import encoder2Dict
from models.mlUtilities import loadEncoder

## Load Gene Expression Data

In [5]:
# get a list of all the GTEx and TCGA types and classes
colDataPath = "/private/groups/kimlab/GTEx_TCGA/groupbyGeneTrainingSets/GTEx_TCGA_TrainColData.csv"
colDataDF = pd.read_csv(colDataPath)
print(f'colDataDF.shape : {colDataDF.shape}')
#colDataDF.head()
categories = colDataDF.loc[:, "category"].unique()
print(f'len(categories) : {len(categories)}')
categories[0:5]

colDataDF.shape : (15801, 6)
len(categories) : 83


array(['Adipose_Subcutaneous', 'Artery_Tibial', 'Heart_Atrial_Appendage',
       'Breast_Mammary_Tissue', 'Brain_Cortex'], dtype=object)

In [6]:
%%time
pipelineStageName = "best10CuratedDegree1_ce467ff"
t = loadElifeTrainingData(pipelineStageName,
                             categories,
                             validElifeCategories,
                             )

HUGO_Genes, elifeGenes, missingGenes, countDF, metaDF, XDF, yNP, labelEncoderElife, mapDF = t

print(f'missingGenes:\n{missingGenes}')

 : ['ENSG00000288380', 'ENSG00000274031', 'ENSG00000263264', 'ENSG00000244693']]


missingGenes:
['ENSG00000288380', 'ENSG00000274031', 'ENSG00000263264', 'ENSG00000244693']
CPU times: user 1min 19s, sys: 12.5 s, total: 1min 31s
Wall time: 1min 33s


## Load model

In [7]:
#modelName = 'best10CuratedDegree1_ce467ff' # full model with 716 bio marker genes
modelName = 'best10CuratedDegree1_ce467ff_elife' # does not include 4 missing genes 
modelRootDir = '/private/groups/kimlab/aedavids/deconvolution/randomForestGeneSignatureDeconvolution.out/model'
modelPath = f"{modelRootDir}/{modelName}.joblib"

print(f'loading model: {modelPath}')
model = joblib.load(modelPath)

print(f'modelName : {modelName}')
model

loading model: /private/groups/kimlab/aedavids/deconvolution/randomForestGeneSignatureDeconvolution.out/model/best10CuratedDegree1_ce467ff_elife.joblib
modelName : best10CuratedDegree1_ce467ff_elife


### find missing features
The full best10CuratedDegree1_ce467ff run used v35 genes named using HUGO ids. The full random forest has 716 genes. 
Elife uses v39 ENSG ids. 4 v35 genes are missing. Find the missing ids in HUGO format and save them to the model out directory
This will make it possible to train a elife specific model

In [8]:
if modelName == 'best10CuratedDegree1_ce467ff':
    # find mapping for features in elife that match our model
    hugoModelFeatures = model.feature_names_in_
    selectHugoModelFeature = mapDF.loc[:, "HUGO_v35"].isin( hugoModelFeatures )
    
    matchingMapDF = mapDF.loc[ selectHugoModelFeature, : ]
    print( f'number of features : {model.n_features_in_}' )
    print(f'matchingMapDF.shape : {matchingMapDF.shape}' )
    display(matchingMapDF.head())
    
    # find model features that are not in elife in Hugo format
    missingElifeGenesInHUGOFormat = list( set( hugoModelFeatures ) - set( matchingMapDF.loc[:, 'HUGO_v35'] ) )
    print(f'missing elife genes in hugo format : {missingElifeGenesInHUGOFormat}' )

    missingElifeFeaturesPath =  f"{modelRootDir}/missingElifeFeatures.csv"
    missingElifeGenesSeries = pd.Series( missingElifeGenesInHUGOFormat, name="missing" )
    missingElifeGenesSeries.to_csv( missingElifeFeaturesPath, index=False )
    print(f'saving missing features list to :\n{missingElifeFeaturesPath}' )

In [9]:
encoderPath = f'{modelRootDir}/{modelName}.labelEncoder.txt'
labelEncoder = loadEncoder(encoderPath)
print(f'labelEncoder: \n{encoder2Dict(labelEncoder)}')

classes = labelEncoder.classes_
print(f'labelEncoder len(classes) :\n{len(classes)}')

labelEncoder: 
{'ACC': 0, 'Adipose_Subcutaneous': 1, 'Adipose_Visceral_Omentum': 2, 'Adrenal_Gland': 3, 'Artery_Aorta': 4, 'Artery_Coronary': 5, 'Artery_Tibial': 6, 'BLCA': 7, 'BRCA': 8, 'Bladder': 9, 'Brain_Amygdala': 10, 'Brain_Anterior_cingulate_cortex_BA24': 11, 'Brain_Caudate_basal_ganglia': 12, 'Brain_Cerebellar_Hemisphere': 13, 'Brain_Cerebellum': 14, 'Brain_Cortex': 15, 'Brain_Frontal_Cortex_BA9': 16, 'Brain_Hippocampus': 17, 'Brain_Hypothalamus': 18, 'Brain_Nucleus_accumbens_basal_ganglia': 19, 'Brain_Putamen_basal_ganglia': 20, 'Brain_Spinal_cord_cervical_c-1': 21, 'Brain_Substantia_nigra': 22, 'Breast_Mammary_Tissue': 23, 'CESC': 24, 'CHOL': 25, 'COAD': 26, 'Cells_Cultured_fibroblasts': 27, 'Cells_EBV-transformed_lymphocytes': 28, 'Cervix_Endocervix': 29, 'Colon_Sigmoid': 30, 'Colon_Transverse': 31, 'DLBC': 32, 'ESCA': 33, 'Esophagus_Gastroesophageal_Junction': 34, 'Esophagus_Mucosa': 35, 'Esophagus_Muscularis': 36, 'GBM': 37, 'HNSC': 38, 'Heart_Atrial_Appendage': 39, 'Heart

## Make Predictions

In [10]:
# we need to make sure the features (cols) are in correct order
print(model.n_features_in_)
# The attribute feature_names_in_ will only be valid if model was trained using a data frame
# numpy arrays to not have feature name information
orderedFeatureHugoList = model.feature_names_in_
print( f'len(orderedFeatureHugoList) : {len(orderedFeatureHugoList)}' )
print(f'len(XDF.columns) : {len(XDF.columns)}' )
logger.error('AEDWIP elife has an extra feature')
# print( f'\norderedFeatureHugoList[0:5] : { orderedFeatureHugoList[0:5]} ') 
# print( f'\norderedFeatureHugoList[-5:] : { orderedFeatureHugoList[-5:]} ') 

# print( 'PLCXD1' in orderedFeatureHugoList ) 
# # display( mapDF.loc['PLCXD1', :] )
# display( mapDF[ mapDF['HUGO_v35'] =='PLCXD1' ] )

2024-07-24 13:53:05,227 ERROR randomForestGTEx_TCGAGeneSignature <module>() line:8] [AEDWIP elife has an extra feature]


712
len(orderedFeatureHugoList) : 712
len(XDF.columns) : 713


In [11]:
def dropDuplicateColumns(orderedFeatureHugoList, XDF, mapDF):
    '''
    TODO

    not generic. Assume only 1 key is duplicate

    I forgot why there is a duplicate. I think it might be because a given gene
    can have several transcripts each with a different bio type. this results
    in multiple entries in teh mapping table

    could be because of hack I added to map elife genes
    '''
    print('begin')
    print( f'len(orderedFeatureHugoList) : {len(orderedFeatureHugoList)}' )
    print(f'len(XDF.columns) : {len(XDF.columns)}' )

    #display( list(XDF.columns) )
    # find extra feature?
    #print( sum(XDF.columns == 'ENSG00000182378.15') )
    #display( mapDF[ mapDF['ENSG_v39'] =='ENSG00000182378.15' ] )
    
    # check for dups
    columnsLst = list(XDF.columns)
    offset = -1
    dupIdxs = []
    for c in columnsLst:
        if columnsLst.count(c) > 1:
            offset =columnsLst.index(c, offset+1 ) 
            print(f"{c} is a duplicate idx: {offset}")
            dupIdxs.append( offset )

    print(dupIdxs)
    

    v1 = XDF.iloc[:, dupIdxs[0] ]
    v2 = XDF.iloc[:, dupIdxs[1] ]

    assert v1.equals(v2) , f'ERROR :The column names are the sames how ever values are differnt'

    # drop idx from end of list first
    lastDupIdx = dupIdxs[-1]
    print( len(columnsLst) )
    # del columnsLst[lastDupIdx]
    intIdx = [i for i in range(len(columnsLst)) ]
    del intIdx[lastDupIdx]
    # print( len(columnsLst) )
    print( len(intIdx) )

    return XDF.iloc[:, intIdx].copy()


tmpXDF = dropDuplicateColumns(orderedFeatureHugoList, XDF, mapDF)
print(f'tmpXDF.shape : {tmpXDF.shape}')

begin
len(orderedFeatureHugoList) : 712
len(XDF.columns) : 713
ENSG00000182378.15 is a duplicate idx: 670
ENSG00000182378.15 is a duplicate idx: 712
[670, 712]
713
712
tmpXDF.shape : (224, 712)


In [12]:
# the values in the mapDF columns are not unique (a gene can have several transcripts, with different biotypes)
# do not set index
# tmpDF1 = mapDF.copy()
# # use tail to debug. the ENSG biomarkers are on the bottom. The top are repeats
# display( tmpDF1.tail() )

# print()
# display( tmpDF1.tail() )

# print()
# # display(tmpDF1.loc['PLCXD1', :] )

In [13]:
# reorder the rows
# V39FeatureOrderSeries = tmpDF1.loc[ orderedFeatureHugoList, 'ENSG_v39' ]
# display( V39FeatureOrderSeries.head() )
# print()
# display( V39FeatureOrderSeries.tail() )
# print()
# print( V39FeatureOrderSeries['ENSG00000182378.15'] )
#print( V39FeatureOrderSeries['ENSG00000182378.15_PAR_Y'] )
# print( V39FeatureOrderSeries['ENSG00000182378.14'] )
# print( V39FeatureOrderSeries[''] )
# print( V39FeatureOrderSeries[''] )

In [14]:

def bar():
    print('we can not use the hugo and elife genes list they are not order correctly')
    hg = HUGO_Genes[0:5]
    print( hg )
    eg = elifeGenes[0:5]
    print(eg )

    selectRows = mapDF.loc[:,'HUGO_v35'].isin( hg )
    display( mapDF.loc[selectRows, :] )
   

bar()

we can not use the hugo and elife genes list they are not order correctly
['TPRN', 'DENND6B', 'CRTAC1', 'AC069281.2', 'SLC9A3-AS1']
['ENSG00000198744.5', 'ENSG00000187642.9', 'ENSG00000215915.10', 'ENSG00000235169.11', 'ENSG00000171819.5']


Unnamed: 0,HUGO_v35,ENSG_v35,ENSG_v39
174,SLC9A3-AS1,ENSG00000225138.8,ENSG00000225138.8
257,AC069281.2,ENSG00000274272.1,ENSG00000274272.1
320,TPRN,ENSG00000176058.13,ENSG00000176058.13
337,CRTAC1,ENSG00000095713.14,ENSG00000095713.14
669,DENND6B,ENSG00000205593.12,ENSG00000205593.12


In [15]:
def getV39FeatureOrder():
    # select the orderedFeatureHugoList genes from teh mapDF
    selectRows = mapDF.loc[:, 'HUGO_v35'].isin( orderedFeatureHugoList )
    featuredHugoMappingDF = mapDF.loc[selectRows, :]

    # are the HUGO value unique ? NO
    print(f'len(orderedFeatureHugoList) : {len(orderedFeatureHugoList)}')
    print(f'featuredHugoMappingDF.shape : {featuredHugoMappingDF.shape}')

    # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.duplicated.html
    # keep=False will cause a boolean series to be return, where duplicates are marked as true
    selectDuplicateRows = featuredHugoMappingDF.duplicated( ['HUGO_v35'], keep=False ) 
    print(f'\nduplicate HUGO_v35 rows')
    display( featuredHugoMappingDF.loc[selectDuplicateRows, :] )

    # 
    # selectFirstRows = featuredHugoMappingDF.duplicated( ['HUGO_v35'], keep='first' ) 
    # print( sum(selectFirstRows) )

    uniquFeaturedHugoMappingDF = featuredHugoMappingDF.drop_duplicates(subset=['HUGO_v35'], inplace=False)
    print(f'uniquFeaturedHugoMappingDF.shape : {uniquFeaturedHugoMappingDF.shape}')

    # set the index so we can reorder rows
    tmpMappingDF = uniquFeaturedHugoMappingDF.set_index('HUGO_v35', inplace=False)

    orderedMappingDF = tmpMappingDF.loc[orderedFeatureHugoList, :]
    print(f'orderedMappingDF.shape : {orderedMappingDF.shape}')

    print("head")
    display( orderedMappingDF.head() )
    print("tail")
    display( orderedMappingDF.tail() )

    retSeries = orderedMappingDF.loc[:, "ENSG_v39"]

    return retSeries

V39FeatureOrderSeries = getV39FeatureOrder()
print(f'V39FeatureOrderSeries.shape : {V39FeatureOrderSeries.shape}')
display( V39FeatureOrderSeries[0:5] )
display( V39FeatureOrderSeries[-5:] )

len(orderedFeatureHugoList) : 712
featuredHugoMappingDF.shape : (715, 3)

duplicate HUGO_v35 rows


Unnamed: 0,HUGO_v35,ENSG_v35,ENSG_v39
670,PLCXD1,ENSG00000182378.14,ENSG00000182378.15
671,PLCXD1,ENSG00000182378.14,ENSG00000182378.15_PAR_Y
693,PLCXD1,ENSG00000182378.14_PAR_Y,ENSG00000182378.15
694,PLCXD1,ENSG00000182378.14_PAR_Y,ENSG00000182378.15_PAR_Y


uniquFeaturedHugoMappingDF.shape : (712, 3)
orderedMappingDF.shape : (712, 2)
head


Unnamed: 0_level_0,ENSG_v35,ENSG_v39
HUGO_v35,Unnamed: 1_level_1,Unnamed: 2_level_1
(GGTG)n,(GGTG)n,(GGTG)n
(GT)n,(GT)n,(GT)n
(TA)n,(TA)n,(TA)n
(TCCAC)n,(TCCAC)n,(TCCAC)n
(TCTATG)n,(TCTATG)n,(TCTATG)n


tail


Unnamed: 0_level_0,ENSG_v35,ENSG_v39
HUGO_v35,Unnamed: 1_level_1,Unnamed: 2_level_1
ZNF781,ENSG00000196381.11,ENSG00000196381.11
ZNRF1,ENSG00000186187.12,ENSG00000186187.12
ZRANB1,ENSG00000019995.6,ENSG00000019995.6
ZSWIM4,ENSG00000132003.9,ENSG00000132003.10
ZYG11B,ENSG00000162378.13,ENSG00000162378.13


V39FeatureOrderSeries.shape : (712,)


HUGO_v35
(GGTG)n        (GGTG)n
(GT)n            (GT)n
(TA)n            (TA)n
(TCCAC)n      (TCCAC)n
(TCTATG)n    (TCTATG)n
Name: ENSG_v39, dtype: object

HUGO_v35
ZNF781    ENSG00000196381.11
ZNRF1     ENSG00000186187.12
ZRANB1     ENSG00000019995.6
ZSWIM4    ENSG00000132003.10
ZYG11B    ENSG00000162378.13
Name: ENSG_v39, dtype: object

In [16]:
XXXDF = tmpXDF.loc[:, V39FeatureOrderSeries.values]
# model expects the names to be in Hugo Format
XXXDF.columns = orderedFeatureHugoList
print(f'XXXDF.shape : {XXXDF.shape}')

XXXDF.shape : (224, 712)


In [17]:
# V39FeatureOrderSeries = 

In [18]:
# display( XDF.iloc[:, 0:5].head() )
# t = V39FeatureOrderSeries[0:5]
# print()
# display( XDF.loc[:, t].head() )

# XXXDF = XDF.loc[:, V39FeatureOrderSeries]

In [19]:
# #V39FeatureOrderSeries['ENSG00000182378.15_PAR_Y']
# #         "ENSG00000182378.15_PAR_Y" : "ENSG00000182378.15",

# def foo():
#     # 	HUGO_v35 ENSG_v35	ENSG_v39
#     #select = mapDF.loc[:, 'HUGO_v35'] == "ENSG00000182378.15"
#     #select = mapDF.loc[:, 'ENSG_v35'] == "ENSG00000182378.15"
#     select = mapDF.loc[:, 'ENSG_v39'] == "ENSG00000182378.15"
#     display( mapDF.loc[select, :] )

# foo()

In [20]:
%%time
predictions  = model.predict(XXXDF)
print(f'\npredictions:\n{predictions[0:5]}')

yProbability = model.predict_proba(XXXDF)
print(f'\nyProbability:\n{yProbability[0:5]}')


predictions:
[82 82 82 82 82]

yProbability:
[[0.05 0.   0.   0.   0.01 0.01 0.02 0.01 0.   0.   0.   0.   0.01 0.01
  0.   0.   0.   0.   0.   0.04 0.01 0.01 0.   0.   0.   0.   0.   0.
  0.02 0.   0.   0.   0.01 0.   0.   0.   0.   0.   0.01 0.   0.1  0.
  0.   0.01 0.03 0.   0.01 0.01 0.   0.01 0.06 0.   0.   0.05 0.   0.
  0.   0.   0.   0.   0.01 0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.01 0.   0.   0.02 0.   0.   0.   0.   0.   0.46]
 [0.02 0.   0.   0.   0.02 0.   0.02 0.   0.   0.   0.01 0.   0.05 0.01
  0.   0.   0.   0.01 0.01 0.01 0.03 0.   0.   0.01 0.   0.   0.01 0.02
  0.02 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.01 0.1  0.
  0.   0.   0.   0.   0.02 0.01 0.   0.05 0.   0.   0.   0.05 0.   0.
  0.   0.   0.01 0.   0.04 0.   0.   0.01 0.   0.   0.   0.02 0.   0.
  0.   0.   0.   0.   0.   0.   0.01 0.   0.   0.02 0.   0.   0.4 ]
 [0.05 0.01 0.   0.   0.   0.   0.01 0.   0.   0.   0.   0.   0.04 0.
  0.   0.   0.   0.   0.   0.   0.01 0.   

In [44]:
# evaluate predictions
elifeLabels = labelEncoderElife.inverse_transform(yNP)
predictionLabels = labelEncoder.inverse_transform( predictions )

numWholeBlood = sum(predictionLabels == "Whole_Blood")
print(f' number of Whole_Blood predictions : {numWholeBlood}  {round( 100 * numWholeBlood / len(predictionLabels))}%' )


predictionDF = pd.DataFrame( {'elife' : elifeLabels, 
                               'prediction' : predictionLabels,
                                'i' : [1 for i in range(len(yNP))] } )


display( predictionDF.groupby( ["elife", 'prediction'] ).count() )

print()
display( predictionDF.groupby( ['prediction', "elife"] ).count() )

rename labelEncoder_Junk to labelEncoderElife
 number of Whole_Blood predictions : 205  92%


Unnamed: 0_level_0,Unnamed: 1_level_0,i
elife,prediction,Unnamed: 2_level_1
Colorectal Cancer,Heart_Left_Ventricle,4
Colorectal Cancer,Whole_Blood,49
Esophagus Cancer,Heart_Left_Ventricle,2
Esophagus Cancer,Whole_Blood,29
Healthy donor,Heart_Left_Ventricle,3
Healthy donor,Whole_Blood,40
Liver Cancer,Heart_Left_Ventricle,2
Liver Cancer,Testis,5
Liver Cancer,Whole_Blood,19
Lung Cancer,Testis,2





Unnamed: 0_level_0,Unnamed: 1_level_0,i
prediction,elife,Unnamed: 2_level_1
Heart_Left_Ventricle,Colorectal Cancer,4
Heart_Left_Ventricle,Esophagus Cancer,2
Heart_Left_Ventricle,Healthy donor,3
Heart_Left_Ventricle,Liver Cancer,2
Heart_Left_Ventricle,Stomach Cancer,1
Testis,Liver Cancer,5
Testis,Lung Cancer,2
Whole_Blood,Colorectal Cancer,49
Whole_Blood,Esophagus Cancer,29
Whole_Blood,Healthy donor,40


In [21]:
aedwip do not plot confusion matrix. it just a big mess

SyntaxError: invalid syntax (2149445957.py, line 1)

## extraCellularRNA/terra/jupyterNotebooks/cibersort/fractionsAsMulticlassClassification.ipynb
copied ploting code from advancement. make_confusion_matrix() did not work well when we have a large # of classes

**<span style="color:red;background-color:yellow">refactor notebooks</span>**  

- deconvolutionAnalysis/jupyterNotebooks/randomForestGeneSignatureDeconvolution.ipynb
- intraExtraRNA_POC/jupyterNotebooks/elife/randomForestGTEx_TCGAGeneSignature.ipynb

In [None]:
# from confusion_matrix.cf_matrix import make_confusion_matrix

from sklearn.metrics import confusion_matrix

from pipeline.dataFactory.utilities import urlify

In [None]:
import bme263DataVis.utilities as utl
import seaborn as sns

def plotConfusionMatrix(confusionData, labels, annotations, imgOutDir, title, pageWidthInInches, 
                        pageHeightInInches, displayCounts=False):
    '''
    arguments
        confusionData, labels, 
        
        annotations
            string values to display in heat map cells, must have same shape as confusionData
        
        imgOutDir, title, pageWidthInInches, pageHeightInInches
        
        displayCounts:
            boolean: default = False
            if true fmt is set to '.0f', and annotations are not displayed
    '''
    
    plotUtils =  utl.MatPlotLibUtilities()
    fig = plt.figure( figsize=(pageWidthInInches, pageHeightInInches) )

    # correlation
    cbarPanelWidthInInches = 0.25 #0.5
    panelWidthInInches = pageWidthInInches - cbarPanelWidthInInches
    panelHeightInInches = pageHeightInInches 
    leftRelativeSize = 0
    bottomRelativeSize = 0
    panel = plotUtils.createPanel(fig, panelWidthInInches, panelHeightInInches, 
                                             leftRelativeSize, bottomRelativeSize)
    
    cbarPanelHeightInInches = panelHeightInInches - 2 #1
    cbarLeftRelativeSize = 0.99 #0.985 #0.975 # 0.95   # % of fig width
    cbarBottomRelativeSize = 0.08 #0.1 #0.125 #0.15 # 0.10 # % of fig height
    cbarPanel = plotUtils.createPanel(fig, cbarPanelWidthInInches, cbarPanelHeightInInches, 
                                             cbarLeftRelativeSize, cbarBottomRelativeSize)

    if (displayCounts):
        panel = sns.heatmap( confusionData 
                            , xticklabels = labels
                            , yticklabels = labels
                            , ax = panel
                            , cbar_ax = cbarPanel
                            , cmap = "Blues"
                            #, cmap="viridis"
                            , annot = True # display the numeric values in confusionData
                            #, fmt='.2%' # use if annot = True # default is '.2g'
                            , fmt=".0f" # use to display true positive counts
                            , square=True
                            , linecolor = "black" # default is "white"
                            , linewidths = 1.0
                   )
    else:
        panel = sns.heatmap( confusionData 
                        , xticklabels = labels
                        , yticklabels = labels
                        , ax = panel
                        , cbar_ax = cbarPanel
                        , cmap = "Blues"
                        #, cmap="viridis"                            
                        , annot = annotations
                        , fmt='' # required if our annotations are strings
                        , square=True
                        , linecolor = "black" # default is "white"
                        , linewidths = 1.0

               )        

    panel.set_title(title)
    panel.set_ylabel('Actual')
    panel.set_xlabel('Predicted')

    # https://drawingfromdata.com/seaborn/matplotlib/visualization/rotate-axis-labels-matplotlib-seaborn.html
    panel.set_xticklabels(panel.get_xticklabels(), rotation=45, horizontalalignment='right')

    # imgFile = imgOutDir.joinpath( urlify(title) + ".png" )
    imgFile = f'{imgOutDir}/{urlify(title) + ".png"}'
    print(f"save: {imgFile}")
    plt.savefig( imgFile , bbox_inches='tight', dpi=300,  facecolor="white", transparent=True)

In [None]:
def plotConfusionMatrixHelper(imgOutDir, title, cfMatrix, labels, start, end, displayCounts=False):
    pageWidthInInches  = 8
    pageHeightInInches = 8
    
    if displayCounts:
        confusionData = cfMatrix
        annot = None
    else :
        confusionData = calculateRowPercentages( cfMatrix )
        annotations = createHeatMapAnnotations( confusionData )
        annot = annotations[start:end, start:end]

    cfMatrixL = confusionData[start:end, start:end] # confusion_matrix is TP, FP counts we want percentages 
    labelsL = labels[start:end]
    plotConfusionMatrix(cfMatrixL, labelsL, annot, imgOutDir, title, pageWidthInInches, 
                        pageHeightInInches, displayCounts)

In [None]:
def calculateRowPercentages(cm):
    '''
    divides each value in a row by the row totoal
    
    arguments:
        cm: a confusion matrix
        
    returns
        a numpy array with same shape as cm 
    '''
    byRows = 1
    rowMarginEquals1 = np.sum(cm, axis=byRows)

    rowMarginEquals1 = rowMarginEquals1.astype(np.float16)
    
    print(f'AEDWIP hack')
    for i in range(len(rowMarginEquals1)):
        v = rowMarginEquals1[i]
        print(f' i: {i} v:{v}')
        if v == 0:
            rowMarginEquals1[i] = 999
            print(f'aeDWIP hack i: {i} v: {v} rowMarginEquals1[i] : {rowMarginEquals1[i]} ' )
    
    print(f"aediwp rowMarginEquals1\n{rowMarginEquals1}")
    print(f"aedwip rowMarginEquals1.shape\n{rowMarginEquals1.shape}")

    rowMarginEquals1Reshape = rowMarginEquals1.reshape(rowMarginEquals1.shape[0], 1)
    #print(f"\n rowMarginEquals1Reshape\n{rowMarginEquals1Reshape}")

 
    print(f'AEDWIP rowMarginEquals1Reshape.shape : {rowMarginEquals1Reshape.shape}')
    print(f'AEDWIP cm.shape : {cm.shape}')
    print(f'AEDWIP rowMarginEquals1Reshape : \n{rowMarginEquals1Reshape}')

    ret = cm/rowMarginEquals1Reshape
    
    return ret
    
    
# def testCalculateRowPercentages(cm):
#     ret = calculateRowPercentages(cm)
#     print(ret)
    
# testCalculateRowPercentages( cfMatrix )

In [None]:
def createHeatMapAnnotations( rowPercentages ):
    '''
    seaborn heatmap argument annot = True display the cfMatrix cell values.
    fmt='.2%' argument would display the number cell values with 2 decimal places
    
    our confustion matrix is large, and has a lot of zeros making it hard to visualy
    interpurt the data.
    
    retuns 
        a numpy array of string with the same dimensions as rowPercentages
        use this as the value for the argument annot, do not use fmt
    '''
    numRows = rowPercentages.shape[0]
    numCols = rowPercentages.shape[1]
    
    bufferList = [0] * numRows
    for i in range(numRows):        
        a = [""] * numCols 
        for j in range(numCols):
            d = rowPercentages[i,j]
            if d > 0.0:
                #s = '{:.2f}'.format(d)
                s = '{}%'.format( round(d * 100), digits=0) 
                a[j] = s
                
        aNP = np.array( a, dtype=str )#.reshape( (1,numCols) )
        bufferList[i] = aNP
        
    return np.array(bufferList, dtype=object) 
    
# def testCreateHeatMapAnnotations():
#     rowPercentages = calculateRowPercentages(cfMatrix)
#     print(f'rowPercentages.shape: {rowPercentages.shape}')
#     print(rowPercentages)
#     ret = createHeatMapAnnotations( rowPercentages )
#     print(f'ret.shape: {ret.shape}')
#     print(ret)
    
# testCreateHeatMapAnnotations()

In [None]:
print(f'AEDWIP yNP.shape : {yNP.shape}')
print(f'AEDWIP predictions.shape : {predictions.shape}')
cm = confusion_matrix(yNP, predictions)
confusionData = calculateRowPercentages( cm )
labelList = list( labelEncoder.classes_ )
labels =  labelList
annotations = createHeatMapAnnotations( confusionData )

title = f"Random Forest {modelName} Gene Signature GTEx _TCGA Training Set"

pageWidthInInches = 8
pageHeightInInches  = 8
plotConfusionMatrix( confusionData, labels, annotations, imgOut, title, pageWidthInInches, 
                        pageHeightInInches, displayCounts=False )

# tmpDir = "./tmp"
# plotConfusionMatrixHelper(tmpDir, title, cm, labels=['c', 'd',  'e', 'f', 'g', 'a', 'b', 'z'], start=0, end=8, displayCounts=False)

In [None]:
print(cm)

print()
print(yNP.shape)
print(yNP)

print()
print( labelEncoder_Junk.classes_)

print()
print(predictions)

In [None]:
# how many samples where predicted to be whole blood
sum( predictions == 82 )

In [None]:
notWholeBloodNP = predictions[ predictions != 82 ] 
print(f' {labelEncoder.classes_[40]} ')
print(f' {labelEncoder.classes_[75]} ')
print(f' {labelEncoder.classes_[82]} ')
notWholeBloodNP


In [None]:
print()
yPDF = pd.DataFrame( yProbability )
yPDF