# Random Forest GTEx_TCGA Gene Signature 

Andrew E. Davidson  
aedavids@ucsc.edu  
7/15/24

Copyright (c) 2020-2023, Regents of the University of California All rights reserved. https://polyformproject.org/licenses/noncommercial/1.0.0

ref:  
* extraCellularRNA/deconvolutionAnalysis/jupyterNotebooks/randomForestGeneSignatureDeconvolution.ipynb
* extraCellularRNA/deconvolutionAnalysis/jupyterNotebooks/randomForestGeneSignatureDeconvolutionPOC.ipynb
* extraCellularRNA/intraExtraRNA_POC/jupyterNotebooks/elife/elifeBinaryRandomForestResults.ipynb

In [1]:
import ipynbname

# use display() to print an html version of a data frame
# useful if dataFrame output is not generated by last like of cell
from IPython.display import display
import joblib
import logging
import matplotlib.pyplot as plt 
import numpy as np
import os
import pandas as pd
# display all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# from sklearn.metrics import confusion_matrix
from sklearn.ensemble      import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# from sklearn.metrics         import recall_score
# from sklearn.metrics         import roc_auc_score
# from sklearn.metrics         import make_scorer

import sys

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
notebookName = ipynbname.name()
notebookPath = ipynbname.path()
notebookDir = os.path.dirname(notebookPath)

#outDir = f'{notebookDir}/{notebookName}.out'
outDir = f'/private/groups/kimlab/aedavids/deconvolution/{notebookName}.out'
os.makedirs(outDir, exist_ok=True)
print(f'outDir:\n{outDir}')

modelOutDir = os.path.join(outDir, "model")
os.makedirs(modelOutDir, exist_ok=True)
print(f'\nmodelOutDir ;\n{modelOutDir}')

imgOut = f'{outDir}/img'
os.makedirs(imgOut, exist_ok=True)
print(f'\nimgOut :\n{imgOut}')

#loglevel = "INFO"
loglevel = "WARN"
# logFMT = "%(asctime)s %(levelname)s [thr:%(threadName)s %(name)s %(funcName)s() line:%(lineno)s] [%(message)s]"
logFMT = "%(asctime)s %(levelname)s %(name)s %(funcName)s() line:%(lineno)s] [%(message)s]"
logging.basicConfig(format=logFMT, level=loglevel)    
logger = logging.getLogger(notebookName)

outDir:
/private/groups/kimlab/aedavids/deconvolution/randomForestGTEx_TCGAGeneSignature.out

modelOutDir ;
/private/groups/kimlab/aedavids/deconvolution/randomForestGTEx_TCGAGeneSignature.out/model

imgOut :
/private/groups/kimlab/aedavids/deconvolution/randomForestGTEx_TCGAGeneSignature.out/img


In [3]:
# setting the python path allows us to run python scripts from using
# the CLI. 
PYTHONPATH = os.environ['PYTHONPATH']
print("ORIG_PYTHONPATH: {}\n".format(PYTHONPATH))

gitRepoRoot = !git rev-parse --show-toplevel
gitRepoRoot = gitRepoRoot[0]

#
# add deconvolutionAnalysis modules
#
deconvolutionModules = f'{gitRepoRoot}/deconvolutionAnalysis/python'
print("deconvolutionModules: {}\n".format(deconvolutionModules))

PYTHONPATH = PYTHONPATH + f':{deconvolutionModules}'
#print("PYTHONPATH: {}\n".format(PYTHONPATH))

sys.path.append( str(deconvolutionModules) )
#print("\nsys.path:\n{}\n".format(sys.path))

#
# add intraExtraRNA_POC modules
#
intraExtraRNA_POCModules = f'{gitRepoRoot}/intraExtraRNA_POC/python/src'
print("intraExtraRNA_POCModules: {}\n".format(intraExtraRNA_POCModules))

PYTHONPATH = PYTHONPATH + f':{intraExtraRNA_POCModules}'
#print("PYTHONPATH: {}\n".format(PYTHONPATH))

sys.path.append( str(intraExtraRNA_POCModules) )
#print("\nsys.path:\n{}\n".format(sys.path))

ORIG_PYTHONPATH: :/private/home/aedavids/extraCellularRNA/src

deconvolutionModules: /private/home/aedavids/extraCellularRNA/deconvolutionAnalysis/python

intraExtraRNA_POCModules: /private/home/aedavids/extraCellularRNA/intraExtraRNA_POC/python/src



In [4]:
# local imports
from intraExtraRNA.elifeUtilities import loadElifeTrainingData
from intraExtraRNA.elifeUtilities import validElifeCategories
from models.mlUtilities import encoder2Dict
from models.mlUtilities import loadEncoder

## Load Gene Expression Data

In [5]:
# get a list of all the GTEx and TCGA types and classes
colDataPath = "/private/groups/kimlab/GTEx_TCGA/groupbyGeneTrainingSets/GTEx_TCGA_TrainColData.csv"
colDataDF = pd.read_csv(colDataPath)
print(f'colDataDF.shape : {colDataDF.shape}')
#colDataDF.head()
categories = colDataDF.loc[:, "category"].unique()
print(f'len(categories) : {len(categories)}')
categories[0:5]

colDataDF.shape : (15801, 6)
len(categories) : 83


array(['Adipose_Subcutaneous', 'Artery_Tibial', 'Heart_Atrial_Appendage',
       'Breast_Mammary_Tissue', 'Brain_Cortex'], dtype=object)

In [6]:
%%time
pipelineStageName = "best10CuratedDegree1_ce467ff"
t = loadElifeTrainingData(pipelineStageName,
                             categories,
                             validElifeCategories,
                             )

HUGO_Genes, elifeGenes, missingGenes, countDF, metaDF, XNP, yNP, labelEncoder_Junk, mapDF = t

print(f'missingGenes:\n{missingGenes}')



KeyError: "['ENSG00000263264.2', 'ENSG00000288380.1', 'ENSG00000274031.1'] not in index"

## Load model

In [7]:
modelName = 'best10CuratedDegree1_ce467ff'
modelRootDir = '/private/groups/kimlab/aedavids/deconvolution/randomForestGeneSignatureDeconvolution.out/model'
modelPath = f"{modelRootDir}/{modelName}.joblib"

print(f'loading model: {modelPath}')
model = joblib.load(modelPath)

print(f'modelName : {modelName}')
model

loading model: /private/groups/kimlab/aedavids/deconvolution/randomForestGeneSignatureDeconvolution.out/model/best10CuratedDegree1_ce467ff.joblib
modelName : best10CuratedDegree1_ce467ff


In [8]:
encoderPath = f'{modelRootDir}/{modelName}.labelEncoder.txt'
labelEncoder = loadEncoder(encoderPath)
print(f'labelEncoder: \n{encoder2Dict(labelEncoder)}')

classes = labelEncoder.classes_
print(f'labelEncoder len(classes) :\n{len(classes)}')

labelEncoder: 
{'ACC': 0, 'Adipose_Subcutaneous': 1, 'Adipose_Visceral_Omentum': 2, 'Adrenal_Gland': 3, 'Artery_Aorta': 4, 'Artery_Coronary': 5, 'Artery_Tibial': 6, 'BLCA': 7, 'BRCA': 8, 'Bladder': 9, 'Brain_Amygdala': 10, 'Brain_Anterior_cingulate_cortex_BA24': 11, 'Brain_Caudate_basal_ganglia': 12, 'Brain_Cerebellar_Hemisphere': 13, 'Brain_Cerebellum': 14, 'Brain_Cortex': 15, 'Brain_Frontal_Cortex_BA9': 16, 'Brain_Hippocampus': 17, 'Brain_Hypothalamus': 18, 'Brain_Nucleus_accumbens_basal_ganglia': 19, 'Brain_Putamen_basal_ganglia': 20, 'Brain_Spinal_cord_cervical_c-1': 21, 'Brain_Substantia_nigra': 22, 'Breast_Mammary_Tissue': 23, 'CESC': 24, 'CHOL': 25, 'COAD': 26, 'Cells_Cultured_fibroblasts': 27, 'Cells_EBV-transformed_lymphocytes': 28, 'Cervix_Endocervix': 29, 'Colon_Sigmoid': 30, 'Colon_Transverse': 31, 'DLBC': 32, 'ESCA': 33, 'Esophagus_Gastroesophageal_Junction': 34, 'Esophagus_Mucosa': 35, 'Esophagus_Muscularis': 36, 'GBM': 37, 'HNSC': 38, 'Heart_Atrial_Appendage': 39, 'Heart

## Make Predictions

In [9]:
%%time
predictions  = model.predict(XNP)
print(f'\npredictions:\n{predictions}')

yProbability = model.predict_proba(XNP)
print(f'\nyProbability:\n{yProbability}')

NameError: name 'XNP' is not defined

In [10]:
mapDF

NameError: name 'mapDF' is not defined