# Random Forest Gene Signature Deconvolution POC
Andrew E. Davidson  
aedavids@ucsc.edu  
6/15/24

Copyright (c) 2020-2023, Regents of the University of California All rights reserved. https://polyformproject.org/licenses/noncommercial/1.0.0



In [1]:
import ipynbname

# use display() to print an html version of a data frame
# useful if dataFrame output is not generated by last like of cell
from IPython.display import display

import matplotlib.pyplot as plt 
import numpy as np
import os
import pandas as pd
# display all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from sklearn.preprocessing import LabelEncoder

import sys

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# setting the python path allows us to run python scripts from using
# the CLI. 
PYTHONPATH = os.environ['PYTHONPATH']
print("ORIG_PYTHONPATH: {}\n".format(PYTHONPATH))

gitRepoRoot = !git rev-parse --show-toplevel
gitRepoRoot = gitRepoRoot[0]

#
# add deconvolutionAnalysis modules
#
deconvolutionModules = f'{gitRepoRoot}/deconvolutionAnalysis/python'
print("deconvolutionModules: {}\n".format(deconvolutionModules))

PYTHONPATH = PYTHONPATH + f':{deconvolutionModules}'
#print("PYTHONPATH: {}\n".format(PYTHONPATH))

sys.path.append( str(deconvolutionModules) )
#print("\nsys.path:\n{}\n".format(sys.path))

#
# add intraExtraRNA_POC modules
#
intraExtraRNA_POCModules = f'{gitRepoRoot}/intraExtraRNA_POC/python/src'
print("intraExtraRNA_POCModules: {}\n".format(intraExtraRNA_POCModules))

PYTHONPATH = PYTHONPATH + f':{intraExtraRNA_POCModules}'
#print("PYTHONPATH: {}\n".format(PYTHONPATH))

sys.path.append( str(intraExtraRNA_POCModules) )
#print("\nsys.path:\n{}\n".format(sys.path))

ORIG_PYTHONPATH: :/private/home/aedavids/extraCellularRNA/src

deconvolutionModules: /private/home/aedavids/extraCellularRNA/deconvolutionAnalysis/python

intraExtraRNA_POCModules: /private/home/aedavids/extraCellularRNA/intraExtraRNA_POC/python/src



In [3]:
runName = "best10CuratedDegree1_ce467ff"
weird = "best10CuratedDegree1"
rootDir = "/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category"
runOutDir = f'{rootDir}/{runName}/training/{weird}.sh.out'
ciberSortInput = "GTEx_TCGA-design-tilda_gender_category-padj-0001-lfc-20-n-10/ciberSortInput"

## Create expected values
i.e. one hot encoding

In [4]:
expectedFractionsPath = f'{runOutDir}/{ciberSortInput}/expectedFractions.txt'
! ls $expectedFractionsPath

/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/best10CuratedDegree1_ce467ff/training/best10CuratedDegree1.sh.out/GTEx_TCGA-design-tilda_gender_category-padj-0001-lfc-20-n-10/ciberSortInput/expectedFractions.txt


In [5]:
%%time
expectedFractionsDF = pd.read_csv(expectedFractionsPath, sep="\t",  index_col="sample_id")
print(f'expectedFractionsDF.shape : {expectedFractionsDF.shape}')
expectedFractionsDF.iloc[0:5, 0:9]

expectedFractionsDF.shape : (15801, 88)
CPU times: user 131 ms, sys: 60 ms, total: 191 ms
Wall time: 202 ms


Unnamed: 0_level_0,participant_id,category,gender,age,dataSet,ACC,Adipose_Subcutaneous,Adipose_Visceral_Omentum,Adrenal_Gland
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F,Adipose_Subcutaneous,Female,66.0,GTEx,0.0,1.0,0.0,0.0
GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F,Artery_Tibial,Female,66.0,GTEx,0.0,0.0,0.0,0.0
GTEX-1117F-0726-SM-5GIEN,GTEX-1117F,Heart_Atrial_Appendage,Female,66.0,GTEx,0.0,0.0,0.0,0.0
GTEX-1117F-2826-SM-5GZXL,GTEX-1117F,Breast_Mammary_Tissue,Female,66.0,GTEx,0.0,0.0,0.0,0.0
GTEX-1117F-3226-SM-5N9CT,GTEX-1117F,Brain_Cortex,Female,66.0,GTEx,0.0,0.0,0.0,0.0


In [6]:
YSeries = expectedFractionsDF.loc[:,'category']
YSeries[0:5]

sample_id
GTEX-1117F-0226-SM-5GZZ7      Adipose_Subcutaneous
GTEX-1117F-0526-SM-5EGHJ             Artery_Tibial
GTEX-1117F-0726-SM-5GIEN    Heart_Atrial_Appendage
GTEX-1117F-2826-SM-5GZXL     Breast_Mammary_Tissue
GTEX-1117F-3226-SM-5N9CT              Brain_Cortex
Name: category, dtype: object

In [7]:
labelEncoder = LabelEncoder()
labelEncoder.fit(YSeries)
labelList = list( labelEncoder.classes_ )
print(len(labelList) )
labelList[0:5]

83


['ACC',
 'Adipose_Subcutaneous',
 'Adipose_Visceral_Omentum',
 'Adrenal_Gland',
 'Artery_Aorta']

## Load normalize counts
convert to Random forest counts

In [8]:
mixturePath = f'{runOutDir}/{ciberSortInput}/mixture.txt'
! ls $mixturePath

/private/groups/kimlab/aedavids/deconvolution/1vsAll-~gender_category/best10CuratedDegree1_ce467ff/training/best10CuratedDegree1.sh.out/GTEx_TCGA-design-tilda_gender_category-padj-0001-lfc-20-n-10/ciberSortInput/mixture.txt


In [9]:
%%time
mixtureDF = pd.read_csv(mixturePath, sep="\t", index_col="sampleTitle")
print(f'mixtureDF.shape : {mixtureDF.shape}')
mixtureDF.iloc[0:5, 0:5]

mixtureDF.shape : (716, 15801)
CPU times: user 3.79 s, sys: 388 ms, total: 4.18 s
Wall time: 4.2 s


Unnamed: 0_level_0,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-2826-SM-5GZXL,GTEX-1117F-3226-SM-5N9CT
sampleTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
(GGTG)n,19.001808,26.644422,12.776613,72.892107,26.525577
(GT)n,833.601046,664.744176,401.398583,918.272979,427.061784
(TA)n,29.74196,14.346997,13.84133,25.135209,5.968255
(TCCAC)n,0.0,1.366381,0.0,0.83784,0.0
(TCTATG)n,0.0,0.0,0.0,0.0,0.663139


In [10]:
XDF = mixtureDF.transpose()
print(f'XDF.shape : {XDF.shape}')
XDF.iloc[0:5, 0:5]

XDF.shape : (15801, 716)


sampleTitle,(GGTG)n,(GT)n,(TA)n,(TCCAC)n,(TCTATG)n
GTEX-1117F-0226-SM-5GZZ7,19.001808,833.601046,29.74196,0.0,0.0
GTEX-1117F-0526-SM-5EGHJ,26.644422,664.744176,14.346997,1.366381,0.0
GTEX-1117F-0726-SM-5GIEN,12.776613,401.398583,13.84133,0.0,0.0
GTEX-1117F-2826-SM-5GZXL,72.892107,918.272979,25.135209,0.83784,0.0
GTEX-1117F-3226-SM-5N9CT,26.525577,427.061784,5.968255,0.0,0.663139


In [11]:
# Add the category column to counts
labeledXDF = pd.merge(XDF, expectedFractionsDF.loc[:, ['category']], how='inner',left_index=True, right_index=True) # left_on="sampleTitle", right_on="sample_id"
print(labeledXDF.shape)
display(labeledXDF.iloc[0:5, -5:])

(15801, 717)


Unnamed: 0,ZNRF1,ZRANB1,ZSWIM4,ZYG11B,category
GTEX-1117F-0226-SM-5GZZ7,712.154709,1738.252329,247.849667,785.683443,Adipose_Subcutaneous
GTEX-1117F-0526-SM-5EGHJ,651.76356,1028.201423,151.66825,678.407982,Artery_Tibial
GTEX-1117F-0726-SM-5GIEN,234.2379,421.62822,98.48639,466.346365,Heart_Atrial_Appendage
GTEX-1117F-2826-SM-5GZXL,918.272979,967.705557,318.379317,811.029419,Breast_Mammary_Tissue
GTEX-1117F-3226-SM-5N9CT,590.85722,759.294632,94.828937,578.257571,Brain_Cortex


## Convert to random forest format

In [14]:
XNP = XDF.values
print(f'XNP.shape : {XNP.shape}')
print(type(XNP))
print(XNP[0:5, 0:5])

XNP.shape : (15801, 716)
<class 'numpy.ndarray'>
[[1.90018078e+01 8.33601046e+02 2.97419600e+01 0.00000000e+00
  0.00000000e+00]
 [2.66444223e+01 6.64744176e+02 1.43469966e+01 1.36638063e+00
  0.00000000e+00]
 [1.27766127e+01 4.01398583e+02 1.38413305e+01 0.00000000e+00
  0.00000000e+00]
 [7.28921069e+01 9.18272979e+02 2.51352093e+01 8.37840309e-01
  0.00000000e+00]
 [2.65255767e+01 4.27061784e+02 5.96825475e+00 0.00000000e+00
  6.63139417e-01]]


In [29]:
yNP = labelEncoder.transform( labeledXDF.loc[:, 'category'] ) # .values
# yNP = labelEncoder.transform( ['Adipose_Subcutaneous', 'Artery_Tibial', 'Heart_Atrial_Appendage',
#                                'UVM', 'UVM', 'UVM'] )
print(f'yNP.shape : {yNP.shape}')
print(f'yNP[0:5, :] \n{yNP[0:5]}')
print(f'\nlabelEncoder.inverse_transform( yNP ) :\n {labelEncoder.inverse_transform( yNP )}')

yNP.shape : (15801,)
yNP[0:5, :] 
[ 1  6 39 23 15]

labelEncoder.inverse_transform( yNP ) :
 ['Adipose_Subcutaneous' 'Artery_Tibial' 'Heart_Atrial_Appendage' ... 'UVM'
 'UVM' 'UVM']


In [21]:
labeledXDF.loc[:, 'category'].values

array(['Adipose_Subcutaneous', 'Artery_Tibial', 'Heart_Atrial_Appendage',
       ..., 'UVM', 'UVM', 'UVM'], dtype=object)