# Plasma GAN
Andrew E. Davidson  
aedavids@ucsc.edu  
8/29/24  

Copyright (c) 2020-2023, Regents of the University of California All rights reserved. https://polyformproject.org/licenses/noncommercial/1.0.0  

ref:
- boiler plate and gene expression data load
    * intraExtraRNA_POC/jupyterNotebooks/elife/randomForestGTEx_TCGAGeneSignature.ipynb
      + aedwip what did this do
    * deconvolutionAnalysis/jupyterNotebooks/randomForestGeneSignatureDeconvolution.ipynb
      + trained a random forest model on the GTEx_TCGA training data set. It has perfect TP

## TODO:


the random forest on GTEX_TCGA had 712 genes

 so we have a list of 712 HUGO Genes, we need to map these to elife

elifeutilities.py

206 def loadElifeTrainingData
    123 def loadCounts
    236 loadMetaData

    assume we have a list of hugo genes
    282 selectFeatures
    284 fixBest10CuratedDegree1_ce467ff
    
```

In [1]:
import ipynbname

# use display() to print an html version of a data frame
# useful if dataFrame output is not generated by last like of cell
from IPython.display import display
import joblib
import logging
import matplotlib.pyplot as plt 
import numpy as np
import os
import pandas as pd
# display all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# from sklearn.metrics import confusion_matrix
from sklearn.ensemble      import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# from sklearn.metrics         import recall_score
# from sklearn.metrics         import roc_auc_score
# from sklearn.metrics         import make_scorer

import sys

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
notebookName = ipynbname.name()
notebookPath = ipynbname.path()
notebookDir = os.path.dirname(notebookPath)

outDir = f'{notebookDir}/{notebookName}.out'
#outDir = f'/private/groups/kimlab/aedavids/deconvolution/{notebookName}.out'
os.makedirs(outDir, exist_ok=True)
print(f'outDir:\n{outDir}')

modelOutDir = os.path.join(outDir, "model")
os.makedirs(modelOutDir, exist_ok=True)
print(f'\nmodelOutDir ;\n{modelOutDir}')

imgOut = f'{outDir}/img'
os.makedirs(imgOut, exist_ok=True)
print(f'\nimgOut :\n{imgOut}')

localCacheDir = "/scratch/aedavids/" + notebookName + "/cache"
print(f'\nlocalCacheDir :\n{localCacheDir}')


loglevel = "INFO"
#loglevel = "WARN"
# logFMT = "%(asctime)s %(levelname)s [thr:%(threadName)s %(name)s %(funcName)s() line:%(lineno)s] [%(message)s]"
logFMT = "%(asctime)s %(levelname)s %(name)s %(funcName)s() line:%(lineno)s] [%(message)s]"
logging.basicConfig(format=logFMT, level=loglevel)    
logger = logging.getLogger(notebookName)

pipelineStageName = "best10CuratedDegree1_ce467ff"

outDir:
/private/home/aedavids/extraCellularRNA/intraExtraRNA_POC/jupyterNotebooks/elife/gan/plasmaGAN.out

modelOutDir ;
/private/home/aedavids/extraCellularRNA/intraExtraRNA_POC/jupyterNotebooks/elife/gan/plasmaGAN.out/model

imgOut :
/private/home/aedavids/extraCellularRNA/intraExtraRNA_POC/jupyterNotebooks/elife/gan/plasmaGAN.out/img

localCacheDir :
/scratch/aedavids/plasmaGAN/cache


In [3]:
# setting the python path allows us to run python scripts from using
# the CLI. 
PYTHONPATH = os.environ['PYTHONPATH']
print("ORIG_PYTHONPATH: {}\n".format(PYTHONPATH))

gitRepoRoot = !git rev-parse --show-toplevel
gitRepoRoot = gitRepoRoot[0]

#
# add deconvolutionAnalysis modules
#
deconvolutionModules = f'{gitRepoRoot}/deconvolutionAnalysis/python'
print("deconvolutionModules: {}\n".format(deconvolutionModules))

PYTHONPATH = PYTHONPATH + f':{deconvolutionModules}'
#print("PYTHONPATH: {}\n".format(PYTHONPATH))

sys.path.append( str(deconvolutionModules) )
#print("\nsys.path:\n{}\n".format(sys.path))

#
# add intraExtraRNA_POC modules
#
intraExtraRNA_POCModules = f'{gitRepoRoot}/intraExtraRNA_POC/python/src'
print("intraExtraRNA_POCModules: {}\n".format(intraExtraRNA_POCModules))

PYTHONPATH = PYTHONPATH + f':{intraExtraRNA_POCModules}'
#print("PYTHONPATH: {}\n".format(PYTHONPATH))

sys.path.append( str(intraExtraRNA_POCModules) )
#print("\nsys.path:\n{}\n".format(sys.path))

ORIG_PYTHONPATH: :/private/home/aedavids/extraCellularRNA/src

deconvolutionModules: /private/home/aedavids/extraCellularRNA/deconvolutionAnalysis/python

intraExtraRNA_POCModules: /private/home/aedavids/extraCellularRNA/intraExtraRNA_POC/python/src



In [4]:
# local imports
from analysis.utilities import loadList
from intraExtraRNA.plasmaGAN.plasmaGAN import loadCountData

## Find features used to train randomForestGeneSignatureDeconvolution model
This model was trained on  GTEx_TCGA training data set has perfect TP!

ref: deconvolutionAnalysis/jupyterNotebooks/randomForestGeneSignatureDeconvolution

model saved to /private/groups/kimlab/aedavids/deconvolution/randomForestGeneSignatureDeconvolution.out

best10CuratedDegree1_ce467ff_elife.orderedFeatures.txt has the list of features. The "elife" token in the file name is a little confusing. The CIBERSORTx signature matrix for best10CuratedDegree1_ce467ff has a couple of gene that we could not map to the elife data set. This list of features is the elife subset of features

In [5]:
def getListOfFeatures() :
    '''
    wrapper function to prevent global namespace polution
    '''
    rfPath = "/private/groups/kimlab/aedavids/deconvolution/randomForestGeneSignatureDeconvolution.out/model"
    fileName = "best10CuratedDegree1_ce467ff_elife.orderedFeatures.txt"
    p = rfPath + "/" + fileName
    print(f'loading features:\n{p}' )
    ret = loadList( p )

    return ret

HUGO_featuresNames = getListOfFeatures()

print(f'\nnumber of HUGO_featuresNames : {len(HUGO_featuresNames)}')
print( f'first 3: {HUGO_featuresNames[:3]} last 3: {HUGO_featuresNames[-3:]}' )

loading features:
/private/groups/kimlab/aedavids/deconvolution/randomForestGeneSignatureDeconvolution.out/model/best10CuratedDegree1_ce467ff_elife.orderedFeatures.txt

number of HUGO_featuresNames : 712
first 3: ['(GGTG)n', '(GT)n', '(TA)n'] last 3: ['ZRANB1', 'ZSWIM4', 'ZYG11B']


# Load Gene Expression Data

In [6]:
%%time
# from intraExtraRNA.plasmaGAN.plasmaGAN import loadCountData

# localCacheDir = "/scratch/aedavids/" + notebookName + "/cache"
XDF, metaDF, elifeLungGenes, missingElifeGenes, mapDF = loadCountData( localCacheDir, HUGO_featuresNames )

print( f'XDF.shape : {XDF.shape}' )
XDF.iloc[0:5, 0:5]

assert len(missingElifeGenes) == 0,"ERROR"

2024-10-10 10:34:55,233 INFO intraExtraRNA.plasmaGAN.plasmaGAN loadCountData() line:55] [BEGIN]
2024-10-10 10:34:55,234 INFO intraExtraRNA.plasmaGAN.plasmaGAN loadCountData() line:69] [loading from /scratch/aedavids/plasmaGAN/cache]
2024-10-10 10:34:55,296 INFO intraExtraRNA.plasmaGAN.plasmaGAN loadCountData() line:98] [END]


XDF.shape : (224, 713)
CPU times: user 46.7 ms, sys: 20.1 ms, total: 66.9 ms
Wall time: 65.6 ms
