# Create Auto Encoder data Set
Figure out how to create a data set we can use to train and auto encoder
input RNA seq data output Gene essential Data

## Explore RNAseq_lRPKM_data.csv

In [1]:
# from DEMETER2.dataFactory import DataFactory

from   DEMETER2.lowRankMatrixFactorizationEasyOfUse \
            import LowRankMatrixFactorizationEasyOfUse as LrmfEoU

import logging
from   setupLogging import setupLogging
configFilePath = setupLogging( default_path='src/test/logging.test.ini.json')
logger = logging.getLogger("notebook")
logger.info("using logging configuration file:{}".format(configFilePath))

import numpy as np
import pandas as pd

dataRootDir = "data/"
rnaSeqDataFile = dataRootDir + "RNAseq_lRPKM_data.csv"
geneDepDataFile =  dataRootDir +"D2_Achilles_gene_dep_scores.tsv"
autoEncoderDataDir = dataRootDir + "autoEncoder/"

# Load the best low rank matrix model identified in evaluateRandomHoldOut.ipynb. 
# These are the genes and cell lines we want to predict
dataFileName = "D2_Achilles_gene_dep_scores.tsv"
numFeatures = 19
geneFilterPercent = 0.25 
holdOutPercent = 0.40 
easyOfUse = LrmfEoU(dataRootDir, dataFileName, numFeatures, geneFilterPercent, holdOutPercent)
resultsDict = easyOfUse.loadAll()

[INFO <ipython-input-1-4e35873a7bd7>:10 - <module>()] using logging configuration file:src/test/logging.test.ini.json


In [2]:
# clean tidy version of demeter data
Y, R, geneDepCellLines, geneDepNames, = resultsDict["DEMETER2"]
geneDependencies = Y
print("geneDependencies.shape:{}".format(geneDependencies.shape))

# trained model
# scipy.optimize.OptimizeResult
X, Theta, optimizeResult = resultsDict["LowRankMatrixFactorizationModel"]
genes = X
print("genes.shape:{}".format(genes.shape))
cellLines = Theta
print("cellLines.shape:{}".format(cellLines.shape))

# knockout logical filters. Use to select Y Train, Validations, and Test values
RTrain, RValidation, RTest = resultsDict["filters"]

geneDependencies.shape:(11193, 501)
genes.shape:(11193, 19)
cellLines.shape:(501, 19)


In [3]:
rawRnaSeqData = np.loadtxt(rnaSeqDataFile, dtype=str, delimiter=",")

In [4]:
print("rawRnaSeqData.shape:{}".format(rawRnaSeqData.shape))
print("rawRnaSeqData[0,0:5]:\n{}".format(rawRnaSeqData[0,0:5]))
print("\nrawRnaSeqData[0:5,0]:\n{}".format(rawRnaSeqData[0:5,0]))

print("\nrawRnaSeqData[0:3,0:3]:\n{}".format(rawRnaSeqData[0:3,0:3]))

rawRnaSeqData.shape:(19185, 646)
rawRnaSeqData[0,0:5]:
['""' '"143B_BONE"' '"22RV1_PROSTATE"' '"2313287_STOMACH"'
 '"697_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE"']

rawRnaSeqData[0:5,0]:
['""' '"MAP4K5 (11183)"' '"PAPD7 (11044)"' '"ZC3H11A (9877)"'
 '"MYH14 (79784)"']

rawRnaSeqData[0:3,0:3]:
[['""' '"143B_BONE"' '"22RV1_PROSTATE"']
 ['"MAP4K5 (11183)"' '1.4432662739991' '1.2328470024234']
 ['"PAPD7 (11044)"' '1.26557693849693' '1.22391368028673']]


In [5]:
rnaSeqCellLines = rawRnaSeqData[0,1:]
print("rnaSeqCellLines.shape:{}".format(rnaSeqCellLines.shape))
print("rnaSeqCellLines[0:5]\n{}".format(rnaSeqCellLines[0:5]))

rnaSeqGeneNames = rawRnaSeqData[1:,0]
print("\n rnaSeqGeneNames.shape:{}".format(rnaSeqGeneNames.shape))
print("rnaSeqGeneNames[0:5]\n{}".format(rnaSeqGeneNames[0:5]))

rnaSeqData = rawRnaSeqData[1:, 1:]
print("\n rnaSeqData.shape:{}".format(rnaSeqData.shape))
print("rnaSeqData[0:5]\n{}".format(rnaSeqData[0:5]))

rnaSeqCellLines.shape:(645,)
rnaSeqCellLines[0:5]
['"143B_BONE"' '"22RV1_PROSTATE"' '"2313287_STOMACH"'
 '"697_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE"' '"769P_KIDNEY"']

 rnaSeqGeneNames.shape:(19184,)
rnaSeqGeneNames[0:5]
['"MAP4K5 (11183)"' '"PAPD7 (11044)"' '"ZC3H11A (9877)"' '"MYH14 (79784)"'
 '"NUP88 (4927)"']

 rnaSeqData.shape:(19184, 645)
rnaSeqData[0:5]
[['1.4432662739991' '1.2328470024234' '1.20384248842145' ...
  '1.20172668065854' '1.18751226047194' '0.938953802655811']
 ['1.26557693849693' '1.22391368028673' '1.2689538389447' ...
  '1.14762439126215' '1.09657878061321' '1.28856689126963']
 ['1.45496476968394' '1.70206442013166' '1.64423971388072' ...
  '1.55541708571845' '1.91717301837013' '1.83327444802754']
 ['-1.32175484807296' '1.25319693365058' '1.36527284352511' ...
  '-1.19592875111434' '1.03735477366346' '1.41274539149957']
 ['1.69737425782221' '1.51889305820117' '1.38944801835085' ...
  '1.42935380562316' '1.11148599007619' '0.850009334029727']]


## Explore D2_Achilles_gene_dep_scores.tsv

In [6]:
# numFeatures = 19
# holdOutPercent = 0.40 
# filterPercent = 0.25 
# suffix = "n_{}_geneFilterPercent_{}_holdOutPercent_{}".format(numFeatures, filterPercent, holdOutPercent)

In [7]:
# def createDataPath(dataRootDir, numFeatures, filterPercent, holdOutPercent):
#     #fmt = "numFeatures_{}_n_{}_geneFilterPercent_{}_holdOutPercent_{}"
#     fmt = "n_{}_geneFilterPercent_{}_holdOutPercent_{}"
#     ret = dataRootDir + fmt.format(numFeatures, filterPercent, holdOutPercent)
#     return ret
    
# dataPath = createDataPath(dataRootDir, numFeatures, filterPercent, holdOutPercent)
# print(dataPath)

In [8]:
# # quick hack dataFactory.loadAll() assume trained model files are in the same
# # directory as the original tsv file
# ! cp $dataPath/*.csv $dataRootDir

In [9]:
# dataFactory = DataFactory(numFeatures)
# RTrain, RValidation, RTest, X, Theta, Y, geneDepCellLines, geneDepNames = \
#     dataFactory.loadAll(geneDepDataFile, suffix)

In [10]:
rnaSeqCellLines = rawRnaSeqData[0,1:]
print("geneDepCellLines.shape:{}".format(geneDepCellLines.shape))
print("geneDepCellLines[0:5]\n{}".format(geneDepCellLines[0:5]))

rnaSeqGeneNames = rawRnaSeqData[1:,0]
print("\ngeneDepNames.shape:{}".format(geneDepNames.shape))
print("geneDepNames[0:5]\n{}".format(geneDepNames[0:5]))

geneDepCellLines.shape:(501,)
geneDepCellLines[0:5]
['"143B_BONE"' '"22RV1_PROSTATE"' '"2313287_STOMACH"'
 '"697_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE"' '"769P_KIDNEY"']

geneDepNames.shape:(11193,)
geneDepNames[0:5]
['"A1BG (1)"' '"NAT2 (10)"' '"ADA (100)"' '"CDH2 (1000)"' '"AKT3 (10000)"']


## Select common cell lines and gene

In [11]:
sortedCommonCellLines = np.intersect1d(rnaSeqCellLines, geneDepCellLines)
print("sortedCommonCellLines.shape:{}".format(sortedCommonCellLines.shape))
print("sortedCommonCellLines[0:5]:\n{}".format(sortedCommonCellLines[0:5]))
print("\nsortedCommonCellLines[-5:]:\n{}".format(sortedCommonCellLines[-5:]))

sortedCommonCellLines.shape:(486,)
sortedCommonCellLines[0:5]:
['"143B_BONE"' '"22RV1_PROSTATE"' '"2313287_STOMACH"'
 '"697_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE"' '"769P_KIDNEY"']

sortedCommonCellLines[-5:]:
['"YD38_UPPER_AERODIGESTIVE_TRACT"' '"YD8_UPPER_AERODIGESTIVE_TRACT"'
 '"YKG1_CENTRAL_NERVOUS_SYSTEM"' '"ZR751_BREAST"' '"ZR7530_BREAST"']


In [12]:
sortedCommonGeneDepNames = np.intersect1d(rnaSeqGeneNames, geneDepNames)
print("sortedCommonGeneDepNames.shape:{}".format(sortedCommonGeneDepNames.shape))
print("sortedCommonGeneDepNames[0:5]:\n{}".format(sortedCommonGeneDepNames[0:5]))
print("\n sortedCommonGeneDepNames[-5:]:\n{}".format(sortedCommonGeneDepNames[-5:]))

sortedCommonGeneDepNames.shape:(10499,)
sortedCommonGeneDepNames[0:5]:
['"A1BG (1)"' '"A1CF (29974)"' '"A2M (2)"' '"A2ML1 (144568)"'
 '"A4GALT (53947)"']

 sortedCommonGeneDepNames[-5:]:
['"ZSCAN9 (7746)"' '"ZSWIM2 (151112)"' '"ZWILCH (55055)"' '"ZYX (7791)"'
 '"ZZEF1 (23140)"']


## Test Pandas

There may be a way to do this directly in numpy. I was wasting to much time hacking

In [13]:
d = np.array([
    [ '', 'a', 'b', 'c', 'd'    ],
    ['w',   1,     2,    3,   4 ],
    ['x',   5,     6,    7,   8 ],
    ['y',   9,    10,   11,  12 ],
    ['z',   13,   14,   15,  16 ] ]) #.astype('str')

rowNames = d[0,1:]
# print(rowNames)
# print(rowNames.shape)
# rowNames = rowNames.reshape(4) #np.reshape(rowNames, 4) #
print("rowNames.shape:{} rowNames:{}".format(rowNames.shape, rowNames))
colNames = d[1:,0] # .reshape(colsName.size())
print("colNames.shape:{} colNames:{}".format(colNames.shape, colNames))

# selectRows = rowNames == ['a', 'c']
# selectRows = np.argwhere(rowNames in ['a', 'c']
# print("selectRows.shape:{} selectRows:{}".format(selectRows.shape, selectRows))

rowNames.shape:(4,) rowNames:['a' 'b' 'c' 'd']
colNames.shape:(4,) colNames:['w' 'x' 'y' 'z']


In [14]:
cNames = d[0,1:]
rNames = d[1:,0]
data = d[1:,1:]
print(data)
print()
df = pd.DataFrame(data, columns=cNames, index=rNames)
print(df)
print()
print( df.loc[['w', 'z'], ['a', 'c']])

[['1' '2' '3' '4']
 ['5' '6' '7' '8']
 ['9' '10' '11' '12']
 ['13' '14' '15' '16']]

    a   b   c   d
w   1   2   3   4
x   5   6   7   8
y   9  10  11  12
z  13  14  15  16

    a   c
w   1   3
z  13  15


## Select the common cell Lines and genes from the RNA Seq data

In [15]:
rnaSeqDF = pd.DataFrame(rnaSeqData, 
                        columns=rnaSeqCellLines, 
                        index=rnaSeqGeneNames)
rnaSeqDF.iloc[0:4, 0:3]

Unnamed: 0,"""143B_BONE""","""22RV1_PROSTATE""","""2313287_STOMACH"""
"""MAP4K5 (11183)""",1.4432662739991,1.2328470024234,1.20384248842145
"""PAPD7 (11044)""",1.26557693849693,1.22391368028673,1.2689538389447
"""ZC3H11A (9877)""",1.45496476968394,1.70206442013166,1.64423971388072
"""MYH14 (79784)""",-1.32175484807296,1.25319693365058,1.36527284352511


In [16]:
commonRNASeqDF = rnaSeqDF.loc[sortedCommonGeneDepNames, sortedCommonCellLines]
print("commonRNASeqDF.shape:{}".format(commonRNASeqDF.shape))
commonRNASeqDF.iloc[0:3, 0:3]

commonRNASeqDF.shape:(10499, 486)


Unnamed: 0,"""143B_BONE""","""22RV1_PROSTATE""","""2313287_STOMACH"""
"""A1BG (1)""",-0.3238095,-0.4397346,-2.149967
"""A1CF (29974)""",-2.14752,0.7524987,0.4075303
"""A2M (2)""",0.5297267,0.2989709,-2.054531


## Select Commone cell lines and genes from D2_Achilles_gene_dep_scores
First imput any missing values

In [17]:
imputedGeneDepData = np.matmul(X, Theta.transpose())
print("imputedGeneDepData.shape:{}".format(imputedGeneDepData.shape))

imputedGeneDepData.shape:(11193, 501)


In [18]:
genDepDF = pd.DataFrame(imputedGeneDepData, 
                        columns=geneDepCellLines, 
                        index=geneDepNames)
genDepDF.iloc[0:4, 0:3]

Unnamed: 0,"""143B_BONE""","""22RV1_PROSTATE""","""2313287_STOMACH"""
"""A1BG (1)""",-0.033601,-0.109786,-0.114928
"""NAT2 (10)""",-0.039952,-0.087199,-0.0774
"""ADA (100)""",0.089201,0.043217,0.056489
"""CDH2 (1000)""",-0.034279,0.061393,0.044192


In [19]:
commonGeneDepDF = genDepDF.loc[sortedCommonGeneDepNames, sortedCommonCellLines]
print("commonGeneDupDF.shape:{}".format(commonGeneDepDF.shape))
commonGeneDepDF.iloc[0:3, 0:3]

commonGeneDupDF.shape:(10499, 486)


Unnamed: 0,"""143B_BONE""","""22RV1_PROSTATE""","""2313287_STOMACH"""
"""A1BG (1)""",-0.033601,-0.109786,-0.114928
"""A1CF (29974)""",0.130508,-0.045974,-0.042363
"""A2M (2)""",-0.050844,-0.08242,-0.045868


## Save to disk

In [20]:
print( "autoEncoderDataDir:{}".format(autoEncoderDataDir))
! mkdir $autoEncoderDataDir

rnaSeqDFsavePath = autoEncoderDataDir + "common_RNAseq_lRPKM_data.csv"
commonRNASeqDF.to_csv(rnaSeqDFsavePath)

autoEncoderDataDir:data/autoEncoder/


In [21]:
geneDupDFsavePath = autoEncoderDataDir + "common_D2_Achilles_gene_dep_scores.csv"
commonGeneDepDF.to_csv(geneDupDFsavePath)