# Evaluate Random Hold Out
author: Andrew E. Davidson, aedavids@ucsc.edu

Best results: learn 19 features

In [1]:
from datetime import datetime
import logging
from   setupLogging import setupLogging
configFilePath = setupLogging( default_path='src/test/logging.test.ini.json')
logger = logging.getLogger("notebook")
logger.info("using logging configuration file:{}".format(configFilePath))

import numpy as np
from DEMETER2.lowRankMatrixFactorizationEasyOfUse \
    import LowRankMatrixFactorizationEasyOfUse as LrmfEoU

[INFO <ipython-input-1-dae33e64cf83>:6 - <module>()] using logging configuration file:src/test/logging.test.ini.json


In [2]:
dataDir = "data/"
dataFileName = "D2_Achilles_gene_dep_scores.tsv"
numFeatures = 19
geneFilterPercent = 0.25 
holdOutPercent = 0.40 
# easyOfUse = LrmfEoU(dataDir, dataFileName, numFeatures, geneFilterPercent, holdOutPercent, tag="_randomized")
easyOfUse = LrmfEoU(dataDir, dataFileName, numFeatures, geneFilterPercent, holdOutPercent)

In [3]:
resultsDict = easyOfUse.loadAll()

# clean tidy version of demeter data
Y, R, cellLines, geneNames, = resultsDict["DEMETER2"]

# trained model
# scipy.optimize.OptimizeResult
X, Theta, optimizeResult = resultsDict["LowRankMatrixFactorizationModel"]

# knockout logical filters. Use to select Y Train, Validations, and Test values
RTrain, RValidation, RTest = resultsDict["filters"]

# Hyper Parameter tunning: Evaluate predicted error

In [4]:
# predictions               = np.matmul( X, Theta.transpose() )
# predictedValidationValues = np.multiply(predictions, RValidation)
# trueTestVaildationValues  = np.multiply(Y, RValidation)

In [5]:
def calculateTotalSquaredError(X, Theta, Y, R):
    """
    arguments:
        X:
        Theta:
        Y: the true observed values
        R: knock out
    """
    predictions = np.matmul( X, Theta.transpose() ) # matrix multiplication
    error = predictions - Y
    sqError = np.multiply( error, error ) # element wise multiply
    selectObservedError = np.multiply( sqError, R )
    totalError = np.sum( selectObservedError )
    
    return totalError

# totalError = calculateTotalSquaredError(predictedValidationValues, trueTestVaildationValues)
totalSqErrorValidation = calculateTotalSquaredError(X, Theta, Y, RValidation)
print( "totalSqErrorValidation:{}".format(totalSqErrorValidation) )

totalSqErrorValidation:18958.40614109373


# Hyper Parameter tunning results

D2_Achilles_gene_dep_scores.tsv, RValidation

```
  n_3_geneFilterPercent_0.25_holdOutPercent_0.4 totalError:20,724.50
 n_14_geneFilterPercent_0.25_holdOutPercent_0.4 totalError:19,090.72  
 n_19_geneFilterPercent_0.25_holdOutPercent_0.4 totalError:18,958.41 
 n_25_geneFilterPercent_0.25_holdOutPercent_0.4 totalError:18,987.49
 n_50_geneFilterPercent_0.25_holdOutPercent_0.4 totalError:19,909.83 
n_100_geneFilterPercent_0.25_holdOutPercent_0.4 totalError:23,908.81
```

randomized base line
```
n_19_geneFilterPercent_0.25_holdOutPercent_0.4_randomized totalError:23,685.279323805073
```

# Test Results
D2_Achilles_gene_dep_scores.tsv, RTest

```
n_19_geneFilterPercent_0.25_holdOutPercent_0.4 totalError:18979.49
```

In [6]:
totalSqErrorTest = calculateTotalSquaredError(X, Theta, Y, RTest)
print( "totalSqErrorTest:{}".format(totalSqErrorTest) )

totalSqErrorTest:18979.491283967178


In [7]:
print( np.sum(RValidation))
print( np.sum(RTest))
print( np.sum(RValidation == RTest) )
print( np.sum(RValidation != RTest) )

1121380
1121355
3364958
2242735


In [8]:
print(RTrain[0:2,0:2])
print(RValidation[0:2,0:2])
print(RTest[0:2,0:2])

[[ True  True]
 [ True False]]
[[False False]
 [False False]]
[[False False]
 [False  True]]
