Validation of ML embeddings
---------------------------

Dependencies
-------------

In [1]:
%AddDeps org.bdgenomics.adam adam-core-spark2_2.11 0.24.0
%AddDeps comp.bio.aging adam-playground_2.11 0.0.13 --repository https://dl.bintray.com/comp-bio-aging/main/

Marking org.bdgenomics.adam:adam-core-spark2_2.11:0.24.0 for download
Preparing to fetch from:
-> file:/tmp/toree-tmp-dir4005346743236264489/toree_add_deps/
-> https://repo1.maven.org/maven2
-> New file at /tmp/toree-tmp-dir4005346743236264489/toree_add_deps/https/repo1.maven.org/maven2/org/bdgenomics/adam/adam-core-spark2_2.11/0.24.0/adam-core-spark2_2.11-0.24.0.jar
-> New file at /tmp/toree-tmp-dir4005346743236264489/toree_add_deps/https/repo1.maven.org/maven2/org/bdgenomics/adam/adam-core-spark2_2.11/0.24.0/adam-core-spark2_2.11-0.24.0.pom


Waiting for a Spark session to start...

Marking comp.bio.aging:adam-playground_2.11:0.0.13 for download
Preparing to fetch from:
-> file:/tmp/toree-tmp-dir4005346743236264489/toree_add_deps/
-> https://dl.bintray.com/comp-bio-aging/main/
-> https://repo1.maven.org/maven2


In [2]:
%AddDeps org.apache.hadoop hadoop-azure 2.7.6
%AddDeps com.microsoft.azure azure-storage 2.0.0

Marking org.apache.hadoop:hadoop-azure:2.7.6 for download
Preparing to fetch from:
-> file:/tmp/toree-tmp-dir4005346743236264489/toree_add_deps/
-> https://repo1.maven.org/maven2
-> New file at /tmp/toree-tmp-dir4005346743236264489/toree_add_deps/https/repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/2.7.6/hadoop-azure-2.7.6.jar
-> New file at /tmp/toree-tmp-dir4005346743236264489/toree_add_deps/https/repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/2.7.6/hadoop-azure-2.7.6.pom


Waiting for a Spark session to start...

Marking com.microsoft.azure:azure-storage:2.0.0 for download
Preparing to fetch from:
-> file:/tmp/toree-tmp-dir4005346743236264489/toree_add_deps/
-> https://repo1.maven.org/maven2
-> New file at /tmp/toree-tmp-dir4005346743236264489/toree_add_deps/https/repo1.maven.org/maven2/com/microsoft/azure/azure-storage/2.0.0/azure-storage-2.0.0.jar
-> New file at /tmp/toree-tmp-dir4005346743236264489/toree_add_deps/https/repo1.maven.org/maven2/com/microsoft/azure/azure-storage/2.0.0/azure-storage-2.0.0.pom


General functions
-----------------

In [3]:
import  org.apache.spark._
import org.apache.spark.sql.{DataFrame, Encoders, SparkSession}
import org.apache.spark.sql.types.StructType
import scala.reflect.runtime.universe._
import org.apache.spark.storage.StorageLevel
import org.apache.spark.rdd._

In [4]:
def sparkHadoopConf(sc: SparkContext, acountName: String, accountKey: String) = {
  sc.hadoopConfiguration.set("fs.azure", "org.apache.hadoop.fs.azure.NativeAzureFileSystem")
  sc.hadoopConfiguration.set("fs.azure.account.key." + acountName + ".blob.core.windows.net", accountKey)
  sc
}

sparkHadoopConf: (sc: org.apache.spark.SparkContext, acountName: String, accountKey: String)org.apache.spark.SparkContext


In [5]:
def azurize(container: String, accountName: String, blobFile: String): String = "wasbs://"+container+"@"+accountName+".blob.core.windows.net/"+blobFile 

def writeText2Azure[T]( rdd: RDD[T], container: String, accountName: String, blobFile: String ): String =
{
  val url = azurize(container, accountName, blobFile)
  rdd.saveAsTextFile(url)
  url
}

def writeTsv2Azure( df: DataFrame, container: String, accountName: String, blobFile: String ): String =
{
  val url = azurize(container, accountName, blobFile)
  df.write.option("sep","\t").option("header","true").csv(url)
  url
}

azurize: (container: String, accountName: String, blobFile: String)String
writeText2Azure: [T](rdd: org.apache.spark.rdd.RDD[T], container: String, accountName: String, blobFile: String)String
writeTsv2Azure: (df: org.apache.spark.sql.DataFrame, container: String, accountName: String, blobFile: String)String


In [6]:
val account = "pipelines1"
val key = ""
val connString = s"DefaultEndpointsProtocol=https;AccountName=pipelines1;AccountKey=${key};EndpointSuffix=core.windows.net"
def az(path: String): String = azurize("storage", account, path)

account = pipelines1
key = 
connString = DefaultEndpointsProtocol=https;AccountName=pipelines1;AccountKey=;EndpointSuffix=core.windows.net


az: (path: String)String


DefaultEndpointsProtocol=https;AccountName=pipelines1;AccountKey=;EndpointSuffix=core.windows.net

In [13]:
val sparkContext = sc
sparkHadoopConf(sparkContext, account, key)
  
val spark = SparkSession
  .builder()
  .appName("mapping_models")
  .getOrCreate()

sparkContext = org.apache.spark.SparkContext@66f35057
spark = org.apache.spark.sql.SparkSession@1ed54776


In [14]:
import org.apache.spark.sql.functions._
import spark.implicits._

val toDouble = udf[Double, String]( _.toDouble)

toDouble = UserDefinedFunction(<function1>,DoubleType,Some(List(StringType)))


UserDefinedFunction(<function1>,DoubleType,Some(List(StringType)))

In [15]:
def write(df: DataFrame, url: String, header: Boolean = true) = {
  df.coalesce(1).write.option("sep","\t").option("header", header).csv(url)
  url
}

write: (df: org.apache.spark.sql.DataFrame, url: String, header: Boolean)String


In [16]:
def readTSV(path: String, header: Boolean = false, sep: String = "\t"): DataFrame = spark.read
    .option("sep", sep)
    .option("comment", "#")
    .option("inferSchema", true)
    .option("header", header)
    .option("ignoreLeadingWhiteSpace", true)
    .option("ignoreTrailingWhiteSpace", true)
    .option("ignoreTrailingWhiteSpace", true)
    .option("maxColumns", 150000)
    .csv(path)

readTSV: (path: String, header: Boolean, sep: String)org.apache.spark.sql.DataFrame


In [17]:
 def toVectors(dataFrame: DataFrame, columns: Seq[String], output: String) = {  
      import org.apache.spark.ml.feature.VectorAssembler
      import org.apache.spark.ml.linalg.Vectors
      val assembler = new VectorAssembler()
        .setInputCols(columns.toArray)
        .setOutputCol(output)
      assembler.transform(dataFrame.na.fill(0.0, columns).na.fill(0.0)).select(output)
    }

toVectors: (dataFrame: org.apache.spark.sql.DataFrame, columns: Seq[String], output: String)org.apache.spark.sql.DataFrame


In [18]:
import org.apache.spark.ml.feature._
def fitPCA(dataFrame: DataFrame, columns: Seq[String], k: Int)(implicit sparkSession: SparkSession): PCAModel = {
  val df = toVectors(dataFrame, columns, "features")
  new PCA()
    .setInputCol("features")
    .setOutputCol("PCA")
    .setK(k)
    .fit(df)
}

def doPCA(dataFrame: DataFrame, columns: Seq[String], k: Int)(implicit sparkSession: SparkSession): DataFrame = {
  val pca = fitPCA(dataFrame,columns, k)
  pca.transform(dataFrame)
}

fitPCA: (dataFrame: org.apache.spark.sql.DataFrame, columns: Seq[String], k: Int)(implicit sparkSession: org.apache.spark.sql.SparkSession)org.apache.spark.ml.feature.PCAModel
doPCA: (dataFrame: org.apache.spark.sql.DataFrame, columns: Seq[String], k: Int)(implicit sparkSession: org.apache.spark.sql.SparkSession)org.apache.spark.sql.DataFrame


GTEX-related code
-----------------

In [19]:
//for testing
val expressionsPath = "expressions"
val byGoPath = expressionsPath + "/go"
val comparison = byGoPath + "/gray_whale_with_bowhead_with_minke_with_NMR_with_human_with_mouse_with_cow_full_outer_counts_extended.tsv"
val grouped = byGoPath + "/grouped"
val ranked = byGoPath + "/grouped/ranked"
val transcriptsPath = expressionsPath + "/transcripts"

val validationPath = expressionsPath + "/" + "validation"
val mouseValidationPath = validationPath + "/" + "mouse"
val jenageValidationPath = mouseValidationPath + "/" + "GSE75192"
val mouseColsValidationPath = jenageValidationPath + "/" + "expressions_columns_GSE75192.tsv"
val mouseRowsValidationPath = jenageValidationPath + "/" + "expressions_rows_GSE75192.tsv"

expressionsPath = expressions
byGoPath = expressions/go
comparison = expressions/go/gray_whale_with_bowhead_with_minke_with_NMR_with_human_with_mouse_with_cow_full_outer_counts_extended.tsv
grouped = expressions/go/grouped
ranked = expressions/go/grouped/ranked
transcriptsPath = expressions/transcripts
validationPath = expressions/validation
mouseValidationPath = expressions/validation/mouse
jenageValidationPath = expressions/validation/mouse/GSE75192
mouseColsValidationPath = expressions/validation/mouse/GSE75192/expressions_columns_GSE75192.tsv
mouseRowsValidationPath = expressions/validation/mouse/GSE75192/expressions_rows_GSE75192.tsv


expressions/validation/mouse/GSE75192/expressions_rows_GSE75192.tsv

In [20]:
val goTable = readTSV(az(comparison), true)

goTable = [go: string, namespace: string ... 27 more fields]


error: error while loading KVIterator, class file '/usr/local/spark-2.3.0-bin-hadoop2.7/jars/spark-unsafe_2.11-2.3.0.jar(org/apache/spark/unsafe/KVIterator.class)' has location not matching its contents: contains class KVIterator
error: error while loading UnsafeAlignedOffset, class file '/usr/local/spark-2.3.0-bin-hadoop2.7/jars/spark-unsafe_2.11-2.3.0.jar(org/apache/spark/unsafe/UnsafeAlignedOffset.class)' has location not matching its contents: contains class UnsafeAlignedOffset
error: error while loading Platform, class file '/usr/local/spark-2.3.0-bin-hadoop2.7/jars/spark-unsafe_2.11-2.3.0.jar(org/apache/spark/unsafe/Platform.class)' has location not matching its contents: contains class Platform
error: error while loading UTF8String, class file '/usr/local/spark-2.3.0-bin-hadoop2.7/jars/spark-unsafe_2.11-2.3.0.jar(org/apache/spark/unsafe/types/UTF8String.class)' has location not matching its contents: contains class UTF8String
error: error while loading CalendarInterval, class fi

[go: string, namespace: string ... 27 more fields]

In [18]:
val gtexPath = "/GTEx"
//val genesPath = az(gtexPath + "/gtex_genes.tsv")
val transcriptsPath = az(gtexPath + "/GTEx_Analysis_2016-01-15_v7_RSEMv1.2.22_transcript_tpm.txt")
val gctPath = az(gtexPath + "/GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm.gct")
val transcripts2EntrezPath = az(gtexPath + "/gencode.v28lift37.metadata.EntrezGene")


gtexPath = /GTEx
transcriptsPath = wasbs://storage@pipelines1.blob.core.windows.net//GTEx/GTEx_Analysis_2016-01-15_v7_RSEMv1.2.22_transcript_tpm.txt
gctPath = wasbs://storage@pipelines1.blob.core.windows.net//GTEx/GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm.gct
transcripts2EntrezPath = wasbs://storage@pipelines1.blob.core.windows.net//GTEx/gencode.v28lift37.metadata.EntrezGene


wasbs://storage@pipelines1.blob.core.windows.net//GTEx/gencode.v28lift37.metadata.EntrezGene

In [19]:
def simplifyFunction = udf[String, String]{case tran =>
  val i = tran.indexOf('|')
  val tr = if(i> 0) tran.substring(0, tran.indexOf('|')) else tran
  tr.substring(0, Math.min(tr.indexOf('.'),tr.length))
}

simplifyFunction: org.apache.spark.sql.expressions.UserDefinedFunction


In [20]:
val conversions = readTSV(transcripts2EntrezPath, false).toDF("transcript_id", "entrez")//.persist(StorageLevel.MEMORY_AND_DISK) 

Name: java.io.IOException
Message: No FileSystem for scheme: wasbs
StackTrace:   at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2660)
  at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)
  at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
  at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
  at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
  at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
  at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
  at org.apache.spark.sql.execution.datasources.DataSource$.org$apache$spark$sql$execution$datasources$DataSource$$checkAndGlobPathIfNecessary(DataSource.scala:705)
  at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$15.apply(DataSource.scala:389)
  at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$15.apply(DataSource.scala:389)
  at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(Trave

In [27]:
conversions.count

147543

In [3]:
conversions.select("entrez").distinct.count

19165

In [19]:
val transcripts = readTSV(transcriptsPath, true).join(conversions, Seq("transcript_id"))

transcripts = [transcript_id: string, gene_id: string ... 11689 more fields]


[transcript_id: string, gene_id: string ... 11689 more fields]

In [None]:
val cols = transcripts.columns.tail.tail.filter(_!="entrez")
val genes = transcripts.groupBy("entrez").sum(cols:_*).persist(StorageLevel.MEMORY_AND_DISK) 
val renamings = genes.columns.map{ case value => if(value.startsWith("sum(")) value.replace("sum(", "").replace(")","") else value}
val genesEntrez = genes.toDF(renamings:_*)

In [None]:
write(genesEntrez, az(gtexPath + "/genes_entrez.tsv"), true)

In [20]:
val genesEntrez = readTSV(az(gtexPath + "/genes_entrez.tab"), true).persist(StorageLevel.MEMORY_AND_DISK) 

genesEntrez = [entrez: int, GTEX-1117F-0226-SM-5GZZ7: double ... 11687 more fields]


[entrez: int, GTEX-1117F-0226-SM-5GZZ7: double ... 11687 more fields]

In [21]:
genesEntrez.count

17459

In [22]:
genesEntrez.columns.size

11689

In [None]:
val stats = genesEntrez.summary()

In [None]:
write(stats, az(gtexPath + "/genes_entrez_stats.tsv"), true)

In [None]:
println("Hello!")

In [25]:
val pca100 = doPCA(genesEntrez, genesEntrez.columns.tail, 100)(spark)

Name: java.lang.OutOfMemoryError
Message: Java heap space
StackTrace:   at scala.reflect.ManifestFactory$$anon$12.newArray(Manifest.scala:141)
  at scala.reflect.ManifestFactory$$anon$12.newArray(Manifest.scala:139)
  at breeze.linalg.DenseVector$mcD$sp.<init>(DenseVector.scala:60)
  at org.apache.spark.mllib.linalg.distributed.RowMatrix.computeGramianMatrix(RowMatrix.scala:122)
  at org.apache.spark.mllib.linalg.distributed.RowMatrix.computeCovariance(RowMatrix.scala:344)
  at org.apache.spark.mllib.linalg.distributed.RowMatrix.computePrincipalComponentsAndExplainedVariance(RowMatrix.scala:387)
  at org.apache.spark.mllib.feature.PCA.fit(PCA.scala:53)
  at org.apache.spark.ml.feature.PCA.fit(PCA.scala:99)
  at fitPCA(<console>:144)
  at doPCA(<console>:148)

In [23]:
println("Hello!")

Hello!


lastException: Throwable = null


In [6]:
import org.apache.spark.ml.linalg.{Matrix, Vectors}
import org.apache.spark.ml.stat.Correlation
import org.apache.spark.sql.Row

val data = Seq(
  Vectors.dense(4.0, 5.0, 3.0),
  Vectors.dense(6.0, 7.0,  8.0),
  Vectors.dense(6.0, 7.0,  8.0)  
)

val df = data.map(Tuple1.apply).toDF("features")
val Row(coeff1: Matrix) = Correlation.corr(df, "features").head
println(s"Pearson correlation matrix:\n $coeff1")

val Row(coeff2: Matrix) = Correlation.corr(df, "features", "spearman").head
println(s"Spearman correlation matrix:\n $coeff2")


Pearson correlation matrix:
 1.0                 1.0                 0.9999999999999998  
1.0                 1.0                 0.9999999999999998  
0.9999999999999998  0.9999999999999998  1.0                 
Spearman correlation matrix:
 1.0                 1.0000000000000002  1.0000000000000002  
1.0000000000000002  1.0                 1.0000000000000002  
1.0000000000000002  1.0000000000000002  1.0                 


data = List([4.0,5.0,3.0], [6.0,7.0,8.0], [6.0,7.0,8.0])
df = [features: vector]
coeff1 = 
coeff2 = 


1.0                 1.0000000000000002  1.0000000000000002  
1.0000000000000002  1.0                 1.0000000000000002  
1.0000000000000002  1.0000000000000002  1.0                 