# figures


Notebook that generates main figures for the Gray Whale paper<br>




In [1]:
import org.apache.spark._
import org.apache.spark.sql.types._
import scala.reflect.runtime.universe._
import org.apache.spark.storage.StorageLevel
import org.apache.spark.rdd._
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions._
import group.research.aging.spark.extensions._
import group.research.aging.spark.extensions.functions._
import kernel.display.html
import org.apache.spark.ml.feature.PCAModel
import org.apache.spark.ml.feature.PCA
import org.apache.spark.ml.linalg.Matrix
import org.apache.spark.ml.stat.Correlation
import org.apache.spark.sql.expressions._

## Extra spark functions

  "val all_pro = processes.select(\"go\", featuresOrdered:_*).na.fill(0.0).sort($\"bowhead_whale_liver\".desc)\n",
        "val liver_pro = processes.select(\"go\", liverFeatures:_*).na.fill(0.0).sort($\"bowhead_whale_liver\".desc)\n",
        "val kidney_pro = processes.select(\"go\", kidneyFeatures:_*).na.fill(0.0).sort($\"bowhead_whale_kidney\".desc)\n",
        "//and ranked\n",
        "val all_pro_ranked = processes.select(\"go\", featuresOrdered:_*).na.fill(0.0).sort($\"bowhead_whale_liver\".desc)\n",
        "val liver_pro_ranked = processes.select(\"go\", liverFeaturesRanked:_*).na.fill(0.0).sort($\"bowhead_whale_liver\".desc)\n",
        "val kidney_pro_ranked = processes.select(\"go\", kidneyFeaturesRanked:_*).na.fill(0.0).sort($\"bowhead_whale_kidney\".desc)\n",
        "all_pro.show(10, 10000)"

In [3]:
//adds index to make it easier to join the dataframes
def addIndex(df: DataFrame) = spark.createDataFrame(
  // Add index
  df.rdd.zipWithIndex.map{case (r, i) => Row.fromSeq(r.toSeq :+ i)},
  // Create schema
  StructType(df.schema.fields :+ StructField("_index", LongType, false))
)

In [4]:
def transpose(df: DataFrame) = {
    val (header, data) = df.collect.map(_.toSeq.toArray).transpose match {
  case Array(h, t @ _*) => {
    (h.map(_.toString), t.map(_.collect { case x: Int => x }))
  }
}
    val rows = df.columns.tail.zip(data).map { case (x, ys) => Row.fromSeq(x +: ys) }
    val schema = StructType(
      StructField("vals", StringType) +: header.map(StructField(_, IntegerType))
    )
spark.createDataFrame(spark.sparkContext.parallelize(rows), schema)
}

In [5]:
def round2(v: Double): Double =  Math.round(100.0 * v) / 100.0 
def log2(x: Double): Double = Math.log10(x) / Math.log10(2.0)
def log20(x: Double): Double = Math.log10(x) / Math.log10(2.0)


def safeLog(v: Double): Double = if(v.isNaN || v <= 1) 0.0 else round2(Math.log10(v)) 
def safeLog20(v: Double): Double = if(v.isNaN || v <= 1) 0.0 else round2(log20(v)) 


def logs(df: DataFrame, cols: List[String]) = {
  val fun = udf[Double, Double](safeLog)
  cols.foldLeft(df){
    case (acc, c) => acc.withColumn(c, fun(new ColumnName(c)))
  }
}

In [6]:
def coord(i: Int): UserDefinedFunction = udf[Double, org.apache.spark.ml.linalg.Vector]{ vec => vec(i) }
val x_udf = coord(0)
val y_udf = coord(1)
val z_udf = coord(2)

In [7]:
import polynote.runtime.KernelRuntime
def script_import(src: String)(implicit kernel: KernelRuntime) =  html(s"""<script  src="${src}" charset="utf-8"></script>""")
def plotly()(implicit kernel: KernelRuntime) = script_import("https://cdn.plot.ly/plotly-latest.js")(kernel)


In [8]:
import polynote.runtime.KernelRuntime
def div(id: String)(implicit kernel: KernelRuntime) = kernel.display.html(s"<div id='${id}'>${id}</div>")

In [9]:
def seq2StringArray(seq: Seq[String]) = "[" + seq.map("\"" + _ + "\"").mkString(",") + "]" 
def seq2NumArray(seq: scala.collection.Seq[Double]) =  "[" + seq.mkString(",") + "]" 
def collectVec(vec: DataFrame): Array[Array[Double]] = vec.map{case org.apache.spark.sql.Row(v: org.apache.spark.ml.linalg.Vector) => v.toArray}.as[Array[Double]].collect

### Javascript generating code for the heatmap



In [11]:
def for_heatmap(vec: DataFrame, id: String, x: Seq[String], y: Seq[String], title: String,  
left: Int = 300, top: Int = 250, width: Int = 1200, height: Int = 1000, 
labels: Boolean = true, logData: Boolean = false, logAnnotations: Boolean = false, 
colorScale: String = "RdBu", inverse: Boolean = false, showScale: Boolean = true,
 zmin: Double = 0.0, zmax: Double = 0.0, dataFontSize: Int = 16, tickFontSize: Int = 24, tickAngle: String = "auto"): String = {
    
val cv: Array[Array[Double]] = collectVec(vec)
val numbers = cv.map(arr=>arr.map(n => round2(n))) //val vs = if(inverse) {val max: Double =numbers.map(_.max).max numbers.map(arr => arr.map(max - _)) } else numbers
val data =        numbers.map(arr=>seq2NumArray(arr.map{number=> if(logData) safeLog(number) else number })).mkString(",")
val annotations = numbers.map(arr=>seq2NumArray(arr.map{number=> if(logAnnotations) safeLog(number) else number })).mkString(",")
val start = s"""
var layout = {
  title: '${title}',
  annotations: [],
  margin: {
    l: ${left},
    r: 125,
    t: ${top},
    b: 50
  },
  width: ${width},
  height: ${height},
  xaxis: {
    side: 'top',
    tickfont: {
      size: ${tickFontSize}
    },
    tickangle: '${tickAngle}'
  },
  yaxis: {
    side: 'left',
    autorange: 'reversed',
    tickfont: {
      size: ${tickFontSize}
    }
  },
};

var xValues = ${seq2StringArray(x)}
var yValues =  ${seq2StringArray(y)}
var zValues =  [${data}]
var aValues =  [${annotations}]

"""
val end = s"""
var data = [{
   x: xValues,
   y: yValues,
   z: zValues,
   ${if(zmin < zmax) "zmin: " + zmin + ", zmax: "+ zmax + ","  else ""}
   colorscale: '${colorScale}', ${if(showScale) "" else " showscale: false ,"}
   type: 'heatmap' ${if(inverse) ", reversescale: true" else ""}
 }];
 Plotly.newPlot('${id}', data, layout);
"""

val middle = s"""
for ( var i = 0; i < yValues.length; i++ ) {
  for ( var j = 0; j < xValues.length; j++ ) {
    var currentValue = zValues[i][j];
    if (currentValue != 0.0) {
      var textColor = 'white';
    }else{
      var textColor = 'black';
    }
    var result = {
      xref: 'x1',
      yref: 'y1',
      x: xValues[j],
      y: yValues[i],
      text: aValues[i][j],
      font: {
        family: 'Arial',
        size: 5,
        color: 'rgb(50, 171, 96)'
      },
      showarrow: false,
      font: {
        color: textColor,
        size: '${dataFontSize}'
      }
    };
    layout.annotations.push(result);
  }
}
"""
val res: String = if(labels) start + middle + end else start + end
//script(res, "drawing heatmap")
 s"""<script type="text/javascript" charset="utf-8">${res}</script>
    <h3>${id} heatmap has been displayed</h3>"""
}

def for_makeHeatmap(df: DataFrame, label: String, features: Seq[String], 
div: String, title: String, left: Int = 300, top: Int = 300,
 width: Int = 1000, height: Int = 1000, labels: Boolean = true, 
 logData: Boolean = false, logAnnotations: Boolean = false, 
 colorScale: String = "RdBu", inverse: Boolean = false, showScale: Boolean = true, 
 zmin: Double = 0.0, zmax: Double = 0.0, names: List[String] = List.empty[String], 
//dataFontSize: Int = 16, tickFontSize: Int = 24, tickAngle: String = "auto"
dataFontSize: Int = 18, tickFontSize: Int = 20, tickAngle: String = "-30"
): String = {
    val y = df.select(label).as[String].collect.toList
    val x = if(names.isEmpty) features.toList else names
    val vec = df.toVectors(features, "features")
    for_heatmap(vec, div, x, y, title, left, top, width, height, labels, logData, logAnnotations, colorScale, inverse, showScale, zmin, zmax, dataFontSize = dataFontSize, tickFontSize = tickFontSize, tickAngle = tickAngle)
}



In [12]:
def makeHeatmap(df: DataFrame, label: String, features: Seq[String], 
div: String, title: String, left: Int = 300, top: Int = 300,
 width: Int = 1000, height: Int = 1000, labels: Boolean = true, 
 logData: Boolean = false, logAnnotations: Boolean = false, 
 colorScale: String = "RdBu", inverse: Boolean = false, showScale: Boolean = true, 
 zmin: Double = 0.0, zmax: Double = 0.0, names: List[String] = List.empty[String], 
//dataFontSize: Int = 16, tickFontSize: Int = 24, tickAngle: String = "auto"
dataFontSize: Int = 18, tickFontSize: Int = 20, tickAngle: String = "-30"
)(implicit kernel: KernelRuntime) = {
    val str = for_makeHeatmap(df,label, features, 
    div, title, left, top, 
    width, height, labels,
    logData,logAnnotations,
    colorScale, inverse, showScale, 
    zmin, zmax, names, 
    dataFontSize, tickFontSize, tickAngle)
    kernel.display.html(str)
}

In [13]:
html(s"""<script src="https://cdn.plot.ly/plotly-latest.js" charset="utf-8"></script><h1>Activate plotly!</h1>""")
//plotly()

## Pathes



In [15]:
//General Pathes
val dataPath = "/data/"
val databasesPath = dataPath + "databases/"
val resultsPath = dataPath + "results/"
val whalePath = resultsPath + "gray-whale/"
//Indexes
val indexesPath = dataPath + "indexes/"
val reactomePath = indexesPath + "reactome/"
//Expressions Pathes
val expressionsPath = whalePath + "Expressions/"
val unirefPath = expressionsPath + "uniref90/"
//Comparison folder
val comparisonsPath = expressionsPath + "Comparisons/"
val comparisonsUniref = comparisonsPath + "uniref90_comparisons/"

val annotationsPath = comparisonsPath + "annotations/"
//GO pathes
val byGoPath = expressionsPath + "GO/"
val grouped = byGoPath + "grouped/" + "updated/"
val ranked = grouped + "ranked/"

val figuresPath = expressionsPath + "Figures/"
val comparison = byGoPath + "gray_whale_with_bowhead_with_minke_with_NMR_with_human_with_mouse_with_cow_full_outer_counts_extended.tsv"

//val goTable    = spark.readTSV(comparison, true)
val components = spark.readTSV(grouped + "by_cellular_component_ranked.tab"  , true)
val processes  = spark.readTSV(grouped + "by_biological_process_ranked.tab"  , true)
val functions  = spark.readTSV(grouped + "by_molecular_function_ranked.tab"  , true)

In [17]:
import scala.collection.immutable._
 val longevity = ListMap(
    "human" -> 122.5,
    "bowhead whale" -> 211.0,
    "gray whale" -> 77.0,
    "minke whale" -> 50,
    "brandt's bat"   -> 41.0,
    "naked mole rat" -> 31.0,
    "cow" -> 20.0,
    "house mouse" -> 4.0    )
val featuresMap: ListMap[String, String] = ListMap(
 "bowhead_whale_liver" -> "Bowhead whale liver" ,
 "gray_whale_liver" -> "Gray whale liver",
 "minke_liver" -> "Minke whale liver",
"human_liver" -> "Human liver",
"bat_liver_1" ->"Bat liver", //"Bat 1 liver",
//"bat_liver_2" -> "Bat 2 liver",
"NMR_liver" -> "Naked mole rat liver",
 "cow_totalRNA_liver" -> "Cow liver",//"Cow 1 liver",
//"cow_mRNA_liver" -> "Cow 2 liver",
 "mouse_totalRNA_liver" -> "Mouse liver",//"Mouse 1 liver",
//"mouse_mRNA_liver" -> "Mouse 2 liver",
 "bowhead_whale_kidney" -> "Bowhead whale kidney",
"gray_whale_kidney" -> "Gray whale kidney",
"minke_kidney" -> "Minke whale kidney",
"human_kidney" -> "Human kidney",
"bat_kidney_1" -> "Bat kidney",//"Bat 1 kidney",
//"bat_kidney_2" -> "Bat 2 kidney",
"NMR_kidney" -> "Naked mole rat kidney",
"cow_totalRNA_kidney" ->"Cow kidney", //"Cow 1 kidney",
//"cow_mRNA_kidney" -> "Cow 2 kidney",
 "mouse_totalRNA_kidney" ->"Mouse kidney" //"Mouse 1 kidney"//,"mouse_mRNA_kidney"->"Mouse 2 kidney"
)

val prettyFeatures = featuresMap.values.toList//.map(v=> "`"+v+"`")
val prettyLiverFeatures = prettyFeatures.filter(_.contains("liver"))
val prettyKidneyFeatures = prettyFeatures.filter(_.contains("kidney"))

val featuresOrdered = featuresMap.keys.toList
val liverFeatures = featuresOrdered.filter(_.contains("liver"))
val kidneyFeatures = featuresOrdered.filter(_.contains("kidney"))

def prettyName(df: DataFrame, cols: Seq[String]) = df.rename(featuresMap).select(cols.head, (cols.tail++prettyFeatures):_* )

val featuresMapRanked: ListMap[String, String] = ListMap(
"bowhead_whale_liver_rank" -> "Bowhead whale liver",
"gray_whale_liver_rank" -> "Gray whale liver",
"minke_liver_rank" -> "Minke whale liver",
"human_liver_rank" -> "Human liver",
"bat_liver_1_rank" ->"Bat liver", //"Bat 1 liver",
//"bat_liver_2_rank" -> "Bat 2 liver",
"NMR_liver_rank" -> "Naked mole rat liver",
"cow_totalRNA_liver_rank" ->"Cow liver", //"Cow 1 liver",
//"cow_mRNA_liver_rank" -> "Cow 2 liver",
"mouse_totalRNA_liver_rank" -> "Mouse liver",//"Mouse 1 liver",
//"mouse_mRNA_liver_rank" -> "Mouse 2 liver",
"bowhead_whale_kidney_rank"->"Bowhead whale kidney",
"gray_whale_kidney_rank" -> "Gray whale kidney",
"minke_kidney_rank" -> "Minke whale kidney",
"human_kidney_rank" -> "Human kidney",
"bat_kidney_1_rank" -> "Bat kidney",//"Bat 1 kidney",
//"bat_kidney_2_rank" -> "Bat 2 kidney",
"NMR_kidney_rank" -> "Naked mole rat kidney",
"cow_totalRNA_kidney_rank" -> "Cow kidney",//,"Cow 1 kidney",
//"cow_mRNA_kidney_rank" -> "Cow 2 kidney",
"mouse_totalRNA_kidney_rank" -> "Mouse kidney" //"Mouse 1 kidney",
//"mouse_mRNA_kidney_rank"->"Mouse 2 kidney"
)

val prettyFeaturesRanked = featuresMapRanked.values.toList//.map(v=> "`"+v+"`")
val prettyLiverFeaturesRanked = prettyFeaturesRanked.filter(_.contains("liver"))
val prettyKidneyFeaturesRanked = prettyFeaturesRanked.filter(_.contains("kidney"))

val featuresOrderedRanked = featuresMapRanked.keys.toList
val liverFeaturesRanked = featuresOrderedRanked.filter(_.contains("liver"))
val kidneyFeaturesRanked = featuresOrderedRanked.filter(_.contains("kidney"))

def rename(str: String) = featuresMap(str)
val renameUDF = udf(rename _)

In [79]:
  //val goTable    = spark.readTSV(comparison, true)
  val components = spark.readTSV(grouped + "by_cellular_component_ranked.tab"  , true)
  val processes  = spark.readTSV(grouped + "by_biological_process_ranked.tab"  , true)
  val functions  = spark.readTSV(grouped + "by_molecular_function_ranked.tab"  , true)

In [18]:
val all_pro = processes.select("go", featuresOrdered:_*).na.fill(0.0).sort($"bowhead_whale_liver".desc)
val liver_pro = processes.select("go", liverFeatures:_*).na.fill(0.0).sort($"bowhead_whale_liver".desc)
val kidney_pro = processes.select("go", kidneyFeatures:_*).na.fill(0.0).sort($"bowhead_whale_kidney".desc)
//and ranked
val all_pro_ranked = processes.select("go", featuresOrdered:_*).na.fill(0.0).sort($"bowhead_whale_liver".desc)
val liver_pro_ranked = processes.select("go", liverFeaturesRanked:_*).na.fill(0.0).sort($"bowhead_whale_liver".desc)
val kidney_pro_ranked = processes.select("go", kidneyFeaturesRanked:_*).na.fill(0.0).sort($"bowhead_whale_kidney".desc)
all_pro.show(10, 10000)

+----------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+--------------------+--------------------+------------------+------------------+------------------+------------------+------------------+-------------------+---------------------+
|        go|bowhead_whale_liver|  gray_whale_liver|       minke_liver|       human_liver|       bat_liver_1|         NMR_liver|cow_totalRNA_liver|mouse_totalRNA_liver|bowhead_whale_kidney| gray_whale_kidney|      minke_kidney|      human_kidney|      bat_kidney_1|        NMR_kidney|cow_totalRNA_kidney|mouse_totalRNA_kidney|
+----------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+--------------------+--------------------+------------------+------------------+------------------+------------------+------------------+-------------------+---------------------+
|GO:0006412| 20034.802

In [19]:
Distributions of GO figures

In [20]:
def whaleBy(path: String): DataFrame = {
    val pro = spark.readTSV(path, false).toDF("category", "label", "count", "go")
    val whale_pro = processes.select("go","gray_whale_liver","gray_whale_kidney").where($"gray_whale_liver" =!= 0.0 || $"gray_whale_kidney" =!= 0.0)
    val whale =  pro.join(whale_pro, Seq("go"))
    whale.groupBy("category", "label", "count").agg(count($"go").as("count_in_whale"), sum($"gray_whale_liver").as("liver_sum"), sum($"gray_whale_kidney").as("kidney_sum"))
}

In [21]:
def whaleCompBy(path: String): DataFrame = {
    val com = spark.readTSV(path, false).toDF("category", "label", "count", "go")
    val whale_com = components.select("go","gray_whale_liver","gray_whale_kidney").where($"gray_whale_liver" =!= 0.0 || $"gray_whale_kidney" =!= 0.0)
    val whale =  com.join(whale_com, Seq("go"))
    whale.groupBy("category", "label", "count").agg(count($"go").as("count_in_whale"), sum($"gray_whale_liver").as("liver_sum"), sum($"gray_whale_kidney").as("kidney_sum"))
}

In [22]:
components.limit(10).show(10,10000)

+----------+------------------+--------------------------------------------------------------+------------------+--------------+------------+-----------------+-----------------------+-----------------------+---------------------+-----------------------------------------------+------------------+-------------------+------------------+------------------+------------------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+--------------------+------------------+------------------+------------------+---------------------+------------------+-------------------+------------------+------------------+------------------+---------------------+------------------------+----------------+--------------+----------------+-------------------------+---------------------+-----------------------+-------------------+----------------+----------------+----------------------+-------------------------+-----------------+-------

In [23]:
def whaleFunBy(path: String): DataFrame = {
    val fun = spark.readTSV(path, false).toDF("category", "label", "count", "go")
    val whale_fun = functions.select("go","gray_whale_liver","gray_whale_kidney").where($"gray_whale_liver" =!= 0.0 || $"gray_whale_kidney" =!= 0.0)
    val whale =  fun.join(whale_fun, Seq("go"))
    whale.groupBy("category", "label", "count").agg(count($"go").as("count_in_whale"), sum($"gray_whale_liver").as("liver_sum"), sum($"gray_whale_kidney").as("kidney_sum"))
}

Correlations
------------
Correlations between species

In [25]:
def doublesByColumns(columns: Seq[String]): List[StructField] = columns.map(c=>StructField(c, DoubleType, false)).toList

def convertCorrellationMatrix(matrix: Matrix, columns: Seq[String]) = {
  require(columns.size == matrix.numCols)
  for(r <- 0 until matrix.numRows) yield {
    val seq = for(c <- 0 until matrix.numCols) yield matrix(r, c)
    Row.fromSeq(columns(r)::seq.toList)
  }
}

def transformCorrellationMatrix(dataFrame: DataFrame, columns: Seq[String])(implicit sparkSession: SparkSession): DataFrame = {
      val rows  = dataFrame.rdd
        .flatMap{ case Row(matrix: Matrix) => convertCorrellationMatrix(matrix, columns) }
      sparkSession.createDataFrame(rows, StructType(StructField("column", StringType, false)::doublesByColumns(columns)))
    }
    
def spearmanCorrellation(dataFrame: DataFrame, columns: Seq[String])(implicit sparkSession: SparkSession): DataFrame = {
  val cor = dataFrame.toVectors(columns.toSeq, "features").persist(StorageLevel.MEMORY_AND_DISK)
  val df = Correlation.corr(cor, "features", method = "spearman")
  transformCorrellationMatrix(df, columns)
}

In [26]:
val all_cor = spearmanCorrellation(all_pro, featuresOrdered)(spark).withColumnRenamed("column", "go")//.withColumn("go", renameUDF($"go")).cache//.rename(featuresMap)
all_cor.show(10,1000)

+--------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+--------------------+--------------------+------------------+------------------+------------------+------------------+------------------+-------------------+---------------------+
|                  go|bowhead_whale_liver|  gray_whale_liver|       minke_liver|       human_liver|       bat_liver_1|         NMR_liver|cow_totalRNA_liver|mouse_totalRNA_liver|bowhead_whale_kidney| gray_whale_kidney|      minke_kidney|      human_kidney|      bat_kidney_1|        NMR_kidney|cow_totalRNA_kidney|mouse_totalRNA_kidney|
+--------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+--------------------+--------------------+------------------+------------------+------------------+------------------+------------------+-------------------+---------------

In [27]:
html(s"""<div id="all_cor"></div>""")
//div("all_cor")

In [28]:
makeHeatmap(all_cor.withColumn("go", renameUDF($"go")), "go", all_cor.columns.tail.toList, "all_cor", "Correlations for liver and kidney",  400, 400, 1700, 1500, inverse = false,  dataFontSize = 18, tickFontSize = 20, 
names = all_cor.columns.tail.toList.map(featuresMap(_)), zmin = 0.0, zmax = 1.0,
tickAngle = "-30"  )
//html(str)

<div><h3>Specific correlations</h3></div><div>---------------------------<br></div>

In [30]:
val longevity_goes =  List("GO:0015986", "GO:0006364", "GO:0006413", "GO:0006614", "GO:0000184", "GO:0019083", "GO:0019068", "GO:0006955", "GO:0060271", "GO:2000785", "GO:0042073", "GO:0055007", "GO:1902017", "GO:0060828", "GO:0090102", "GO:0034067", "GO:0060122", "GO:0061512", "GO:0061351", "GO:0001736", "GO:0035735", "GO:0051642", "GO:0035845", "GO:0036372", "GO:0007165", "GO:0002474", "GO:0000050", "GO:0019731", "GO:0002227", "GO:0046034", "GO:0032259", "GO:0071268", "GO:0033353", "GO:0000096", "GO:0042745", "GO:0002439", "GO:0006950", "GO:0097284", "GO:0097176", "GO:1990000", "GO:0051014", "GO:2001269", "GO:0090527", "GO:0042989", "GO:0045010", "GO:1902174", "GO:0019430", "GO:0001938", "GO:0045060", "GO:1902166", "GO:0001961", "GO:0042104", "GO:0045738", "GO:1902546", "GO:0061481",  "GO:1902231", "GO:0032079", "GO:2001272", "GO:1905053","GO:0009435", "GO:0010941")
val dna_repair_goes = List("GO:0006281", "GO:0006283","GO:0000715","GO:0070911","GO:0033683","GO:0006296","GO:0006297","GO:0006294","GO:0000717","GO:0036297","GO:0045739","GO:1905053","GO:0006289","GO:0030091","GO:0045738","GO:0006282","GO:0006307","GO:0051103","GO:0043504","GO:0006284","GO:2000819","GO:0000724")
val hypoxia_goes = List("GO:0061418", "GO:1903298", "GO:0097411", "GO:0061428", "GO:0001666", "GO:0071456")
val autophagy_goes = List("GO:0006914","GO:0016236","GO:1904714","GO:2000785","GO:0000045","GO:0097352","GO:0016241","GO:0010507","GO:0010506")
val stress_response_goes = List("GO:0006979","GO:0034599","GO:1902236","GO:0051403","GO:0006950","GO:0097201","GO:1990441","GO:0070059","GO:0034976","GO:0051496","GO:1902176","GO:1990440","GO:0014898","GO:0031098","GO:0097501","GO:0043149","GO:0008631","GO:0071472","GO:0001306","GO:0032872")
val apoptosis_goes = List("GO:0097284","GO:0043066","GO:2000352","GO:0006915","GO:0043065","GO:0051402","GO:0042981","GO:1902236","GO:0043154","GO:2001240","GO:1901029","GO:0097284","GO:0043524","GO:1903298","GO:0043065","GO:0070059","GO:0008630","GO:2001235","GO:2001269","GO:1902174","GO:1902166","GO:0002906","GO:0097191","GO:1902231","GO:2001272","GO:1902176","GO:0006919","GO:0072332","GO:0008637","GO:0043525","GO:1902230","GO:0010666","GO:0043280","GO:0097190","GO:0070234","GO:2001244","GO:0008631","GO:0043281","GO:2000426","GO:2001272","GO:0097194","GO:1900119","GO:0008630","GO:1902166","GO:1902231")
val ubiquitin_goes = List("GO:0043161","GO:0016579","GO:0000209","GO:0016567","GO:0031146","GO:0051437","GO:0051436","GO:0031397","GO:0031396","GO:0032436","GO:0006511","GO:0030433","GO:0031398","GO:0032435","GO:0070936","GO:0043328","GO:0070534","GO:0051865","GO:0006513","GO:0070979","GO:0043162")
val mitochondria_goes = List("GO:0006123","GO:0046034","GO:0032981","GO:1901029","GO:0007005","GO:0034551","GO:0051881","GO:0032543","GO:0006839","GO:0006122","GO:0047497","GO:1903955","GO:0008637","GO:0042775","GO:0046902","GO:0000422","GO:0090201","GO:0061732","GO:0010917","GO:0006850","GO:0010637","GO:0070125","GO:0051560")
val atp_goes = List("GO:0015986","GO:0015991","GO:0046034","GO:0032781","GO:0099132","GO:0070072","GO:2001171","GO:0006754","GO:0042776","GO:0042775","GO:0071318","GO:0042773","GO:0033198","GO:0043044","GO:2000984","GO:0015867","GO:2001170")
val nad_goes = List("GO:0019674", "GO:0006116","GO:0009435","GO:0006734")
val wounds_goes = List("GO:0044319","GO:0090303","GO:0042060","GO:0060055","GO:0061045","GO:0060054","GO:0061041","GO:0035313","GO:1903690")
val cellular = List("GO:0034362", "GO:0106003", "GO:0005814", "GO:0005902", "GO:0031514", "GO:0005929", "GO:0032420", "GO:0001750", "GO:0005801", "GO:0030992", "GO:0032391", "GO:0097546", "GO:0097542", "GO:0044292", "GO:1902636", "GO:0120105", "GO:0120104", "GO:0005628", "GO:0043332", "GO:0120106", "GO:0030479", "GO:0000142", "GO:0035692", "GO:0035693", "GO:0035578", "GO:0101003", "GO:0098993", "GO:0022626", "GO:0005940", "GO:0002197", "GO:0042565", "GO:0042272", "GO:0000835", "GO:0065010") 

In [31]:
def makeProCor(list: List[String]) = {
    val tpms = processes.select("go", featuresOrdered:_*).where($"go".isin(list:_*)).na.fill(0.0).sort($"bowhead_whale_liver".desc)
    spearmanCorrellation(tpms, featuresOrdered)(spark).withColumnRenamed("column", "go").cache//.withColumn("go", renameUDF($"go")).cache//.rename(featuresMap)
}

In [32]:
def makeHeatCor(df: DataFrame, name: String,
 title: String, colorScale: String = "RdBu")(implicit kernel: KernelRuntime) = {
 makeHeatmap(df.withColumn("go", renameUDF($"go")), "go", df.columns.tail.toList, name, title,  300, 400, 1800, 1400, inverse = false,  dataFontSize = 18, tickFontSize = 20, names = df.columns.tail.toList.map(featuresMap(_)),
 tickAngle = "-30", colorScale = colorScale )(kernel)    
}


In [33]:
processes.select("go", featuresOrdered:_*)
  .where($"go".isin(longevity_goes:_*))
  .na.fill(0.0)
  .sort($"bowhead_whale_liver".desc).show(10,1000)

+----------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+--------------------+--------------------+------------------+------------------+------------------+------------------+------------------+-------------------+---------------------+
|        go|bowhead_whale_liver|  gray_whale_liver|       minke_liver|       human_liver|       bat_liver_1|         NMR_liver|cow_totalRNA_liver|mouse_totalRNA_liver|bowhead_whale_kidney| gray_whale_kidney|      minke_kidney|      human_kidney|      bat_kidney_1|        NMR_kidney|cow_totalRNA_kidney|mouse_totalRNA_kidney|
+----------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+--------------------+--------------------+------------------+------------------+------------------+------------------+------------------+-------------------+---------------------+
|GO:0007165| 11456.244

In [34]:
val longevity_cor = makeProCor(longevity_goes)
longevity_cor.show(10,1000)

+--------------------+-------------------+---------------------+------------------+-------------------+-------------------+-------------------+---------------------+--------------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+---------------------+---------------------+
|                  go|bowhead_whale_liver|     gray_whale_liver|       minke_liver|        human_liver|        bat_liver_1|          NMR_liver|   cow_totalRNA_liver|mouse_totalRNA_liver|bowhead_whale_kidney|   gray_whale_kidney|       minke_kidney|       human_kidney|       bat_kidney_1|         NMR_kidney|  cow_totalRNA_kidney|mouse_totalRNA_kidney|
+--------------------+-------------------+---------------------+------------------+-------------------+-------------------+-------------------+---------------------+--------------------+--------------------+--------------------+-------------------+-------------------+-------------------+------

In [35]:
html(s"""<div id="longevity_cor"></div>""")

In [36]:
makeHeatCor(longevity_cor, "longevity_cor", "Longevity correlations for liver and kidney" )

### DNA REPAIR<br>




In [38]:
val dna_repair_cor = makeProCor(dna_repair_goes)
dna_repair_cor.show(10,1000)

+--------------------+-------------------+--------------------+--------------------+-------------------+-------------------+-------------------+--------------------+---------------------+--------------------+---------------------+--------------------+--------------------+-------------------+-------------------+--------------------+---------------------+
|                  go|bowhead_whale_liver|    gray_whale_liver|         minke_liver|        human_liver|        bat_liver_1|          NMR_liver|  cow_totalRNA_liver| mouse_totalRNA_liver|bowhead_whale_kidney|    gray_whale_kidney|        minke_kidney|        human_kidney|       bat_kidney_1|         NMR_kidney| cow_totalRNA_kidney|mouse_totalRNA_kidney|
+--------------------+-------------------+--------------------+--------------------+-------------------+-------------------+-------------------+--------------------+---------------------+--------------------+---------------------+--------------------+--------------------+----------------

In [39]:
html(s"""<div id="dna_repair_cor"></div>""")

In [40]:
makeHeatCor(dna_repair_cor, "dna_repair_cor", "DNA repair correlations for liver and kidney")

### Hypoxia




In [42]:
val hypoxia_cor = makeProCor(hypoxia_goes)
html(s"""<div id="hypoxia_cor"></div>""")

In [43]:
makeHeatCor(hypoxia_cor, "hypoxia_cor", "Hypoxia correlations for liver and kidney" )

### Autophagy




In [45]:
val autophagy_cor = makeProCor(autophagy_goes)
html(s"""<div id="autophagy_cor"></div>""")

In [46]:
makeHeatCor(autophagy_cor, "autophagy_cor", "Autophagy correlations for liver and kidney" )

### Stress response




In [48]:
val stress_response_cor = makeProCor(stress_response_goes)
html(s"""<div id="stress_response_cor"></div>""")

makeHeatCor(stress_response_cor, "stress_response_cor", "Stress response correlations for liver and kidney" )

### Apoptosis



In [51]:
val apoptosis_cor = makeProCor(apoptosis_goes)
html(s"""<div id="apoptosis_cor"></div>""")

In [52]:
makeHeatCor(apoptosis_cor, "apoptosis_cor", "Apoptosis correlations for liver and kidney" )

In [53]:
Ubiquitin

In [54]:
val ubiquitin_cor = makeProCor(ubiquitin_goes)
html(s"""<div id="ubiquitin_cor"></div>""")

In [55]:
makeHeatCor(ubiquitin_cor, "ubiquitin_cor", "Ubiquitin correlations for liver and kidney" )

### Mitochondria



val mitochondria_cor = makeProCor(mitochondria_goes)
html(s"""<div id="mitochondria_cor"></div>""")

In [58]:
makeHeatCor(mitochondria_cor, "mitochondria_cor", "Mitochondria correlations for liver and kidney" )

### ATP correlations



In [60]:
val atp_cor = makeProCor(atp_goes)
html(s"""<div id="atp_cor"></div>""")

In [61]:
makeHeatCor(atp_cor, "atp_cor", "ATP correlations for liver and kidney" )

### NAD goes



In [63]:
val nad_cor = makeProCor(nad_goes)
html(s"""<div id="nad_cor"></div>""")

In [64]:
makeHeatCor(nad_cor, "nad_cor", "NAD correlations for liver and kidney" )

### Wounds healing correlations



In [66]:
val wounds_cor = makeProCor(wounds_goes)
html(s"""<div id="wounds_cor"></div>""")

In [67]:
html(makeHeatCor(wounds_cor, "wounds_cor", "Wound healing correlations for liver and kidney" ))

### GO figures



In [69]:
val goTable = spark.readTSV(comparison, true)

In [70]:
val all_pro_ranks = processes.select("go", ("label"::featuresOrderedRanked):_*).cache

In [71]:
def go_ranked(goes: Seq[String]) =  processes.where($"go".isin(goes:_*)).select("go", ("label"::featuresOrderedRanked):_*).sort($"bowhead_whale_liver_rank".asc)

In [72]:
val longevity_pro_ranks = go_ranked(longevity_goes)
html(s"""<div id="longevity_pro_ranks"></div>""")

In [73]:
makeHeatmap(longevity_pro_ranks, "label", featuresOrderedRanked, "longevity_pro_ranks", "Specific processes (by ranks)",  600, 500, 2000, 3000, inverse = true, logData = true, showScale = false, names = prettyFeaturesRanked, tickAngle = "-30")

## DNA repair ranks



In [75]:
val dna_repair_goes_ranks = go_ranked(dna_repair_goes)
html("<div id='dna_repair_goes_ranks'></div>")

In [76]:
makeHeatmap(dna_repair_goes_ranks, "label", featuresOrderedRanked, "dna_repair_goes_ranks", "DNA repair (by ranks)",  600, 300, 1700, 1400, inverse = true, logData = true, showScale = true, names = prettyFeaturesRanked, 
dataFontSize = 18, tickFontSize = 18, tickAngle = "-30")

In [77]:
val autophagy_goes_ranks = go_ranked(autophagy_goes)
div("autophagy_goes_ranks")

In [78]:
makeHeatmap(autophagy_goes_ranks, "label", featuresOrderedRanked, "autophagy_goes_ranks", "Autophagy (by ranks)",  600, 300, 1700, 800, inverse = true, logData = true, showScale = false, names = prettyFeaturesRanked, 
dataFontSize = 18, tickFontSize = 20, tickAngle = "-30")