# histograms

This is a text cell. Start editing!

In [1]:
import org.apache.spark._
import org.apache.spark.sql.types._
import scala.reflect.runtime.universe._
import org.apache.spark.storage.StorageLevel
import org.apache.spark.rdd._
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions._
import group.research.aging.spark.extensions._
import group.research.aging.spark.extensions.functions._
import kernel.display.html

# Expressions charts



In [3]:
val gray_whale_expressions= spark.readTSV("/data/results/gray-whale/Expressions/Transcripts/raw/gray_whale/gray_whale_unannotated.tsv", true)
    .withColumn("gray_whale_TPM_average", ($"gray_whale_liver_TPM" + $"gray_whale_kidney_TPM" ) / 2.0)
    .select("gray_whale_liver_TPM", "gray_whale_kidney_TPM", "gray_whale_TPM_average")
    .orderBy($"gray_whale_TPM_average".desc)
gray_whale_expressions.show(10,1000)

+--------------------+---------------------+----------------------+
|gray_whale_liver_TPM|gray_whale_kidney_TPM|gray_whale_TPM_average|
+--------------------+---------------------+----------------------+
|         5073.310514|          7749.827459|          6411.5689865|
|         1919.685425|           913.876522|          1416.7809735|
|         2714.853292|             8.304572|           1361.578932|
|          983.748362|          1110.571776|           1047.160069|
|         1240.863501|           811.640763|           1026.252132|
|         1499.576298|           173.077664|     836.3269809999999|
|         1539.723902|           120.987325|           830.3556135|
|          914.982057|           498.561784|     706.7719205000001|
|         1104.751215|           286.767514|           695.7593645|
|          206.599366|          1119.584019|     663.0916924999999|
+--------------------+---------------------+----------------------+
only showing top 10 rows



In [4]:
gray_whale_expressions.count()

10389

In [5]:
html(s"""<script src="https://cdn.plot.ly/plotly-latest.js" charset="utf-8"></script><h1>Activate plotly!</h1>""")

In [6]:
val liver_expressions = gray_whale_expressions.select("gray_whale_liver_TPM").as[Double].orderBy($"gray_whale_liver_TPM".desc).collect.take(500).toVector.map(Math.log(_))
val kidney_expressions = gray_whale_expressions.select("gray_whale_kidney_TPM").as[Double].orderBy($"gray_whale_kidney_TPM".desc).collect.take(500).toVector.map(Math.log(_))

In [15]:
html("<div id='myDiv'></div><b>it works!</b>")

In [8]:
html(
    s"""<script>
    //var x1 = ${liver_expressions.mkString("[", ", ", "]")};
    //val x2 = ${kidney_expressions.mkString("[", ", ", "]")};
var data = [
  {    
    y: ${liver_expressions.mkString("[", ", ", "]")},
    type: 'bar',
    name: 'gray_whale_liver_transcripts',
  },
  {    
    y: ${kidney_expressions.mkString("[", ", ", "]")},
    type: 'bar',
    name: 'gray_whale_kidney_transcripts'
  }  
];

Plotly.newPlot('myDiv', data);

    """
)

In [9]:
val make_gene = udf[String, String]{str=> str.substring(0,str.lastIndexOf("_")) }
val gray_whale_genes= spark.readTSV("/data/results/gray-whale/Expressions/Transcripts/raw/gray_whale/gray_whale_unannotated.tsv", true)
    .withColumn("gene",  make_gene($"transcript"))
    .select("gene","gray_whale_liver_TPM", "gray_whale_kidney_TPM")
    .groupBy($"gene").agg(sum($"gray_whale_liver_TPM")as("gray_whale_liver_TPM"), sum($"gray_whale_kidney_TPM").as("gray_whale_kidney_TPM"))
    .withColumn("gray_whale_TPM_average", ($"gray_whale_liver_TPM" + $"gray_whale_kidney_TPM" ) / 2.0)    
    .orderBy($"gray_whale_TPM_average".desc)
gray_whale_genes.show(30,100)

+---------+--------------------+---------------------+----------------------+
|     gene|gray_whale_liver_TPM|gray_whale_kidney_TPM|gray_whale_TPM_average|
+---------+--------------------+---------------------+----------------------+
|c19144_g2|         5073.310514|          7749.827459|          6411.5689865|
|c31983_g1|         1919.685425|           913.876522|          1416.7809735|
| c3812_g2|         2714.853292|             8.304572|           1361.578932|
| c4211_g1|         1748.832417|           902.416853|    1325.6246350000001|
|c19059_g3|  1710.0973800000002|           901.457727|          1305.7775535|
|c71306_g1|          983.748362|          1110.571776|           1047.160069|
| c8949_g1|         1499.576298|           173.077664|     836.3269809999999|
|c71268_g1|         1539.723902|           120.987325|           830.3556135|
|c18980_g3|         1104.751215|           286.767514|           695.7593645|
|c32069_g1|          206.599366|          1119.584019|     663.0

In [10]:
val liver_genes_expressions = gray_whale_genes.select("gray_whale_liver_TPM").as[Double].orderBy($"gray_whale_liver_TPM".desc).collect.take(500).toVector//.map(Math.log(_))
val kidney_genes_expressions = gray_whale_genes.select("gray_whale_kidney_TPM").as[Double].orderBy($"gray_whale_kidney_TPM".desc).collect.take(500).toVector//.map(Math.log(_))

In [11]:
html("<div id='genes'></div><b>it works!</b>")

In [12]:
html(
    s"""<script>

var data = [
  {    
    y: ${liver_genes_expressions.mkString("[", ", ", "]")},
    type: 'bar',
    name: 'gray whale top 500 liver genes (TPM)',
  },
  {    
    y: ${kidney_genes_expressions.mkString("[", ", ", "]")},
    type: 'bar',
    name: 'gray whale top 500 kidney genes (TPM)'
  }  
];

Plotly.newPlot('genes', data);

    """
)

## Contigs




In [16]:
import org.bdgenomics.adam._
import org.bdgenomics.adam.rdd.ADAMContext._
val transcriptome = spark.sparkContext.loadDnaSequences("/data/indexes/GRAY_WHALE/Trin_Mitya.Trinity.fasta")
transcriptome.dataset.show(10,1000)

+------------+-------------------------------------------------------------------------------------------------------------------------------+--------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [17]:
val proteome = spark.sparkContext.loadFastaProtein("/data/results/gray-whale/diamond/blastx/transdecoder/previous/Trinity.fasta.transdecoder.pep")
proteome.dataset.show(10,100)

+-------------------------------------------------------+----------------------------------------------------------------------------------------------------+--------+----------------------------------------------------------------------------------------------------+------+----------+
|                                                   name|                                                                                         description|alphabet|                                                                                            sequence|length|attributes|
+-------------------------------------------------------+----------------------------------------------------------------------------------------------------+--------+----------------------------------------------------------------------------------------------------+------+----------+
|  comp558155_c0::comp558155_c0_seq1::g.129975::m.129975|comp558155_c0::comp558155_c0_seq1::g.129975  ORF type:complete len:316 (+),score=7

In [18]:
(proteome.dataset.where($"description".contains("ORF type:complete")).select("sequence").count,
 proteome.dataset.where($"description".contains("ORF type:complete")).select("sequence").as[String].filter(str=>str.size < 90).count )

(83922,41)

In [19]:
proteome.dataset.select("sequence").as[String].filter(str=>str.count < 201)