# well annotated species workflow

This is a text cell. Start editing!

In [3]:
import org.apache.spark._
import org.apache.spark.sql.types._
import scala.reflect.runtime.universe._
import org.apache.spark.storage.StorageLevel
import org.apache.spark.rdd._
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions._
import group.research.aging.spark.extensions._
import group.research.aging.spark.extensions.functions._
import kernel.display.html

In [6]:
val whalePath = "/data/results/gray-whale/"
val expressionsPath = whalePath + "Expressions/"
val unirefPath = expressionsPath + "uniref90/"
val transcriptsPath = expressionsPath + "Transcripts/"
val codingPath = transcriptsPath + "coding/"

val comparisonsPath = expressionsPath + "Comparisons/"
val comparisonsUniref = comparisonsPath + "uniref90_comparisons/"
val annotationsPath = comparisonsPath + "annotations/"

In [4]:
def loadTranscripts(subpath: String, prefix: String) = {
    val path = if(subpath.startsWith("/")) subpath else transcriptsPath + subpath
    spark.readTSV(path, header=true).select($"Name".as("transcript"), $"NumReads".as(prefix + "_reads"), $"TPM".as(prefix + "_TPM")).cache 
}

In [9]:
import org.apache.spark.sql.functions.udf
def undot(str: String): String = str.substring(0, str.indexOf("."))
def uni(df: DataFrame) = df.select("uniref90").distinct
val undotFun = udf[String, String](undot)

In [5]:
val mapping = spark.readTSV("/data/indexes/uniprot/idmapping_selected.tab").toDF("UniProtKB-AC","UniProtKB-ID","Entrez","RefSeq","GI","PDB","GO",
"UniRef100","UniRef90","UniRef50","UniParc","PIR",
"NCBI-taxon","MIM","UniGene","PubMed",
"EMBL","EMBL-CDS","Ensembl","Ensembl_TRS","Ensembl_PRO","Additional PubMed"
    )
mapping.limit(20).show(20, 1000)

+------------+------------+-------+-----------+-------------------------------+----+----------------------------------+----------------+---------------+---------------+-------------+----+----------+----+-------+------------------+--------+----------+-------+-----------+-----------+-----------------+
|UniProtKB-AC|UniProtKB-ID| Entrez|     RefSeq|                             GI| PDB|                                GO|       UniRef100|       UniRef90|       UniRef50|      UniParc| PIR|NCBI-taxon| MIM|UniGene|            PubMed|    EMBL|  EMBL-CDS|Ensembl|Ensembl_TRS|Ensembl_PRO|Additional PubMed|
+------------+------------+-------+-----------+-------------------------------+----+----------------------------------+----------------+---------------+---------------+-------------+----+----------+----+-------+------------------+--------+----------+-------+-----------+-----------+-----------------+
|      Q6GZX4|  001R_FRG3G|2947773|YP_031579.1|             81941549; 49237298|null|             

In [1]:
val cow_mapping = mapping.where($"NCBI-taxon" === "9913").cache
cow_mapping.limit(20).show(20)

+------------+------------+------+--------------------+--------------------+--------------------+--------------------+----------------+-----------------+-----------------+-------------+--------------+----------+----+-------+--------------------+--------------------+--------------------+------------------+------------------+------------------+--------------------+
|UniProtKB-AC|UniProtKB-ID|Entrez|              RefSeq|                  GI|                 PDB|                  GO|       UniRef100|         UniRef90|         UniRef50|      UniParc|           PIR|NCBI-taxon| MIM|UniGene|              PubMed|                EMBL|            EMBL-CDS|           Ensembl|       Ensembl_TRS|       Ensembl_PRO|   Additional PubMed|
+------------+------------+------+--------------------+--------------------+--------------------+--------------------+----------------+-----------------+-----------------+-------------+--------------+----------+----+-------+--------------------+--------------------+--

In [2]:
val cow_liver_tr = loadTranscripts("raw/cow/liver/GSM1020724/quant.sf", "cow_liver")
val cow_kidney_tr = loadTranscripts("raw/cow/kidney/GSM1020723/quant.sf", "cow_kidney")
(cow_liver_tr.count, cow_kidney_tr.count)

(22904,22904)

In [7]:
cow_liver_tr.show(10,1000)

+--------------------+---------------+-------------+
|          transcript|cow_liver_reads|cow_liver_TPM|
+--------------------+---------------+-------------+
|ENSBTAT00000064726.1|         2168.0|    41.310949|
|ENSBTAT00000030504.3|         1474.0|    33.032692|
|ENSBTAT00000004603.5|            3.0|     0.031527|
|ENSBTAT00000066297.1|         1197.0|    22.513724|
|ENSBTAT00000054517.2|            0.0|          0.0|
|ENSBTAT00000052281.2|            0.0|          0.0|
|ENSBTAT00000056197.2|            0.0|          0.0|
|ENSBTAT00000052768.1|            0.0|          0.0|
|ENSBTAT00000015780.3|         1333.0|    11.552155|
|ENSBTAT00000049620.3|          159.0|     0.804058|
+--------------------+---------------+-------------+
only showing top 10 rows



In [8]:
val cow_trans = cow_liver_tr.withColumn("Ensembl_TRS", undotFun($"transcript")).select($"Ensembl_TRS").join(cow_mapping, Seq("Ensembl_TRS")).select("Ensembl_TRS", "Uniref90").distinct
cow_trans.limit(20).show(10,1000)

+------------------+---------------+
|       Ensembl_TRS|       Uniref90|
+------------------+---------------+
|ENSBTAT00000018700|UniRef90_Q5E9P1|
|ENSBTAT00000018595|UniRef90_P41500|
|ENSBTAT00000000651|UniRef90_Q2HJE0|
|ENSBTAT00000025670|UniRef90_Q1LZG6|
|ENSBTAT00000025465|UniRef90_A0JNF3|
|ENSBTAT00000008771|UniRef90_Q8TAL6|
|ENSBTAT00000016996|UniRef90_Q2KIF1|
|ENSBTAT00000003959|UniRef90_P59768|
|ENSBTAT00000001782|UniRef90_Q0P5M8|
|ENSBTAT00000003033|UniRef90_O08908|
+------------------+---------------+
only showing top 10 rows



In [10]:
val cow_uni = uni(cow_trans.withColumnRenamed("Uniref90", "uniref90")).cache
cow_uni.limit(10).show(10,1000)

+---------------+
|       uniref90|
+---------------+
|UniRef90_Q14872|
|UniRef90_F1MQK6|
|UniRef90_Q1RMI9|
|UniRef90_G3N1M0|
|UniRef90_O15379|
|UniRef90_O75899|
|UniRef90_F1MZB7|
|UniRef90_A6QNR9|
|UniRef90_O43692|
|UniRef90_Q2KHU0|
+---------------+



In [11]:
val cow_mapping = mapping.where($"NCBI-taxon" === "9913").cache
//cow_mapping.writeTSV("/data/indexes/uniprot/species/cow_mapping.tsv", header = true)
cow_mapping.limit(20).show(20)

+------------+------------+------+--------------------+--------------------+--------------------+--------------------+----------------+-----------------+-----------------+-------------+--------------+----------+----+-------+--------------------+--------------------+--------------------+------------------+------------------+------------------+--------------------+
|UniProtKB-AC|UniProtKB-ID|Entrez|              RefSeq|                  GI|                 PDB|                  GO|       UniRef100|         UniRef90|         UniRef50|      UniParc|           PIR|NCBI-taxon| MIM|UniGene|              PubMed|                EMBL|            EMBL-CDS|           Ensembl|       Ensembl_TRS|       Ensembl_PRO|   Additional PubMed|
+------------+------------+------+--------------------+--------------------+--------------------+--------------------+----------------+-----------------+-----------------+-------------+--------------+----------+----+-------+--------------------+--------------------+--