# lags protein base

This is a text cell. Start editing!

In [1]:
import org.apache.spark._
import org.apache.spark.sql.types._
import scala.reflect.runtime.universe._
import org.apache.spark.storage.StorageLevel
import org.apache.spark.rdd._
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions._
import group.research.aging.spark.extensions._
import group.research.aging.spark.extensions.functions._
import kernel.display.html
import org.bdgenomics.adam.rdd.ADAMContext._

In [70]:
import org.apache.spark.sql.functions.udf
def undot(str: String): String = str.substring(0, str.lastIndexOf("."))
def uni(df: DataFrame) = df.select("uniref90").distinct
val undotFun = udf[String, String](undot)

In [2]:
"qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore".split(" ").mkString("[\"","\",\"","\"]")

In [3]:
val columns = List(
        "qseqid",  // Query Seq - id\n",
        "qlen",  // Query sequence length\n",
        "sseqid",  // Subject Seq - id\n","    //"sallseqid",  // All subject Seq - id(s), separated by a ';'\n",
        "slen",  // Subject sequence length\n",
        "qstart",  // Start of alignment in query\n",
        "qend",  // End of alignment in query\n",
        "sstart",  // Start of alignment in subject\n",
        "send",  // End of alignment in subject\n",
        "qseq",  // Aligned part of query sequence\n",
        "sseq",  // Aligned part of subject sequence\n",
        "evalue",  // Expect value\n",
        "bitscore",  // Bit score\n",
        "score",  // Raw score\n",
        "length",  // Alignment length\n",
        "pident",  // Percentage of identical matches\n",
        "nident",  // Number of identical matches\n",
        "mismatch",  // Number of mismatches\n",
        "positive",  // Number of positive - scoring matches\n",
        "gapopen",  // Number of gap openings\n",
        "gaps",  // Total number of gaps\n",
        "ppos",  // Percentage of positive - scoring matches\n",
        "qframe",  // Query frame\n",
        "btop"  // Blast traceback operations(BTOP)\n",
          //  "staxids",  // unique Subject Taxonomy ID(s), separated by a ';' (in numerical order)\n",
          //  "stitle",  // Subject Title\n",
          //  "salltitles",  // All Subject Title(s), separated by a '<>'\n",
          //  "qcovhsp",  // Query Coverage Per HSP\n",
          //  "qtitle" // Query title\n",
        )

In [4]:
columns.size

23

In [5]:
//val columns = List("gene", "transcript", "identity", "aligment_length", "mismatches", "gaps", "start_query", "end_query", "start_target", "end_target") //last two non valid

val lags_path = "/data/results/gray-whale/diamond/blastp/lags"
val lags_in_bowhead = spark.readTSV(lags_path + "/lags_in_bowhead.m8").toDF(columns :_*)
val lags_in_graywhale = spark.readTSV(lags_path + "/lags_in_graywhale.m8").toDF(columns:_*)
val lags_in_minkewhale = spark.readTSV(lags_path + "/lags_in_minkewhale.m8").toDF(columns:_*)
val lags_in_NMR  = spark.readTSV(lags_path + "/lags_in_NMR.m8").toDF(columns:_*)
val lags_in_bat  = spark.readTSV(lags_path + "/lags_in_bat.m8").toDF(columns:_*)

val lags_in_cow  = spark.readTSV(lags_path + "/lags_in_cow.m8").toDF(columns:_*)
val lags_in_mouse  = spark.readTSV(lags_path + "/lags_in_mouse.m8").toDF(columns:_*)
val lags_in_human  = spark.readTSV(lags_path + "/lags_in_human.m8")  .toDF(columns:_*)


In [7]:
val genage_conversion = spark.readTSV("/data/databases/genage/genage_conversion.tsv", header = true).distinct
genage_conversion.show(10,1000)

+------+------------------------+------------------+------------------+-----------+---------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Entrez|                Organism|           Ensembl|           Protein|Gene Symbol|Lifespan Effect|Longevity Influence|                                                                                                                                                                            Method|
+------+------------------------+------------------+------------------+-----------+---------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|852260|Saccharomyces cerevisiae|           YBL021C|           YBL021C|       HAP3|       decrease|    

In [8]:
val conv = genage_conversion.where(length($"Longevity Influence") <100).groupBy("Organism", "Longevity Influence")
.agg(countDistinct("Ensembl").as("Total Genes"), countDistinct("Protein").as("Total Proteins")).orderBy($"Organism".asc, $"Longevity Influence".asc)
conv.show(10,1000)

+-----------------------+-------------------+-----------+--------------+
|               Organism|Longevity Influence|Total Genes|Total Proteins|
+-----------------------+-------------------+-----------+--------------+
| Caenorhabditis elegans|               anti|        576|          1444|
| Caenorhabditis elegans|               none|          3|            10|
| Caenorhabditis elegans|                pro|        288|           743|
|            Danio rerio|                pro|          1|             3|
|Drosophila melanogaster|               anti|         74|           245|
|Drosophila melanogaster|               none|          2|             5|
|Drosophila melanogaster|                pro|        122|           381|
|           Mus musculus|               anti|         45|           152|
|           Mus musculus|               none|          2|             6|
|           Mus musculus|                pro|         78|           310|
+-----------------------+-------------------+------

In [9]:
def byMaxScore(df: DataFrame, threshold: Double = 30.0) = {    
    val d = df.where($"ppos" >=  threshold)//.select("qseqid", "sseqid", "score").distinct
    //val windowSequence = Window.partitionBy(d("sseqid")).orderBy(d("score").desc)
    //val ds = d.withColumn("rn", row_number.over(windowSequence)).where($"rn" === 1).drop("rn")
    val ds = d
    val windowQuery = Window.partitionBy(ds("qseqid")).orderBy(ds("score").desc)
    ds.withColumn("rn", row_number.over(windowQuery)).where($"rn" === 1).drop("rn")
}

def byMaxScoreExt(df: DataFrame, threshold: Double = 30.0) = byMaxScore(df, threshold)
 .withColumnRenamed("qseqid", "Protein")
 .join(genage_conversion, Seq("Protein"))
 .select($"Protein",$"Ensembl", $"Entrez", $"Gene Symbol",$"Longevity Influence", $"Organism", $"Method", $"score", 
 $"length", $"mismatch", $"positive", $"ppos".as("percent aligned")).distinct.orderBy($"Organism", $"Longevity Influence",$"Method", $"score".desc)
 
 def count_genes(df: DataFrame, name: String) = df.groupBy($"Organism", $"Longevity Influence")
 .agg(countDistinct("Ensembl").as(name + " genes"),countDistinct("Protein").as(name + " proteins")).filter(row=>
 row.getAs[String]("Longevity Influence") match 
 { 
     case null => true 
     case str if str.length < 200 => true
     case _ => false
     }
 ).orderBy("Organism", "Longevity Influence")

def summary(df: DataFrame, name: String, threshold: Double = 30.0) = count_genes(byMaxScoreExt(df, threshold), name)

In [30]:
val threshold = 50.0

def renameTarget(species: String) = udf[String, String]{
    case str if species.toLowerCase.contains("bat") => undot(undot(str))
    
    case str if species.toLowerCase.contains("cow") =>
     str.indexOf("gene:") match {
            case -1 => undot(str.replace("TAP", "TAT"))
            case i => undot(str.substring(i+5, str.indexOf(" ", i)).replace("TAP", "TAT"))            
        } 

    case str if species.toLowerCase.contains("nmr") || species.toLowerCase.contains("naked") =>
        str.indexOf("GENE.") match {
            case -1 => undot(undot(str))
            case i => 
                val a= i + "GENE.".length
                undot(undot(str.substring(a, str.indexOf("~",a)))) //double undot
        }

    case str if species.contains("mink") =>
        undot(str.replace("XP", "XM"))
    case str if (species.toLowerCase.contains("mouse") || species.toLowerCase.contains("human")) && str.contains("|") =>   
        undot(str.split('|')(1))
       
    case str if str.contains("::") => str.split("::")(1)    
    case str => str
}

def filterLags(df: DataFrame, name: String = "transcript",  threshold: Double = 50.0) = {
    val ren = renameTarget(name)
    byMaxScore(df, threshold)
    .withColumnRenamed("qseqid", "Protein")
    .withColumn(name, ren($"sseqid"))
    .join(genage_conversion, Seq("Protein")).select("Protein",  "Organism", "Longevity Influence", name)
    .filter(row=>
        row.getAs[String]("Longevity Influence") match 
        { 
            case null => true 
            case str if str.length < 200 => true
            case _ => false
            }
        )
    .distinct()
}

val mw_filered = filterLags(lags_in_minkewhale, "minke whale", threshold)
println(mw_filered.count)
mw_filered.show(10,1000)

2779
+------------+------------------------+-------------------+------------+
|     Protein|                Organism|Longevity Influence| minke whale|
+------------+------------------------+-------------------+------------+
| FBpp0082863| Drosophila melanogaster|               anti|XM_007169043|
|     YPL040C|Saccharomyces cerevisiae|            fitness|XM_007172131|
|   E01B7.1.2|  Caenorhabditis elegans|                pro|XM_007170473|
| F26D12.1l.1|  Caenorhabditis elegans|               anti|XM_028018577|
|  F01F1.8a.2|  Caenorhabditis elegans|               anti|XM_007186549|
|Y71H2AM.23.2|  Caenorhabditis elegans|               anti|XM_007186412|
| FBpp0078399| Drosophila melanogaster|               anti|XM_007190556|
| FBpp0307142| Drosophila melanogaster|                pro|XM_007168672|
|   LLC1.3b.1|  Caenorhabditis elegans|               none|XM_007187833|
|  F28C1.2b.1|  Caenorhabditis elegans|               anti|XM_007184459|
+------------+------------------------+-------

In [89]:
def filterLagsSimple(df: DataFrame, name: String = "transcript",  threshold: Double = 50.0) = {

    val ren = renameTarget(name)
    byMaxScore(df, threshold)
    .withColumnRenamed("qseqid", "Protein")
    .withColumn(name, ren($"sseqid"))
    .select("Protein", name).distinct
    .join(genage_conversion.select("Protein").distinct, Seq("Protein")).select("Protein", name)
}

val mw_filered_simple = filterLagsSimple(lags_in_minkewhale, "minke whale", threshold)
println(mw_filered_simple.count)
mw_filered_simple.show(10,1000)

2644
+-------------+------------+
|      Protein| minke whale|
+-------------+------------+
|      YPR021C|XM_007195790|
|Y69A2AR.19a.1|XM_007173362|
|      YNL202W|XM_007181859|
|   F13G3.7b.2|XM_028019676|
|      YGL227W|XM_007198201|
|   F10D11.1.2|XM_007186167|
|  Y55D5A.5c.2|XM_007194427|
|  FBpp0305430|XM_007179402|
|   C47E12.2.2|XM_007177934|
|   C27B7.7a.1|XM_007197313|
+-------------+------------+
only showing top 10 rows



In [19]:
val threshold = 50.0
val all_lags = filterLags(lags_in_graywhale, "gray whale", threshold)
  .join(filterLags(lags_in_bowhead, "bowhead whale", threshold), Seq("Protein",  "Organism", "Longevity Influence"), "outer")
  .join(filterLags(lags_in_minkewhale, "minke whale", threshold), Seq("Protein",  "Organism", "Longevity Influence"), "outer")
  .join(filterLags(lags_in_human, "human", threshold), Seq("Protein",  "Organism", "Longevity Influence"), "outer")
  .join(filterLags(lags_in_cow, "cow", threshold), Seq("Protein",  "Organism", "Longevity Influence"), "outer")
  .join(filterLags(lags_in_mouse, "mouse", threshold), Seq("Protein",  "Organism", "Longevity Influence"), "outer")
  .orderBy($"Organism".asc, $"Longevity Influence".desc).cache()
all_lags.show(20,10000)

+------------+----------------------+-------------------+------------+-------------+------------+---------------+------------------+------------------+
|     Protein|              Organism|Longevity Influence|  gray whale|bowhead whale| minke whale|          human|               cow|             mouse|
+------------+----------------------+-------------------+------------+-------------+------------+---------------+------------------+------------------+
|  F42G9.6a.1|Caenorhabditis elegans|                pro|        null|         null|        null|ENST00000297056|              null|ENSMUST00000125567|
|  T19E7.2b.1|Caenorhabditis elegans|                pro|c17238_g1_i1|  bmy_15253T0|XM_007171592|ENST00000362042|ENSBTAT00000005413|ENSMUST00000005103|
|   F44G3.2.1|Caenorhabditis elegans|                pro|c13258_g1_i2|  bmy_05894T0|XM_007168193|ENST00000348956|ENSBTAT00000018492|ENSMUST00000208710|
|  K10B2.1c.1|Caenorhabditis elegans|                pro| c1881_g1_i1|  bmy_04507T0|XM_0

In [12]:
val threshold = 50.0
val all_lags_summary = summary(lags_in_bowhead, "bowhead whale", threshold)
  .join(summary(lags_in_graywhale, "gray whale", threshold), Seq("Organism", "Longevity Influence"))
  .join(summary(lags_in_minkewhale, "minke whale", threshold), Seq("Organism", "Longevity Influence"))
  .join(summary(lags_in_human, "human", threshold), Seq("Organism", "Longevity Influence"))
  .join(summary(lags_in_cow, "cow", threshold), Seq("Organism", "Longevity Influence"))
  .join(summary(lags_in_mouse, "mouse", threshold), Seq("Organism", "Longevity Influence")).orderBy($"Organism".asc).cache()
all_lags.show(20,10000)

+----------+----------------------+-------------------+------------------------------------------+-------------+--------------+----------------------------------------------------------------------------------------------------------------------+--------------------+--------------------------------------------------------------------------------------------------------------------------------+
|   Protein|              Organism|Longevity Influence|                                gray whale|bowhead whale|   minke whale|                                                                                                                 human|                 cow|                                                                                                                           mouse|
+----------+----------------------+-------------------+------------------------------------------+-------------+--------------+-------------------------------------------------------------------------------

## Expressions



In [21]:
val whalePath = "/data/results/gray-whale/"
val expressionsPath = whalePath + "Expressions/"
val unirefPath = expressionsPath + "uniref90/"
val transcriptsPath = expressionsPath + "Transcripts/"
val codingPath = transcriptsPath + "coding/"

val comparisonsPath = expressionsPath + "Comparisons/"
val comparisonsUniref = comparisonsPath + "uniref90_comparisons/"
val annotationsPath = comparisonsPath + "annotations/"

In [17]:
def loadTranscripts(subpath: String, prefix: String, undot: Boolean = true) = {
    val path = if(subpath.startsWith("/")) subpath else transcriptsPath + subpath
    val res = spark.readTSV(path, header=true).select($"Name".as("transcript"), $"NumReads".as(prefix + "_reads"), $"TPM".as(prefix + "_TPM"))
    (if(undot) res.withColumn("transcript", undotFun($"transcript")) else res).cache 
}

In [22]:
val gray_liver_tr = loadTranscripts("raw/gray_whale/liver/transcripts_quant", "gray_whale_liver", false)
val gray_kidney_tr =  loadTranscripts("raw/gray_whale/kidney/transcripts_quant", "gray_whale_kidney", false)
println((gray_liver_tr.count, gray_kidney_tr.count))
gray_liver_tr.show(10,1000)

(114263,114263)
+----------+----------------------+--------------------+
|transcript|gray_whale_liver_reads|gray_whale_liver_TPM|
+----------+----------------------+--------------------+
|  c0_g1_i1|               2.35775|             1.56396|
|  c0_g1_i2|               2.64225|            1.878858|
|  c3_g1_i1|                   2.0|            1.208575|
|  c4_g1_i1|             45.184412|             1.57192|
|  c4_g1_i2|              4.815588|             0.33569|
|  c5_g1_i1|                   6.0|            1.565579|
|  c6_g1_i1|                   5.0|             0.80294|
|  c7_g1_i1|                  24.0|            3.393696|
|  c8_g1_i1|                  33.0|            3.809589|
|  c9_g1_i1|             14.105694|            0.718535|
+----------+----------------------+--------------------+
only showing top 10 rows



In [32]:
def expressionsForLags(df: DataFrame, dfs: Seq[DataFrame], species: String, threshold: Double = 50.0) = {
 
 val trs = filterLagsSimple(df, species, threshold).withColumnRenamed(species, "transcript")
 val cols = dfs.foldLeft(List("Protein")){ case (acc, el) => acc ++ el.columns.filter(f=> !f.contains("reads"))}.distinct
 dfs.foldLeft(trs){ case (acc, el) => acc.join(el, Seq("transcript"))}.select(cols.head, cols.tail:_*).withColumnRenamed("transcript", species+"_transcript").distinct
}

In [35]:
val gray_whale_lags_expression = expressionsForLags(lags_in_graywhale, Seq(gray_liver_tr, gray_kidney_tr), "gray_whale")
gray_whale_lags_expression.show(10,1000)

+-----------+---------------------+--------------------+---------------------+
|    Protein|gray_whale_transcript|gray_whale_liver_TPM|gray_whale_kidney_TPM|
+-----------+---------------------+--------------------+---------------------+
|    YGL064C|         c14623_g1_i1|            0.186365|             0.995612|
|FBpp0401456|          c7733_g2_i1|          310.515847|           473.203765|
|FBpp0288908|          c4752_g1_i1|            0.875987|             4.148228|
| D2023.2a.2|          c7240_g1_i1|          107.172115|            38.781339|
|  Y37E3.9.1|          c9180_g2_i1|           57.983029|           184.539534|
|FBpp0070917|         c18269_g1_i4|           40.079991|            54.738772|
|FBpp0308778|         c10264_g1_i1|          103.186991|           113.593692|
| ZK897.1m.1|         c38480_g1_i1|            1.310951|             2.450916|
|    YLR371W|         c15530_g1_i2|            0.230202|             0.703873|
| F38A6.3d.1|         c15780_g1_i1|            1.627

In [43]:
val minke_liver_tr = loadTranscripts("raw/minke_whale/liver/transcripts_quant/quant.sf", "minke_liver",true)
val minke_kidney_tr = loadTranscripts("raw/minke_whale/kidney/transcripts_quant/quant.sf", "minke_kidney", true)
println((minke_liver_tr.count, minke_kidney_tr.count))
minke_liver_tr.show(10,1000)

(37868,37868)
+------------+-----------------+---------------+
|  transcript|minke_liver_reads|minke_liver_TPM|
+------------+-----------------+---------------+
|XM_007163932|              0.0|            0.0|
|XM_007163933|        71.627944|       1.775091|
|XM_007163934|        67.498857|       1.770562|
|XM_007163935|         3.158546|       0.138508|
|XM_007163936|         6.065339|       0.160186|
|XM_007163937|       113.811004|       2.588256|
|XM_007163938|              4.0|       0.084003|
|XM_007163939|         0.069572|       0.001121|
|XM_007163940|        18.574946|       0.304864|
|XM_007163941|              0.0|            0.0|
+------------+-----------------+---------------+
only showing top 10 rows



In [42]:
val minke_lags_expression = expressionsForLags(lags_in_minkewhale, Seq(minke_liver_tr, minke_kidney_tr), "minke_whale")
minke_lags_expression.show(10,1000)

+-----------+----------------------+---------------+----------------+
|    Protein|minke_whale_transcript|minke_liver_TPM|minke_kidney_TPM|
+-----------+----------------------+---------------+----------------+
|FBpp0080787|          XM_007174215|            0.0|             0.0|
|  K12H4.8.1|          XM_007192312|        5.41E-4|             0.0|
|FBpp0312092|          XM_007172625|       0.991899|        6.341447|
|FBpp0290803|          XM_007165477|            0.0|             0.0|
|    YJL062W|          XM_007181529|       5.288851|        7.577712|
| C09H10.3.1|          XM_007171145|       0.825031|        1.776809|
|FBpp0087929|          XM_007168239|       0.772431|             0.0|
| C56G2.1b.2|          XM_007176218|       3.042068|          1.0E-6|
|    YHR164C|          XM_007167851|       5.023247|        8.368648|
| F28B12.3.2|          XM_007178940|       2.135544|             0.0|
+-----------+----------------------+---------------+----------------+
only showing top 10 

In [27]:
val cow_liver_tr = loadTranscripts("raw/cow/liver/GSM1020724/quant.sf", "cow_liver")
val cow_kidney_tr = loadTranscripts("raw/cow/kidney/GSM1020723/quant.sf", "cow_kidney")
println((cow_liver_tr.count, cow_kidney_tr.count))
cow_liver_tr.show(10,1000)

(22904,22904)
+------------------+---------------+-------------+
|        transcript|cow_liver_reads|cow_liver_TPM|
+------------------+---------------+-------------+
|ENSBTAT00000064726|         2168.0|    41.310949|
|ENSBTAT00000030504|         1474.0|    33.032692|
|ENSBTAT00000004603|            3.0|     0.031527|
|ENSBTAT00000066297|         1197.0|    22.513724|
|ENSBTAT00000054517|            0.0|          0.0|
|ENSBTAT00000052281|            0.0|          0.0|
|ENSBTAT00000056197|            0.0|          0.0|
|ENSBTAT00000052768|            0.0|          0.0|
|ENSBTAT00000015780|         1333.0|    11.552155|
|ENSBTAT00000049620|          159.0|     0.804058|
+------------------+---------------+-------------+
only showing top 10 rows



In [71]:
val cow_lags_expression = expressionsForLags(lags_in_cow, Seq(cow_liver_tr, cow_kidney_tr), "cow")
cow_lags_expression.show(10,1000)

+-----------+------------------+-------------+--------------+
|    Protein|    cow_transcript|cow_liver_TPM|cow_kidney_TPM|
+-----------+------------------+-------------+--------------+
| T21H8.1h.1|ENSBTAT00000021483|    12.507328|       9.11233|
|FBpp0078779|ENSBTAT00000021683|   192.906893|    294.096266|
|    YKL056C|ENSBTAT00000013402|  4409.029239|    3507.87296|
|F26D12.1l.2|ENSBTAT00000018905|    11.105422|     14.609954|
|FBpp0074822|ENSBTAT00000011976|    26.797805|     37.119717|
| ZK897.1c.1|ENSBTAT00000006737|      4.42228|      26.88029|
| F55A8.1a.1|ENSBTAT00000026265|     0.463981|     25.359775|
|FBpp0078890|ENSBTAT00000022388|     6.899469|      7.918285|
|H28O16.1d.7|ENSBTAT00000003259|   269.814607|    809.285228|
|FBpp0310640|ENSBTAT00000008535|     2.926829|      7.194492|
+-----------+------------------+-------------+--------------+
only showing top 10 rows



In [59]:
val nmr_liver_tr = loadTranscripts("raw/NMR/liver/transcripts_quant/quant.sf", "nmr_liver")
val nmr_kidney_tr = loadTranscripts("raw/NMR/kidney/transcripts_quant/quant.sf", "nmr_kidney")
println((nmr_liver_tr.count, nmr_kidney_tr.count))
nmr_liver_tr.show(10,1000)

(7713,7713)
+------------+---------------+-------------+
|  transcript|nmr_liver_reads|nmr_liver_TPM|
+------------+---------------+-------------+
|GEBF01000001|           48.0|    13.173864|
|GEBF01000002|            0.0|          0.0|
|GEBF01000003|          182.0|    60.981855|
|GEBF01000004|            6.0|     1.872549|
|GEBF01000005|           11.0|     2.070928|
|GEBF01000006|            0.0|          0.0|
|GEBF01000007|           83.0|    14.059258|
|GEBF01000008|            3.0|     0.299681|
|GEBF01000009|         1742.0|  2316.448613|
|GEBF01000010|            0.0|          0.0|
+------------+---------------+-------------+
only showing top 10 rows



In [73]:
val nmr_lags_expression = expressionsForLags(lags_in_NMR, Seq(nmr_liver_tr, nmr_kidney_tr), "nmr")
nmr_lags_expression.show(10,1000)

+-----------+--------------+-------------+--------------+
|    Protein|nmr_transcript|nmr_liver_TPM|nmr_kidney_TPM|
+-----------+--------------+-------------+--------------+
|    YEL012W|  GEBF01005412|   575.659697|    180.697831|
| F01F1.8b.2|  GEBF01007302|   240.739216|    204.222425|
|    YGL064C|  GEBF01000111|   109.757993|    116.718397|
|  D2030.1.2|  GEBF01007678|    14.411751|      31.33811|
| T05A1.5a.1|  GEBF01001255|     0.413249|   1131.234836|
|  YER056C-A|  GEBF01006363|   459.824647|     404.48731|
|    YGR281W|  GEBF01004679|    40.105963|       5.42533|
|FBpp0078396|  GEBF01000167|   784.838957|    141.989843|
|FBpp0082821|  GEBF01003225|   101.694353|     91.327127|
| W10G6.2a.1|  GEBF01001171|    90.951186|    662.162908|
+-----------+--------------+-------------+--------------+
only showing top 10 rows



In [61]:
val mouse_liver_tr = loadTranscripts("raw/mouse/liver/GSM1400574/quant.sf", "mouse_liver")
val mouse_kidney_tr = loadTranscripts("raw/mouse/kidney/GSM2195188/quant.sf", "mouse_kidney")
println((mouse_liver_tr.count, mouse_kidney_tr.count) )
mouse_liver_tr.show(10,1000)

(133618,133618)
+-----------------------------------------------------------------------------------------------------------------+-----------------+---------------+
|                                                                                                       transcript|mouse_liver_reads|mouse_liver_TPM|
+-----------------------------------------------------------------------------------------------------------------+-----------------+---------------+
|ENSMUST00000193812.1|ENSMUSG00000102693.1|OTTMUSG00000049935.1|OTTMUST00000127109.1|RP23-271O17.1-001|RP23-271O17|              0.0|            0.0|
|                                                                          ENSMUST00000082908.1|ENSMUSG00000064842|              0.0|            0.0|
|                     ENSMUST00000162897.1|ENSMUSG00000051951.5|OTTMUSG00000026353.2|OTTMUST00000086625.1|AC157543|              0.0|            0.0|
|                     ENSMUST00000159265.1|ENSMUSG00000051951.5|OTTMUSG00000026353.2

In [74]:
filterLags(lags_in_mouse,"mouse").show(10, 1000)

+-----------+------------------------+-------------------+------------------+
|    Protein|                Organism|Longevity Influence|             mouse|
+-----------+------------------------+-------------------+------------------+
|FBpp0089374| Drosophila melanogaster|                pro|ENSMUST00000030243|
| F11A1.3c.1|  Caenorhabditis elegans|               null|ENSMUST00000090543|
|    YGR204W|Saccharomyces cerevisiae|            fitness|ENSMUST00000021443|
| F56B3.8a.1|  Caenorhabditis elegans|               anti|ENSMUST00000002844|
| T21H8.1f.1|  Caenorhabditis elegans|                pro|ENSMUST00000040056|
|    YCR009C|Saccharomyces cerevisiae|               anti|ENSMUST00000022680|
|T05G5.10b.2|  Caenorhabditis elegans|               anti|ENSMUST00000164359|
|FBpp0288676| Drosophila melanogaster|                pro|ENSMUST00000106603|
|F23B12.8b.1|  Caenorhabditis elegans|                pro|ENSMUST00000012587|
|FBpp0074902| Drosophila melanogaster|               anti|ENSMUS

In [75]:
val unEncodeFunction = udf[String, String]{ str => undot(str.substring(0, str.indexOf("|")))}
def unEncode(df: DataFrame) = df.withColumn("transcript", unEncodeFunction($"transcript"))

In [62]:
val mouse_lags_expression = expressionsForLags(lags_in_mouse, Seq(mouse_liver_tr, mouse_kidney_tr).map( unEncode), "mouse")
mouse_lags_expression.show(10,1000)

+-----------+------------------+---------------+----------------+
|    Protein|  mouse_transcript|mouse_liver_TPM|mouse_kidney_TPM|
+-----------+------------------+---------------+----------------+
|    YOL126C|ENSMUST00000019323|        93.4509|      278.187632|
|  F10B5.1.1|ENSMUST00000076364|     909.446342|      850.953447|
|F52D10.3a.3|ENSMUST00000022894|       31.02732|       66.251696|
|Y71H2AR.2.1|ENSMUST00000223778|            0.0|             0.0|
| D2021.1b.1|ENSMUST00000044484|       1.968125|        2.519344|
|F13B10.2d.2|ENSMUST00000081650|     245.310788|      425.781834|
| ZK524.2g.1|ENSMUST00000030170|       0.351997|        0.165473|
|    YLR382C|ENSMUST00000038863|       8.090383|       39.762013|
| K07A12.3.1|ENSMUST00000043675|     252.987738|       986.14427|
|    YBL103C|ENSMUST00000203884|            0.0|        0.682382|
+-----------+------------------+---------------+----------------+
only showing top 10 rows



In [63]:
val human_liver_tr = loadTranscripts("raw/human/liver/GSM1698568/quant.sf", "human_liver")
val human_kidney_tr = loadTranscripts("raw/human/kidney/GSM1698570/quant.sf", "human_kidney")
println((human_liver_tr.count, human_kidney_tr.count) )
human_liver_tr.show(10,1000)

(203027,203027)
+---------------------------------------------------------------------------------------------------------+-----------------+---------------+
|                                                                                               transcript|human_liver_reads|human_liver_TPM|
+---------------------------------------------------------------------------------------------------------+-----------------+---------------+
|                 ENST00000456328.2|ENSG00000223972.5|OTTHUMG00000000961.2|OTTHUMT00000362751.1|RP11-34P13|              0.0|            0.0|
|                 ENST00000450305.2|ENSG00000223972.5|OTTHUMG00000000961.2|OTTHUMT00000002844.2|RP11-34P13|              0.0|            0.0|
|                 ENST00000488147.1|ENSG00000227232.5|OTTHUMG00000000958.1|OTTHUMT00000002839.1|RP11-34P13|         4.883404|       0.057526|
|                                                                        ENST00000619216.1|ENSG00000278267|              0.0|       

In [76]:
val human_lags_expression = expressionsForLags(lags_in_human, Seq(human_liver_tr, human_kidney_tr).map( unEncode), "human")
human_lags_expression.show(10,1000)

+------------+----------------+---------------+----------------+
|     Protein|human_transcript|human_liver_TPM|human_kidney_TPM|
+------------+----------------+---------------+----------------+
|   B0365.3.2| ENST00000295598|      10.797749|      102.233543|
|  T13F2.1a.1| ENST00000525094|            0.0|             0.0|
|   F44G3.2.1| ENST00000348956|       0.086218|        2.837912|
| FBpp0072621| ENST00000419333|       0.051248|             0.0|
|  F01F1.8a.1| ENST00000275603|       2.615517|        6.362973|
|  T02H6.11.1| ENST00000287022|       0.380222|        1.063411|
|     YGR078C| ENST00000286428|       0.913526|        2.711772|
|  K08E5.2b.2| ENST00000433363|       3.997244|             0.0|
|   C01F1.2.1| ENST00000255390|       0.634887|        0.808296|
|Y54E10BL.6.1| ENST00000307102|       1.132638|        1.053082|
+------------+----------------+---------------+----------------+
only showing top 10 rows



In [23]:
//val quants_base = "/data/samples/de_novo/bat/quants/ours"
val bat_liver_1 = loadTranscripts("raw/bat/quant_bat_liver_active_1/quant.sf", "bat_liver")
val bat_kidney_1 = loadTranscripts("raw/bat/quant_bat_kidney_active_1/quant.sf", "bat_kidney")
val bat_liver_2 = loadTranscripts("raw/bat/quant_bat_liver_active_2/quant.sf", "bat_liver_2")
val bat_kidney_2 = loadTranscripts("raw/bat/quant_bat_kidney_active_2/quant.sf", "bat_kidney_2")
println((bat_liver_1.count, bat_kidney_2.count))
bat_liver_1.show(10,1000)

(408411,408411)
+----------------------------+---------------+-------------+
|                  transcript|bat_liver_reads|bat_liver_TPM|
+----------------------------+---------------+-------------+
| NODE_1_length_17988_cov_580|       2131.249|     2.849032|
|  NODE_2_length_17947_cov_75|       3916.057|     5.768584|
|  NODE_3_length_17149_cov_48|       1329.291|     2.802971|
| NODE_4_length_16953_cov_143|       5417.553|    11.308167|
|  NODE_5_length_16927_cov_28|        667.796|     1.503753|
|  NODE_6_length_16768_cov_80|        135.463|     0.206226|
|  NODE_7_length_16714_cov_42|        493.287|     1.240009|
|  NODE_8_length_16515_cov_85|        623.855|     0.971752|
|  NODE_9_length_16383_cov_85|        183.466|     0.285428|
|NODE_10_length_16149_cov_140|         348.75|     0.777149|
+----------------------------+---------------+-------------+
only showing top 10 rows



In [66]:
val bat_lags_expression = expressionsForLags(lags_in_bat, Seq(bat_liver_1, bat_kidney_1), "bat")
bat_lags_expression.show(10,1000)

+-----------+------------------------------+-------------+--------------+
|    Protein|                bat_transcript|bat_liver_TPM|bat_kidney_TPM|
+-----------+------------------------------+-------------+--------------+
| F11A1.3a.1| NODE_2187_length_5899_cov_269|     1.488213|     18.967196|
| F58B3.5c.1|NODE_35539_length_1399_cov_144|     8.683969|     20.666786|
|  K05C4.1.1| NODE_28969_length_1697_cov_81|     6.887901|      4.182846|
| R13H8.1b.1| NODE_1040_length_7086_cov_213|      3.38475|      6.885604|
|FBpp0082680|  NODE_3606_length_5087_cov_21|     0.534704|      2.468411|
|F26F4.10a.2|NODE_18533_length_2406_cov_151|          0.0|      0.628313|
|FBpp0100079|NODE_30038_length_1642_cov_355|    13.638469|     16.102531|
|  F14F4.3.1|  NODE_1592_length_6403_cov_89|     8.595544|       6.36736|
|  YBR084C-A|NODE_53737_length_889_cov_2912|   244.671627|    405.914312|
|FBpp0289741| NODE_2880_length_5453_cov_106|     0.299941|      0.132011|
+-----------+-------------------------

In [78]:
val bowhead_liver_tr = loadTranscripts("raw/bowhead/liver/transcripts_quant/quant.sf", "bowhead_whale_liver", false)
val bowhead_kidney_tr = loadTranscripts("raw/bowhead/kidney/transcripts_quant/quant.sf", "bowhead_whale_kidney", false)
println((bowhead_liver_tr.count, bowhead_kidney_tr.count) )
bowhead_liver_tr.show(10,1000)

(423657,1059024)
+----------------------------------+-------------------------+-----------------------+
|                        transcript|bowhead_whale_liver_reads|bowhead_whale_liver_TPM|
+----------------------------------+-------------------------+-----------------------+
|000872-000883_All_comp1777_c1_seq1|                  1.03144|                1.01615|
|000872-000883_All_comp1777_c1_seq2|                      0.0|                    0.0|
|000872-000883_All_comp1777_c1_seq3|                  15.5346|                31.2996|
|000872-000883_All_comp1777_c3_seq1|                      6.0|                18.8227|
|000872-000883_All_comp1852_c0_seq1|                      8.0|                2.13903|
|000872-000883_All_comp1852_c1_seq1|                  397.264|                 25.142|
|000872-000883_All_comp1852_c1_seq2|                     59.0|                4.72196|
|000872-000883_All_comp1852_c1_seq3|                  28.8846|                2.45446|
|000872-000883_All_comp185

In [77]:
val human_liver_tr = loadTranscripts("raw/human/liver/GSM1698568/quant.sf", "human_liver")
val human_kidney_tr = loadTranscripts("raw/human/kidney/GSM1698570/quant.sf", "human_kidney")
println((human_liver_tr.count, human_kidney_tr.count) )
human_liver_tr.show(10,1000)

(203027,203027)
+---------------------------------------------------------------------------------------------------------+-----------------+---------------+
|                                                                                               transcript|human_liver_reads|human_liver_TPM|
+---------------------------------------------------------------------------------------------------------+-----------------+---------------+
|                 ENST00000456328.2|ENSG00000223972.5|OTTHUMG00000000961.2|OTTHUMT00000362751.1|RP11-34P13|              0.0|            0.0|
|                 ENST00000450305.2|ENSG00000223972.5|OTTHUMG00000000961.2|OTTHUMT00000002844.2|RP11-34P13|              0.0|            0.0|
|                 ENST00000488147.1|ENSG00000227232.5|OTTHUMG00000000958.1|OTTHUMT00000002839.1|RP11-34P13|         4.883404|       0.057526|
|                                                                        ENST00000619216.1|ENSG00000278267|              0.0|       

In [79]:
val human_lags_expression = expressionsForLags(lags_in_human, Seq(human_liver_tr, human_kidney_tr).map( unEncode), "human")
human_lags_expression.show(10,1000)

+------------+----------------+---------------+----------------+
|     Protein|human_transcript|human_liver_TPM|human_kidney_TPM|
+------------+----------------+---------------+----------------+
|   B0365.3.2| ENST00000295598|      10.797749|      102.233543|
|  T13F2.1a.1| ENST00000525094|            0.0|             0.0|
|   F44G3.2.1| ENST00000348956|       0.086218|        2.837912|
| FBpp0072621| ENST00000419333|       0.051248|             0.0|
|  F01F1.8a.1| ENST00000275603|       2.615517|        6.362973|
|  T02H6.11.1| ENST00000287022|       0.380222|        1.063411|
|     YGR078C| ENST00000286428|       0.913526|        2.711772|
|  K08E5.2b.2| ENST00000433363|       3.997244|             0.0|
|   C01F1.2.1| ENST00000255390|       0.634887|        0.808296|
|Y54E10BL.6.1| ENST00000307102|       1.132638|        1.053082|
+------------+----------------+---------------+----------------+
only showing top 10 rows



In [81]:
val lag_fields = Seq("Protein", "Organism", "Longevity Influence")

In [80]:
val fields = Seq("Protein")
val allLagsExpression = gray_whale_lags_expression
    .join(minke_lags_expression, fields, "outer")
    .join(human_lags_expression, fields, "outer")
    .join(nmr_lags_expression, fields, "outer")
    .join(bat_lags_expression, fields, "outer")
    .join(cow_lags_expression, fields, "outer")
    .join(mouse_lags_expression, fields, "outer")
    println(allLagsExpression.count)
    allLagsExpression.show(10,10000)

2895
+-----------+---------------------+--------------------+---------------------+----------------------+---------------+----------------+----------------+---------------+----------------+--------------+-------------+--------------+------------------------------+-------------+--------------+------------------+-------------+--------------+------------------+---------------+----------------+
|    Protein|gray_whale_transcript|gray_whale_liver_TPM|gray_whale_kidney_TPM|minke_whale_transcript|minke_liver_TPM|minke_kidney_TPM|human_transcript|human_liver_TPM|human_kidney_TPM|nmr_transcript|nmr_liver_TPM|nmr_kidney_TPM|                bat_transcript|bat_liver_TPM|bat_kidney_TPM|    cow_transcript|cow_liver_TPM|cow_kidney_TPM|  mouse_transcript|mouse_liver_TPM|mouse_kidney_TPM|
+-----------+---------------------+--------------------+---------------------+----------------------+---------------+----------------+----------------+---------------+----------------+--------------+-------------+----

In [82]:
val con = new Concatenate(" ", unique = true)
val genage_agg =genage_conversion.na.fill("null").select(lag_fields.head, lag_fields.tail:_*).groupBy($"Protein")
.agg(con($"Organism").as("Organism"), con($"Longevity Influence").as("Longevity Influence"))
.filter(row=>row.getAs[String]("Longevity Influence").size < 100)
.orderBy(lag_fields(1), lag_fields(2))
genage_agg.show(10,1000)


+-----------+----------------------+-------------------+
|    Protein|              Organism|Longevity Influence|
+-----------+----------------------+-------------------+
| C27B7.7a.1|Caenorhabditis elegans|               anti|
| C51E3.7c.1|Caenorhabditis elegans|               anti|
| D2030.9b.2|Caenorhabditis elegans|               anti|
|F26D12.1k.1|Caenorhabditis elegans|               anti|
| F55A8.2c.2|Caenorhabditis elegans|               anti|
|F55G11.5a.1|Caenorhabditis elegans|               anti|
| K07A3.1a.1|Caenorhabditis elegans|               anti|
| T01A4.1a.1|Caenorhabditis elegans|               anti|
|T25C12.1b.1|Caenorhabditis elegans|               anti|
| W10G6.2a.1|Caenorhabditis elegans|               anti|
+-----------+----------------------+-------------------+
only showing top 10 rows



In [91]:
val allLags_t = genage_agg.join(allLagsExpression, "Protein").orderBy(lag_fields(1), lag_fields(2))
val cols = allLags_t.columns
val trs = cols.filter(_.contains("transcript"))
val reordered = cols.diff(trs)++trs
val allLags = allLags_t.select(allLags_t.columns.head, reordered.tail:_*).cache
allLags.show(10, 1000)

+-----------+----------------------+-------------------+--------------------+---------------------+---------------+----------------+---------------+----------------+-------------+--------------+-------------+--------------+-------------+--------------+---------------+----------------+---------------------+----------------------+----------------+--------------+------------------------------+------------------+------------------+
|    Protein|              Organism|Longevity Influence|gray_whale_liver_TPM|gray_whale_kidney_TPM|minke_liver_TPM|minke_kidney_TPM|human_liver_TPM|human_kidney_TPM|nmr_liver_TPM|nmr_kidney_TPM|bat_liver_TPM|bat_kidney_TPM|cow_liver_TPM|cow_kidney_TPM|mouse_liver_TPM|mouse_kidney_TPM|gray_whale_transcript|minke_whale_transcript|human_transcript|nmr_transcript|                bat_transcript|    cow_transcript|  mouse_transcript|
+-----------+----------------------+-------------------+--------------------+---------------------+---------------+----------------+----

In [92]:
allLags.writeTSV("/data/results/gray-whale/lags/all_lags_comparison.tsv")

parts of /data/results/gray-whale/lags/all_lags_comparison.tsv merged!


/data/results/gray-whale/lags/all_lags_comparison.tsv