# well-annotated species workflow


Mapping well-sequenced genomes to uniref90<br>




In [1]:
import org.apache.spark._
import org.apache.spark.sql.types._
import scala.reflect.runtime.universe._
import org.apache.spark.storage.StorageLevel
import org.apache.spark.rdd._
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions._
import group.research.aging.spark.extensions._
import group.research.aging.spark.extensions.functions._
import kernel.display.html

In [2]:
//setting up pathes
val projectPath = "/data/results/gray-whale/"
val expressionsPath = projectPath + "Expressions/"
val unirefPath = expressionsPath + "uniref90/"
val transcriptsPath = expressionsPath + "Transcripts/"
val codingPath = transcriptsPath + "coding/"

val comparisonsPath = expressionsPath + "Comparisons/"
val comparisonsUniref = comparisonsPath + "uniref90_comparisons/"
val annotationsPath = comparisonsPath + "annotations/"

In [3]:
def loadTranscripts(subpath: String, prefix: String) = {
    val path = if(subpath.startsWith("/")) subpath else transcriptsPath + subpath
    spark.readTSV(path, header=true).select($"Name".as("transcript"), $"NumReads".as(prefix + "_reads"), $"TPM".as(prefix + "_TPM")).cache 
}

In [4]:
import org.apache.spark.sql.functions.udf
def undot(str: String): String = str.substring(0, str.indexOf("."))
def uni(df: DataFrame) = df.select("uniref90").distinct
val undotFun = udf[String, String](undot)

In [5]:
val mapping_cols = List("UniProtKB-AC","UniProtKB-ID","Entrez","RefSeq","GI","PDB","GO",
"UniRef100","UniRef90","UniRef50","UniParc","PIR",
"NCBI-taxon","MIM","UniGene","PubMed",
"EMBL","EMBL-CDS","Ensembl","Ensembl_TRS","Ensembl_PRO","Additional PubMed")


In [6]:
val mapping = spark.readTSV("/data/indexes/uniprot/idmapping_selected.tab").toDF(mapping_cols: _*)
mapping.limit(20).show(20, 1000)

+------------+------------+-------+-----------+-------------------------------+----+----------------------------------+----------------+---------------+---------------+-------------+----+----------+----+-------+------------------+--------+----------+-------+-----------+-----------+-----------------+
|UniProtKB-AC|UniProtKB-ID| Entrez|     RefSeq|                             GI| PDB|                                GO|       UniRef100|       UniRef90|       UniRef50|      UniParc| PIR|NCBI-taxon| MIM|UniGene|            PubMed|    EMBL|  EMBL-CDS|Ensembl|Ensembl_TRS|Ensembl_PRO|Additional PubMed|
+------------+------------+-------+-----------+-------------------------------+----+----------------------------------+----------------+---------------+---------------+-------------+----+----------+----+-------+------------------+--------+----------+-------+-----------+-----------+-----------------+
|      Q6GZX4|  001R_FRG3G|2947773|YP_031579.1|             81941549; 49237298|null|             

In [7]:

def processMapping(df: DataFrame) = {
    df.select("Uniref90", "Ensembl_TRS").as[(String, String)].where($"Ensembl_TRS".isNotNull).flatMap{
        case (u, ens) => ens.split(";").map(e=>u->e)
    }.toDF("Uniref90","Ensembl_TRS")
}


In [8]:
def map_species(name: String, liver: String, kidney: String, mapping: DataFrame) = {
    val liver_tr = loadTranscripts(liver, s"${name}_liver")
    val kidney_tr = loadTranscripts(kidney, s"${name}_kidney")
    val liver_kidney_tr = liver_tr.join(kidney_tr, "transcript").withColumn("Ensembl_TRS", undotFun($"transcript"))
    val mp = mapping.select("Uniref90","Ensembl_TRS").distinct
    val col_liver = new ColumnName(s"${name}_liver_TPM")
    val col_kidney = new ColumnName(s"${name}_kidney_TPM")
    liver_kidney_tr
        .join(mp, Seq("Ensembl_TRS")).select($"Uniref90",col_liver,col_kidney)
        .groupBy("Uniref90").agg(sum(col_liver).as(s"${name}_liver"), sum(col_kidney).as(s"${name}_kidney"))
        .withColumnRenamed("Uniref90", "uniref90")
        .orderBy(new ColumnName(s"${name}_liver").desc_nulls_last)
}


Mapping animals to uniref90<br>

In [10]:
val cow_mapping = processMapping(mapping.where($"NCBI-taxon" === "9913")).cache
cow_mapping.limit(20).show(20)

+-----------------+-------------------+
|         Uniref90|        Ensembl_TRS|
+-----------------+-------------------+
|  UniRef90_P62258| ENSBTAT00000007442|
|  UniRef90_Q04917| ENSBTAT00000044059|
|UniRef90_P31947-2| ENSBTAT00000012154|
|  UniRef90_P27348| ENSBTAT00000032851|
|  UniRef90_P63104| ENSBTAT00000000289|
|  UniRef90_Q16537| ENSBTAT00000026361|
|  UniRef90_Q00005| ENSBTAT00000002427|
|  UniRef90_P14893| ENSBTAT00000087114|
|  UniRef90_Q0V8K7| ENSBTAT00000077799|
|  UniRef90_P31937| ENSBTAT00000001374|
|  UniRef90_Q9N179| ENSBTAT00000008760|
|  UniRef90_Q0P5A7| ENSBTAT00000039828|
|  UniRef90_O46411| ENSBTAT00000068138|
|  UniRef90_P12725| ENSBTAT00000004927|
|  UniRef90_P12725| ENSBTAT00000072441|
|  UniRef90_Q2KJF1| ENSBTAT00000012837|
|  UniRef90_P28800| ENSBTAT00000027793|
|  UniRef90_P28190| ENSBTAT00000015230|
|  UniRef90_P28190| ENSBTAT00000072690|
|  UniRef90_Q1LZD0| ENSBTAT00000009974|
+-----------------+-------------------+



In [11]:
val cow_liver_tr = loadTranscripts("raw/cow/liver/GSM1020724/quant.sf", "cow_liver")
val cow_kidney_tr = loadTranscripts("raw/cow/kidney/GSM1020723/quant.sf", "cow_kidney")
(cow_liver_tr.count, cow_kidney_tr.count)

(22904,22904)

In [12]:
val cow_trans = cow_liver_tr.withColumn("Ensembl_TRS", undotFun($"transcript")).select($"Ensembl_TRS").join(cow_mapping, Seq("Ensembl_TRS")).select("Ensembl_TRS", "Uniref90").distinct
cow_trans.limit(20).show(10,1000)

+------------------+-------------------+
|       Ensembl_TRS|           Uniref90|
+------------------+-------------------+
|ENSBTAT00000025670|    UniRef90_Q1LZG6|
|ENSBTAT00000009295|    UniRef90_F1N0H5|
|ENSBTAT00000027120|    UniRef90_F1MP34|
|ENSBTAT00000008078|    UniRef90_Q0VCP2|
|ENSBTAT00000053481|    UniRef90_A6QL99|
|ENSBTAT00000021390|    UniRef90_A5PJW9|
|ENSBTAT00000001824|UniRef90_A0A452FIE5|
|ENSBTAT00000048703|    UniRef90_F1N1Z4|
|ENSBTAT00000061407|UniRef90_A0A3Q1MUR8|
|ENSBTAT00000025465|    UniRef90_A0JNF3|
+------------------+-------------------+
only showing top 10 rows



In [13]:
val cow_uni = uni(cow_trans.withColumnRenamed("Uniref90", "uniref90")).cache
cow_uni.limit(10).show(10,1000)

+---------------+
|       uniref90|
+---------------+
|UniRef90_A7MB57|
|UniRef90_Q14872|
|UniRef90_F1MQK6|
|UniRef90_Q1RMI9|
|UniRef90_F1MET4|
|UniRef90_G3N1M0|
|UniRef90_O15379|
|UniRef90_O75899|
|UniRef90_F1MZB7|
|UniRef90_A6QNR9|
+---------------+



In [14]:
val cow_uniref = map_species("cow", "raw/cow/liver/GSM1020724/quant.sf", "raw/cow/kidney/GSM1020723/quant.sf", cow_mapping)
println(cow_uniref.count())
cow_uniref.show(10,1000)

18909
+---------------+------------+------------+
|       uniref90|   cow_liver|  cow_kidney|
+---------------+------------+------------+
|UniRef90_P02769|39122.608508|    5.549983|
|UniRef90_P81644|23846.294138|    4.429746|
|UniRef90_P00396|23023.581018|51112.719511|
|UniRef90_P03898|17905.614012| 28120.59444|
|UniRef90_P00415|16212.689475|31884.322025|
|UniRef90_P00847|14432.628554|27960.731306|
|UniRef90_F6QND5|12790.230203|    5.847987|
|UniRef90_O46375|12706.865718|    24.48667|
|UniRef90_P19035|11847.007201|  612.123366|
|UniRef90_F1MMK9|10769.811066|   10.514576|
+---------------+------------+------------+
only showing top 10 rows



In [15]:
cow_uniref.writeTSV("/data/results/gray-whale/Expressions/uniref90/established/cow_uniref90.tsv", true)

parts of /data/results/gray-whale/Expressions/uniref90/established/cow_uniref90.tsv merged!


/data/results/gray-whale/Expressions/uniref90/established/cow_uniref90.tsv

In [16]:
val mapping_root = "/data/indexes/uniprot/species/"
val human_mapping = processMapping(spark.readTSV(mapping_root  + "HUMAN_9606_idmapping_selected.tab").toDF(mapping_cols:_*))
human_mapping.limit(20).show(20) ///data/indexes/uniprot/species

+---------------+----------------+
|       Uniref90|     Ensembl_TRS|
+---------------+----------------+
|UniRef90_P31946| ENST00000353703|
|UniRef90_P31946| ENST00000372839|
|UniRef90_P62258| ENST00000264335|
|UniRef90_P62258| ENST00000571732|
|UniRef90_P62258| ENST00000616643|
|UniRef90_P62258| ENST00000627231|
|UniRef90_Q04917| ENST00000248975|
|UniRef90_P61981| ENST00000307630|
|UniRef90_P31947| ENST00000339276|
|UniRef90_P27348| ENST00000238081|
|UniRef90_P27348| ENST00000381844|
|UniRef90_P63104| ENST00000353245|
|UniRef90_P63104| ENST00000395951|
|UniRef90_P63104| ENST00000395953|
|UniRef90_P63104| ENST00000395956|
|UniRef90_P63104| ENST00000395957|
|UniRef90_P63104| ENST00000395958|
|UniRef90_P63104| ENST00000419477|
|UniRef90_P63104| ENST00000457309|
|UniRef90_P63104| ENST00000522542|
+---------------+----------------+



In [17]:
val mouse_mapping = processMapping(spark.readTSV(mapping_root + "MOUSE_10090_idmapping_selected.tab").toDF(mapping_cols:_*))
mouse_mapping.count()

67401

In [18]:
val human_uniref = map_species("human", "raw/human/liver/GSM1698568/quant.sf", "raw/human/kidney/GSM1698570/quant.sf", human_mapping)
println(human_uniref.count())
human_uniref.show(10,1000)

50878
+-----------------+------------------+------------------+
|         uniref90|       human_liver|      human_kidney|
+-----------------+------------------+------------------+
|  UniRef90_P02768|       8623.782791|         12.144448|
|  UniRef90_P02652|       3532.126448|               0.0|
|  UniRef90_P02763|       1843.415159|               0.0|
|  UniRef90_P00414|       1689.593287|       3711.708944|
|UniRef90_P02768-2|       1687.159951|         12.765588|
|  UniRef90_P00395|       1636.603468|       6318.267945|
|  UniRef90_P02656|1486.8948070000001|          2.610827|
|  UniRef90_P02654|       1437.849913|0.5099290000000001|
|  UniRef90_P00325|       1370.567789| 8.642149999999999|
|  UniRef90_P00738|       1144.102532|               0.0|
+-----------------+------------------+------------------+
only showing top 10 rows



In [19]:
human_uniref.writeTSV("/data/results/gray-whale/Expressions/uniref90/established/human_uniref90.tsv", true)

parts of /data/results/gray-whale/Expressions/uniref90/established/human_uniref90.tsv merged!


/data/results/gray-whale/Expressions/uniref90/established/human_uniref90.tsv

In [20]:
val mouse_uniref = map_species("mouse", "raw/mouse/liver/GSM1400574/quant.sf", "raw/mouse/kidney/GSM2195188/quant.sf", mouse_mapping)
mouse_uniref.show(10,1000)

+---------------+------------------+------------------+
|       uniref90|       mouse_liver|      mouse_kidney|
+---------------+------------------+------------------+
|UniRef90_P11588| 69975.97204299999|          0.653462|
|UniRef90_P07724|      59405.218009|          4.655618|
|UniRef90_P22599|23652.941147999998|        272.303171|
|UniRef90_P03930|      17481.818076|       9248.265999|
|UniRef90_P00848|      16692.802391|      20312.635609|
|UniRef90_Q00623|       15715.55198|1.4392260000000001|
|UniRef90_P00396|      14679.308736|      27944.465293|
|UniRef90_P00416|      14490.795055|      12610.242599|
|UniRef90_P00406|      14251.757045|        3798.72075|
|UniRef90_P02692|       14211.28666|          2.884441|
+---------------+------------------+------------------+
only showing top 10 rows



In [21]:
mouse_uniref.writeTSV("/data/results/gray-whale/Expressions/uniref90/established/mouse_uniref90.tsv", true)

parts of /data/results/gray-whale/Expressions/uniref90/established/mouse_uniref90.tsv merged!


/data/results/gray-whale/Expressions/uniref90/established/mouse_uniref90.tsv