In [ ]:
import org.apache.spark.sql.{DataFrame, Encoders, SparkSession}
import org.apache.spark.sql.types.StructType
import org.bdgenomics.adam.rdd.ADAMContext._
import org.bdgenomics.adam.rdd.ADAMContextExtensions._
import scala.reflect.runtime.universe._
import comp.bio.aging.playground.extras.uniprot._
import org.apache.spark.storage.StorageLevel

import org.apache.spark.sql.{DataFrame, Encoders, SparkSession}
import org.apache.spark.sql.types.StructType
import org.bdgenomics.adam.rdd.ADAMContext._
import org.bdgenomics.adam.rdd.ADAMContextExtensions._
import scala.reflect.runtime.universe._
import comp.bio.aging.playground.extras.uniprot._
import org.apache.spark.storage.StorageLevel


In [ ]:
val spark = SparkSession
  .builder()
  .appName("annotations")
  .getOrCreate()

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@282f89a3


1. UniProtKB-AC
2. UniProtKB-ID
3. GeneID (EntrezGene)
4. RefSeq
5. GI
6. PDB
7. GO
8. UniRef100
9. UniRef90
10. UniRef50
11. UniParc
12. PIR
13. NCBI-taxon
14. MIM
15. UniGene
16. PubMed
17. EMBL
18. EMBL-CDS
19. Ensembl
20. Ensembl_TRS
21. Ensembl_PRO
22. Additional PubMed

In [ ]:
val base = "hdfs://namenode/pipelines"
val diamond = base + "/diamond"
val blastp = diamond + "/blastp"
val blastx = diamond + "/blastx"

base: String = hdfs://namenode/pipelines
diamond: String = hdfs://namenode/pipelines/diamond
blastp: String = hdfs://namenode/pipelines/diamond/blastp
blastx: String = hdfs://namenode/pipelines/diamond/blastx


In [ ]:
val target = base + "/GRAY_WHALE"
val transdecoderName = "Trin_Mitya.Trinity"


In [ ]:
val trinity = base + s"/${transdecoderName}.fasta"
val proteins = base + s"/${transdecoderName}.transdecoder.pep"
val gff = graywhale + s"/${transdecoderName}.fasta.transdecoder.gff3"

val trinity = graywhale + "/Trin_Mitya.Trinity.fasta"
val proteins = graywhale + "/Trin_Mitya.Trinity.fasta.transdecoder.pep"
val gff = graywhale + "/Trin_Mitya.Trinity.fasta.transdecoder.gff3"

val proteinsAdam = graywhale + "/proteins.adam"
val transcriptsAdam = graywhale + "/transcripts.adam"
val transcripts = sparkContext.loadParquetContigFragments(transcriptsAdam)
val query = sparkContext.loadParquetContigFragments(proteinsAdam)


In [ ]:
val predictions = query.rdd
  .map(q=>comp.bio.aging.playground.extras.diamond.ProteinPrediction.extractPrediction(q.getDescription, q.getSequence, q.getContigName))
  .filter(p=>p.diamondHits.nonEmpty)
  //.filter(p=>p.diamondHits.nonEmpty || p.pfamHits.nonEmpty)
  .map(p=>(p.transcript, p))

predictions: org.apache.spark.rdd.RDD[(String, comp.bio.aging.playground.extras.diamond.ProteinPrediction)] = MapPartitionsRDD[4] at map at <console>:91


In [ ]:
val mapping =  spark.readTypedTSV[UniprotMapping]("/pipelines/uniprot/idmapping_selected.tab")

mapping: org.apache.spark.sql.Dataset[comp.bio.aging.playground.extras.uniprot.UniprotMapping] = [uniprot_ac: string, uniprot_id: string ... 20 more fields]


In [ ]:
import spark.implicits._
val complete = predictions.filter(p => p._2.orf_type == "complete")
val completeDF = complete.map{ case (tr, c) =>(tr, c.diamondHits.maxBy(r=>r.score).id, c.sequence)}.toDF("transcript", "uniref","sequence")


import spark.implicits._
complete: org.apache.spark.rdd.RDD[(String, comp.bio.aging.playground.extras.diamond.ProteinPrediction)] = MapPartitionsRDD[5] at filter at <console>:92
completeDF: org.apache.spark.sql.DataFrame = [transcript: string, uniref: string ... 1 more field]


In [ ]:
val suffix = "_common_mapped_redundant.tab"
val name = "/graywhale" + suffix
val filename = "/pipelines/mappings/complete" + name

suffix: String = _common_mapped_redundant.tab
name: String = /graywhale_common_mapped_redundant.tab
filename: String = /pipelines/mappings/complete/graywhale_common_mapped_redundant.tab


In [ ]:
val mappingDF = mapping.toDF().select("uniprot_ac",	"uniprot_id",	"entrez",	"refSeq",	"gi",	"go",	"uniref90",	"uniparc",	"taxon", "pubmed", "embl")
val completedMapped = mappingDF.join(completeDF, completeDF("uniref") === mappingDF("uniref90")).cache()

mappingDF: org.apache.spark.sql.DataFrame = [uniprot_ac: string, uniprot_id: string ... 9 more fields]
completedMapped: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [uniprot_ac: string, uniprot_id: string ... 12 more fields]


In [ ]:
completedMapped.write.option("sep","\t").option("header","true").csv(filename)

In [ ]:
case class MappedJoined(transcript: String, uniref90: String, taxon: String, pubmed: String, go: String, entrez: String, uniprot_id: String)
  {
    self =>

    lazy val pubmeds: Set[String] = pubmed.split(";").toSet
    lazy val goes: Set[String] = go.split(";").toSet
    lazy val taxons: Set[String] = taxon.split(";").toSet

    def merge(other: MappedJoined): MappedJoined = other.copy(
      self.taxon + ";" + other.taxon,
      self.pubmed + ";" + other.pubmed,
      self.go + ";" + other.go
    )
  }


defined class MappedJoined


In [ ]:
val smallJoined= joined.rdd.map(r=>
    ( r.getAs[String]("transcript"),
        MappedJoined(
            r.getAs[String]("transcript"),
            r.getAs[String]("uniref90"),
            r.getAs[String]("taxon"),
            r.getAs[String]("pubmed"),
            r.getAs[String]("go"),
            r.getAs[String]("entrez"),
            r.getAs[String]("uniprot_id")
        )
      )
    ).reduceByKey((a, b)=> a.merge(b)).persist(StorageLevel.MEMORY_AND_DISK) 
  

smallJoined: org.apache.spark.rdd.RDD[(String, MappedJoined)] = ShuffledRDD[57] at reduceByKey at <console>:119


In [ ]:
(completeDF.count, joined.count, smallJoined.count)

org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 28.0 failed 4 times, most recent failure: Lost task 0.3 in stage 28.0 (TID 437, 10.0.2.3, executor 0): java.io.IOException: org.apache.spark.SparkException: Failed to get broadcast_25_piece0 of broadcast_25
	at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1310)
	at org.apache.spark.broadcast.TorrentBroadcast.readBroadcastBlock(TorrentBroadcast.scala:206)
	at org.apache.spark.broadcast.TorrentBroadcast._value$lzycompute(TorrentBroadcast.scala:66)
	at org.apache.spark.broadcast.TorrentBroadcast._value(TorrentBroadcast.scala:66)
	at org.apache.spark.broadcast.TorrentBroadcast.getValue(TorrentBroadcast.scala:96)
	at org.apache.spark.broadcast.Broadcast.value(Broadcast.scala:70)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:86)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org

In [ ]:
features.rdd.take(10)

res11: Array[org.bdgenomics.formats.avro.Feature] = Array({"featureId": "Gene.10639::c10000_g1_i1::g.10639", "name": "ORF%20type%3A5prime_partial%20len%3A112%20%28-%29%2Cscore%3D25.49%2CUniRef90_UPI000440677B%7C100.0%7C2.8e-56%2CUniRef90_UPI0009517C74%7C99.1%7C6.2e-56%2CUniRef90_L5JU14%7C99.1%7C8.1e-56%2CUniRef90_UPI0007689164%7C99.1%7C8.1e-56%2CUniRef90_UPI00042CD8A7%7C98.2%7C2.3e-55%2CUniRef90_K9KD70%7C97.3%7C3.1e-55%2CUniRef90_F7DFP8%7C97.3%7C3.1e-55%2CUniRef90_UPI0003290704%7C96.4%7C3.1e-55%2CUniRef90_UPI00033436D2%7C96.4%7C4.0e-55%2CUniRef90_UPI00057BC7E4%7C97.3%7C5.2e-55%2CUniRef90_UPI0007047ADE%7C97.3%7C5.2e-55%2CUniRef90_UPI0009453B6C%7C96.4%7C5.2e-55%2CUniRef90_E2QZ15%7C96.4%7C8.9e-55%2CUniRef90_E1BDD1%7C97.3%7C1.5e-54%2CUniRef90_G3MXA6%7C97.3%7C1.5e-54%2CUniRef90_A0A287BJY9%7C...

In [ ]:
features.rdd.map(_.getFeatureType).distinct().collect().toList

res16: List[String] = List(mRNA, CDS, three_prime_UTR, exon, gene, five_prime_UTR)
