In [ ]:
import org.apache.spark.sql.{DataFrame, Encoders, SparkSession}
import org.apache.spark.sql.types.StructType
import org.apache.spark.storage.StorageLevel

import scala.reflect.runtime.universe._
import comp.bio.aging.playground.extras.uniprot._
import org.bdgenomics.adam.rdd.ADAMContext._
import org.bdgenomics.adam.rdd.ADAMContextExtensions._

import org.apache.spark.sql.{DataFrame, Encoders, SparkSession}
import org.apache.spark.sql.types.StructType
import org.bdgenomics.adam.rdd.ADAMContext._
import org.bdgenomics.adam.rdd.ADAMContextExtensions._
import scala.reflect.runtime.universe._
import comp.bio.aging.playground.extras.uniprot._
import org.apache.spark.storage.StorageLevel


In [ ]:
def sparkHadoopConf(sc: SparkContext, acountName: String, accountKey: String) = {
  sc.hadoopConfiguration.set("fs.azure", "org.apache.hadoop.fs.azure.NativeAzureFileSystem")
  sc.hadoopConfiguration.set("fs.azure.account.key." + acountName + ".blob.core.windows.net", accountKey)
  sc
}

sparkHadoopConf: (sc: org.apache.spark.SparkContext, acountName: String, accountKey: String)org.apache.spark.SparkContext


In [ ]:
def azurize(container: String, accountName: String, blobFile: String): String = "wasbs://"+container+"@"+accountName+".blob.core.windows.net/"+blobFile 

def writeText2Azure[T]( rdd: RDD[T], container: String, accountName: String, blobFile: String ): String =
{
  val url = azurize(container, accountName, blobFile)
  rdd.saveAsTextFile(url)
  url
}

def writeTsv2Azure( df: DataFrame, container: String, accountName: String, blobFile: String ): String =
{
  val url = azurize(container, accountName, blobFile)
  df.write.option("sep","\t").option("header","true").csv(url)
  url
}

azurize: (container: String, accountName: String, blobFile: String)String
writeText2Azure: [T](rdd: org.apache.spark.rdd.RDD[T], container: String, accountName: String, blobFile: String)String
writeTsv2Azure: (df: org.apache.spark.sql.DataFrame, container: String, accountName: String, blobFile: String)String


In [ ]:
val connString = "DefaultEndpointsProtocol=https;AccountName=pipelines1;AccountKey=;EndpointSuffix=core.windows.net"
val account = "pipelines1"
val key = ""
def az(path: String): String = azurize("storage", account, path)

connString: String = DefaultEndpointsProtocol=https;AccountName=pipelines1;AccountKey=;EndpointSuffix=core.windows.net
account: String = pipelines1
key: String = 
az: (path: String)String


In [ ]:
sparkHadoopConf(sparkContext, account, key)
  
val spark = SparkSession
  .builder()
  .appName("mapping_models")
  .getOrCreate()

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@10db3fa5


In [ ]:
import org.apache.spark.sql.functions._
import spark.implicits._

val toDouble = udf[Double, String]( _.toDouble)

import org.apache.spark.sql.functions._
import spark.implicits._
toDouble: org.apache.spark.sql.expressions.UserDefinedFunction = UserDefinedFunction(<function1>,DoubleType,Some(List(StringType)))


In [ ]:
val base = "/batch/quant"

base: String = /batch/quant


In [ ]:
val mouse_liver_path = az(base + "/Mus musculus_totalRNA_GSM2927683_quant.sf")
val mouse_kidney_path = az(base + "/Mus musculus_totalRNA_GSM2927750_quant.sf")

val human_liver_path = az(base + "/Homo sapiens_totalRNA_GSM1698568_quant.sf")
val human_kidney_path = az(base + "/Homo sapiens_totalRNA_GSM1698570_quant.sf")

val cow_liver_path = az(base + "/Bos taurus_totalRNA_GSM2042593_quant.sf")
val cow_kidney_path = az(base + "/Bos taurus_totalRNA_GSM2042596_quant.sf")


mouse_liver_path: String = wasbs://storage@pipelines1.blob.core.windows.net//batch/quant/Mus musculus_totalRNA_GSM2927683_quant.sf
mouse_kidney_path: String = wasbs://storage@pipelines1.blob.core.windows.net//batch/quant/Mus musculus_totalRNA_GSM2927750_quant.sf
human_liver_path: String = wasbs://storage@pipelines1.blob.core.windows.net//batch/quant/Homo sapiens_totalRNA_GSM1698568_quant.sf
human_kidney_path: String = wasbs://storage@pipelines1.blob.core.windows.net//batch/quant/Homo sapiens_totalRNA_GSM1698570_quant.sf
cow_liver_path: String = wasbs://storage@pipelines1.blob.core.windows.net//batch/quant/Bos taurus_totalRNA_GSM2042593_quant.sf
cow_kidney_path: String = wasbs://storage@pipelines1.blob.core.windows.net//batch/quant/Bos taurus_totalRNA_GSM2042596_quant.sf


In [ ]:
def load_liver(path: String) = spark.readTSV(path, true).withColumn("liver", toDouble($"TPM")).drop("TPM").withColumnRenamed("Name","transcript")
def load_kidney(path: String) = spark.readTSV(path, true).withColumn("kidney", toDouble($"TPM")).drop("TPM").withColumnRenamed("Name","transcript")
def join_liver_kidney(liver: DataFrame, kidney: DataFrame): DataFrame =
    liver.join(kidney.withColumnRenamed("transcript", "kidney_transcript"), $"transcript" === $"kidney_transcript")
    .select($"transcript", $"liver", $"kidney", ( ($"liver" + $"kidney") / 2.0 ).as("avg_expression"))    
    .sort($"avg_expression".desc)

load_liver: (path: String)org.apache.spark.sql.DataFrame
load_kidney: (path: String)org.apache.spark.sql.DataFrame
join_liver_kidney: (liver: org.apache.spark.sql.DataFrame, kidney: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame


In [ ]:
val mouse = join_liver_kidney(load_liver(mouse_liver_path), load_kidney(mouse_kidney_path))
val human = join_liver_kidney(load_liver(human_liver_path), load_kidney(human_kidney_path))
val cow = join_liver_kidney(load_liver(cow_liver_path), load_kidney(cow_kidney_path))

mouse: org.apache.spark.sql.DataFrame = [transcript: string, liver: double ... 2 more fields]
human: org.apache.spark.sql.DataFrame = [transcript: string, liver: double ... 2 more fields]
cow: org.apache.spark.sql.DataFrame = [transcript: string, liver: double ... 2 more fields]


In [ ]:
/*
def simplify(dataFrame: DataFrame): Dataset[(String, Double, Double, Double)] = dataFrame.map{ row=>
  val tran = row.getAs[String]("transcript")
  val tr = tran.substring(0,  Math.min(tran.indexOf('|'), tran.length))
  (tr.substring(0, Math.min(tr.indexOf('.'),tr.length)),	row.getAs[Double]("liver"), row.getAs[Double]("kidney"), row.getAs[Double]("avg_expression"))
}
*/

def simplify(dataFrame: DataFrame): Dataset[(String, Double, Double, Double)] = dataFrame.map{ row=>
  val tran = row.getAs[String]("transcript")
  val tr = tran.substring(0, tran.indexOf('|'))
  val transcript =  tr.substring(0, Math.min(tr.indexOf('.'),tr.length))
  (transcript,	(row.getAs[Double]("liver"), row.getAs[Double]("kidney"), row.getAs[Double]("avg_expression")))
}.rdd.reduceByKey{ case ((a1, b1, c1), (a2, b2,c2)) => (a1+a2, b1+b2, c1 + c2)}
    .map{ case (a, (b ,c, d)) => (a, b ,c ,d)}
  .toDS()

val human_simple = simplify(human).toDF("transcript", "liver", "kidney", "avg_expression")
val mouse_simple = simplify(mouse).toDF("transcript", "liver", "kidney", "avg_expression")
val cow_simple = simplify(cow).toDF("transcript", "liver", "kidney", "avg_expression")

simplify: (dataFrame: org.apache.spark.sql.DataFrame)org.apache.spark.sql.Dataset[(String, Double, Double, Double)]
human_simple: org.apache.spark.sql.DataFrame = [transcript: string, liver: double ... 2 more fields]
mouse_simple: org.apache.spark.sql.DataFrame = [transcript: string, liver: double ... 2 more fields]
cow_simple: org.apache.spark.sql.DataFrame = [transcript: string, liver: double ... 2 more fields]


In [ ]:
val headers = List("uniprot_ac", "uniprot_id", "entrez", "refSeq", "gi", "pdb", "go", 
  "uniref100", "uniref90", "uniref50", "uniparc", "pir", 
  "taxon", "mim", "unigene", "pubmed", "embl", "embl_cds", 
  "ensembl", /*"ensembl_trs"*/ "transcript", "ensembl_pro", "additional_pubmed") 

headers: List[String] = List(uniprot_ac, uniprot_id, entrez, refSeq, gi, pdb, go, uniref100, uniref90, uniref50, uniparc, pir, taxon, mim, unigene, pubmed, embl, embl_cds, ensembl, transcript, ensembl_pro, additional_pubmed)


In [ ]:
val columns = List("transcript", "uniref90", "go", "liver","kidney", "avg_expression", "uniprot_ac", "taxon", "uniprot_id", "entrez", "refSeq", "gi", "uniparc", "pubmed", "embl")

def joinMapping(mapping: Dataset[UniprotMapping], df: DataFrame): DataFrame = {
  val mdf = mapping.flatMap{ u=> if(u.ensembl_trs == null) Nil else  u.ensembl_trs.split(';').map(trs=>u.copy(ensembl_trs = trs)) }.toDF(headers:_*)
  mdf.join(df, "transcript").select(columns.head, columns.tail:_*)
}

columns: List[String] = List(transcript, uniref90, go, liver, kidney, avg_expression, uniprot_ac, taxon, uniprot_id, entrez, refSeq, gi, uniparc, pubmed, embl)
joinMapping: (mapping: org.apache.spark.sql.Dataset[comp.bio.aging.playground.extras.uniprot.UniprotMapping], df: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame


In [ ]:
val human_tax_id = "9606"
val mouse_tax_id = "10090"
val cow_tax_id = "9913"

val ind = az("/indexes/uniprot")
val human_mapping = spark.readTypedTSV[UniprotMapping](ind + "/HUMAN_9606_idmapping_selected.tab")
val mouse_mapping = spark.readTypedTSV[UniprotMapping](ind + "/MOUSE_10090_idmapping_selected.tab")
val cow_mapping = spark.readTypedTSV[UniprotMapping](ind + s"/COW_${cow_tax_id}_idmapping_selected.tab")

val all_mapping =  spark.readTypedTSV[UniprotMapping]("file:///pipelines/indexes/uniprot/idmapping_selected.tab")

human_tax_id: String = 9606
mouse_tax_id: String = 10090
cow_tax_id: String = 9913
ind: String = wasbs://storage@pipelines1.blob.core.windows.net//indexes/uniprot
human_mapping: org.apache.spark.sql.Dataset[comp.bio.aging.playground.extras.uniprot.UniprotMapping] = [uniprot_ac: string, uniprot_id: string ... 20 more fields]
mouse_mapping: org.apache.spark.sql.Dataset[comp.bio.aging.playground.extras.uniprot.UniprotMapping] = [uniprot_ac: string, uniprot_id: string ... 20 more fields]
cow_mapping: org.apache.spark.sql.Dataset[comp.bio.aging.playground.extras.uniprot.UniprotMapping] = [uniprot_ac: string, uniprot_id: string ... 20 more fields]
all_mapping: org.apache.spark.sql.Dataset[comp.bio.aging.playground.extras.uniprot.UniprotMapping] = [uniprot_ac: string, uniprot_id: string ... 20...

In [ ]:
(human_mapping.count, mouse_mapping.count, cow_mapping.count)

res15: (Long, Long, Long) = (162191,83601,32206)


In [ ]:
import org.bdgenomics.adam.rdd.ADAMContextExtensions._
import comp.bio.aging.playground.extras.uniprot._
def writeTSV(dataFrame: DataFrame, path: String, header: Boolean = true, sep: String = "\t"): Unit =
      dataFrame.write.option("sep", sep).option("header",header).csv(path)
//val cow_mapping = all_mapping.filter(u=>u.taxon == cow_tax_id)
//writeTSV(cow_mapping.toDF().coalesce(1), ind + s"/COW_${cow_tax_id}_idmapping_selected.tab")

import org.bdgenomics.adam.rdd.ADAMContextExtensions._
import comp.bio.aging.playground.extras.uniprot._
writeTSV: (dataFrame: org.apache.spark.sql.DataFrame, path: String, header: Boolean, sep: String)Unit


In [ ]:
//val cow_mapping = all_mapping.filter(u=>u.taxon == cow_tax_id)
//writeTSV(cow_mapping.toDF().coalesce(1), ind + s"/COW_${cow_tax_id}_idmapping_selected.tab")

cow_mapping: org.apache.spark.sql.Dataset[comp.bio.aging.playground.extras.uniprot.UniprotMapping] = [uniprot_ac: string, uniprot_id: string ... 20 more fields]


In [ ]:
cow

res42: org.apache.spark.sql.DataFrame = [transcript: string, liver: double ... 2 more fields]


In [ ]:
val humanExpressions = joinMapping(human_mapping, human_simple)
val mouseExpressions = joinMapping(mouse_mapping, mouse_simple)
//val cowExpressions = joinMapping(cow_mapping, cow_simple)
//(68611,51014)
(humanExpressions.count, mouseExpressions.count)


humanExpressions: org.apache.spark.sql.DataFrame = [transcript: string, uniref90: string ... 13 more fields]
mouseExpressions: org.apache.spark.sql.DataFrame = [transcript: string, uniref90: string ... 13 more fields]
res48: (Long, Long) = (68611,51014)


In [ ]:
human_simple

res31: org.apache.spark.sql.DataFrame = [transcript: string, liver: double ... 2 more fields]


In [ ]:
writeTSV(humanExpressions.coalesce(1), az("/expressions/transcripts/" + "human"+"_transcripts_all.tab"), true)
writeTSV(mouseExpressions.coalesce(1), az("/expressions/transcripts/" + "mouse"+"_transcripts_all.tab"), true)

In [ ]:
//val go_inner = spark.readTSV(az("/expressions/go/gray_whale_with_bowhead_with_minke_with_NMR_small_full_inner.tsv"), true)
//val go_outer = spark.readTSV(az("/expressions/go/gray_whale_with_bowhead_with_minke_with_NMR_small_full_outer.tsv"), true)

go_inner: org.apache.spark.sql.DataFrame = [go: string, label: string ... 10 more fields]
go_outer: org.apache.spark.sql.DataFrame = [go: string, label: string ... 10 more fields]


In [ ]:
val go_outer_full = spark.readTSV(az("/expressions/go/gray_whale_with_bowhead_with_minke_with_NMR_large_full_outer.tsv"), true)

go_outer_full: org.apache.spark.sql.DataFrame = [go: string, label: string ... 14 more fields]


In [ ]:
val go_outer_small = spark.readTSV(az("/expressions/go/gray_whale_with_bowhead_with_minke_with_NMR_small_full_outer.tsv"), true)

go_outer_small: org.apache.spark.sql.DataFrame = [go: string, label: string ... 10 more fields]


In [ ]:
go_outer_counts.columns.map(v=> "\"" + v +"\"").mkString(", ")

res64: String = "go", "label", "gray_whale_liver", "bowhead_whale_liver", "minke_liver", "NMR_liver", "gray_whale_kidney", "bowhead_whale_kidney", "minke_kidney", "NMR_kidney", "type", "transcripts_count", "uniref90_count", "taxons_count", "gray_whale_avg"


In [ ]:
val count = udf[Int, String](_.split(";").size)
val go_outer_counts = go_outer_full.na.fill("")
    .withColumn("transcripts_count", count($"transcript"))
    .withColumn("uniref90_count", count($"uniref90"))
    .withColumn("taxons_count", count($"taxon"))
    .drop($"uniprot_ac").drop($"taxon").drop($"uniref90").drop($"transcript")
    .withColumn("gray_whale_avg", toDouble($"gray_whale_avg_expression"))
    .drop($"gray_whale_avg_expression")    
    .sort($"gray_whale_avg".desc)
    .select("go", "label", "gray_whale_avg", "transcripts_count", "uniref90_count", "taxons_count",  "type", "gray_whale_liver", 
            "bowhead_whale_liver", "minke_liver", "NMR_liver", "gray_whale_kidney", 
            "bowhead_whale_kidney", "minke_kidney", "NMR_kidney"
            )
    
    //$"gray_whale_avg_expression"
go_outer_counts


count: org.apache.spark.sql.expressions.UserDefinedFunction = UserDefinedFunction(<function1>,IntegerType,Some(List(StringType)))
go_outer_counts: org.apache.spark.sql.DataFrame = [go: string, label: string ... 13 more fields]
res67: org.apache.spark.sql.DataFrame = [go: string, label: string ... 13 more fields]


In [ ]:
go_outer_counts.coalesce(1).write.option("sep","\t").option("header","true")
.csv(az("/expressions/go/gray_whale_with_bowhead_with_minke_with_NMR_full_outer_counts.tsv"))


In [ ]:
def byGo(df: DataFrame, species: String) =
  df.select("go", "liver", "kidney").flatMap{row=>
    val g = row.getAs[String]("go")
    val goes = if(g==null) Array[String]() else g.split(";").map(go=>go.trim).filter(go=>go!="")
    goes.map{ go =>
      go -> (row.getAs[Double]("liver"), row.getAs[Double]("kidney"))
    }
  }.rdd.reduceByKey{ case ((l1, k1), (l2, k2)) => (l1 + l2, k1 + k2)}
    .map{ case (go, (liver, kidney)) => (go, liver, kidney)}
      .toDF("go", s"${species}_liver", s"${species}_kidney")


byGo: (df: org.apache.spark.sql.DataFrame, species: String)org.apache.spark.sql.DataFrame


In [ ]:
val goHuman = byGo(humanExpressions, "human")
val goMouse = byGo(mouseExpressions, "mouse")

goHuman: org.apache.spark.sql.DataFrame = [go: string, human_liver: double ... 1 more field]
goMouse: org.apache.spark.sql.DataFrame = [go: string, mouse_liver: double ... 1 more field]


In [ ]:
val go_inner_ext = go_inner.join(goHuman, Seq("go"), "inner").join(goMouse, Seq("go"), "inner").sort($"gray_whale_avg_expression".desc)
val go_outer_ext = go_outer.join(goHuman, Seq("go"), "full_outer").join(goMouse, Seq("go"), "full_outer").sort($"gray_whale_avg_expression".desc)


go_inner_ext: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [go: string, label: string ... 14 more fields]
go_outer_ext: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [go: string, label: string ... 14 more fields]


In [ ]:
writeTSV(go_inner_ext.coalesce(1), az("/expressions/go/gray_whale_with_bowhead_with_minke_with_NMR_with_human_with_mouse_small_full_inner.tsv"), true)
writeTSV(go_outer_ext.coalesce(1), az("/expressions/go/gray_whale_with_bowhead_with_minke_with_NMR_with_human_with_mouse_small_full_outer.tsv"), true)


In [ ]:
//SPLIT GO

In [ ]:
import net.sansa_stack.rdf.spark.io._
import org.apache.jena.riot.Lang

val input = az("/go/go.owl")

val lang = Lang.RDFXML
val triples = spark.rdf(lang)(input)
triples.take(5).foreach(println(_))

org.apache.spark.SparkException: Job aborted due to stage failure: Exception while getting task result: com.esotericsoftware.kryo.KryoException: java.lang.IndexOutOfBoundsException: Index: 102, Size: 27
Serialization trace:
fTargetNamespace (org.apache.xerces.impl.dv.xs.XSSimpleTypeDecl)
fBase (org.apache.xerces.impl.dv.xs.XSSimpleTypeDecl)
typeDeclaration (org.apache.jena.datatypes.xsd.impl.XSDBaseStringType)
dtype (org.apache.jena.graph.impl.LiteralLabelImpl)
label (org.apache.jena.graph.Node_Literal)
obj (org.apache.jena.graph.Triple)
  at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1517)
  at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1505)
  at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1504)
  at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
  at scala.collection.mutable.ArrayBu