# Transcripts<br>


This notebook is devoted to saving expressions to GRAPHDB<br>




In [1]:
import org.apache.spark.rdd._
import org.apache.spark.sql._
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.functions._
import group.research.aging.spark.extensions._
import group.research.aging.spark.extensions.functions._
import group.research.aging.spark.extensions.functions.ConcatenateString
import group.research.aging.spark.extensions.functions.Concatenate
import ammonite.ops._
import ammonite.ops.ImplicitWd._

In [2]:
import better.files._
import File._
import java.io.{File => JFile}

RDF

In [4]:
val prefix = "http://aging-research.group/resource/"
val prefix_ensembl = "http://rdf.ebi.ac.uk/resource/ensembl/"
val prefix_samples = "http://aging-research.group/samples/"
def samp(str: String) = "<" +prefix_samples +str.replace(" ", "_")+">"
def pref(str: String) = "<" +prefix +str.replace(" ", "_")+">" 
def ens(str: String) =  "<" +prefix_ensembl +str.replace(" ", "_")+">"
def tax(str: String) = ens("taxon#"+str)
def str(s: String) = if(s==null) "" else s""""${s.replace("\"", "\\\"")}""""

In [5]:
def pre(prefix: String, str: String) = 
    if(prefix.startsWith("xsd:")) s""""$str"^^$prefix"""" else
    if(prefix.contains("http")) "<" + prefix + str + ">" else     
    prefix + str//sequence_memberssequence_membersse

In [6]:
def u_uri(pref: String) = udf[String, String]{ str => pref + str.replace(" ", "_")}
val u_string = udf[String, String]{ str => "\"" + str.replace("\n", "\\n").replace("\\", "\\\\").replace("\"", "\\\"") +  "\"" +"^^<http://www.w3.org/2001/XMLSchema#string>" }
val u_double = udf[String, String]{ str => "\"" + str +  "\"" +"^^<http://www.w3.org/2001/XMLSchema#double>" }
val u_integer= udf[String, String]{ str => "\"" + str +  "\"" +"^^<http://www.w3.org/2001/XMLSchema#integer>" }
val u_ens = udf[String, String] { str => s"<http://rdf.ebi.ac.uk/resource/ensembl/${str.replace(" ", "_")}>" }
val u_base = udf[String, String] { str => s"<http://aging-research.group/resource/${str.replace(" ", "_")}>" }

## Load transcript maps




In [8]:
//val anage = spark.readTSV("/data/databases/anage/anage_data.tsv", header = true).withColumn("scientific_name", concat($"Genus", lit(" "), $"Species"))
//anage.show()
//val anage_animals = spark.readTSV("/data/ensembl/99/view_animals_anage.tsv", header = true).withColumnRenamed("latin_name", "scientific_name")
//anage_animals.show(10,10000)
val species = spark.readTSV("/data/ensembl/99/ensembl_anage_vertebrates.tsv", header = true).withColumnRenamed("latin_name", "scientific_name")
species.show(10,10000)

+--------------------+--------------------------------+-----------+-----------+------------------+-----------------------+-----+-----+------+-------+----------------+-----------+--------------+-----------+--------------+--------------------+------------------+-------+-----------+----------------+--------------------+--------------+----------------+--------------+-----------+
|         common_name|                 scientific_name|taxonomy_id|   assembly|assembly_accession|              genebuild|class|order|family|species|maximum_lifespan|body_mass_g|metabolic_rate|temperature|gestation_days|female_maturity_days|male_maturity_days|weaning|litter_size|litters_per_year|inter_birth_interval|birth_weight_g|weaning_weight_g|adult_weight_g|growth_rate|
+--------------------+--------------------------------+-----------+-----------+------------------+-----------------------+-----+-----+------+-------+----------------+-----------+--------------+-----------+--------------+--------------------+---

In [9]:
val path = File("/data/ensembl/99/species")
val tx2gene = (
  for(fl <- path.children.flatMap(_.children
    .filter(c=>c.name.endsWith(".gtf")&& !c.name.contains("abinitio")&& !c.name.contains("hapl_scaff")))
    ) yield fl.parent.name.capitalize -> spark.readTSV(fl.pathAsString.replace(".gtf", "_tx2gene.tsv")).toDF("transcript", "gene")
  ).toMap[String, DataFrame]

val h = tx2gene.head
println(h._1)
h._2.show(10,1000)

Otolemur_garnettii
+------------------+------------------+
|        transcript|              gene|
+------------------+------------------+
|ENSOGAT00000012158|ENSOGAG00000012155|
|ENSOGAT00000031338|ENSOGAG00000033509|
|ENSOGAT00000026645|ENSOGAG00000030153|
|ENSOGAT00000031633|ENSOGAG00000034131|
|ENSOGAT00000030922|ENSOGAG00000033908|
|ENSOGAT00000015494|ENSOGAG00000015488|
|ENSOGAT00000030131|ENSOGAG00000029906|
|ENSOGAT00000004627|ENSOGAG00000004624|
|ENSOGAT00000031760|ENSOGAG00000029208|
|ENSOGAT00000029851|ENSOGAG00000032266|
+------------------+------------------+
only showing top 10 rows



In [10]:
def tx2quad(df: DataFrame, species: String): DataFrame = {
    df
      .withColumn("subject", u_ens($"gene"))
      .withColumn("property", u_base(lit(("has_transcript"))))
      .withColumn("object", u_ens($"transcript"))      
      .withColumn("context", lit("<http://aging-research.group/samples/> ."))
      .select("subject", "property", "object", "context")
}
def tx2quad(tp: (String, DataFrame)): DataFrame = tx2quad(tp._2, tp._1)

tx2quad(h).show(10, 1000)

+----------------------------------------------------------+-----------------------------------------------------+----------------------------------------------------------+----------------------------------------+
|                                                   subject|                                             property|                                                    object|                                 context|
+----------------------------------------------------------+-----------------------------------------------------+----------------------------------------------------------+----------------------------------------+
|<http://rdf.ebi.ac.uk/resource/ensembl/ENSOGAG00000012155>|<http://aging-research.group/resource/has_transcript>|<http://rdf.ebi.ac.uk/resource/ensembl/ENSOGAT00000012158>|<http://aging-research.group/samples/> .|
|<http://rdf.ebi.ac.uk/resource/ensembl/ENSOGAG00000033509>|<http://aging-research.group/resource/has_transcript>|<http://rdf.ebi.ac.uk/reso

# Expressions



In [12]:
import org.apache.spark.sql._

val cont = "<" + prefix_samples + "> ."
def sam(str: String) = s"<${prefix_samples}${str}>"
def num(n: Double) =  "\"" + n.toString +  "\"" +"^^<http://www.w3.org/2001/XMLSchema#double>"


In [13]:
val samples = spark.readTSV("/data/samples/species/samples_index.tsv", header=true).na.fill("N/A")
  .where($"index" =!= "N/A")
  .where($"library_strategy" === "RNA-Seq")
  .sort($"organism".desc, $"library_layout")
samples.show(10, 10000)

+-----------+-----------+----------+----------------------+-----+------------+-------------------+----------------+--------------+-----------------+---------+---------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+----------------+----+-----+--------+--------------+--------------------------------------------+---------------------------------------------------------

In [14]:
val runMap = genericArrayOps(samples.select("run","transcripts")
  .as[(String, String)].collect).toMap.map{ case (run, v)=>spark.readTSV(v)
  .toDF("transcript", "TPM").withColumn("run",lit(run)).select("run", "transcript", "TPM").as[(String, String, Double)]
  }

java.lang.InterruptedException: Execution was interrupted by the user

In [15]:
runMap.head.show(10,1000)

+----------+--------------------+---------+
|       run|          transcript|      TPM|
+----------+--------------------+---------+
|SRR2040645|ENSCCET00000000012.1|15.003977|
|SRR2040645|ENSCCET00000000003.1|75.039963|
|SRR2040645|ENSCCET00000000007.1| 0.203121|
|SRR2040645|ENSCCET00000000047.1|77.176479|
|SRR2040645|ENSCCET00000000005.1|      0.0|
|SRR2040645|ENSCCET00000000092.1|      0.0|
|SRR2040645|ENSCCET00000000174.1| 0.771706|
|SRR2040645|ENSCCET00000000010.1|      0.0|
|SRR2040645|ENSCCET00000000154.1| 0.038792|
|SRR2040645|ENSCCET00000000006.1|72.543704|
+----------+--------------------+---------+
only showing top 10 rows



In [16]:
val joinedRuns = runMap.reduce{ (a, b)=> a.union(b)}
joinedRuns.toDF().writeTSV("/data/databases/graphdb/import/transcript_expressions.tsv")

parts of /data/databases/graphdb/import/transcript_expressions.tsv merged!


/data/databases/graphdb/import/transcript_expressions.tsv

In [17]:
val joinedRuns = runMap.reduce{ (a, b)=> a.union(b)}
joinedRuns.toDF().writeTSV("/data/databases/graphdb/import/transcript_expressions.tsv")

In [18]:
import org.apache.spark.storage.StorageLevel
val transcriptExpressions = spark.readTSV("/data/databases/graphdb/import/transcript_expressions.tsv", true).as[(String, String, Double)]
transcriptExpressions.show(10)

+----------+--------------------+---------+
|       run|          transcript|      TPM|
+----------+--------------------+---------+
|SRR2040645|ENSCCET00000000012.1|15.003977|
|SRR2040645|ENSCCET00000000003.1|75.039963|
|SRR2040645|ENSCCET00000000007.1| 0.203121|
|SRR2040645|ENSCCET00000000047.1|77.176479|
|SRR2040645|ENSCCET00000000005.1|      0.0|
|SRR2040645|ENSCCET00000000092.1|      0.0|
|SRR2040645|ENSCCET00000000174.1| 0.771706|
|SRR2040645|ENSCCET00000000010.1|      0.0|
|SRR2040645|ENSCCET00000000154.1| 0.038792|
|SRR2040645|ENSCCET00000000006.1|72.543704|
+----------+--------------------+---------+
only showing top 10 rows



In [19]:
   
def transcript_expression_quads(df: Dataset[(String,String, Double)]): Dataset[(String, String, String, String)] = {
    df.flatMap{case (run, transcript, tpm) =>
    val exp = samp("has_"+transcript+"_transcript_expression")
    Seq(
        (ens(transcript), "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>", "<http://aging-research.group/resource/Transcript>", cont) ,
        (exp, "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>", "<http://aging-research.group/resource/Transcript_expression>", cont) ,   
        (exp, "<http://aging-research.group/resource/expression_of>", ens(transcript), cont) ,      
        (s"<https://www.ncbi.nlm.nih.gov/sra/${run}>", exp, num(tpm), cont )
    )
    }.toDF("subject", "property", "object", "context").as[(String, String, String, String)]
}

In [20]:
val all_expressions = transcript_expression_quads(transcriptExpressions)
all_expressions.show(10,1000)

+------------------------------------------------------------------------------------+------------------------------------------------------------------------------------+------------------------------------------------------------+----------------------------------------+
|                                                                             subject|                                                                            property|                                                      object|                                 context|
+------------------------------------------------------------------------------------+------------------------------------------------------------------------------------+------------------------------------------------------------+----------------------------------------+
|                        <http://rdf.ebi.ac.uk/resource/ensembl/ENSCCET00000000012.1>|                                   <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>|       

In [21]:
all_expressions.toDF.writeTSV("/data/databases/graphdb/import/transcript_expressions.nq", header = false, rewrite = true)

parts of /data/databases/graphdb/import/transcript_expressions.nq merged!


/data/databases/graphdb/import/transcript_expressions.nq

### **Bugfix in transcript names**



In [23]:
def undot(s: String): String = {  s.lastIndexOf(".") match { case -1 => s case i => s.substring(0,i)} }
val u_undot = udf[String, String](undot _)
val tids = transcriptExpressions.select($"transcript").distinct()
.withColumn("stable_transcript_id", u_undot($"transcript"))
.withColumn("property", lit("<http://www.w3.org/2002/07/owl#sameAs>"))
.withColumn("context", lit("<http://aging-research.group/samples/> ."))
.select(u_ens($"stable_transcript_id").as("subject"), $"property", u_ens($"transcript").as("object"), $"context")
tids.show(20,1000)

+----------------------------------------------------------+--------------------------------------+------------------------------------------------------------+----------------------------------------+
|                                                   subject|                              property|                                                      object|                                 context|
+----------------------------------------------------------+--------------------------------------+------------------------------------------------------------+----------------------------------------+
|<http://rdf.ebi.ac.uk/resource/ensembl/ENSCCET00000000058>|<http://www.w3.org/2002/07/owl#sameAs>|<http://rdf.ebi.ac.uk/resource/ensembl/ENSCCET00000000058.1>|<http://aging-research.group/samples/> .|
|<http://rdf.ebi.ac.uk/resource/ensembl/ENSCCET00000000231>|<http://www.w3.org/2002/07/owl#sameAs>|<http://rdf.ebi.ac.uk/resource/ensembl/ENSCCET00000000231.1>|<http://aging-research.group/sam

In [24]:
tids.writeTSV("/data/databases/graphdb/import/transcript_stable.nq", header = false, rewrite = true)

parts of /data/databases/graphdb/import/transcript_stable.nq merged!


/data/databases/graphdb/import/transcript_stable.nq

Bugfix in transcript names<br>

Extracting sequences<br>

In [27]:
transcriptExpressions.select("transcript").where($"transcript".contains("ENSGGOT00000016465")).show(10,1000)

+--------------------+
|          transcript|
+--------------------+
|ENSGGOT00000016465.3|
|ENSGGOT00000016465.3|
|ENSGGOT00000016465.3|
|ENSGGOT00000016465.3|
|ENSGGOT00000016465.3|
|ENSGGOT00000016465.3|
|ENSGGOT00000016465.3|
|ENSGGOT00000016465.3|
|ENSGGOT00000016465.3|
|ENSGGOT00000016465.3|
+--------------------+
only showing top 10 rows



In [28]:
val mapping_cols = Seq("UniProtKB-AC","UniProtKB-ID","Entrez","RefSeq","GI","PDB","GO",
"UniRef100","UniRef90","UniRef50","UniParc","PIR",
"NCBI-taxon","MIM","UniGene","PubMed",
"EMBL","EMBL-CDS","Ensembl","Ensembl_TRS","Ensembl_PRO","Additional PubMed")
val mapping = spark.readTSV("/data/indexes/uniprot/idmapping_selected.tab").toDF(mapping_cols:_*)
mapping.limit(20).show(20, 1000)

+------------+------------+-------+-----------+-------------------------------+----+----------------------------------+----------------+---------------+---------------+-------------+----+----------+----+-------+------------------+--------+----------+-------+-----------+-----------+-----------------+
|UniProtKB-AC|UniProtKB-ID| Entrez|     RefSeq|                             GI| PDB|                                GO|       UniRef100|       UniRef90|       UniRef50|      UniParc| PIR|NCBI-taxon| MIM|UniGene|            PubMed|    EMBL|  EMBL-CDS|Ensembl|Ensembl_TRS|Ensembl_PRO|Additional PubMed|
+------------+------------+-------+-----------+-------------------------------+----+----------------------------------+----------------+---------------+---------------+-------------+----+----------+----+-------+------------------+--------+----------+-------+-----------+-----------+-----------------+
|      Q6GZX4|  001R_FRG3G|2947773|YP_031579.1|             81941549; 49237298|null|             

In [29]:
val stable = transcriptExpressions.select(u_undot($"transcript")).distinct().cache()
stable.count()

2578056

In [30]:
val u_trs = udf[String, String](str=>str.replace("<http://rdf.ebi.ac.uk/resource/ensembl/", "").replace(">", ""))
val pro = spark.readTSV("/data/species/pro_isoforms.tsv", true)
.withColumn("Ensembl_TRS", u_trs($"?transcript"))
.select("Ensembl_TRS").distinct()
val anti = spark.readTSV("/data/species/anti_isoforms.tsv", true)
.withColumn("Ensembl_TRS", u_trs($"?transcript"))
.select("Ensembl_TRS").distinct()
println(anti.count())
anti.show(10,1000)

1778
+----------------------+
|           Ensembl_TRS|
+----------------------+
|MGP_CAROLIEiJ_T0093290|
|    ENSPCIT00000004040|
|    ENSPMJT00000007695|
|    ENSPPAT00000058388|
|    ENSMNET00000041990|
|MGP_CAROLIEiJ_T0057794|
|    ENSOGAT00000008354|
|    ENSGGOT00000042588|
|    ENSMMUT00000102137|
|    ENSMICT00000047385|
+----------------------+
only showing top 10 rows



In [31]:
val pro_uni = pro.join(mapping, Seq("Ensembl_TRS"))
print(pro_uni.count())
pro_uni.show(10,1000)

1149+------------------+------------+------------+---------+--------------------------------------------------------------+---------------------------------------------------------------+----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------+---------------+---------------+-------------+----+----------+----+-------+----------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [32]:
val anti_uni = anti.join(mapping, Seq("Ensembl_TRS"))
anti_uni.show(10,1000)

+------------------+------------+------------+---------+------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------+---------------+---------------+-------------+------+----------+----+-------+----------------------------------------------------------+----------------------------------------------------

In [33]:
pro_uni.writeTSV("/data/species/pro_isoforms_uniprot_mapping.tsv", true)
anti_uni.writeTSV("/data/species/anti_isoforms_uniprot_mapping.tsv", true)

parts of /data/species/pro_isoforms_uniprot_mapping.tsv merged!
parts of /data/species/anti_isoforms_uniprot_mapping.tsv merged!


/data/species/anti_isoforms_uniprot_mapping.tsv