# **n-quads**


Saving data to graphdb<br>




In [1]:
import org.apache.spark.rdd._
import org.apache.spark.sql._
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.functions._
import group.research.aging.spark.extensions._
import group.research.aging.spark.extensions.functions._
import group.research.aging.spark.extensions.functions.ConcatenateString
import group.research.aging.spark.extensions.functions.Concatenate
import ammonite.ops._
import ammonite.ops.ImplicitWd._

## Species loading



In [3]:
val species_n = spark.read.pa samples.columnssa_lookup.parquet").na.fill("")
.as[(java.lang.Integer, String, String, String, String, java.lang.Double, java.lang.Double,java.lang.Double, java.lang.Double,String,String,String)]
species_n.show(10,10000)

+--------+------------------------+--------------------+------------------------------------------------+-----+--------+------+--------------+-------------+---------------+-----------+------------+
|taxon_id|         scientific_name|         common_name|                                             url|class|lifespan|mass_g|metabolic_rate|temperature_k|specimen_origin|sample_size|data_quality|
+--------+------------------------+--------------------+------------------------------------------------+-----+--------+------+--------------+-------------+---------------+-----------+------------+
| 2489341|     Strigops habroptila|              Kakapo|     https://www.ensembl.org/Strigops_habroptila| Aves|    60.0|  null|          null|         null|      captivity|      small|  acceptable|
|  132585|    Anser brachyrhynchus|   Pink-footed goose|    https://www.ensembl.org/Anser_brachyrhynchus| Aves|    40.9|  null|          null|         null|           wild|     medium|  acceptable|
|    9031|

In [4]:
val prefix = "http://aging-research.group/resource/"
val prefix_ensembl = "http://rdf.ebi.ac.uk/resource/ensembl/"
val prefix_samples = "http://aging-research.group/samples/"
def samp(str: String) = "<" +prefix_samples +str.replace(" ", "_")+">"
def pref(str: String) = "<" +prefix +str.replace(" ", "_")+">" 
def ens(str: String) =  "<" +prefix_ensembl +str.replace(" ", "_")+">"
def tax(str: String) = ens("taxon#"+str)
def str(s: String) = if(s==null) "" else s""""${s.replace("\"", "\\\"")}""""

In [5]:
def trip(s: String, p: String, o: String):List[(String, String, String, String)] = if(s==null || p == null || o == null ||
 s == "" ||  p == "" || o == "" || o=="\\N" || o.startsWith("\"\\N\"") || s == "\\N" || s == "\"\\N\"") Nil else List((s,p,o, "<" + "http://rdf.ebi.ac.uk/resource/ensembl/confidence/high" + "> ."))

def sp(taxon_id: java.lang.Integer, scientific_name: String)(p: String, o: Any) = if(taxon_id==null || o == null) Nil else {
      trip(tax(taxon_id.toString), pref(p), o.toString)++
        trip(pref(scientific_name),pref(p), o.toString) 
}

In [6]:
val trs = species_n.flatMap{
    case (taxon_id,scientific_name,common_name,url,animal_class,
    lifespan,mass_g,metabolic_rate,temperature_k,
    specimen_origin,sample_size,data_quality) =>
        trip(tax(taxon_id.toString), pref("has_scientific_name"), pref(scientific_name))++
        trip(pref(scientific_name),pref("has_taxon"), tax(taxon_id.toString))++
        sp(taxon_id, scientific_name)("has_common_name", str(common_name))++
        sp(taxon_id, scientific_name)("has_lifespan", lifespan)++
        sp(taxon_id, scientific_name)("has_temperature_kelvin", temperature_k)++
        sp(taxon_id, scientific_name)("has_mass_g", mass_g) ++
        sp(taxon_id, scientific_name)("has_metabolic_rate", metabolic_rate) ++
        sp(taxon_id, scientific_name)("is_animal_class", ens(animal_class)) ++
        sp(taxon_id, scientific_name)("has_ensembl_url", "<"+url+">") ++
        Nil
}.toDF("subject", "property", "object", "context")
trs.show(100,1000)

+--------------------------------------------------------------+-------------------------------------------------------------+--------------------------------------------------------------+---------------------------------------------------------+
|                                                       subject|                                                     property|                                                        object|                                                  context|
+--------------------------------------------------------------+-------------------------------------------------------------+--------------------------------------------------------------+---------------------------------------------------------+
|         <http://rdf.ebi.ac.uk/resource/ensembl/taxon#2489341>|   <http://aging-research.group/resource/has_scientific_name>|    <http://aging-research.group/resource/Strigops_habroptila>|<http://rdf.ebi.ac.uk/resource/ensembl/confidence/high> .|
|    <ht

In [7]:
trs.writeTSV("/data/databases/graphdb/import/species.nq", false)

parts of /data/databases/graphdb/import/species.nt merged!


/data/databases/graphdb/import/species.nt

# Processing samples



In [9]:
val samples = spark.readTSV("/data/samples/species/samples_index.tsv", header=true).na.fill("N/A")
  .where($"index" =!= "N/A")
  .where($"library_strategy" === "RNA-Seq")
  .sort($"organism".desc, $"library_layout")
samples.show(10, 10000)

+-----------+-----------+----------+----------------------+-----+-----------+----------------------------+----------------+--------------+-----------------+---------+---------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+----------------+----+-----+--------+--------------+---------------------------------------+-----------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------+--------------+---

In [63]:
samples.count()

617

In [12]:
val bioproject_prefix = "https://www.ncbi.nlm.nih.gov/bioproject/"
val experiment_prefix = "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="
val sra_prefix = "https://www.ncbi.nlm.nih.gov/sra/"

def bioproject(str: String) = s"<${bioproject_prefix}${str}>"
def experiment(str: String) = if(str.contains("GSE")) s"<${experiment_prefix}$str>" else bioproject(str)
def run(str: String) = s"<${experiment_prefix}${str}>"
val has_experiment = pref("has_experiment")
val has_run = pref("has_run")
val has_sample_name = pref("has_sample_name")
val has_experiments = pref("has_experiment")


In [13]:
def pre(prefix: String, str: String) = 
    if(prefix.startsWith("xsd:")) s""""$str"^^$prefix"""" else
    if(prefix.contains("http")) "<" + prefix + str + ">" else     
    prefix + str//sequence_memberssequence_membersse

In [14]:
def u_uri(pref: String) = udf[String, String]{ str => pref + str.replace(" ", "_")}
val u_string = udf[String, String]{ str => "\"" + str.replace("\n", "\\n").replace("\\", "\\\\").replace("\"", "\\\"") +  "\"" +"^^<http://www.w3.org/2001/XMLSchema#string>" }
val u_double = udf[String, String]{ str => "\"" + str +  "\"" +"^^<http://www.w3.org/2001/XMLSchema#double>" }
val u_integer= udf[String, String]{ str => "\"" + str +  "\"" +"^^<http://www.w3.org/2001/XMLSchema#integer>" }
val u_exp = udf[String, String]{ str => if(str.contains("GSE")) s"<https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=$str>" else  s"<https://www.ncbi.nlm.nih.gov/bioproject/${str}>"}
val u_study = udf[String, String]{  str => s"<https://trace.ncbi.nlm.nih.gov/Traces/sra/?study=${str.replace(" ", "_")}>" }
val u_ens = udf[String, String] { str => s"<http://rdf.ebi.ac.uk/resource/ensembl/${str.replace(" ", "_")}>" }
val u_samples = udf[String, String] { str => s"<http://aging-research.group/samples/${str.replace(" ", "_")}>" }
val u_base = udf[String, String] { str => s"<http://aging-research.group/resource/${str.replace(" ", "_")}>" }
val u_run = udf[String, String] { str => s"<https://www.ncbi.nlm.nih.gov/sra/${str.replace(" ", "_")}>" }

In [15]:
genericArrayOps(samples.columns).toList.map(l=>"\"" + l + "\"").mkString("[",",", "]")

["bioproject","series","run","organism","taxid","sample_name","sequencer","library_strategy","library_layout","library_selection","study","study_title","characteristics","source","age","sex","tumor","protocol","salmon_version","index","genes","transcript

In [80]:
def skip(df: DataFrame, n: Int) = df.withColumn("Index",monotonically_increasing_id)
  .filter($"Index" > n)
  .drop("Index")

In [16]:
import org.apache.spark.sql.types.StringType
val samples_prefixed = samples.withColumn("modified",$"modified".cast(StringType)).na.fill("N/A").na.fill(0)
    .withColumn("bioproject",u_exp($"bioproject"))
    .withColumn("series",u_exp($"series"))
    .withColumn("run",u_run($"run"))
    .withColumn("organism",u_base($"organism"))
    .withColumn("sample_name",u_string($"sample_name"))
    .withColumn("sequencer",u_samples($"sequencer"))
    .withColumn("library_strategy",u_samples($"library_strategy"))
    .withColumn("library_layout",u_samples($"library_layout"))    
    .withColumn("library_selection",u_samples($"library_selection"))
    .withColumn("study",u_study($"study"))
    .withColumn("study_title",u_string($"study_title"))      
    .withColumn("characteristics",u_string($"characteristics"))
    .withColumn("source",u_string($"source"))
    .withColumn("age",u_string($"age"))
    .withColumn("sex",u_string($"sex"))
    .withColumn("tumor",u_samples($"tumor"))
    .withColumn("protocol",u_string($"protocol"))
    .withColumn("salmon_version",u_string($"salmon_version"))
    //.withColumn("percent_mapped",u_double($"percent_mapped"))
    .withColumn("libType",u_string($"libType"))
    .withColumn("numBootstraps",u_integer($"numBootstraps"))
    .withColumn("modified",u_string($"modified"))        
    .select(
      "bioproject","series","run","organism","taxid","sample_name","sequencer","library_strategy","library_layout","library_selection",
      "study","study_title","characteristics","source","age","sex","tumor","protocol",
      "salmon_version", "libType","numBootstraps","modified"
    )
      
    
samples_prefixed.show(5, 10000)

+-----------------------------------------------------+-----------------------------------------------------+---------------------------------------------+-------------------------------------------------------------+-----+---------------------------------------------------+---------------------------------------------------------+---------------------------------------------+--------------------------------------------+-------------------------------------------+------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Quads functions



In [17]:
  //note first is subject, others - objects!
  def toQuads(dataFrame: DataFrame, subject_prefix: String, subject_column: String,
  properties: Seq[String], prefixes: Seq[String], object_columns: Seq[String], context: String = prefix_samples):  
  org.apache.spark.sql.Dataset[(String, String, String, String)] = {
    require(properties.length == prefixes.length && 
    prefixes.length == object_columns.length, 
    s"not same number of properties(${properties.length}), prefixes(${prefixes.length}) and columns(${object_columns.length})")
    val df = dataFrame.select(subject_column, object_columns:_*)    
    df.flatMap{ case row=>       
            val sub = pre(subject_prefix, row.getAs[String](subject_column)) 
            properties.zip(prefixes.zip(object_columns)).map{ 
                case (prop, (pref, col)) => 
                    val i = row.fieldIndex(col)
                    val value = row.get(i).toString
                    val o = pre(pref, value) //pre(pref, row.getAs[String](col))
                    (sub, prop, o, "<" + context + ">" + " .")
            }
    }.toDF("subject", "property", "object", "context").as[(String, String, String, String)]   
    
}

In [68]:
def toHasQuads(dataFrame: DataFrame, subject_prefix: String, subject_column: String,
   prefixes: Seq[String], columns: Seq[String], context: String = prefix_samples, property_prefix: String = prefix_samples) = {
      val properties = columns.map(c=> pre(property_prefix, "has_"+c))
      toQuads(dataFrame, subject_prefix, subject_column, properties, prefixes, columns, context)
  }

In [18]:
def toHasPrefixedQuads(dataFrame: DataFrame, subject_prefix: String, subject_column: String,
columns: Seq[String], context: String = prefix_samples, property_prefix: String = prefix_samples, object_prefix: String = prefix_samples) = {
    val properties = columns.map(c=> pre(property_prefix, c))
    val prefixes = columns.map(_ => object_prefix)
    toQuads(dataFrame, subject_prefix, subject_column, properties, prefixes, columns, context)
}

def toHasRawQuads(dataFrame: DataFrame, subject_column: String, context: String = prefix_samples, property_prefix: String = prefix_samples) = {
    val columns = genericArrayOps(dataFrame.columns).toList.filterNot(_==subject_column)
    val properties = columns.map(c=> pre(property_prefix, "has_"+c))    
    val prefixes = columns.map(_ => "")
    toQuads(dataFrame, "", subject_column, properties, prefixes, columns, context)
}

In [19]:
def has_domains(subject: String, columns: Seq[String], context: String = prefix_samples, subject_prefix: String = prefix_samples, property_prefix: String = prefix_samples) = {
    val domain = if(subject.contains(subject_prefix)) subject else pre(subject_prefix, subject)
    val ts = columns.map(c=>( pre(property_prefix, "has_"+c), "<http://www.w3.org/2000/01/rdf-schema#domain>", domain, "<" + context +">" +" .")   ) 
    spark.createDataset[(String, String, String, String)](ts)
}
def of_class(df:  org.apache.spark.sql.Dataset[_], subject: String, column: String = "subject", subject_prefix: String = prefix_samples, context: String = prefix_samples) ={
    val v = df.select(column).distinct.as[String]
    val cl = if(subject.contains(subject_prefix)) subject else pre(subject_prefix, subject)    
    val ts = Seq((cl, "<http://www.w3.org/2000/01/rdf-schema#subClassOf>", "<http://www.w3.org/2002/07/owl#Class>",  "<" + context +">" +" ."))
    spark.createDataset[(String, String, String, String)](ts).union(
        v
    .withColumn("property", lit("<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>"))
    .withColumn("object", lit(cl))
    .withColumn("context", lit("<" + context +">" +" ."))
    .as[(String, String, String, String)])
}

In [20]:
val biopro_prep = samples_prefixed.select("bioproject", "series", "run", "study")
val biopro = toHasRawQuads(biopro_prep, "bioproject")
val all_biopro = of_class(biopro, "Bioproject").union(has_domains("Bioproject", genericArrayOps(biopro_prep.columns).toList)).union(biopro)
.toDF("subject", "property", "object", "context")
println(all_biopro.count)
all_biopro.show(100,10000)

1982
+-----------------------------------------------------+-------------------------------------------------+------------------------------------------------+----------------------------------------+
|                                              subject|                                         property|                                          object|                                 context|
+-----------------------------------------------------+-------------------------------------------------+------------------------------------------------+----------------------------------------+
|     <http://aging-research.group/samples/Bioproject>|<http://www.w3.org/2000/01/rdf-schema#subClassOf>|           <http://www.w3.org/2002/07/owl#Class>|<http://aging-research.group/samples/> .|
|<https://www.ncbi.nlm.nih.gov/bioproject/PRJNA200320>|<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>|<http://aging-research.group/samples/Bioproject>|<http://aging-research.group/samples/> .|
|<https://www.n

In [21]:
val bioseries_prep = samples_prefixed.select("series", "run", "study")
val bioseries = toHasRawQuads(bioseries_prep, "series")
val all_bioseries = of_class(bioseries, "Experiment_series")
  .union(has_domains("Experiment_series", genericArrayOps(bioseries_prep.columns).toList))
  .union(bioseries)
  .toDF("subject", "property", "object", "context")
println(all_bioseries.count())
all_bioseries.show(100,100)

1368
+--------------------------------------------------------------------------------+-------------------------------------------------+-------------------------------------------------------+----------------------------------------+
|                                                                         subject|                                         property|                                                 object|                                 context|
+--------------------------------------------------------------------------------+-------------------------------------------------+-------------------------------------------------------+----------------------------------------+
|                         <http://aging-research.group/samples/Experiment_series>|<http://www.w3.org/2000/01/rdf-schema#subClassOf>|                  <http://www.w3.org/2002/07/owl#Class>|<http://aging-research.group/samples/> .|
|                           <https://www.ncbi.nlm.nih.gov/bioproject/PRJNA2

In [22]:
val biorun_prep = samples_prefixed.select("run","organism","sample_name","sequencer","library_strategy","library_layout","library_selection",
      "study","study_title","characteristics","source","age","sex","tumor","protocol",
      "salmon_version", "libType","numBootstraps","modified")
val biorun = toHasRawQuads(biorun_prep, "run")
biorun.count()
biorun.show(10,100)

+--------------------------------------------+-----------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------+
|                                     subject|                                                   property|                                                                                              object|                                 context|
+--------------------------------------------+-----------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------+
|<https://www.ncbi.nlm.nih.gov/sra/SRR960570>|         <http://aging-research.group/samples/has_organism>|                                       <http://aging-research.group/resource/Zonotrichia_albicollis>|<http://aging-research.group/samples/> .|
|<ht

In [66]:
val all_biorun =  of_class(biorun, "Sequencing_run")
  .union(has_domains("Sequencing_run", genericArrayOps(biorun_prep.columns).toList))
  .union(biorun).toDF("subject", "property", "object", "context")
print(all_biorun.count)
all_biorun.show(10)

11728+--------------------+--------------------+--------------------+--------------------+
|             subject|            property|              object|             context|
+--------------------+--------------------+--------------------+--------------------+
|<http://aging-res...|<http://www.w3.or...|<http://www.w3.or...|<http://aging-res...|
|<https://www.ncbi...|<http://www.w3.or...|<http://aging-res...|<http://aging-res...|
|<https://www.ncbi...|<http://www.w3.or...|<http://aging-res...|<http://aging-res...|
|<https://www.ncbi...|<http://www.w3.or...|<http://aging-res...|<http://aging-res...|
|<https://www.ncbi...|<http://www.w3.or...|<http://aging-res...|<http://aging-res...|
|<https://www.ncbi...|<http://www.w3.or...|<http://aging-res...|<http://aging-res...|
|<https://www.ncbi...|<http://www.w3.or...|<http://aging-res...|<http://aging-res...|
|<https://www.ncbi...|<http://www.w3.or...|<http://aging-res...|<http://aging-res...|
|<https://www.ncbi...|<http://www.w3.or...|<http:

In [23]:
val all_samples = all_biopro.union(all_bioseries).union(all_biorun).as[(String, String, String, String)].toDF("subject", "property", "object", "context")
println(all_samples.count())
all_samples.show(10)

15078
+--------------------+--------------------+--------------------+--------------------+
|             subject|            property|              object|             context|
+--------------------+--------------------+--------------------+--------------------+
|<http://aging-res...|<http://www.w3.or...|<http://www.w3.or...|<http://aging-res...|
|<https://www.ncbi...|<http://www.w3.or...|<http://aging-res...|<http://aging-res...|
|<https://www.ncbi...|<http://www.w3.or...|<http://aging-res...|<http://aging-res...|
|<https://www.ncbi...|<http://www.w3.or...|<http://aging-res...|<http://aging-res...|
|<https://www.ncbi...|<http://www.w3.or...|<http://aging-res...|<http://aging-res...|
|<https://www.ncbi...|<http://www.w3.or...|<http://aging-res...|<http://aging-res...|
|<https://www.ncbi...|<http://www.w3.or...|<http://aging-res...|<http://aging-res...|
|<https://www.ncbi...|<http://www.w3.or...|<http://aging-res...|<http://aging-res...|
|<https://www.ncbi...|<http://www.w3.or...|<http

In [24]:
all_samples.toDF.writeTSV("/data/databases/graphdb/import/samples.nq", header = false, rewrite = true)

parts of /data/databases/graphdb/import/samples.nq merged!


/data/databases/graphdb/import/samples.nq

# **Gene-expressions to quads<br>**



In [26]:
val runMap = genericArrayOps(samples.select("run","genes").as[(String, String)].collect).toMap.mapValues(v=>spark.readTSV(v).toDF("gene", "TPM").as[(String, Double)])
runMap

In [27]:
val (r, e) = runMap.head
println(r)
e.show(10,1000)

SRR2040645
+------------------+-----------+
|              gene|        TPM|
+------------------+-----------+
|ENSCCEG00000000007|  15.003977|
|ENSCCEG00000000008|  75.039963|
|ENSCCEG00000000009|   0.203121|
|ENSCCEG00000000012|  77.176479|
|ENSCCEG00000000013|   0.771706|
|ENSCCEG00000000014|   0.038792|
|ENSCCEG00000000015|  72.543704|
|ENSCCEG00000000018|1519.296913|
|ENSCCEG00000000019|   3.455105|
|ENSCCEG00000000020| 100.753391|
+------------------+-----------+
only showing top 10 rows



In [28]:
import org.apache.spark.sql._

val cont = "<" + prefix_samples + "> ."
def sam(str: String) = s"<${prefix_samples}${str}>"
def num(n: Double) =  "\"" + n.toString +  "\"" +"^^<http://www.w3.org/2001/XMLSchema#double>"
    
def expression_quads(run: String, df: Dataset[(String, Double)]): Dataset[(String, String, String, String)] = {
    df.flatMap{case (gene, tpm) =>
    val exp = sam(s"has_${gene}_expression")
    Seq(
        (ens(gene), "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>", "<http://aging-research.group/resource/Gene>", cont) ,
        (exp, "<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>", "<http://aging-research.group/resource/Gene_expression>", cont) ,   
        (exp, "<http://aging-research.group/resource/expression_of>", ens(gene), cont) ,      
        (s"<https://www.ncbi.nlm.nih.gov/sra/${run}>", exp, num(tpm), cont )
    )
    }.toDF("subject", "property", "object", "context").as[(String, String, String, String)]
}

In [29]:
val all_expressions = runMap.map{ case (run, df) => expression_quads(run,df)}.reduce(_.union(_))
all_expressions.show(100,10000)

+-----------------------------------------------------------------------+-----------------------------------------------------------------------+----------------------------------------------------------+----------------------------------------+
|                                                                subject|                                                               property|                                                    object|                                 context|
+-----------------------------------------------------------------------+-----------------------------------------------------------------------+----------------------------------------------------------+----------------------------------------+
|             <http://rdf.ebi.ac.uk/resource/ensembl/ENSCCEG00000000007>|                      <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>|               <http://aging-research.group/resource/Gene>|<http://aging-research.group/samples/> .|
|<http://aging-r

In [30]:
all_expressions.toDF.writeTSV("/data/databases/graphdb/import/all_expressions.nq", header = false, rewrite = true)

parts of /data/databases/graphdb/import/all_expressions.nq merged!


/data/databases/graphdb/import/all_expressions.nq

In [31]:
expression_quads(r, e).show(200,1000)

+-----------------------------------------------------------------------+-----------------------------------------------------------------------+----------------------------------------------------------+----------------------------------------+
|                                                                subject|                                                               property|                                                    object|                                 context|
+-----------------------------------------------------------------------+-----------------------------------------------------------------------+----------------------------------------------------------+----------------------------------------+
|             <http://rdf.ebi.ac.uk/resource/ensembl/ENSBTAG00000000005>|                      <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>|               <http://aging-research.group/resource/Gene>|<http://aging-research.group/samples/> .|
|<http://aging-r

# **Genes**<br>




In [33]:
val genes = spark.readTSV("/data/ensembl/99/compara/dump/gene_member.txt").toDF(
    "gene_member_id",	"stable_id",	"version",	"source_name",	
    "taxon_id",	"genome_db_id",	"biotype_group",	"canonical_member_id",	
    "description",	"dnafrag_id",	"dnafrag_start", 
	"dnafrag_end",	"dnafrag_strand",	"display_label"
)
genes.show(50)

+--------------+------------------+-------+-----------+--------+------------+-------------+-------------------+--------------------+----------+-------------+-----------+--------------+-------------+
|gene_member_id|         stable_id|version|source_name|taxon_id|genome_db_id|biotype_group|canonical_member_id|         description|dnafrag_id|dnafrag_start|dnafrag_end|dnafrag_strand|display_label|
+--------------+------------------+-------+-----------+--------+------------+-------------+-------------------+--------------------+----------+-------------+-----------+--------------+-------------+
|             1|ENSTNIG00000004376|      1|ENSEMBLGENE|   99883|          65|       coding|                  1|NADH-ubiquinone o...|   4514312|         2838|       3812|             1|       mt-nd1|
|             2|ENSTNIG00000004377|      1|ENSEMBLGENE|   99883|          65|       coding|                  2|NADH-ubiquinone o...|   4514312|         4025|       5070|             1|       mt-nd2|
|    

## tx2gene




In [35]:
val sequence_members = spark.readTSV("/data/ensembl/99/compara/dump/seq_member.txt").toDF("seq_member_id", "stable_id", "version", "source_name", "taxon_id",
 "genome_db_id", "sequence_id", "gene_member_id", "has_transcript_edits", "has_translation_edits", 
"description", "dnafrag_id", "dnafrag_start", "dnafrag_end", "dnafrag_strand", "display_label")
sequence_members.show(50,1000)

+-------------+------------------+-------+-----------+--------+------------+-----------+--------------+--------------------+---------------------+-----------+----------+-------------+-----------+--------------+-------------+
|seq_member_id|         stable_id|version|source_name|taxon_id|genome_db_id|sequence_id|gene_member_id|has_transcript_edits|has_translation_edits|description|dnafrag_id|dnafrag_start|dnafrag_end|dnafrag_strand|display_label|
+-------------+------------------+-------+-----------+--------+------------+-----------+--------------+--------------------+---------------------+-----------+----------+-------------+-----------+--------------+-------------+
|            1|ENSTNIP00000007010|      1| ENSEMBLPEP|   99883|          65|          1|             1|                   0|                    0|         \N|   4514312|         2838|       3812|             1|   mt-nd1-201|
|            2|ENSTNIP00000007011|      1| ENSEMBLPEP|   99883|          65|          2|            

In [36]:
(sequence_members.select("stable_id", "version").distinct.count(),sequence_members.select("stable_id").distinct.count())

(37167718,37167718)

In [37]:
sequence_members.select("gene").where($"taxon_id" ===9606).distinct.count()

320377

In [38]:
val seq_mem = sequence_members.select( $"gene_member_id", $"stable_id", $"source_name", $"has_transcript_edits", $"has_translation_edits", $"display_label")
val tx2gene = genes.select($"stable_id".as("gene"),$"taxon_id", $"gene_member_id").join(seq_mem, "gene_member_id")
tx2gene.show(200,1000)


+--------------+-------------------+--------+-------------------+-----------+--------------------+---------------------+--------------------+
|gene_member_id|               gene|taxon_id|          stable_id|source_name|has_transcript_edits|has_translation_edits|       display_label|
+--------------+-------------------+--------+-------------------+-----------+--------------------+---------------------+--------------------+
|           148| ENSTNIG00000005907|   99883| ENSTNIP00000002825| ENSEMBLPEP|                   0|                    0|          ints6l-201|
|           148| ENSTNIG00000005907|   99883| ENSTNIP00000008634| ENSEMBLPEP|                   0|                    0|          ints6l-202|
|           148| ENSTNIG00000005907|   99883| ENSTNIP00000001015| ENSEMBLPEP|                   0|                    0|          ints6l-203|
|           471| ENSPPYG00000017450|    9601| ENSPPYP00000019561| ENSEMBLPEP|                   0|                    0|           MYL10-201|
|     

In [39]:
tx2gene.where($"taxon_id" ===9606).select("gene").distinct.count()

67996

In [40]:
tx2gene.select("stable_id").distinct.count()

12159873

In [41]:
tx2gene.select("source_name").distinct()show(1000)

+------------+
| source_name|
+------------+
|ENSEMBLTRANS|
|  ENSEMBLPEP|
+------------+



In [43]:
val genes_n = spark.read.parquet("/data/ensembl/99/website/genes_lookup.parquet").as[(java.lang.Integer,String, String, String, String, String, String, String)]
genes_n.show(20,1000)

+--------+------------------+-------------+-------------+------------------------------------------------------------------------------------+---------------+--------------------+-----+
|taxon_id|         stable_id|display_label|biotype_group|                                                                         description|scientific_name|         common_name|class|
+--------+------------------+-------------+-------------+------------------------------------------------------------------------------------+---------------+--------------------+-----+
|  211598|ENSANIG00000000002|        SSTR4|       coding|                         somatostatin receptor 4 [Source:HGNC Symbol;Acc:HGNC:11333]|Accipiter nisus|Eurasian sparrowhawk| Aves|
|  211598|ENSANIG00000000003|           \N|       coding|                                                                                  \N|Accipiter nisus|Eurasian sparrowhawk| Aves|
|  211598|ENSANIG00000000004|         CD93|       coding|             

In [44]:
val not_null = udf[String, String](str=> if(str==null || str=="\\N") "" else str)
val underscored =udf[String, String](str=> str.replace(" ", "_"))

In [45]:
val tx2gene_n = tx2gene/*.where($"source_name" === "ENSEMBLTRANS")*/.select($"gene", $"stable_id".as("transcript"), $"display_label")
.join(genes_n.select($"stable_id".as("gene"), $"scientific_name"), "gene")
.withColumn("scientific_name", underscored($"scientific_name"))
    .withColumn("display_label", not_null($"display_label"))
    .withColumnRenamed("label", "display_label")
tx2gene_n.show(100,10000)

+------------------+------------------+-------------+--------------------+
|              gene|        transcript|display_label|     scientific_name|
+------------------+------------------+-------------+--------------------+
|ENSABRG00000000073|ENSABRP00000000057|     SPG7-201|Anser_brachyrhynchus|
|ENSABRG00000000092|ENSABRP00000000085|    CPNE7-201|Anser_brachyrhynchus|
|ENSABRG00000000381|ENSABRP00000000341|             |Anser_brachyrhynchus|
|ENSABRG00000000571|ENSABRT00000000770|             |Anser_brachyrhynchus|
|ENSABRG00000000980|ENSABRP00000000919|             |Anser_brachyrhynchus|
|ENSABRG00000001204|ENSABRP00000001162|   RNF121-201|Anser_brachyrhynchus|
|ENSABRG00000001692|ENSABRT00000002456|             |Anser_brachyrhynchus|
|ENSABRG00000002062|ENSABRP00000002057|             |Anser_brachyrhynchus|
|ENSABRG00000002368|ENSABRP00000002427|     GLG1-201|Anser_brachyrhynchus|
|ENSABRG00000002482|ENSABRT00000003692|             |Anser_brachyrhynchus|
|ENSABRG00000002512|ENSAB

In [46]:
tx2gene_n.writeParquet("/data/ensembl/99/website/transcripts.parquet", true)

parts of /data/ensembl/99/website/transcripts.parquet merged!


/data/ensembl/99/website/transcripts.parquet

In [47]:
val tx2gene_n = spark.read.parquet("/data/ensembl/99/website/transcripts.parquet")
tx2gene_n.show(100,1000)

+------------------+------------------+-------------+--------------------+
|              gene|        transcript|display_label|     scientific_name|
+------------------+------------------+-------------+--------------------+
|ENSABRG00000000073|ENSABRP00000000057|     SPG7-201|Anser_brachyrhynchus|
|ENSABRG00000000092|ENSABRP00000000085|    CPNE7-201|Anser_brachyrhynchus|
|ENSABRG00000000381|ENSABRP00000000341|             |Anser_brachyrhynchus|
|ENSABRG00000000571|ENSABRT00000000770|             |Anser_brachyrhynchus|
|ENSABRG00000000980|ENSABRP00000000919|             |Anser_brachyrhynchus|
|ENSABRG00000001204|ENSABRP00000001162|   RNF121-201|Anser_brachyrhynchus|
|ENSABRG00000001692|ENSABRT00000002456|             |Anser_brachyrhynchus|
|ENSABRG00000002062|ENSABRP00000002057|             |Anser_brachyrhynchus|
|ENSABRG00000002368|ENSABRP00000002427|     GLG1-201|Anser_brachyrhynchus|
|ENSABRG00000002482|ENSABRT00000003692|             |Anser_brachyrhynchus|
|ENSABRG00000002512|ENSAB

In [48]:
val human_count = genes.select("stable_id").where($"taxon_id" === 	9606).distinct.count()
val human_count_n = genes_n.select("stable_id").where($"taxon_id" === 	9606).distinct.count()
val human_count_tx2gene_n = tx2gene_n.select("gene").where($"scientific_name" === "Homo_sapiens").distinct.count()
(human_count, human_count_n, human_count_tx2gene_n)

(67996,67996,67996)

In [50]:
val human_old = spark.readTSV("/data/ensembl/97/tx2gene/Homo_sapiens.tsv").toDF("transcript", "gene")
human_old.show()

+---------------+---------------+
|     transcript|           gene|
+---------------+---------------+
|ENST00000387314|ENSG00000210049|
|ENST00000389680|ENSG00000211459|
|ENST00000387342|ENSG00000210077|
|ENST00000387347|ENSG00000210082|
|ENST00000386347|ENSG00000209082|
|ENST00000361390|ENSG00000198888|
|ENST00000387365|ENSG00000210100|
|ENST00000387372|ENSG00000210107|
|ENST00000387377|ENSG00000210112|
|ENST00000361453|ENSG00000198763|
|ENST00000387382|ENSG00000210117|
|ENST00000387392|ENSG00000210127|
|ENST00000387400|ENSG00000210135|
|ENST00000387405|ENSG00000210140|
|ENST00000387409|ENSG00000210144|
|ENST00000361624|ENSG00000198804|
|ENST00000387416|ENSG00000210151|
|ENST00000387419|ENSG00000210154|
|ENST00000361739|ENSG00000198712|
|ENST00000387421|ENSG00000210156|
+---------------+---------------+
only showing top 20 rows



In [51]:
val tx = human_old.select("transcript").join(tx2gene.select($"stable_id".as("transcript")).distinct, Seq("transcript"), "leftanti")
tx.show(200,1000)

+---------------+
|     transcript|
+---------------+
|ENST00000057513|
|ENST00000201647|
|ENST00000216039|
|ENST00000216489|
|ENST00000230381|
|ENST00000245618|
|ENST00000255613|
|ENST00000257264|
|ENST00000257857|
|ENST00000257863|
|ENST00000262817|
|ENST00000268042|
|ENST00000272223|
|ENST00000279249|
|ENST00000279804|
|ENST00000280467|
|ENST00000283122|
|ENST00000283243|
|ENST00000283635|
|ENST00000285599|
|ENST00000290015|
|ENST00000291503|
|ENST00000293502|
|ENST00000295899|
|ENST00000295962|
|ENST00000296277|
|ENST00000297504|
|ENST00000298492|
|ENST00000299106|
|ENST00000300134|
|ENST00000300458|
|ENST00000300658|
|ENST00000301295|
|ENST00000302548|
|ENST00000302622|
|ENST00000304081|
|ENST00000305476|
|ENST00000308167|
|ENST00000308987|
|ENST00000309519|
|ENST00000311170|
|ENST00000311575|
|ENST00000311875|
|ENST00000313050|
|ENST00000314340|
|ENST00000315170|
|ENST00000316853|
|ENST00000317296|
|ENST00000317538|
|ENST00000317615|
|ENST00000317749|
|ENST00000319555|
|ENST00000

In [52]:
val tx2 = tx2gene.select($"stable_id".as("transcript"), $"gene").distinct.join(human_old.select("transcript", "gene"), Seq("transcript"), "leftanti")
tx2.show()

+-------------------+-------------------+
|         transcript|               gene|
+-------------------+-------------------+
|ENSCSAVP00000011994|ENSCSAVG00000007046|
| ENSLACP00000009729| ENSLACG00000008578|
| ENSPMAP00000007844| ENSPMAG00000007123|
| ENSPFOP00000030165| ENSPFOG00000014066|
| ENSLACP00000022658| ENSLACG00000013531|
| ENSOGAP00000003412| ENSOGAG00000003835|
| ENSGACP00000014131| ENSGACG00000010673|
| ENSCSAP00000014597| ENSCSAG00000003454|
| ENSACAP00000022078| ENSACAG00000028460|
| ENSPSIP00000014553| ENSPSIG00000012901|
| ENSACAP00000023323| ENSACAG00000028473|
| ENSOPRP00000013334| ENSOPRG00000014613|
| ENSOARP00000018461| ENSOARG00000017189|
| ENSOANP00000014034| ENSOANG00000008805|
| ENSEEUP00000010269| ENSEEUG00000011243|
| ENSEEUP00000009234| ENSEEUG00000010098|
| ENSMEUP00000013073| ENSMEUG00000014319|
| ENSSART00000016069| ENSSARG00000016064|
| ENSHGLT00100039399| ENSHGLG00100030806|
| ENSCAPT00000004486| ENSCAPG00000004065|
+-------------------+-------------

In [53]:
val j = tx2gene.select($"stable_id".as("transcript"), $"gene").distinct.join(human_old.select("transcript", "gene"), Seq("gene"))
println(human_old.select("gene").distinct.count())
println(j.select("gene").distinct.count())
j

66832
66663


[gene: string, transcript: string ... 1 more field]

In [54]:
(human_old.select("gene").distinct().count(), tx2gene.

In [55]:
species_n.orderBy("scientific_name").show(200,1000)

+--------+--------------------------+------------------------------+--------------------------------------------------+------------------+--------+--------+--------------+-------------+---------------+-----------+------------+
|taxon_id|           scientific_name|                   common_name|                                               url|             class|lifespan|  mass_g|metabolic_rate|temperature_k|specimen_origin|sample_size|data_quality|
+--------+--------------------------+------------------------------+--------------------------------------------------+------------------+--------+--------+--------------+-------------+---------------+-----------+------------+
|  211598|           Accipiter nisus|          Eurasian sparrowhawk|           https://www.ensembl.org/Accipiter_nisus|              Aves|    20.2|   135.0|        0.9516|         null|           wild|     medium|  acceptable|
|    9646|    Ailuropoda melanoleuca|                   Giant panda|    https://www.ensembl.

In [56]:
import ammonite.ops._
val t = Path("/data/ensembl/99/tx2gene")
val dirs = t.toIO.listFiles().toList.filter(_.getName.contains("scientific_name="))
dirs.foreach(f=> ammonite.ops.rm(Path(f.getAbsolutePath)))
//.map(f=>f.listFiles().toList.filter(_.getName.endsWith(".csv")).foreach(ff=>ff.renameTo(Path(f.getAbsolutePath.replace("scientific_name=","")+".tsv").toIO)))

## |Producing genes n-quads




In [58]:
val genes_n = spark.read.parquet("/data/ensembl/99/website/genes_lookup.parquet").as[(java.lang.Integer,String, String, String, String, String, String, String)]
genes_n.show(20,1000)

In [59]:
def str2(s: String) = if(s==null) "" else s""""${s.replace("\"", "\\\"")}""""

In [60]:
val genes_trs = genes_n.flatMap{
    case (taxon_id,stable_id,display_label,biotype_group,description,scientific_name,common_name,cl) =>
    sp(taxon_id, scientific_name)("has_gene", ens(stable_id)) ++
    trip(ens(stable_id), "<"+"http://www.w3.org/2000/01/rdf-schema#label" +">", str2(display_label))++
    trip(ens(stable_id), "<"+"http://www.w3.org/2000/01/rdf-schema#description" +">",  str2(description))++
    trip(ens(stable_id), ens("has_biotype"), ens("biotype/"+biotype_group))++
    Nil
}.toDF("subject", "property", "object", "context")
genes_trs.show(40, 1000)

+----------------------------------------------------------+---------------------------------------------------+------------------------------------------------------------------------------------+---------------------------------------------------------+
|                                                   subject|                                           property|                                                                              object|                                                  context|
+----------------------------------------------------------+---------------------------------------------------+------------------------------------------------------------------------------------+---------------------------------------------------------+
|      <http://rdf.ebi.ac.uk/resource/ensembl/taxon#211598>|    <http://aging-research.group/resource/has_gene>|                          <http://rdf.ebi.ac.uk/resource/ensembl/ENSANIG00000000002>|<http://rdf.ebi.ac.uk/resource/ense

In [61]:

genes_trs.writeTSV("/data/databases/graphdb/import/genes_lookup.nq", header = false)

parts of /data/databases/graphdb/import/genes_lookup.nt merged!


/data/databases/graphdb/import/genes_lookup.nt