# n triples


saving lookups<br>




In [1]:
import org.apache.spark.rdd._
import org.apache.spark.sql._
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.functions._
import group.research.aging.spark.extensions._
import group.research.aging.spark.extensions.functions._
import group.research.aging.spark.extensions.functions.ConcatenateString
import group.research.aging.spark.extensions.functions.Concatenate
import ammonite.ops._
import ammonite.ops.ImplicitWd._

In [2]:
val species_n = spark.read.parquet("/data/ensembl/99/website/species_lookup.parquet").na.fill("")
.as[(java.lang.Integer, String, String, String, String, java.lang.Double, java.lang.Double,java.lang.Double, java.lang.Double,String,String,String)]
species_n.show(10,10000)

+--------+------------------------+--------------------+------------------------------------------------+-----+--------+------+--------------+-------------+---------------+-----------+------------+
|taxon_id|         scientific_name|         common_name|                                             url|class|lifespan|mass_g|metabolic_rate|temperature_k|specimen_origin|sample_size|data_quality|
+--------+------------------------+--------------------+------------------------------------------------+-----+--------+------+--------------+-------------+---------------+-----------+------------+
| 2489341|     Strigops habroptila|              Kakapo|     https://www.ensembl.org/Strigops_habroptila| Aves|    60.0|  null|          null|         null|      captivity|      small|  acceptable|
|  132585|    Anser brachyrhynchus|   Pink-footed goose|    https://www.ensembl.org/Anser_brachyrhynchus| Aves|    40.9|  null|          null|         null|           wild|     medium|  acceptable|
|    9031|

In [3]:
val prefix_ens = "http://rdf.ebi.ac.uk/resource/ensembl/"
val prefix = "http://aging-research.group/resource/"
def ens(str: String) = "<"+prefix_ens+str.replace(" ", "_")+">"
def tax(str: String) = ens("taxon#"+str)
def pref(str: String) = "<" +prefix +str.replace(" ", "_")+">" 
def str(s: String) = if(s==null) "" else s""""${s.replace("\"", "\\\"")}""""

In [4]:
def trip(s: String, p: String, o: String):List[(String, String, String, String)] = if(s==null || p == null || o == null ||
 s == "" ||  p == "" || o == "" || o=="\\N" || o.startsWith("\"\\N\"") || s == "\\N" || s == "\"\\N\"") Nil else List((s,p,o, "<" + "http://rdf.ebi.ac.uk/resource/ensembl/confidence/high" + "> ."))
def sp(taxon_id: java.lang.Integer, scientific_name: String)(p: String, o: Any) = if(taxon_id==null || o == null) Nil else {
      trip(tax(taxon_id.toString), pref(p), o.toString)++
        trip(pref(scientific_name),pref(p), o.toString) 
}

In [8]:
val trs = species_n.flatMap{
    case (taxon_id,scientific_name,common_name,url,animal_class,
    lifespan,mass_g,metabolic_rate,temperature_k,
    specimen_origin,sample_size,data_quality) =>
        trip(tax(taxon_id.toString), pref("has_scientific_name"), pref(scientific_name))++
        trip(pref(scientific_name),pref("has_taxon"), tax(taxon_id.toString))++
        sp(taxon_id, scientific_name)("has_common_name", str(common_name))++
        sp(taxon_id, scientific_name)("has_lifespan", lifespan)++
        sp(taxon_id, scientific_name)("has_temperature_kelvin", temperature_k)++
        sp(taxon_id, scientific_name)("has_mass_g", mass_g) ++
        sp(taxon_id, scientific_name)("has_metabolic_rate", metabolic_rate) ++
        sp(taxon_id, scientific_name)("is_animal_class", ens(animal_class)) ++
        sp(taxon_id, scientific_name)("has_ensembl_url", "<"+url+">") ++
        Nil
}.toDF("subject", "property", "object", "context")
trs.show(100,1000)

+--------------------------------------------------------------+-------------------------------------------------------------+--------------------------------------------------------------+---------------------------------------------------------+
|                                                       subject|                                                     property|                                                        object|                                                  context|
+--------------------------------------------------------------+-------------------------------------------------------------+--------------------------------------------------------------+---------------------------------------------------------+
|         <http://rdf.ebi.ac.uk/resource/ensembl/taxon#2489341>|   <http://aging-research.group/resource/has_scientific_name>|    <http://aging-research.group/resource/Strigops_habroptila>|<http://rdf.ebi.ac.uk/resource/ensembl/confidence/high> .|
|    <ht

In [6]:
trs.writeTSV("/data/databases/graphdb/import/species.nt", false)

parts of /data/databases/graphdb/import/species.nt merged!


/data/databases/graphdb/import/species.nt

Producing genes n-triples<br>

In [9]:
val genes_n = spark.read.parquet("/data/ensembl/99/website/genes_lookup.parquet").as[(java.lang.Integer,String, String, String, String, String, String, String)]
genes_n.show(20,1000)

+--------+------------------+-------------+-------------+------------------------------------------------------------------------------------+---------------+--------------------+-----+
|taxon_id|         stable_id|display_label|biotype_group|                                                                         description|scientific_name|         common_name|class|
+--------+------------------+-------------+-------------+------------------------------------------------------------------------------------+---------------+--------------------+-----+
|  211598|ENSANIG00000000002|        SSTR4|       coding|                         somatostatin receptor 4 [Source:HGNC Symbol;Acc:HGNC:11333]|Accipiter nisus|Eurasian sparrowhawk| Aves|
|  211598|ENSANIG00000000003|           \N|       coding|                                                                                  \N|Accipiter nisus|Eurasian sparrowhawk| Aves|
|  211598|ENSANIG00000000004|         CD93|       coding|             

In [13]:
def str2(s: String) = if(s==null) "" else s""""${s.replace("\"", "\\\"")}""""

In [11]:
val genes_trs = genes_n.flatMap{
    case (taxon_id,stable_id,display_label,biotype_group,description,scientific_name,common_name,cl) =>
    sp(taxon_id, scientific_name)("has_gene", ens(stable_id)) ++
    trip(ens(stable_id), "<"+"http://www.w3.org/2000/01/rdf-schema#label" +">", str2(display_label))++
    trip(ens(stable_id), "<"+"http://www.w3.org/2000/01/rdf-schema#description" +">",  str2(description))++
    trip(ens(stable_id), ens("has_biotype"), ens("biotype/"+biotype_group))++
    Nil
}.toDF("subject", "property", "object", "context")
genes_trs.show(40, 1000)

+----------------------------------------------------------+---------------------------------------------------+------------------------------------------------------------------------------------+---------------------------------------------------------+
|                                                   subject|                                           property|                                                                              object|                                                  context|
+----------------------------------------------------------+---------------------------------------------------+------------------------------------------------------------------------------------+---------------------------------------------------------+
|      <http://rdf.ebi.ac.uk/resource/ensembl/taxon#211598>|    <http://aging-research.group/resource/has_gene>|                          <http://rdf.ebi.ac.uk/resource/ensembl/ENSANIG00000000002>|<http://rdf.ebi.ac.uk/resource/ense

In [10]:
genes_trs.writeTSV("/data/databases/graphdb/import/genes_lookup.nt", header = false)

parts of /data/databases/graphdb/import/genes_lookup.nt merged!


/data/databases/graphdb/import/genes_lookup.nt