# **n-quads**

saving lookups<br>




In [1]:
import org.apache.spark.rdd._
import org.apache.spark.sql._
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.functions._
import group.research.aging.spark.extensions._
import group.research.aging.spark.extensions.functions._
import group.research.aging.spark.extensions.functions.ConcatenateString
import group.research.aging.spark.extensions.functions.Concatenate
import ammonite.ops._
import ammonite.ops.ImplicitWd._

In [2]:
val species_n = spark.read.parquet("/data/ensembl/99/website/species_lookup.parquet").na.fill("")
.as[(java.lang.Integer, String, String, String, String, java.lang.Double, java.lang.Double,java.lang.Double, java.lang.Double,String,String,String)]
species_n.show(10,10000)

+--------+------------------------+--------------------+------------------------------------------------+-----+--------+------+--------------+-------------+---------------+-----------+------------+
|taxon_id|         scientific_name|         common_name|                                             url|class|lifespan|mass_g|metabolic_rate|temperature_k|specimen_origin|sample_size|data_quality|
+--------+------------------------+--------------------+------------------------------------------------+-----+--------+------+--------------+-------------+---------------+-----------+------------+
| 2489341|     Strigops habroptila|              Kakapo|     https://www.ensembl.org/Strigops_habroptila| Aves|    60.0|  null|          null|         null|      captivity|      small|  acceptable|
|  132585|    Anser brachyrhynchus|   Pink-footed goose|    https://www.ensembl.org/Anser_brachyrhynchus| Aves|    40.9|  null|          null|         null|           wild|     medium|  acceptable|
|    9031|

In [3]:
val prefix = "http://aging-research.group/resource/"
val prefix_ensembl = "http://rdf.ebi.ac.uk/resource/ensembl/"
val prefix_samples = "http://aging-research.group/samples/"
def samp(str: String) = "<" +prefix_samples +str.replace(" ", "_")+">"
def pref(str: String) = "<" +prefix +str.replace(" ", "_")+">" 
def ens(str: String) =  "<" +prefix_ensembl +str.replace(" ", "_")+">"
def tax(str: String) = ens("taxon#"+str)
def str(s: String) = if(s==null) "" else s""""${s.replace("\"", "\\\"")}""""

In [4]:
def trip(s: String, p: String, o: String):List[(String, String, String, String)] = if(s==null || p == null || o == null ||
 s == "" ||  p == "" || o == "" || o=="\\N" || o.startsWith("\"\\N\"") || s == "\\N" || s == "\"\\N\"") Nil else List((s,p,o, "<" + "http://rdf.ebi.ac.uk/resource/ensembl/confidence/high" + "> ."))

def sp(taxon_id: java.lang.Integer, scientific_name: String)(p: String, o: Any) = if(taxon_id==null || o == null) Nil else {
      trip(tax(taxon_id.toString), pref(p), o.toString)++
        trip(pref(scientific_name),pref(p), o.toString) 
}

In [5]:
val trs = species_n.flatMap{
    case (taxon_id,scientific_name,common_name,url,animal_class,
    lifespan,mass_g,metabolic_rate,temperature_k,
    specimen_origin,sample_size,data_quality) =>
        trip(tax(taxon_id.toString), pref("has_scientific_name"), pref(scientific_name))++
        trip(pref(scientific_name),pref("has_taxon"), tax(taxon_id.toString))++
        sp(taxon_id, scientific_name)("has_common_name", str(common_name))++
        sp(taxon_id, scientific_name)("has_lifespan", lifespan)++
        sp(taxon_id, scientific_name)("has_temperature_kelvin", temperature_k)++
        sp(taxon_id, scientific_name)("has_mass_g", mass_g) ++
        sp(taxon_id, scientific_name)("has_metabolic_rate", metabolic_rate) ++
        sp(taxon_id, scientific_name)("is_animal_class", ens(animal_class)) ++
        sp(taxon_id, scientific_name)("has_ensembl_url", "<"+url+">") ++
        Nil
}.toDF("subject", "property", "object", "context")
trs.show(100,1000)

+--------------------------------------------------------------+-------------------------------------------------------------+--------------------------------------------------------------+---------------------------------------------------------+
|                                                       subject|                                                     property|                                                        object|                                                  context|
+--------------------------------------------------------------+-------------------------------------------------------------+--------------------------------------------------------------+---------------------------------------------------------+
|         <http://rdf.ebi.ac.uk/resource/ensembl/taxon#2489341>|   <http://aging-research.group/resource/has_scientific_name>|    <http://aging-research.group/resource/Strigops_habroptila>|<http://rdf.ebi.ac.uk/resource/ensembl/confidence/high> .|
|    <ht

In [6]:
trs.writeTSV("/data/databases/graphdb/import/species.nq", false)

parts of /data/databases/graphdb/import/species.nt merged!


/data/databases/graphdb/import/species.nt

# Processing samples



In [8]:
val samples = spark.readTSV("/data/samples/species/species.tsv", header=true).na.fill("N/A")
  .where($"index" =!= "N/A")
  .where($"library_strategy" === "RNA-Seq")
  .sort($"organism".desc, $"library_layout")
samples.show(10, 10000)

+-----------+-----------+----------+----------------+-----+-----------+----------------------------+----------------+--------------+-----------------+---------+-------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------+-------------+-----------+---+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+-----------+-------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+------------------------

In [9]:
samples.schema.fields.map{ case f=> f.name}.toList

List(bioproject, series, run, organism, taxid, sample_name, sequencer, library_strategy, library_layout, library_selection, study, study_title, characteristics, source, age, sex, tumor, protocol, salmon_version, index, genes, transcripts, quant, libType,

In [10]:
samples.printSchema()

root
 |-- bioproject: string (nullable = false)
 |-- series: string (nullable = false)
 |-- run: string (nullable = false)
 |-- organism: string (nullable = false)
 |-- taxid: integer (nullable = true)
 |-- sample_name: string (nullable = false)
 |-- sequencer: string (nullable = false)
 |-- library_strategy: string (nullable = false)
 |-- library_layout: string (nullable = false)
 |-- library_selection: string (nullable = false)
 |-- study: string (nullable = false)
 |-- study_title: string (nullable = false)
 |-- characteristics: string (nullable = false)
 |-- source: string (nullable = false)
 |-- age: string (nullable = false)
 |-- sex: string (nullable = false)
 |-- tumor: string (nullable = false)
 |-- protocol: string (nullable = false)
 |-- salmon_version: string (nullable = false)
 |-- index: string (nullable = false)
 |-- genes: string (nullable = false)
 |-- transcripts: string (nullable = false)
 |-- quant: string (nullable = false)
 |-- libType: string (nullable = false)
 

In [13]:
val bioproject_prefix = "https://www.ncbi.nlm.nih.gov/bioproject/"
val experiment_prefix = "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="
val sra_prefix = "https://www.ncbi.nlm.nih.gov/sra/"

def bioproject(str: String) = s"<${bioproject_prefix}${str}>"
def experiment(str: String) = if(str.contains("GSE")) s"<${experiment_prefix}$str>" else bioproject(str)
def run(str: String) = s"<${experiment_prefix}${str}>"
val has_experiment = pref("has_experiment")
val has_run = pref("has_run")
val has_sample_name = pref("has_sample_name")
val has_experiments = pref("has_experiment")


In [23]:
def pre(prefix: String, str: String) = 
    if(prefix.startsWith("xsd:")) s""""$str"^^$prefix"""" else
    if(prefix.contains("http")) "<" + prefix + str + ">" else     
    prefix + str

In [24]:
  //note first is subject, others - objects!
  def toQuads(dataFrame: DataFrame, subject_prefix: String, subject_column: String,
  properties: Seq[String], prefixes: Seq[String], object_columns: Seq[String], context: String = prefix_samples) = {
    require(properties.length == prefixes.length && prefixes.length == object_columns.length, s"not same number of properties(${properties.length}), prefixes(${prefixes.length}) and columns(${object_columns.length})")
    val df = dataFrame.select(subject_column, object_columns:_*)    
    df.flatMap{ case row=>       
            val sub = pre(subject_prefix, row.getAs[String](subject_column)) 
            properties.zip(prefixes.zip(object_columns)).map{ 
                case (prop, (pref, col)) => 
                    val o = pre(pref, row.getAs[String](col))
                    (sub, prop, o, "<" + context + ">" + " .")
            }
    }.as[(String, String, String, String)].toDF("subject", "property", "object", "context")    
    
}

In [25]:
def toHasQuads(dataFrame: DataFrame, subject_prefix: String, subject_column: String,
   prefixes: Seq[String], columns: Seq[String], context: String = prefix_samples, property_prefix: String = prefix_samples) = {
      val properties = columns.map(c=> pre(property_prefix, "has_"+c))
      toQuads(dataFrame, subject_prefix, subject_column, properties, prefixes, columns, context)
  }

def toHasPrefixedQuads(dataFrame: DataFrame, subject_prefix: String, subject_column: String,
columns: Seq[String], context: String = prefix_samples, property_prefix: String = prefix_samples, object_prefix: String = prefix_samples) = {
    val properties = columns.map(c=> pre(property_prefix, c))
    val prefixes = columns.map(_ => object_prefix)
    toQuads(dataFrame, subject_prefix, subject_column, properties, prefixes, columns, context)
}

In [15]:
val (samples_gse, samples_main) = samples.partition()
val trip_bioprojects = toHasQuads(samples.where($"series" contains "GSE"),  bioproject_prefix, "bioproject",
 Seq(experiment_prefix, sra_prefix, "https://trace.ncbi.nlm.nih.gov/Traces/sra/?study="),
 Seq("series", "run", "study")).join(toHasQuads(samples.where(! ($"series" contains "GSE")),  bioproject_prefix, "bioproject",
 Seq(experiment_prefix, sra_prefix, "https://trace.ncbi.nlm.nih.gov/Traces/sra/?study="),
 Seq("series", "run", "study")))
trip_bioprojects.show(1000, 10000)

+-----------------------------------------------------+------------------------------------------------+------------------------------------------------------------------------+----------------------------------------+
|                                              subject|                                        property|                                                                  object|                                 context|
+-----------------------------------------------------+------------------------------------------------+------------------------------------------------------------------------+----------------------------------------+
|<https://www.ncbi.nlm.nih.gov/bioproject/PRJNA184055>|<http://aging-research.group/samples/has_series>|           <https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE43013>|<http://aging-research.group/samples/> .|
|<https://www.ncbi.nlm.nih.gov/bioproject/PRJNA184055>|   <http://aging-research.group/samples/has_run>|                    

In [16]:
val trs = species_n.flatMap{
    case (taxon_id,scientific_name,common_name,url,animal_class,
    lifespan,mass_g,metabolic_rate,temperature_k,
    specimen_origin,sample_size,data_quality) =>
        trip(tax(taxon_id.toString), pref("has_scientific_name"), pref(scientific_name))++
        trip(pref(scientific_name),pref("has_taxon"), tax(taxon_id.toString))++
        sp(taxon_id, scientific_name)("has_common_name", str(common_name))++
        sp(taxon_id, scientific_name)("has_lifespan", lifespan)++
        sp(taxon_id, scientific_name)("has_temperature_kelvin", temperature_k)++
        sp(taxon_id, scientific_name)("has_mass_g", mass_g) ++
        sp(taxon_id, scientific_name)("has_metabolic_rate", metabolic_rate) ++
        sp(taxon_id, scientific_name)("is_animal_class", ens(animal_class)) ++
        sp(taxon_id, scientific_name)("has_ensembl_url", "<"+url+">") ++
        Nil
}.toDF("subject", "property", "object", "context")
trs.show(100,1000)

+--------------------------------------------------------------+-------------------------------------------------------------+--------------------------------------------------------------+---------------------------------------------------------+
|                                                       subject|                                                     property|                                                        object|                                                  context|
+--------------------------------------------------------------+-------------------------------------------------------------+--------------------------------------------------------------+---------------------------------------------------------+
|         <http://rdf.ebi.ac.uk/resource/ensembl/taxon#2489341>|   <http://aging-research.group/resource/has_scientific_name>|    <http://aging-research.group/resource/Strigops_habroptila>|<http://rdf.ebi.ac.uk/resource/ensembl/confidence/high> .|
|    <ht

# **Producing genes n-quads**<br>




In [18]:
val genes_n = spark.read.parquet("/data/ensembl/99/website/genes_lookup.parquet").as[(java.lang.Integer,String, String, String, String, String, String, String)]
genes_n.show(20,1000)

+--------+------------------+-------------+-------------+------------------------------------------------------------------------------------+---------------+--------------------+-----+
|taxon_id|         stable_id|display_label|biotype_group|                                                                         description|scientific_name|         common_name|class|
+--------+------------------+-------------+-------------+------------------------------------------------------------------------------------+---------------+--------------------+-----+
|  211598|ENSANIG00000000002|        SSTR4|       coding|                         somatostatin receptor 4 [Source:HGNC Symbol;Acc:HGNC:11333]|Accipiter nisus|Eurasian sparrowhawk| Aves|
|  211598|ENSANIG00000000003|           \N|       coding|                                                                                  \N|Accipiter nisus|Eurasian sparrowhawk| Aves|
|  211598|ENSANIG00000000004|         CD93|       coding|             

In [19]:
def str2(s: String) = if(s==null) "" else s""""${s.replace("\"", "\\\"")}""""

In [20]:
val genes_trs = genes_n.flatMap{
    case (taxon_id,stable_id,display_label,biotype_group,description,scientific_name,common_name,cl) =>
    sp(taxon_id, scientific_name)("has_gene", ens(stable_id)) ++
    trip(ens(stable_id), "<"+"http://www.w3.org/2000/01/rdf-schema#label" +">", str2(display_label))++
    trip(ens(stable_id), "<"+"http://www.w3.org/2000/01/rdf-schema#description" +">",  str2(description))++
    trip(ens(stable_id), ens("has_biotype"), ens("biotype/"+biotype_group))++
    Nil
}.toDF("subject", "property", "object", "context")
genes_trs.show(40, 1000)

+----------------------------------------------------------+---------------------------------------------------+------------------------------------------------------------------------------------+---------------------------------------------------------+
|                                                   subject|                                           property|                                                                              object|                                                  context|
+----------------------------------------------------------+---------------------------------------------------+------------------------------------------------------------------------------------+---------------------------------------------------------+
|      <http://rdf.ebi.ac.uk/resource/ensembl/taxon#211598>|    <http://aging-research.group/resource/has_gene>|                          <http://rdf.ebi.ac.uk/resource/ensembl/ENSANIG00000000002>|<http://rdf.ebi.ac.uk/resource/ense

In [21]:
genes_trs.writeTSV("/data/databases/graphdb/import/genes_lookup.nq", header = false)

parts of /data/databases/graphdb/import/genes_lookup.nt merged!


/data/databases/graphdb/import/genes_lookup.nt