# ortho graph

This is a text cell. Start editing!

In [1]:
import org.apache.spark.rdd._
import org.apache.spark.sql._
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.functions._
import group.research.aging.spark.extensions._
import group.research.aging.spark.extensions.functions._
import group.research.aging.spark.extensions.functions.ConcatenateString
import group.research.aging.spark.extensions.functions.Concatenate

In [2]:
val ortho_table2 = spark.read.parquet("/data/ensembl/99/website/intermediates/ortho_table2.parquet")
ortho_table2.columns.toList

List(homology_id, gene_member_id, stable_id, taxon_id, description, is_high_confidence)

In [3]:
val con = new Concatenate(",", true)

In [4]:
val ortho_short = ortho_table2.select("homology_id", "description", "is_high_confidence", "stable_id").as[(String, String, String, String)]
ortho_short.columns.toList

List(homology_id, description, is_high_confidence, stable_id)

In [5]:
val ortho_relations = ortho_short.groupBy($"homology_id", $"description", $"is_high_confidence").agg(con($"stable_id").as("genes"))
ortho_relations.printSchema()

root
 |-- homology_id: string (nullable = true)
 |-- description: string (nullable = true)
 |-- is_high_confidence: string (nullable = true)
 |-- genes: string (nullable = true)



In [6]:
ortho_relations.writeParquet("/data/ensembl/99/website/intermediates/ortho_relations.parquet", true)

parts of /data/ensembl/99/website/intermediates/ortho_relations.parquet merged!


/data/ensembl/99/website/intermediates/ortho_relations.parquet

In [7]:
ortho_relations.writeTSV("/data/ensembl/99/website/intermediates/ortho_relations.tsv", header = true)

parts of /data/ensembl/99/website/intermediates/ortho_relations.tsv merged!


/data/ensembl/99/website/intermediates/ortho_relations.tsv

In [8]:
val orto_relations = spark.read.parquet("/data/ensembl/99/website/intermediates/ortho_relations.parquet")
ortho_relations.columns.toList

List(homology_id, description, is_high_confidence, genes)

In [9]:
ortho_relations.limit(10).show(10, 10000)

+-----------+------------------+------------------+-------------------------------------+
|homology_id|       description|is_high_confidence|                                genes|
+-----------+------------------+------------------+-------------------------------------+
|  100000110|ortholog_many2many|                 0|ENSSDAG00000024266,ENSGACG00000020194|
|  100000413| ortholog_one2many|                 1|ENSPSTG00000007108,ENSATEG00000005345|
|  100000479|  ortholog_one2one|                 1|ENSSTOG00000026618,ENSCGRG00001024961|
|  100000517|ortholog_many2many|                 0|ENSSFAG00005026694,ENSKMAG00000003246|
|   10000055|  ortholog_one2one|                 1|ENSONIG00000002251,ENSEBUG00000006460|
|  100000691|  ortholog_one2one|                 0|ENSSARG00000014213,ENSCAPG00000011430|
|   10000093| ortholog_one2many|                 1|ENSPREG00000010639,ENSSANG00000048093|
|  100001019|ortholog_many2many|                 0|ENSNBRG00000007654,ENSSFAG00005017551|
|  1000011

In [10]:
val lim10 = ortho_relations.limit(10).cache()

In [11]:
lim10.show()

+-----------+------------------+------------------+--------------------+
|homology_id|       description|is_high_confidence|               genes|
+-----------+------------------+------------------+--------------------+
|  100000110|ortholog_many2many|                 0|ENSSDAG0000002426...|
|  100000413| ortholog_one2many|                 1|ENSPSTG0000000710...|
|  100000479|  ortholog_one2one|                 1|ENSSTOG0000002661...|
|  100000517|ortholog_many2many|                 0|ENSSFAG0000502669...|
|   10000055|  ortholog_one2one|                 1|ENSONIG0000000225...|
|  100000691|  ortholog_one2one|                 0|ENSSARG0000001421...|
|   10000093| ortholog_one2many|                 1|ENSPREG0000001063...|
|  100001019|ortholog_many2many|                 0|ENSNBRG0000000765...|
|  100001162|  ortholog_one2one|                 0|ENSUPAG0001000240...|
|  100001313|  ortholog_one2one|                 0|ENSSPAG0000001248...|
+-----------+------------------+------------------+

In [12]:
val nTriples = ortho_relations.as[(String, String, String, String)].flatMap{ case (hom, des, high, genes) =>
val gs = genes.split(",").toList.combinations(2)
val base = "http://rdf.ebi.ac.uk/resource/ensembl/"
val con = base + "confidence/" + (if(high=="0") "low" else "high")
val desc = base + des
val res:List[(String, String, String, String)] = gs.flatMap{ case one::two::tail => 
List(
    ("<"+base + one + ">", "<"+desc + ">", "<"+base + two + ">", "<"+con + "> ."), ("<"+base + two + ">", "<"+desc + ">", "<"+base+one + ">", "<"+con + "> .") )
}.toList
res
}.toDF("subject", "property", "object", "context")

In [13]:
nTriples.writeTSV("/data/ensembl/99/website/intermediates/ortho_relations/all_orthologs.nt", false)

parts of /data/ensembl/99/website/intermediates/ortho_relations/all_orthologs.nt merged!


/data/ensembl/99/website/intermediates/ortho_relations/all_orthologs.nt

In [14]:
val nTriples10 = lim10.as[(String, String, String, String)].flatMap{ case (hom, des, high, genes) =>
val gs = genes.split(",").toList.combinations(2)
val base = "http://rdf.ebi.ac.uk/resource/ensembl/"
val con = base + "confidence/" + (if(high=="0") "low" else "high")
val desc = base + des
val res:List[(String, String, String, String)] = gs.flatMap{ case one::two::tail => 
List(
    ("<"+base + one + ">", "<"+desc + ">", "<"+base + two + ">", "<"+con + "> ."), ("<"+base + two + ">", "<"+desc + ">", "<"+base+one + ">", "<"+con + "> .") )
}.toList
res
}.toDF("subject", "property", "object", "context")
nTriples10.show(1000,1000)

+----------------------------------------------------------+----------------------------------------------------------+----------------------------------------------------------+---------------------------------------------------------+
|                                                   subject|                                                  property|                                                    object|                                                  context|
+----------------------------------------------------------+----------------------------------------------------------+----------------------------------------------------------+---------------------------------------------------------+
|<http://rdf.ebi.ac.uk/resource/ensembl/ENSSDAG00000024266>|<http://rdf.ebi.ac.uk/resource/ensembl/ortholog_many2many>|<http://rdf.ebi.ac.uk/resource/ensembl/ENSGACG00000020194>| <http://rdf.ebi.ac.uk/resource/ensembl/confidence/low> .|
|<http://rdf.ebi.ac.uk/resource/ensembl/ENSGACG00000

In [15]:
nTriples10.writeTSV("/data/ensembl/99/website/intermediates/ortho_relations/orthology_graph_10.nt", false)

parts of /data/ensembl/99/website/intermediates/ortho_relations/orthology_graph_10.nt merged!


/data/ensembl/99/website/intermediates/ortho_relations/orthology_graph_10.nt

In [17]:
val n4 = lim10.as[(String, String, String, String)].flatMap{ case (hom, des, high, genes) => 
val gs = genes.split(",")
gs.nNnn
gs.map(g=>(g, des, g))
}
n4.show()

+------------------+------------------+------------------+
|                _1|                _2|                _3|
+------------------+------------------+------------------+
|ENSSDAG00000024266|ortholog_many2many|ENSSDAG00000024266|
|ENSGACG00000020194|ortholog_many2many|ENSGACG00000020194|
|ENSPSTG00000007108| ortholog_one2many|ENSPSTG00000007108|
|ENSATEG00000005345| ortholog_one2many|ENSATEG00000005345|
|ENSSTOG00000026618|  ortholog_one2one|ENSSTOG00000026618|
|ENSCGRG00001024961|  ortholog_one2one|ENSCGRG00001024961|
|ENSSFAG00005026694|ortholog_many2many|ENSSFAG00005026694|
|ENSKMAG00000003246|ortholog_many2many|ENSKMAG00000003246|
|ENSONIG00000002251|  ortholog_one2one|ENSONIG00000002251|
|ENSEBUG00000006460|  ortholog_one2one|ENSEBUG00000006460|
|ENSSARG00000014213|  ortholog_one2one|ENSSARG00000014213|
|ENSCAPG00000011430|  ortholog_one2one|ENSCAPG00000011430|
|ENSPREG00000010639| ortholog_one2many|ENSPREG00000010639|
|ENSSANG00000048093| ortholog_one2many|ENSSANG0000004809

In [18]:
"http://rdf.ebi.ac.uk/resource/ensembl/"