# Onto2Vec

In this part of the tutorial, we run two ontology based methods to produce vector representations of biological entities: Onto2Vec and OPA2Vec.

## Imports

In [None]:
import mowl
mowl.init_jvm("20g")
from mowl.datasets.ppi_yeast import PPIYeastSlimDataset, PPIYeastDataset


In [None]:
from mowl.reasoning.base import MOWLReasoner
from org.semanticweb.elk.owlapi import ElkReasonerFactory
from org.semanticweb.HermiT import Reasoner

In [None]:

ds = PPIYeastSlimDataset()

reasonerFactory = ElkReasonerFactory()
reasoner = reasonerFactory.createReasoner(ds.ontology)
reasoner.precomputeInferences()

consistent = reasoner.isConsistent()
print(f"consistent: {consistent}")

mreasoner = MOWLReasoner(reasoner)
mreasoner.infer_subclass_axioms(ds.ontology)
mreasoner.infer_equiv_class_axioms(ds.ontology)



## Onto2Vec

Onto2vec produces vectory representations based on the logical axioms of an ontology and the known associations between ontology classes and biological entities. In the case study below, we use Onto2vec to produce vector representations of proteins based on their GO annotations and the GO logical axioms.

In [None]:
from mowl.corpus.base import extract_axiom_corpus, extract_and_save_axiom_corpus
corpus = extract_axiom_corpus(ds.ontology)
extract_and_save_axiom_corpus(ds.ontology, out_file = "data/onto2vec_axiom_corpus")

In [None]:
len(corpus)

In [None]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
sentences = LineSentence("data/onto2vec_corpus")

model = Word2Vec(
         sentences,
         sg = 1,
         min_count = 1,
         vector_size = 20,
         window = 5,
         epochs = 20,
         workers = 4
     )

vectors = model.wv


# Evaluation

In [None]:
from mowl.projection.edge import Edge
from mowl.projection.factory import projector_factory

eval_projector = projector_factory('taxonomy_rels', taxonomy=False, relations=["http://interacts_with"])

training_set = eval_projector.project(ds.ontology)
testing_set = eval_projector.project(ds.testing)

training_entities,_ = Edge.getEntitiesAndRelations(training_set)
testing_entities,_ = Edge.getEntitiesAndRelations(testing_set)

entities = list(set(training_entities) | set(testing_entities))


In [None]:
from mowl.evaluation.base import RankBasedEvaluator, CosineSimilarity

evaluator = RankBasedEvaluator(
        vectors, 
        testing_set, 
        CosineSimilarity, 
        training_set = training_set, 
        head_entities=entities,
        tail_entities=entities)

evaluator.evaluate(show=True)

# TSNE

In [None]:
ec_numbers = {}
with open('data/yeast_ec.tab') as f:
    next(f)
    for line in f:
        it = line.strip().split('\t', -1)
        if len(it) < 5:
            continue
        if it[3]:
            prot_id = it[3].split(';')[0]
            prot_id = '{0}'.format(prot_id)
            ec_numbers[f"http://{prot_id}"] = it[4].split(".")[0]

In [None]:
from mowl.visualization.base import TSNE as MTSNE

tsne = MTSNE(vectors, ec_numbers, entities = entities)
tsne.generate_points(5000, workers = 16, verbose = 1)

In [None]:
tsne.show()