# ELEmbeddings

## Imports

In [None]:
import sys
import os
sys.path.append('../../../')
import mowl
mowl.init_jvm("5g")
from mowl.datasets.ppi_yeast import PPIYeastDataset, PPIYeastSlimDataset
from mowl.datasets.base import PathDataset
from mowl.embeddings.elembeddings.model import ELEmbeddings

import numpy as np
import torch as th
import logging
logging.basicConfig(level = logging.DEBUG)

## Loading the dataset

In [None]:
dataset = PPIYeastSlimDataset()
#root = "data_old/"
#dataset = PathDataset(root+"yeast-classes.owl", root+"valid.owl", root+"test.owl")

## ELEmbeddings

In [None]:
device = "cuda"
model = ELEmbeddings(dataset, epochs = 5001, margin = -0.1, device = device, model_filepath = "data/models/elmodel.th")
#model.train()
model.load_best_model()

Train the model

In [None]:
#model.train()

## Inference 

In [None]:
from mowl.inference.el import GCI0Score
gci0_score = GCI0Score(model.gci0_loss, ["http://4932.Q0045", "http://4932.Q0050", "http://4932.Q0055"])
print(gci0_score.patterns)
gci0_score.score("not c?http.*? or c?http.*?  SubClassOf owl:Nothing")

In [None]:
from mowl.inference.el import GCI2Score
embeddings, rel_embs = model.get_embeddings()
class_list= [k for k in embeddings]
gci2_score = GCI2Score(model.gci2_loss, class_list, property_list = list(rel_embs.keys()))
#gci2_score = GCI2Score(model.gci2_loss, ["http://4932.Q0045", "http://4932.Q0050", "http://4932.Q0055"], property_list = ["http://interacts_with"])
print(gci2_score.patterns)
preds_elem = gci2_score.score("c?http://4932.Q0017? SubClassOf p?.*? some  c?http://4932.Q0032?")
preds_elem

In [None]:

from mowl.projection.factory import projector_factory
from mowl.walking.factory import walking_factory

from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

projector = projector_factory("dl2vec", bidirectional_taxonomy = True)
edges = projector.project(dataset.ontology)

walker = walking_factory("deepwalk", alpha = 0.1, walk_length = 10, num_walks = 10, outfile = "/tmp/walks.txt")
walker.walk(edges)
corpus = LineSentence("/tmp/walks.txt")

w2v_model = Word2Vec(
       corpus,
       sg=1,
       min_count=1,
       vector_size=10,
       window = 10,
       epochs = 10,
       workers = 16)






In [None]:
from gensim.models import Word2Vec
#w2v_model.save("/tmp/w2v_model")
w2v_model = Word2Vec.load("/tmp/w2v_model")
embeddings = w2v_model.wv

In [None]:
from mowl.inference.cosine import CosineSimilarityInfer
print(len(embeddings))
cos_infer = CosineSimilarityInfer(embeddings, "http://interacts_with")
preds = cos_infer.score("c?.*?4932\.(Q|Q).*? SubClassOf http://interacts_with some  c?.*?4932.*?")
len(preds)

In [None]:
#preds

In [None]:
from mowl.evaluation.predictions import evaluate_predictions
from mowl.corpus.base import extract_axiom_corpus
corpus = extract_axiom_corpus(dataset.testing)

metrics = evaluate_predictions(corpus, preds_elem, [1,10, 100, 1000, 10000, 160000, 162918, 3336802], pos_label = 0)
print(metrics)

In [None]:
import pickle as pkl
with open("predictions.pkl", "wb") as f:
    pkl.dump(preds, f)

In [None]:
for p in preds:
    print(p)

In [None]:
exclude_gos = lambda x: not "GO" in x 
subclass_condition = lambda x: x in ["http://4932.Q0045", "http://4932.Q0050", "http://4932.Q0055"]
relation_condition = lambda x: x in ["http://interacts_with"]

    model.infer_gci2("infer_property", top_k = float("inf"), subclass_condition=subclass_condition,  filler_condition = exclude_gos, axioms_to_filter = dataset.ontology )
 #   model.infer_gci2(top_k = 10, mode = "infer_subclass", subclass_condition=exclude_gos, property_condition = relation_condition,  filler_condition = exclude_gos )
 #   model.infer_gci2(top_k = 10, mode = "infer_filler", subclass_condition=exclude_gos, property_condition = relation_condition,  filler_condition = exclude_gos )


In [None]:
property_inferences = model.property_inferences
non_zero_inferences = {k:v for k,v in property_inferences.items() if v > 0.0}
#len(subclass_inferences)
len(non_zero_inferences)

In [None]:
list(non_zero_inferences.items())[:10]

In [None]:
model.infer_gci2("infer_filler", top_k = 1000, subclass_condition=subclass_condition, property_condition = relation_condition, filler_condition = exclude_gos, axioms_to_filter = dataset.ontology )
filler_inferences = model.filler_inferences

In [None]:
len(filler_inferences)

In [None]:
filler_inferences

In [None]:

#model = ELEmbeddings(dataset, epochs = 5000, margin = -0.1, device = "cuda")
model.evaluate_ppi()


In [None]:
cls_dict, _ = model.get_entities_index_dict()
qwe = {v: k for k,v in cls_dict.items()}
qwe[34793]

In [None]:
cls_embs, _ = model.get_embeddings()

In [None]:
clskeys = set(cls_embs.keys())
assert 'http://purl.obolibrary.org/obo/GO_0008040' in clskeys

In [None]:
print(clskeys)

In [None]:
model.evaluate_ppi()

# Evaluation

In [None]:
from mowl.evaluation.rank_based import ModelRankBasedEvaluator
device = "cuda"
#model = ELEmbeddings(dataset, epochs = 5000, margin = -0.1, device = device)
evaluator = ModelRankBasedEvaluator(
        model,
        device = device
        )


In [None]:
evaluator.evaluate(show=True)

In [None]:
evaluator.evaluate(show=True)

# TSNE

In [None]:
ec_numbers = {}
with open('data/yeast_ec.tab') as f:
    next(f)
    for line in f:
        it = line.strip().split('\t', -1)
        if len(it) < 5:
            continue
        if it[3]:
            prot_id = it[3].split(';')[0]
            prot_id = '{0}'.format(prot_id)
            ec_numbers[f"http://{prot_id}"] = it[4].split(".")[0]


In [None]:
from mowl.projection.edge import Edge
from mowl.projection.factory import projector_factory
eval_projector = projector_factory('taxonomy_rels', taxonomy=False, relations=["http://interacts_with"])

training_set = eval_projector.project(dataset.ontology)
testing_set = eval_projector.project(dataset.testing)

training_entities,_ = Edge.getEntitiesAndRelations(training_set)
testing_entities,_ = Edge.getEntitiesAndRelations(testing_set)

entities = list(set(training_entities) | set(testing_entities))
print(len(entities))

In [None]:
from mowl.visualization.base import TSNE

cls_embeddings, _ = model.get_embeddings()

tsne = TSNE(cls_embeddings, ec_numbers, entities = entities)
tsne.generate_points(5000, workers = 16, verbose = 1)

In [None]:
tsne.show()

In [None]:
tsne.show()

In [None]:
X = TSNE(n_components=2, verbose=1, n_iter=2500).fit_transform(embeddings)