# DL2Vec

In [None]:
import sys
sys.path.append("../../../")

import torch as th
#import logging


import pickle as pkl
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import logging

logging.basicConfig(level=logging.INFO)

In [None]:
import mowl
mowl.init_jvm("10g")
from mowl.datasets.ppi_yeast import PPIYeastSlimDataset
from mowl.model import EmbeddingModel
from mowl.projection.dl2vec.model import DL2VecProjector
from mowl.projection.factory import projector_factory
from mowl.walking.factory import walking_factory
import mowl.evaluation.base as ev
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence


## DL2Vec

In [None]:
dataset = PPIYeastSlimDataset()
projector = projector_factory("dl2vec", bidirectional_taxonomy = True)
edges = projector.project(dataset.ontology)

walker = walking_factory(
        "node2vec", 
        alpha = 0.1, 
        walk_length = 20, 
        num_walks = 10, 
        outfile = "data/walks")
walker.walk(edges)

corpus = LineSentence("data/walks")

w2v_model = Word2Vec(
        corpus,
        sg=1,
        min_count=1,
        vector_size=10,
        window = 5,
        epochs = 2,
        workers = 16)


vectors = w2v_model.wv

w2v_model.save('data/dl2vec') 

In [None]:

#model.train()

## Evaluation

In [None]:
from mowl.projection.edge import Edge
dataset = PPIYeastSlimDataset()
vectors = Word2Vec.load("data/dl2vec").wv

eval_projector = projector_factory('taxonomy_rels', taxonomy=False, relations=["http://interacts_with"])

training_set = eval_projector.project(dataset.ontology)
testing_set = eval_projector.project(dataset.testing)

training_entities,_ = Edge.getEntitiesAndRelations(training_set)
testing_entities,_ = Edge.getEntitiesAndRelations(testing_set)

entities = list(set(training_entities) | set(testing_entities))


In [None]:
from mowl.evaluation.rank_based import EmbeddingsRankBasedEvaluator
from mowl.evaluation.base import CosineSimilarity
evaluator = EmbeddingsRankBasedEvaluator(
        vectors, 
        testing_set, 
        CosineSimilarity, 
        training_set = training_set, 
        head_entities = entities,
        device = "cuda"
)

evaluator.evaluate(show=True)

In [None]:
from mowl.embeddings.graph_based.dl2vec.model import DL2Vec
dataset = PPIYeastSlimDataset()



model = DL2Vec(dataset, 
               "data/dl2vectoolkit", 
               bidirectional_taxonomy = True,
                alpha = 0.1, 
            walk_length = 20, 
            num_walks = 10, 

               workers = 16, 
               wv_epochs=2,
                vector_size=10,
        window = 5,
               device = "cuda"
              )
model.train()

In [None]:
from mowl.evaluation.rank_based import ModelRankBasedEvaluator
from mowl.evaluation.base import  CosineSimilarity

In [None]:
evaluator = ModelRankBasedEvaluator(model, device = "cuda" )

In [None]:
evaluator.evaluate()

In [None]:
evaluator.metrics

In [None]:
Hits@1:   0.00 Filtered:   0.01
Hits@10:  0.02 Filtered:   0.10
Hits@100: 0.23 Filtered:   0.37
MR:       909.21 Filtered: 853.36
AUC:      0.85 Filtered:   0.86
Evaluation finished. Access the results using the "metrics" attribute.

In [None]:
evaluator.metrics

## TSNE

In [None]:
ec_numbers = {}
with open('data/yeast_ec.tab') as f:
    next(f)
    for line in f:
        it = line.strip().split('\t', -1)
        if len(it) < 5:
            continue
        if it[3]:
            prot_id = it[3].split(';')[0]
            prot_id = '{0}'.format(prot_id)
            ec_numbers[f"http://{prot_id}"] = it[4].split(".")[0]

In [None]:
from mowl.visualization.base import TSNE as MTSNE

tsne = MTSNE(vectors, ec_numbers, entities = entities)
tsne.generate_points(5000, workers = 16, verbose = 1)


In [None]:
tsne.show()

In [None]:

tsne.savefig('data/mowl_tsne.jpg')