# Translational Embeddings

### Knowledge Graph

Let $KG = (V, E, L; \vdash)$ be a knowledge graph with a set of
    vertices $V$, a set of edges $E \subseteq V \times V$, a label
    function $L: V \cup E \mapsto Lab$ that assigns labels from a set
    of labels $Lab$ to vertices and edges, and an inference relation
    $\vdash$.
    
**A knowledge graph embedding is a function** $f_\eta : L(V) \cup L(E) \mapsto \mathbb{R}^n$. That is, the function takes elements from the set $ L(V) \cup L(E) \subseteq Lab$ and gets elements in $\mathbb{R}^n$, where $n$ is the _embedding size_.

### TransE idea
TransE aims to model multirelational data by representing relationships as **translations** in the following way:

Consider an edge is the graph of the form $(h, \ell, t)$, where $h$ is the head of the edge, $\ell$ is the type of relation and $t$ is the tail of the edge. Let's denote the corresponding embeddings as $\boldsymbol{h}$, $\boldsymbol{\ell}$ and $\boldsymbol{t}$. TransE learns the embeddings such that: 
$$\boldsymbol{h} + \boldsymbol{\ell} \approx \boldsymbol{t}$$

### Objective function
TransE minimizes the following objective function: $$
\mathcal{L}=\sum_{(h, \ell, t) \in S} \sum_{\left(h^{\prime}, \ell, t^{\prime}\right) \in S_{(h, \ell, t)}^{\prime}}\left[\gamma+d(\boldsymbol{h}+\boldsymbol{\ell}, \boldsymbol{t})-d\left(\boldsymbol{h}^{\prime}+\boldsymbol{\ell}, \boldsymbol{t}^{\prime}\right)\right]_{+}
$$

Where $d(\boldsymbol{h}+\boldsymbol{\ell}, \boldsymbol{t})$ is the _dissimilarity_ score of a positive edge. Furthermore, $d\left(\boldsymbol{h}^{\prime}+\boldsymbol{\ell}, \boldsymbol{t}^{\prime}\right)$ is the _dissimilarity_ score for a negative triple obtained by corrupting either the head or tail (but not both) of a positive triple. In this way, TransE favors lower scores for positive edges and big scores for negative edges. 

Regarding the parameter $\gamma$, it is used to enforce that the score of a positive edge is different (lower) than the score of a negative edge by at least $\gamma$.

In [None]:
import sys
sys.path.append("../../../")

import mowl
mowl.init_jvm("4g")
import torch as th
#import logging
import numpy as np
import pickle as pkl
from mowl.visualization.base import TSNE
import matplotlib.pyplot as plt

from mowl.datasets.ppi_yeast import PPIYeastSlimDataset

from mowl.embeddings.translational.model import TranslationalOnt
from mowl.projection.factory import projector_factory

In [None]:
dataset = PPIYeastSlimDataset()
projector = projector_factory("dl2vec", bidirectional_taxonomy = True)
edges = projector.project(dataset.ontology)
modelE = TranslationalOnt(
    edges,
    trans_method = "transE",
    embedding_dim = 100,
    epochs = 32,
    batch_size = 256,
    model_filepath = "/tmp/trans_model.th"
    )
    


In [None]:
#modelE.train()
modelE.load_best_model()

## Inference

In [None]:
method = modelE.score_method
cls_embs, rel_embs = modelE.get_embeddings()
print(rel_embs.keys())

In [None]:
from mowl.inference.el import GCI2Score

scorer = GCI2Score(method, list(cls_embs.keys()), list(rel_embs.keys()))

#"c?.*?4932\.Q.*? SubClassOf p?.*?            some  c?.*?4932.*?"
#"c?.*?4932\.Q.*? SubClassOf p?http://interacts_with? some c?.*?"
preds = scorer.score("c?.*?4932\.Q.*? SubClassOf p?http://interacts_with? some c?.*?4932.*?")
print(len(preds))

In [None]:
from mowl.evaluation.predictions import evaluate_predictions
from mowl.corpus.base import extract_axiom_corpus
corpus = extract_axiom_corpus(dataset.testing)

metrics = evaluate_predictions(corpus, preds, [1,10, 100, 1000, 10000, 160000, 162918, 3336802], pos_label = 0)
print(metrics)

In [None]:
transE_embeddings = modelE.model.entity_representations[0](indices=None).cpu().detach().numpy()
protE_embeddings = {}

for node, idx in modelE.entities_idx.items():
    if node.startswith("4932"):
        protE_embeddings[node] = transE_embeddings[idx]
        
with open("data/protE_emb", "wb") as file:
    pkl.dump(protE_embeddings, file)

In [None]:
with open("data/protE_emb", "rb") as file:
    protE_embeddings = pkl.load(file)

ec_numbers = {}
with open('data/yeast_ec.tab') as f:
    next(f)
    for line in f:
        it = line.strip().split('\t', -1)
        if len(it) < 5:
            continue
        if it[3]:
            prot_id = it[3].split(';')[0]
            prot_id = '{0}'.format(prot_id)    
            ec_numbers[prot_id] = it[4]

In [None]:
ec_dict = {}
for prot in ec_numbers:
    if prot in protE_embeddings:
        ec_dict[prot] = protE_embeddings[prot]
        
size = modelE.embedding_dim
embeds = np.zeros((len(ec_dict), size), dtype=np.float32)

for i, emb in enumerate(ec_dict.values()):
    embeds[i, :] = emb
nodemap = {}
for i, m in enumerate(ec_dict.keys()):
    nodemap[i] = m
    

X = TSNE(n_components=2, verbose=1, n_iter=5000, n_jobs=8).fit_transform(embeds)

In [None]:
classes = {'0': [[], []]}
for item in nodemap.items():
    k, v = item
    if v in ec_numbers:
        ec = ec_numbers[v].split('.')[0]
        if ec not in classes:
            classes[ec] = [[], []]
        classes[ec][0].append(X[k, 0])
        classes[ec][1].append(X[k, 1])
        
colors = iter(plt.cm.rainbow(np.linspace(0, 1, len(classes))))
fig, ax = plt.subplots(figsize=(20, 20))

for ec, items in classes.items():
    if ec == '0':
        continue
    color = next(colors)
    ax.scatter(items[0], items[1], color=color, label=ec)

ax.legend()
ax.grid(True)

plt.savefig('data/fig_transE.jpg')

plt.show()

## TransH

In [None]:
dataset = PPIYeastSlimDataset()
    
modelH = TranslationalOnt(
        dataset, 
        parsing_method = "dl2vec", 
        trans_method = "transH",
        embedding_dim = 100,
        epochs = 32,
        batch_size = 256,
        bidirectional_taxonomy = True
    )

In [None]:
modelH.train()

In [None]:
transH_embeddings = modelH.model.entity_representations[0](indices=None).cpu().detach().numpy()
protH_embeddings = {}

for node, idx in modelH.entities_idx.items():
    if node.startswith("4932"):
        protH_embeddings[node] = transH_embeddings[idx]
        
with open("data/protH_emb", "wb") as file:
    pkl.dump(protH_embeddings, file)

In [None]:
ec_dict_H = {}
for prot in ec_numbers:
    if prot in protH_embeddings:
        ec_dict_H[prot] = protH_embeddings[prot]
        
size = modelH.embedding_dim
embedsH = np.zeros((len(ec_dict_H), size), dtype=np.float32)

for i, emb in enumerate(ec_dict_H.values()):
    embedsH[i, :] = emb
nodemapH = {}
for i, m in enumerate(ec_dict_H.keys()):
    nodemapH[i] = m
    

XH = TSNE(n_components=2, verbose=1, n_iter=5000, n_jobs=8).fit_transform(embedsH)

In [None]:
classes = {'0': [[], []]}
for item in nodemapH.items():
    k, v = item
    if v in ec_numbers:
        ec = ec_numbers[v].split('.')[0]
        if ec not in classes:
            classes[ec] = [[], []]
        classes[ec][0].append(XH[k, 0])
        classes[ec][1].append(XH[k, 1])
        
colors = iter(plt.cm.rainbow(np.linspace(0, 1, len(classes))))
fig, ax = plt.subplots(figsize=(20, 20))

for ec, items in classes.items():
    if ec == '0':
        continue
    color = next(colors)
    ax.scatter(items[0], items[1], color=color, label=ec)

ax.legend()
ax.grid(True)

plt.savefig('data/fig_transH.jpg')

plt.show()

## TransR

In [None]:
dataset = PPIYeastSlimDataset()
    
modelR = TranslationalOnt(
        dataset, 
        parsing_method = "dl2vec", 
        trans_method = "transR",
        embedding_dim = 100,
        epochs = 32,
        batch_size = 256,
        bidirectional_taxonomy = True
    )

In [None]:
modelR.train()

In [None]:
transR_embeddings = modelR.model.entity_representations[0](indices=None).cpu().detach().numpy()
protR_embeddings = {}

for node, idx in modelR.entities_idx.items():
    if node.startswith("4932"):
        protR_embeddings[node] = transR_embeddings[idx]
        
with open("data/protR_emb", "wb") as file:
    pkl.dump(protR_embeddings, file)

In [None]:
ec_dict_R = {}
for prot in ec_numbers:
    if prot in protR_embeddings:
        ec_dict_R[prot] = protR_embeddings[prot]
        
size = modelR.embedding_dim
embedsR = np.zeros((len(ec_dict_R), size), dtype=np.float32)

for i, emb in enumerate(ec_dict_R.values()):
    embedsR[i, :] = emb
nodemapR = {}
for i, m in enumerate(ec_dict_R.keys()):
    nodemapR[i] = m
    

XR = TSNE(n_components=2, verbose=1, n_iter=5000, n_jobs=8).fit_transform(embedsR)

In [None]:
classes = {'0': [[], []]}
for item in nodemapR.items():
    k, v = item
    if v in ec_numbers:
        ec = ec_numbers[v].split('.')[0]
        if ec not in classes:
            classes[ec] = [[], []]
        classes[ec][0].append(XR[k, 0])
        classes[ec][1].append(XR[k, 1])
        
colors = iter(plt.cm.rainbow(np.linspace(0, 1, len(classes))))
fig, ax = plt.subplots(figsize=(20, 20))

for ec, items in classes.items():
    if ec == '0':
        continue
    color = next(colors)
    ax.scatter(items[0], items[1], color=color, label=ec)

ax.legend()
ax.grid(True)

plt.savefig('data/fig_transR.jpg')

plt.show()

In [None]:
resultsR = modelR.evaluate()

In [None]:
resultsR.hits_at_k