In [26]:
import click as ck
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.python.framework import function
import re
import math
import matplotlib.pyplot as plt
import logging
from tensorflow.keras.layers import (
    Input,
)
from tensorflow.keras import optimizers
from tensorflow.keras import constraints
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, CSVLogger
from tensorflow.keras import backend as K
from scipy.stats import rankdata

from elembeddings.elembedding import (
    ELModel, load_data, load_valid_data, Generator, MyModelCheckpoint)

# Parameters
batch_size = 32
embedding_size = 50
margin = -0.01
reg_norm = 1
learning_rate = 3e-4
epochs = 128

In [27]:
train_data, classes, relations = load_data('data/data-train/yeast-classes-normalized.owl')
valid_data = load_valid_data('data/data-valid/4932.protein.links.v10.5.txt', classes, relations)
    
proteins = {}
for k, v in classes.items():
    if not k.startswith('<http://purl.obolibrary.org/obo/GO_'):
        proteins[k] = v

nb_classes = len(classes)
nb_relations = len(relations)
nb_train_data = 0
for key, val in train_data.items():
    nb_train_data = max(len(val), nb_train_data)
train_steps = int(math.ceil(nb_train_data / (1.0 * batch_size)))
train_generator = Generator(train_data, batch_size, steps=train_steps)

cls_dict = {v: k for k, v in classes.items()}
rel_dict = {v: k for k, v in relations.items()}

cls_list = []
rel_list = []
for i in range(nb_classes):
    cls_list.append(cls_dict[i])
for i in range(nb_relations):
    rel_list.append(rel_dict[i])

        
print(nb_classes, nb_relations)

51671 10


In [28]:
nf1 = Input(shape=(2,), dtype=np.int32)
nf2 = Input(shape=(3,), dtype=np.int32)
nf3 = Input(shape=(3,), dtype=np.int32)
nf4 = Input(shape=(3,), dtype=np.int32)
dis = Input(shape=(3,), dtype=np.int32)
top = Input(shape=(1,), dtype=np.int32)
nf3_neg = Input(shape=(3,), dtype=np.int32)
el_model = ELModel(nb_classes, nb_relations, embedding_size, batch_size, margin, reg_norm)
out = el_model([nf1, nf2, nf3, nf4, dis, top, nf3_neg])
model = tf.keras.Model(inputs=[nf1, nf2, nf3, nf4, dis, top, nf3_neg], outputs=out)
optimizer = optimizers.Adam(lr=learning_rate)
model.compile(optimizer=optimizer, loss='mse')

out_classes_file = 'data/cls_embeddings.pkl'
out_relations_file = 'data/rel_embeddings.pkl'

checkpointer = MyModelCheckpoint(
    out_classes_file=out_classes_file,
    out_relations_file=out_relations_file,
    cls_list=cls_list,
    rel_list=rel_list,
    valid_data=valid_data,
    proteins=proteins,
    monitor='loss')
        
model.fit_generator(
    train_generator,
    steps_per_epoch=train_steps,
    epochs=epochs,
    workers=12,
    callbacks=[checkpointer,])


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/128
 Saving embeddings 1 1772.2059329120982

Epoch 2/128
 Saving embeddings 2 1768.0118937000557

Epoch 3/128
 Saving embeddings 3 1729.2357600817693

Epoch 4/128
 Saving embeddings 4 1604.8418974168371

Epoch 5/128
 Saving embeddings 5 1201.4964922876788

Epoch 6/128
 Saving embeddings 6 770.5193737223565

Epoch 7/128
 Saving embeddings 7 592.4931471845382

Epoch 8/128
 Saving embeddings 8 516.6255807470731

Epoch 9/128
 Saving embeddings 9 440.44041535030664

Epoch 10/128
 Saving embeddings 10 400.3001765471102

Epoch 11/128
 Saving embeddings 11 377.556448615499

Epoch 12/128
 Saving embeddings 12 346.17055380040887

Epoch 13/128
 Saving embeddings 13 335.59726351979185

Epoch 14/128
 Saving embeddings 14 321.42296970823264

Epoch 15/128
 Saving embeddings 15 319.568272625906

Epoch 16/128
Epoch 17/128
 Saving embeddings 17 296.80201170786097

Epoch 18/128
Epoch 19/128
Epoch 20/128
Epoch 21/128
Epoch 22/128
Epoch 23/128
Epoch 24/128
 Saving embeddings 24 292.96443504924736



KeyboardInterrupt: 

### TSNE

In [None]:
from matplotlib import pyplot as plt
from sklearn.manifold import TSNE


cls_df = pd.read_pickle('data/cls_embeddings.pkl')
rel_df = pd.read_pickle('data/rel_embeddings.pkl')

cls_embeddings = cls_df['embeddings'].values
rel_embeddings = rel_df['embeddings'].values

n = len(proteins)
embedding_size = cls_embeddings[0].shape[0] - 1
embeds = np.zeros((n, embedding_size))
for i, item in enumerate(proteins.items()):
    k, v = item
    embeds[i, :] = cls_embeddings[v][:-1]

X = TSNE(n_components=2, verbose=1).fit_transform(embeds)



[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 5504 samples in 0.009s...
[t-SNE] Computed neighbors for 5504 samples in 2.937s...
[t-SNE] Computed conditional probabilities for sample 1000 / 5504
[t-SNE] Computed conditional probabilities for sample 2000 / 5504
[t-SNE] Computed conditional probabilities for sample 3000 / 5504
[t-SNE] Computed conditional probabilities for sample 4000 / 5504
[t-SNE] Computed conditional probabilities for sample 5000 / 5504
[t-SNE] Computed conditional probabilities for sample 5504 / 5504
[t-SNE] Mean sigma: 0.304443
[t-SNE] KL divergence after 250 iterations with early exaggeration: 89.340729


### Plots

In [None]:
from matplotlib.pyplot import cm

# Load EC numbers
ec_numbers = {}
with open('data/yeast_ec.tab') as f:
    next(f)
    for line in f:
        it = line.strip().split('\t', -1)
        if len(it) < 5:
            continue
        if it[3]:
            prot_id = it[3].split(';')[0]
            prot_id = '<http://{0}>'.format(prot_id)    
            ec_numbers[prot_id] = it[4]

classes = {'0': [[], []]}
for i, item in enumerate(proteins.items()):
    k, v = item
    if k in ec_numbers:
        ec = ec_numbers[k].split('.')[0]
        if ec not in classes:
            classes[ec] = [[], []]
        classes[ec][0].append(X[i, 0])
        classes[ec][1].append(X[i, 1])
    else:
        classes['0'][0].append(X[i, 0])
        classes['0'][1].append(X[i, 1])
    
colors = iter(cm.rainbow(np.linspace(0, 1, len(classes))))
fig, ax = plt.subplots()

for ec, items in classes.items():
    if ec == '0':
        continue
    color = next(colors)
    ax.scatter(items[0], items[1], color=color, label=ec)

ax.legend()
ax.grid(True)

plt.show()
