# VAE on proteins in SA representation

In [1]:
import glob
import os
from collections import Counter
import string
from keras import Input
from keras.layers import Dense, Lambda
import keras.backend as K
from keras.models import Model
from keras.objectives import binary_crossentropy, mse
import random
import numpy as np
from keras.optimizers import RMSprop, Adam
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


### Load dataset

In [2]:
family_path = "Dataset/families_reduced/fam_1"

In [3]:
proteins = glob.glob(os.path.join(family_path, "*.out"))
for p in proteins:
    print(p)

Dataset/families_reduced/fam_1/1DLW.lf_str.out
Dataset/families_reduced/fam_1/1ECA.lf_str.out
Dataset/families_reduced/fam_1/1ASH.lf_str.out


In [4]:
proteins_conf = []
for p in proteins:
    with open(p) as in_file:
        for line in in_file:
            proteins_conf.append(line.strip())
len(proteins_conf)

30000

In [5]:
l = [len(p) for p in proteins_conf]
print(Counter(l))

Counter({113: 10000, 133: 10000, 144: 10000})


### Preprocess dataset

In [6]:
max_length = 144
num_classes = 27 # including padding 0

In [7]:
letters_di=dict(zip(string.ascii_letters,[ord(c)%32 for c in string.ascii_letters]))

In [8]:
# trasnform letters to integers
proteins_processed = [[letters_di[l] for l in p] for p in proteins_conf]

In [9]:
# pad sequences if less than max length
proteins_processed = [p if len(p) == max_length else p + [0] * (max_length - len(p)) for p in proteins_processed]

In [10]:
# tranforms data to one hot encodings
proteins_processed = [to_categorical(p, num_classes=num_classes) for p in proteins_processed]

In [11]:
random.shuffle(proteins_processed)

In [12]:
train_set = proteins_processed[:(len(proteins_processed)*75//100)]
test_set = proteins_processed[(len(proteins_processed)*75//100):]

In [13]:
train_np = np.array([np.array(x) for x in train_set])
test_np = np.array([np.array(x) for x in test_set])
train_np = train_np.astype('float32') / letters_di['Z'] * 1.0
test_np = test_np.astype('float32') / letters_di['Z'] * 1.0

In [14]:
train_np.shape

(22500, 144, 27)

In [15]:
test_np.shape

(7500, 144, 27)

### Build VAE

In [16]:
batch_size = 64
original_dim = 144 * 27
intermediate_dim = 300
latent_dim = 2
epochs = 100

In [17]:
train_np = np.reshape(train_np, [-1, original_dim])
test_np = np.reshape(test_np,[-1, original_dim])

In [18]:
#encoder
inputs = Input(shape=(original_dim,), name='encoder_input')
x = Dense(intermediate_dim, activation='relu')(inputs)
z_mean = Dense(latent_dim, name='z_mean')(x)
z_log_var = Dense(latent_dim, name='z_log_var')(x)

Instructions for updating:
Colocations handled automatically by placer.


In [19]:
def sampling(args):
    """Reparameterization trick by sampling from an isotropic unit Gaussian.
    # Arguments
        args (tensor): mean and log of variance of Q(z|X)
    # Returns
        z (tensor): sampled latent vector
    """

    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    # by default, random_normal has mean = 0 and std = 1.0
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])

In [20]:
encoder = Model(inputs, [z_mean, z_log_var, z], name='encoder')
encoder.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      (None, 3888)         0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 300)          1166700     encoder_input[0][0]              
__________________________________________________________________________________________________
z_mean (Dense)                  (None, 2)            602         dense_1[0][0]                    
__________________________________________________________________________________________________
z_log_var (Dense)               (None, 2)            602         dense_1[0][0]                    
__________________________________________________________________________________________________
lambda_1 (

In [21]:
# decoder
latent_inputs = Input(shape=(latent_dim,), name='z_sampling')
x = Dense(intermediate_dim, activation='relu')(latent_inputs)
outputs = Dense(original_dim, activation='sigmoid')(x)

In [22]:
decoder = Model(latent_inputs, outputs, name='decoder')
decoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
z_sampling (InputLayer)      (None, 2)                 0         
_________________________________________________________________
dense_2 (Dense)              (None, 300)               900       
_________________________________________________________________
dense_3 (Dense)              (None, 3888)              1170288   
Total params: 1,171,188
Trainable params: 1,171,188
Non-trainable params: 0
_________________________________________________________________


In [23]:
# end-to-end autoencoder
outputs = decoder(encoder(inputs)[2])
vae = Model(inputs, outputs, name='vae_mlp')

In [24]:
reconstruction_loss = binary_crossentropy(inputs,outputs)
reconstruction_loss *= original_dim
kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
kl_loss = K.sum(kl_loss, axis=-1)
kl_loss *= -0.5
vae_loss = K.mean(reconstruction_loss + kl_loss)
vae.add_loss(vae_loss)
opt = Adam(lr=0.001)
vae.compile(optimizer=opt)
vae.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_input (InputLayer)   (None, 3888)              0         
_________________________________________________________________
encoder (Model)              [(None, 2), (None, 2), (N 1167904   
_________________________________________________________________
decoder (Model)              (None, 3888)              1171188   
Total params: 2,339,092
Trainable params: 2,339,092
Non-trainable params: 0
_________________________________________________________________


In [25]:
vae.fit(x=train_np,
        shuffle=True,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(test_np, None),
        verbose=1)

Instructions for updating:
Use tf.cast instead.
Train on 22500 samples, validate on 7500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Ep

Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x12d7a3c18>

In [27]:
test_res = vae.predict(test_np)

In [32]:
np.argmax(np.reshape(test_np[0],[-1,27]),axis=1)

array([22, 20, 21, 21, 21, 21, 21, 22, 23, 20, 18, 23, 23, 18,  8,  5, 11,
       22, 15, 13, 22, 22, 21, 21, 22, 21, 21, 21, 21, 21, 21, 21, 21, 22,
       23, 14, 13, 22, 20, 22, 20, 22, 22, 13, 22, 23, 17, 23, 17,  8, 12,
       21, 22, 19, 22, 22, 14, 10, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21,
       21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 22, 23, 13, 22, 21, 21, 21,
       21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 23, 16, 17,  5,  2,
        9, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 19,
       24, 20, 14, 11,  5,  5, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
       21, 21, 21, 22, 21, 21, 21, 23])

In [31]:
np.argmax(np.reshape(test_res[0],[-1,27]),axis=1)

array([22, 20, 21, 21, 21, 21, 21, 21, 21, 22, 19, 23, 23, 18,  8,  5, 11,
       18, 10, 19, 22, 22, 23, 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
       23, 14, 13, 20, 20, 22, 22, 21, 22, 13, 22, 23, 17, 23, 17,  8, 11,
       21, 23, 19, 22, 22, 14, 10, 20, 21, 21, 21, 23, 21, 21, 21, 21, 21,
       21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 23, 12, 21, 21, 21, 21,
       21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 23, 16, 13,  5,  2,
        9, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22,
       21, 23, 14, 11, 13,  4, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
       21, 21, 21, 21, 21, 21, 21, 23])

In [33]:
np.argmax(np.reshape(test_np[1],[-1,27]),axis=1)

array([11, 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 23, 19, 20, 23,
       13, 22, 21, 21, 21, 21, 21, 21, 21, 21, 22, 14, 10, 23, 19, 20, 22,
       21, 22, 13, 22, 23, 14, 13, 24, 14,  7, 20, 21, 23, 19, 23, 17, 11,
       21, 21, 21, 21, 21, 21, 21, 21, 21, 23, 21, 21, 21, 21, 21, 23, 20,
       21, 22, 17, 24, 22, 20, 19, 21, 21, 21, 21, 21, 21, 21, 21, 21, 23,
       19, 22, 21,  5,  4,  9, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
       21, 22, 21, 21, 21, 21, 23, 18,  8, 13, 19, 23, 22, 20, 21, 21, 21,
       21, 23, 24, 24, 21, 21, 21, 21, 22, 21, 21, 22, 21, 23,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0])

In [34]:
np.argmax(np.reshape(test_res[1],[-1,27]),axis=1)

array([11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 23, 19, 20, 23,
       13, 20, 21, 21, 21, 21, 21, 21, 21, 21, 20, 14, 10, 23, 19, 22, 22,
       21, 22, 19, 20, 23, 14,  2, 14, 14,  7, 22, 21, 23, 19, 23, 17, 11,
       21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 21, 21, 21, 21, 21, 21,
       21, 23, 17, 24, 22, 20, 19, 21, 21, 21, 21, 21, 21, 21, 22, 21, 23,
       19, 22, 20, 15,  5,  9, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
       21, 22, 21, 21, 21, 21, 23, 14, 11, 13, 22, 21, 23, 19, 21, 21, 21,
       21, 21, 21, 21, 21, 21, 21, 21, 22, 21, 21, 22, 22, 23,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0])

In [53]:
test_encoded = encoder.predict(test_np)

In [47]:
len(test_encoded[2])

7500

In [48]:
test_encoded[2]

array([[-0.3593759 ,  0.74483067],
       [ 0.07032961, -0.8594195 ],
       [ 0.30905998, -1.6250093 ],
       ...,
       [ 0.5164246 ,  1.1261108 ],
       [ 1.3644242 ,  0.63759685],
       [-0.8656409 , -0.5959314 ]], dtype=float32)

In [52]:
test_encoded[1]

array([[-2.122706 , -1.2144934],
       [-1.9336755, -1.9306399],
       [-1.0487787, -1.4388236],
       ...,
       [-2.0057704, -2.0212417],
       [-1.7248856, -2.2287254],
       [-1.6537818, -0.0442635]], dtype=float32)

In [54]:
test_encoded[1]

array([[-2.122706 , -1.2144934],
       [-1.9336755, -1.9306399],
       [-1.0487787, -1.4388236],
       ...,
       [-2.0057704, -2.0212417],
       [-1.7248856, -2.2287254],
       [-1.6537818, -0.0442635]], dtype=float32)