# AE on proteins in SA representation

In [None]:
import glob
import os
from collections import Counter
import string
from keras import Input
from keras.layers import Dense, Lambda, Conv1D
import keras.backend as K
from keras.models import Model
from keras.objectives import binary_crossentropy, mse

import random
import numpy as np
from keras.optimizers import RMSprop, Adam
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.models import load_model

### Load dataset

Experiment on a subset of families, test different model capabilities

In [None]:
families = ["fam_1", "fam_2", "fam_3"]
ds_path = "Dataset/families_reduced"
family_paths = {}
for f in families:
    family_paths[f]= os.path.join(ds_path, f)

In [None]:
def load_family(f):
    proteins = glob.glob(os.path.join(family_paths[f], "*.out"))
    print("Proteins for family %s" %f)
    for p in proteins:
        print(p)
    proteins_conf = []
    for p in proteins:
        with open(p) as in_file:
            for line in in_file:
                proteins_conf.append(line.strip())
    len(proteins_conf)
    l = [len(p) for p in proteins_conf]
    print(Counter(l))
    return proteins_conf

In [None]:
families_conf = {}
for f in families:
    families_conf[f] = load_family(f)

### Preprocess dataset

In [None]:
letters_di=dict(zip(string.ascii_letters,[ord(c)%32 for c in string.ascii_letters]))

In [None]:
padding = True
num_classes = 27 if padding else 26 # 0 is left for padding
categorical = True
normalize = False
max_length = 144
flatten = True

In [None]:
def process_conf(configurations, categorical, padding, max_length, normalize, flatten):
    proteins_processed = [[letters_di[l] for l in p] for p in configurations]
    if padding:
        # pad sequences if less than max length
        proteins_processed = [p if len(p) == max_length else p + [0] * (max_length - len(p)) for p in proteins_processed]
    # tranforms data to one hot encodings
    if categorical:
        proteins_processed = [to_categorical(p, num_classes=num_classes) for p in proteins_processed]   
    proteins_processed = np.array([np.array(x) for x in proteins_processed])
    if flatten:
        proteins_processed = proteins_processed.reshape(-1, num_classes * max_length)
    if normalize:
        proteins_processed = proteins_processed.astype('float32') / (letters_di['Z'] * 1.0)
    return proteins_processed

In [None]:
for f in families:
    families_conf[f] = process_conf(families_conf[f], categorical=categorical, padding=padding, max_length=max_length,normalize=normalize, flatten=flatten)

In [None]:
families_conf['fam_1'].shape

### Build the autoencoder

In [None]:
batch_size = 64
intermediate_dim = 2
epochs = 30

In [None]:
# prepare dirs
if not os.path.exists("models"):
    os.mkdir("models")
if not os.path.exists("logs"):
    os.mkdir("logs")
for f in families:
    if not os.path.exists(os.path.join("models", f)):
        os.mkdir(os.path.join("models", f))
    if not os.path.exists(os.path.join("logs", f)):
        os.mkdir(os.path.join("logs", f))

In [None]:
def create_checkpoints(f):
    checkpoints_path = os.path.join("models", f)
    tensorboard_path = os.path.join("logs", f)
    cp_cb = ModelCheckpoint(filepath=os.path.join(checkpoints_path, "model.{epoch:02d}.hdf5"), monitor='val_loss',
                            save_best_only=True)
    tb_cb = TensorBoard(log_dir=tensorboard_path)
    return [cp_cb, tb_cb]

In [None]:
#autoencoder
def get_ae():
    if categorical:
        if not flatten:
            model_input = Input(shape=(None,num_classes))
        else:
            model_input = Input(shape=(max_length*num_classes,))
    else:
        model_input = Input(shape=(max_length,))
    #x=Conv1D(intermediate_dim, activation='sigmoid', kernel_size=3, padding='same', dilation_rate=1)(model_input)
    #encoded=Conv1D(intermediate_dim, activation='sigmoid', kernel_size=3, padding='same', dilation_rate=1, name="encoded")(x)
    #x=Conv1D(num_classes, activation='sigmoid', kernel_size=3, padding='same', dilation_rate=1)(encoded)
    encoded= Dense(intermediate_dim, activation='sigmoid')(model_input)
    if categorical:
        if not flatten:
            x = Dense(num_classes, activation='sigmoid')(encoded)
        else:
            x = Dense(max_length*num_classes, activation='sigmoid')(encoded)
    else:
        x = Dense(max_length, activation='sigmoid')(encoded)
    ae=Model(inputs=model_input, outputs=[x])
    opt=RMSprop(lr=0.01)
    ae.compile(optimizer=opt, loss='binary_crossentropy')
    ae.summary()
    return ae

#### Notes

Using convolutional layers shows poor convergence.

Itermediate dimension reduced to 2 seems to work for 3 families, but not for more.

### Train the autoencoder for specific classes

In [None]:
test_data = {}
train_data = {}
for f in families:
    train, test = train_test_split(families_conf[f], test_size=0.25, random_state=42)
    test_data[f] = test
    train_data[f] = train
    print(train.shape)
    print(test.shape)

In [None]:
for f in families: 
    print("Training for family %s" %f)
    ae = get_ae()
    ae.fit(train_data[f], train_data[f],
           shuffle=True,
           epochs=epochs,
           batch_size=batch_size,
           validation_data=(test_data[f], test_data[f]),
           callbacks=create_checkpoints(f),
           verbose=1)

In [None]:
models = {"fam_1": "models/fam_1/model.28.hdf5",
          "fam_2": "models/fam_2/model.30.hdf5",
          "fam_3": "models/fam_3/model.30.hdf5"}

In [None]:
def evaluate_for_fam(f):
    ae = load_model(models[f])
    ae.summary()
    losses_train = []
    for t in train_data[f]:
        losses_train.append(ae.evaluate(np.array([t]),np.array([t]), verbose=0))
    max_l = max(losses_train)
    losses_test = []
    tp, tn, fp, fn = 0, 0, 0, 0
    for ft in families: 
        for t in test_data[ft]:
            loss=ae.evaluate(np.array([t]),np.array([t]), verbose=0)
            if loss > max_l:
                # predict other family
                if ft == f:
                    fn+=1
                else:
                    tn+=1
            else:
                # predict current family
                if ft == f:
                   tp+=1 
                else:
                    fp+=1
    return tp, tn, fp, fn

In [None]:
tp, tn, fp, fn = evaluate_for_fam("fam_1")
print("True positives  %d" %tp)
print("True negatives  %d" %tn)
print("False positives %d" %fp)
print("False negatives %d" %fn)

In [None]:
tp, tn, fp, fn = evaluate_for_fam("fam_2")
print("True positives  %d" %tp)
print("True negatives  %d" %tn)
print("False positives %d" %fp)
print("False negatives %d" %fn)

In [None]:
tp, tn, fp, fn = evaluate_for_fam("fam_3")
print("True positives  %d" %tp)
print("True negatives  %d" %tn)
print("False positives %d" %fp)
print("False negatives %d" %fn)