# AE on proteins in SA representation - protein level

In [None]:
import glob
import os
from collections import Counter
import string
from keras import Input
from keras.layers import Dense, Lambda, Conv1D
import keras.backend as K
from keras.models import Model
from keras.objectives import binary_crossentropy, mse
import os
import random
import numpy as np
import NotebookLoader
from keras.optimizers import RMSprop, Adam
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.models import load_model
from tempfile import TemporaryFile
import csv

In [None]:
import Preprocessing as pre

### Configs

In [None]:
lengths = {}
for f in pre.families:
    proteins = glob.glob(os.path.join(pre.family_paths[f], "*.out"))
    print("Proteins for family %s" %f)
    for p in proteins:
        print(p)
    lengths[f] = len(p)
total = sum([lengths[f] for f in pre.families])

In [None]:
# if preprocessing should add padding or not 
# for fully convololutional networks it is not needed, otherwise must be set to True
padding = True
# if angle representation is used
angles = False
# length of alphabet used for encoding, 3 for angles
num_classes = 25 if not angles else 3
# categorical enconding of classes. for SA representation must be set to True, for angles False
categorical = True if not angles else False
# normalization of data 
normalize = False if not angles else True
# max length of a sequence 
max_length = 668
if angles:
    max_length *= 3
flatten = True

In [None]:
batch_size = 64
intermediate_dim = 10 if not angles else 25
epochs = 20
learning_rate = 0.01

In [None]:
def read_set_for_family(f, set_type):
    ds_path = os.path.join(ds_serialized_path, f, set_type)
    files = glob.glob(os.path.join(ds_path, "*.npy"))
    new = np.load(files[0])
    for f in files[1:]:
        conf_f = np.load(f)
        new = np.concatenate([new,conf_f])
        del conf_f
    return new

### Build the autoencoder

In [None]:
def create_callbacks(f):
    checkpoints_path = os.path.join("models_proteins", f)
    tensorboard_path = os.path.join("logs", f)
    cp_cb = ModelCheckpoint(filepath=os.path.join(checkpoints_path, "model_protein_level_" + f + ".hdf5"), monitor='val_loss',
                            save_best_only=True)
    tb_cb = TensorBoard(log_dir=tensorboard_path)
    return [cp_cb, tb_cb]

In [None]:
#autoencoder
def get_ae():
    if categorical:
        if not flatten:
            model_input = Input(shape=(None,num_classes))
        else:
            model_input = Input(shape=(max_length*num_classes,))
    else:
        model_input = Input(shape=(max_length,))
    #x=Conv1D(intermediate_dim, activation='sigmoid', kernel_size=3, padding='same', dilation_rate=1)(model_input)
    #encoded=Conv1D(intermediate_dim, activation='sigmoid', kernel_size=3, padding='same', dilation_rate=1, name="encoded")(x)
    #x=Conv1D(num_classes, activation='sigmoid', kernel_size=3, padding='same', dilation_rate=1)(encoded)
    encoded= Dense(intermediate_dim, activation='sigmoid')(model_input)
    if categorical:
        if not flatten:
            x = Dense(num_classes, activation='sigmoid')(encoded)
        else:
            x = Dense(max_length*num_classes, activation='sigmoid')(encoded)
    else:
        x = Dense(max_length, activation='sigmoid')(encoded)
    ae=Model(inputs=model_input, outputs=[x])
    opt=RMSprop(lr=learning_rate)
    ae.compile(optimizer=opt, loss='binary_crossentropy', metrics=['mean_absolute_error'])
    ae.summary()
    return ae

### Train and evaluate the autoencoder for specific classes

In [None]:
models = {"fam_1": "models_proteins/fam_1/model_protein_level_fam_1.hdf5",
          "fam_2": "models_proteins/fam_2/model_protein_level_fam_2.hdf5",
          "fam_3": "models_proteins/fam_3/model_protein_level_fam_3.hdf5",
          "fam_4": "models_proteins/fam_4/model_protein_level_fam_4.hdf5",
          "fam_5": "models_proteins/fam_5/model_protein_level_fam_5.hdf5",
          "fam_6": "models_proteins/fam_6/model_protein_level_fam_6.hdf5",
          "fam_7": "models_proteins/fam_7/model_protein_level_fam_7.hdf5",
          "fam_8": "models_proteins/fam_8/model_protein_level_fam_8.hdf5",
          "fam_9": "models_proteins/fam_9/model_protein_level_fam_9.hdf5"}

In [None]:
path = 'data_serialized_proteins_prot'
#for r in range(1, 10):
#    os.makedirs(os.path.join(path, 'fam_%d' % r))

In [None]:
def evaluate_for_fam(f):
    print("Test for autoencoder on fam %s" %f)
    train = read_set_for_family(f,"train")
    ae = load_model(models[f])
    ae.summary()
    losses_train = []
    for t in train:
        losses_train.append(ae.evaluate(np.array([t]),np.array([t]), verbose=0))
    max_l = max(losses_train)
    print("Max loss is %f" %max_l)
    losses_test = []
    del train
    tp, tn, fp, fn = 0, 0, 0, 0
    tp_p, tn_p, fp_p, fn_p = 0, 0, 0, 0
    for ft in pre.families:
        print("Test for fam %s" %ft)
        ds_path = os.path.join(path, ft, "test")
        files = glob.glob(os.path.join(ds_path, "*.npy"))
        for file in files:
            test = np.load(file)
            total_nr = test.shape[0]
            total_loss = 0.0
            gt = 0
            ls = 0
            # check for each configuration the losses
            for t in test:
                loss=ae.evaluate(np.array([t]),np.array([t]), verbose=0)
                total_loss +=loss
                if loss > max_l:
                    gt+=1
                else:
                    ls+=1
            if gt >= ls:
                # predict other family
                if ft == f:
                    fn+=1
                else:
                    tn+=1
            else:
                # predict current family
                if ft == f:
                    tp+=1 
                else:
                    fp+=1
            # compute the probability
            total_loss /= total_nr
            if total_loss > max_l:
                pr = 1 - max_l / (2 * total_loss)
            else:
                pr = total_loss / (2 * max_l)
            if pr >= 0.5:
                # predict other family
                if ft == f:
                    fn_p+=1
                else:
                    tn_p+=1
            else:
                # predict current family
                if ft == f:
                    tp_p+=1 
                else:
                    fp_p+=1                 
    return [tp, tn, fp, fn], [tp_p, tn_p, fp_p, fn_p]

In [None]:
res_fam = open('res_fam_conf_prot.csv', mode='w')
res_avg = open('res_avg_conf_prot.csv', mode='w')
writer_fam = csv.writer(res_fam, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer_fam.writerow(['Iteration', 'Superfamily', 'TP', 'TN', 'FP', 'FN', 'Prec', 'Recall', 'Spec', 'AUC'])
writer_avg = csv.writer(res_avg, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer_avg.writerow(['Iteration', 'Prec', 'Recall', 'Spec', 'AUC'])
for i in range(0,20):
    
    # serialize data
    for f in pre.families:
        proteins = glob.glob(os.path.join(pre.family_paths[f], "*.out"))
        for p in proteins:
            print(p)
            proteins_conf = []
            with open(p) as in_file:
                for line in in_file:
                    proteins_conf.append(line.strip())
            print(len(proteins_conf))
            test_size = int(0.25 * len(proteins_conf))
            val_size = int(0.15 * len(proteins_conf))
            train_all_p, test_p = train_test_split(proteins_conf, test_size=test_size, random_state=i)
            train_p, val_p = train_test_split(train_all_p, test_size = val_size, random_state=i)

            #preprocess
            train_p = pre.process_conf(train_p, categorical=categorical, use_angles=use_angles, padding=padding, max_length=max_length,normalize=normalize, flatten=flatten)
            val_p = pre.process_conf(val_p, categorical=categorical, use_angles=use_angles, padding=padding, max_length=max_length,normalize=normalize, flatten=flatten)
            test_p = pre.process_conf(test_p, categorical=categorical, use_angles=use_angles, padding=padding, max_length=max_length,normalize=normalize, flatten=flatten)
            print("train: " + repr(train_p.shape))
            print("val: " + repr(val_p.shape))
            print("test: " + repr(test_p.shape))
            del train_all_p
            del proteins_conf

            p_name = os.path.basename(p).split('.')[0]
            train_filename = os.path.join(path, f, "train", "train_"+ p_name +".npy")
            val_filename = os.path.join(path, f, "val", "val_"+ p_name +".npy")
            test_filename = os.path.join(path, f, "test", "test_"+ p_name +".npy")
            np.save(train_filename, train_p)
            np.save(val_filename, val_p)
            np.save(test_filename, test_p)

            del train_p
            del test_p
            del val_p
            
    # train autoencoders
    for f in pre.families:
        print("Training for family %s" %f)
        train = read_set_for_family(f,"train")
        test = read_set_for_family(f,"val")
        print("train: " + repr(train.shape))
        print("test" + repr(test.shape))
        ae = get_ae()
        ae.fit(train, train,
               shuffle=True,
               epochs=epochs,
               batch_size=batch_size,
               validation_data=(test, test),
               callbacks=create_callbacks(f),
               verbose=1)
        del train
        del test
    
    # evaluate autoencoders
    tp, tn, fp, fn = {}, {}, {}, {}
    tp_p, tn_p, fp_p, fn_p = {}, {}, {}, {}
    prec, recall, spec, auc = {}, {}, {}, {}
    prec_p, recall_p, spec_p, auc_p = {}, {}, {}, {}
    for f in pre.families:
        print("Evaluating family %s" %f)
        [tp[f], tn[f], fp[f], fn[f]], [tp_p[f], tn_p[f], fp_p[f], fn_p[f]] = evaluate_for_fam(f)
        prec_p[f] = (1.0* tp_p[f] / (tp_p[f] + fp_p[f]))
        recall_p[f] = (1.0* tp_p[f] / (tp_p[f] + fn_p[f]))
        spec_p[f] = (1.0* tn_p[f] / (tn_p[f] + fp_p[f]))
        auc_p[f] = (recall_p[f] + spec_p[f]) / 2
        # write to csv 
        print([i, f, tp_p[f], tn_p[f], fp_p[f], tn_p[f], prec_p[f], recall_p[f], spec_p[f], auc_p[f]])
        writer_fam.writerow([i, f, tp_p[f], tn_p[f], fp_p[f], tn_p[f], prec_p[f], recall_p[f], spec_p[f], auc_p[f]])
    prec_wavg_p, recall_wavg_p, spec_wavg_p, auc_wavg_p = 0, 0, 0, 0
    for f in pre.families:
        prec_wavg_p += lengths[f] * prec_p[f] / total
        recall_wavg_p += lengths[f] * recall_p[f] / total
        spec_wavg_p += lengths[f] * spec_p[f] / total
        auc_wavg_p += lengths[f] * auc_p[f] / total
    # write to csv
    print([i, prec_wavg_p, recall_wavg_p, spec_wavg_p, auc_wavg_p])
    writer_avg.writerow([i, prec_wavg_p, recall_wavg_p, spec_wavg_p, auc_wavg_p])
res_fam.close()
res_avg.close()