In [None]:
import pandas as pd
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import dill
import numpy as np
import keras.backend as K
from absl import flags
from keras import regularizers
from keras.layers import Activation, BatchNormalization, Convolution1D, Dense, Flatten, GlobalMaxPooling1D, Input,MaxPooling1D, Reshape
from keras.losses import binary_crossentropy, categorical_crossentropy
from keras.metrics import binary_accuracy, categorical_accuracy
from tensorflow.python.ops.nn_ops import conv1d_transpose
from keras.utils.np_utils import to_categorical
from evolutron.engine import Model, load_model
from evolutron.templates import callback_templates as cb
from keras.utils import plot_model

In [None]:
##Tensorflow Classification Model
def protein_classification_model(input_shape=None, output_dim=None, saved_model=None):
    if saved_model:
        model = load_model(saved_model, custom_objects=custom_layers, compile=False)
        model.classification = True
    else:
        seq_length, alphabet = input_shape
        # Model Architecture
        # Input LayerRO
        inp = Input(shape=input_shape, name='aa_seq')
        feature_layer = inp
        feature_layer = Convolution1D(filters=128,
                                          kernel_size=50,
                                          strides=1,
                                          padding='same',
                                          use_bias=False,
                                          kernel_initializer='glorot_uniform',
                                          activation='linear',
                                          name="Conv_Layer_1")(feature_layer)
        feature_layer = BatchNormalization()(feature_layer)
        feature_layer = Activation(activation='relu')(feature_layer)

        # Max-pooling
        if seq_length:
            max_pool = MaxPooling1D(pool_size=seq_length)(feature_layer)
            flat = Flatten()(max_pool)
        else:
            # max_pool = GlobalMaxPooling1D()(convs[-1])
            # flat = max_pool
            raise NotImplementedError('Sequence length must be known at this point. Pad and use mask.')

        # Fully-Connected encoding layers
        fc_enc = Dense(128,
                        kernel_initializer='glorot_uniform',
                        activation='relu',
                        name='FCEnc1')(flat)

        encoded = Dense(128,
                        kernel_initializer='glorot_uniform',
                        activation='relu',
                        name='FCEnc2')(fc_enc)

        classifier = Dense(output_dim,
                           kernel_initializer='glorot_uniform',
                           activation='softmax',
                           name='Classifier')(encoded)

        model = Model(inputs=inp, outputs=classifier, name='CoFAM', classification=True)

    # Loss Functions
    losses = [categorical_crossentropy]

    # Metrics
    metrics = [categorical_accuracy]

    # Compilation
    model.compile(optimizer="nadam",
                  loss=losses,
                  metrics=metrics,
                  lr=0.002)
    return model

In [None]:
##Functions for Padding of the dataset
def pad_or_clip_seq(x, n):
    if n >= x.shape[0]:
        b = np.zeros((n, x.shape[1]))
        b[:x.shape[0]] = x
        return b
    else:
        return x[:n, :]
    
def preprocess_dataset(x_data, y_data=None, one_hot='x', padded=True, pad_y_data=False, nb_aa=20, min_aa=None,
                       max_aa=None):
    """

    Args:
        x_data (pd.Series):
        y_data (list or np.ndArray):
        one_hot (str):
        padded (bool):
        pad_y_data (bool):
        nb_aa:
        min_aa:
        max_aa:

    Returns:

    """
    if not max_aa:
        max_aa = int(np.percentile([len(x) for x in x_data], 99))  # pad so that 99% of datapoints are complete
    else:
        max_aa = min(max_aa, np.max([len(x) for x in x_data]))
    x_data = np.asarray([pad_or_clip_seq(x, max_aa) for x in x_data], dtype=np.float32)

    y_data = np.asarray(y_data)
    assert ((len(x_data) == len(y_data)) or (len(x_data) == len(y_data[0])))
    data_size = len(x_data)
    print('Dataset size: {0}'.format(data_size))
    return x_data, y_data

In [None]:
x_data,y_data=preprocess_dataset(filteredEncodingTensors,y_data,max_aa=1353)
input_shape = x_data[0].shape
y_data = to_categorical(y_data)
output_dim = y_data.shape[1]
conv_net = protein_classification_model(input_shape,output_dim)
plot_model(conv_net, to_file='model.png')
conv_net.display_network_info()
callbacks = cb.standard(patience=20, reduce_factor=0.5)
#print('Started training at {}'.format(time.asctime()))
conv_net.fit(x_data, y_data,epochs=10,batch_size=64,validation_split=0.15,callbacks=callbacks)

In [None]:
##Save Model
outputFolder="output"
file_key = str(np.random.randint(10 ** 8, 10 ** 9))
conv_net.save_train_history(file_key, data_dir=outputFolder)
conv_net.save(file_key, data_dir=outputFolder)
conv_net.save_architecture(file_key, data_dir=outputFolder)
dill.dump(FLAGS.flag_values_dict(),open(os.path.join(outputFolder, 'models', file_key + '.flags'), 'wb'))