# Experiment on PFAM dataset

In [1]:
import glob
import os
from collections import Counter
import string
from keras import Input
from keras.layers import Dense, Lambda, Conv1D
import keras.backend as K
from keras.models import Model
from keras.objectives import binary_crossentropy, mse
import os
import random
import numpy as np
import NotebookLoader
from keras.optimizers import RMSprop, Adam
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.models import load_model
from tempfile import TemporaryFile
import csv
import Preprocessing as pre
import random

Using TensorFlow backend.


importing Jupyter notebook from Preprocessing.ipynb


In [2]:
dataset_path = "../dataset_pfam"

In [3]:
glob.glob(dataset_path + "/*.txt")

['../dataset_pfam/HCV(RdRP_3)_203.txt',
 '../dataset_pfam/TET(_JBP)_645.txt',
 '../dataset_pfam/NAD(_binding_1)_37979.txt',
 '../dataset_pfam/RVP_791.txt',
 '../dataset_pfam/Rub(redoxin)_4692.txt']

In [4]:
lengths = []
conformations = []

In [5]:
for fl in glob.glob(dataset_path + "/*.txt"):
    with open(fl) as f:
        current_conf = []
        f.readline() # skip first ">"
        for line in f:
            if line.startswith(">"):
                conformations += [current_conf]
                lengths += [len(current_conf)]
                current_conf = []
            else:
                current_conf += line.strip()
        conformations += [current_conf]
        print("In %s there are %d conf sequences" %(fl, len(conformations)) )
del current_conf
all_letters = []
print(len(conformations))
for c in conformations:
    all_letters +=c
cnt = Counter(all_letters)
print(cnt)
print(len(cnt))
del conformations
del cnt

In ../dataset_pfam/HCV(RdRP_3)_203.txt there are 203 conf sequences
In ../dataset_pfam/TET(_JBP)_645.txt there are 848 conf sequences
In ../dataset_pfam/NAD(_binding_1)_37979.txt there are 38827 conf sequences
In ../dataset_pfam/RVP_791.txt there are 39618 conf sequences
In ../dataset_pfam/Rub(redoxin)_4692.txt there are 44310 conf sequences
44310
Counter({'L': 1900655, 'A': 1741473, 'G': 1528121, 'V': 1394313, 'E': 1315518, 'S': 1260737, 'P': 1182150, 'R': 1167869, 'D': 1145802, 'T': 1102951, 'I': 989159, 'K': 954768, 'F': 804154, 'Q': 750081, 'N': 638931, 'Y': 609620, 'H': 505377, 'M': 447121, 'C': 321851, 'W': 261180, 'X': 9658, 'Z': 1, 'O': 1})
23


In [6]:
max_length = max(lengths)

In [7]:
letters = ['L', 'A', 'G', 'V', 'E', 'S', 'P', 'R', 'D', 'T', 'I', 'K', 'F', 'Q', 'N', 'Y', 'H', 'M', 'C', 'W', 'X', 'Z', 'O']

In [8]:
letters = sorted(letters)

In [9]:
# overwrite this
pre.letters_di= dict(zip(letters,range(0, len(letters))))

In [10]:
pre.letters_di

{'A': 0,
 'C': 1,
 'D': 2,
 'E': 3,
 'F': 4,
 'G': 5,
 'H': 6,
 'I': 7,
 'K': 8,
 'L': 9,
 'M': 10,
 'N': 11,
 'O': 12,
 'P': 13,
 'Q': 14,
 'R': 15,
 'S': 16,
 'T': 17,
 'V': 18,
 'W': 19,
 'X': 20,
 'Y': 21,
 'Z': 22}

In [11]:
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
num_bins = 1000
n, bins, patches = plt.hist(lengths, num_bins, facecolor='blue', alpha=0.5)
plt.show()

<Figure size 640x480 with 1 Axes>

In [12]:
padding = True
num_classes = 23
categorical = True
use_angles = False
max_length = 2000
flatten = True

In [13]:
def read_set(nr):
    train = []
    validation = []
    test= []
    for i, fl in enumerate(glob.glob(dataset_path + "/*.txt")[:2]):
        with open(fl) as f:
            conformations = []
            current_conf = []
            f.readline() # skip first ">"
            for line in f:
                if line.startswith(">"):
                    conformations += [current_conf[:max_length]]
                    current_conf = []
                else:
                    current_conf += line.strip()
            conformations += [current_conf[:max_length]]
            # decide set
            if i == nr:
                test = conformations
                print("%d added to test from %s" % (len(test), fl))
            else:
                # split val train 90 - 10
                train_f, val_f = train_test_split(conformations, test_size=0.1, random_state=42)
                print("%d added to train from %s" % (len(train_f), fl))
                print("%d added to validation from %s" % (len(val_f), fl))
                train += train_f
                validation += val_f
                del train_f
                del val_f
            del conformations
    random.shuffle(train)
    random.shuffle(validation)
    return train, validation, test

In [14]:
batch_size = 64
intermediate_dim = 25
epochs = 20
learning_rate = 0.005

In [15]:
def create_checkpoints(nr):
    checkpoints_path = "models"
    cp_cb = ModelCheckpoint(filepath=os.path.join(checkpoints_path, "model" + str(nr) + ".hdf5"), monitor='val_loss',
                            save_best_only=True)
    return [cp_cb]

In [16]:
#autoencoder
def get_ae():
    if categorical:
        if not flatten:
            model_input = Input(shape=(None,num_classes))
        else:
            model_input = Input(shape=(max_length*num_classes,))
    elif use_angles:
        model_input = Input(shape=(max_length*3,))
    else:
        model_input = Input(shape=(max_length,))
    #x=Conv1D(intermediate_dim, activation='sigmoid', kernel_size=3, padding='same', dilation_rate=1)(model_input)
    #encoded=Conv1D(intermediate_dim, activation='sigmoid', kernel_size=3, padding='same', dilation_rate=1, name="encoded")(x)
    #x=Conv1D(num_classes, activation='sigmoid', kernel_size=3, padding='same', dilation_rate=1)(encoded)
    encoded= Dense(intermediate_dim, activation='sigmoid')(model_input)
    if categorical:
        if not flatten:
            x = Dense(num_classes, activation='sigmoid')(encoded)
        else:
            x = Dense(max_length*num_classes, activation='sigmoid')(encoded)
    elif use_angles:
        x = Dense(max_length*3, activation='sigmoid')(encoded)
    else:
        x = Dense(max_length, activation='sigmoid')(encoded)
    ae=Model(inputs=model_input, outputs=[x])
    opt=RMSprop(lr=learning_rate)
    ae.compile(optimizer=opt, loss='binary_crossentropy', metrics=['mean_absolute_error'])
    ae.summary()
    return ae

In [17]:
for idx in range(0,5):
    print("Training %d" %idx)
    train, validation, test = read_set(idx)
    train = pre.process_conf(train, categorical=categorical, use_angles=use_angles, padding=padding, max_length=max_length, flatten=flatten, num_classes=num_classes)  
    validation = pre.process_conf(validation, categorical=categorical, use_angles=use_angles, padding=padding, max_length=max_length, flatten=flatten, num_classes=num_classes)
    print("Train set" + repr(train.shape))
    print("Validation set" + repr(validation.shape))
    ae = get_ae()
    ae.fit(train, train,
           shuffle=True,
           epochs=epochs,
           batch_size=batch_size,
           validation_data=(validation, validation),
           callbacks=create_checkpoints(idx),
           verbose=1)
    del validation
    
    print("Evaluation %d" %idx)
    
    ae = load_model(os.path.join("models", "model" + str(idx) + ".hdf5"))
    losses_train = []
    for t in train:
        losses_train.append(ae.evaluate(np.array([t]),np.array([t]), verbose=0)[0])
    max_l = max(losses_train)
    del train
    del losses_train
    tn = 0
    total = len(test)
    test = pre.process_conf(test, categorical=categorical, use_angles=use_angles, padding=padding, max_length=max_length, flatten=flatten, num_classes=num_classes)  
    for t in test:
        loss=ae.evaluate(np.array([t]),np.array([t]), verbose=0)[0]
        if loss > max_l:
            tn+=1
    del test
    print("Result for %d" %idx)
    print("True negatives:     %d" %tn)
    print("Total:              %d" %total)
    print("True negative rate: %f" %(tn*1.0/total))

Training 0
203 added to test from ../dataset_pfam/HCV(RdRP_3)_203.txt
580 added to train from ../dataset_pfam/TET(_JBP)_645.txt
65 added to validation from ../dataset_pfam/TET(_JBP)_645.txt
(580, 2000, 23)
(580, 46000)
(65, 2000, 23)
(65, 46000)
Train set(580, 46000)
Validation set(65, 46000)
Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 46000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 25)                1150025   
_________________________________________________________________
dense_2 (Dense)              (None, 46000)             1196000   
Total params: 2,346,025
Trainable params: 2,346,025
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
Use

Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Evaluation 1
(645, 2000, 23)
(645, 46000)
Result for 1
True negatives:     187
Total:              645
True negative rate: 0.289922
Training 2
182 added to train from ../dataset_pfam/HCV(RdRP_3)_203.txt
21 added to validation from ../dataset_pfam/HCV(RdRP_3)_203.txt
580 added to train from ../dataset_pfam/TET(_JBP)_645.txt
65 added to validation from ../dataset_pfam/TET(_JBP)_645.txt
(762, 2000, 23)
(762, 46000)
(86, 2000, 23)
(86, 46000)
Train set(762, 46000)
Validation set(86, 46000)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 46000)             0         
_________________________________________________________________
dense_5 (Dense)              (None, 25)                1150025   
_________________________________________________________________
dense_6 (Dense)              

ZeroDivisionError: float division by zero