In [3]:
from __future__ import print_function

In [4]:
import numpy as np
np.random.seed(1337)

import json
import os
from keras.preprocessing import sequence
import pandas as pd

In [5]:
seq_file = os.path.join('..', 'data', 'protein-seqs_20180714-095845.txt')
funct_file = os.path.join('..', 'data', 'protein_fun_20180714-091630.txt')

In [8]:
with open(funct_file) as fn_file:
    has_function = json.load(fn_file)

In [9]:
max_seq_size = 500

In [10]:
X = []
Y = []
pos_examples = 0
neg_examples = 0

In [11]:
with open(seq_file) as f:
    for line in f:
        ln = line.split(',')
        protein_id = ln[0].strip()
        seq = ln[1].strip()
        
        if len(seq) >= max_seq_size:
            continue
        
        print(line)
        
        X.append(seq)
        
        if protein_id in has_function:
            Y.append(1)
            pos_examples += 1
        else:
            Y.append(0)
            neg_examples += 1

P27361,MAAAAAQGGGGGEPRRTEGVGPGVPGEVEMVKGQPFDVGPRYTQLQYIGEGAYGMVSSAYDHVRKTRVAIKKISPFEHQTYCQRTLREIQILLRFRHENVIGIRDILRASTLEAMRDVYIVQDLMETDLYKLLKSQQLSNDHICYFLYQILRGLKYIHSANVLHRDLKPSNLLINTTCDLKICDFGLARIADPEHDHTGFLTEYVATRWYRAPEIMLNSKGYTKSIDIWSVGCILAEMLSNRPIFPGKHYLDQLNHILGILGSPSQEDLNCIINMKARNYLQSLPSKTKVAWAKLFPKSDSKALDLLDRMLTFNPNKRITVEEALAHPYLEQYYDPTDEPVAEEPFTFAMELDDLPKERLKELIFQETARFQPGVLEAP

P53779,MSLHFLYYCSEPTLDVKIAFCQGFDKQVDVSYIAKHYNMSKSKVDNQFYSVEVGDSTFTVLKRYQNLKPIGSGAQGIVCAAYDAVLDRNVAIKKLSRPFQNQTHAKRAYRELVLMKCVNHKNIISLLNVFTPQKTLEEFQDVYLVMELMDANLCQVIQMELDHERMSYLLYQMLCGIKHLHSAGIIHRDLKPSNIVVKSDCTLKILDFGLARTAGTSFMMTPYVVTRYYRAPEVILGMGYKENVDIWSVGCIMGEMVRHKILFPGRDYIDQWNKVIEQLGTPCPEFMKKLQPTVRNYVENRPKYAGLTFPKLFPDSLFPADSEHNKLKASQARDLLSKMLVIDPAKRISVDDALQHPYINVWYDPAEVEAPPPQIYDKQLDEREHTIEEWKELIYKEVMNSEEKTKNGVVKGQPSPSGAAVNSSESLPPSSSVNDISSMSTDQTLASDTDSSLEASAGPLGCCR

Q15049,MTQEPFREELAYDRMPTLERGRQDPASYAPDAKPSDLQLSKRLPPCFSHKTWVFSVLMGSCLLVTSGFSLYLGNVFPAEMDYLRCAAGSCIPSAIVSFTVSRRNANVIPNFQILFVSTFAVTTTCLIWFGCK

In [12]:
print("Pos examples : %d" % pos_examples)
print("Neg examples : %d" % neg_examples)

Pos examples : 2
Neg examples : 5


In [13]:
def sequence_to_index(sequence):
    try:
        acid_letters = ['_', 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M',
                'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y']
        
        indexes = [acid_letters.index(c) for c in list(sequence)]
        return indexes
    except Exception:
        print(sequence)
        raise Exception

In [14]:
sequence_to_index("AX")

[1, 21]

In [15]:
X_all = [ ]
for i in range(len(X)):
    x = sequence_to_index(X[i])
    X_all.append(x)

In [16]:
X_all = np.array(X_all)
y_all = np.array(Y)

In [17]:
print(Y[0])
print(X_all[0])
print(len(X_all[0]))

1
[11, 1, 1, 1, 1, 1, 14, 6, 6, 6, 6, 6, 4, 13, 15, 15, 17, 4, 6, 19, 6, 13, 6, 19, 13, 6, 4, 19, 4, 11, 19, 9, 6, 14, 13, 5, 3, 19, 6, 13, 15, 22, 17, 14, 10, 14, 22, 8, 6, 4, 6, 1, 22, 6, 11, 19, 16, 16, 1, 22, 3, 7, 19, 15, 9, 17, 15, 19, 1, 8, 9, 9, 8, 16, 13, 5, 4, 7, 14, 17, 22, 2, 14, 15, 17, 10, 15, 4, 8, 14, 8, 10, 10, 15, 5, 15, 7, 4, 12, 19, 8, 6, 8, 15, 3, 8, 10, 15, 1, 16, 17, 10, 4, 1, 11, 15, 3, 19, 22, 8, 19, 14, 3, 10, 11, 4, 17, 3, 10, 22, 9, 10, 10, 9, 16, 14, 14, 10, 16, 12, 3, 7, 8, 2, 22, 5, 10, 22, 14, 8, 10, 15, 6, 10, 9, 22, 8, 7, 16, 1, 12, 19, 10, 7, 15, 3, 10, 9, 13, 16, 12, 10, 10, 8, 12, 17, 17, 2, 3, 10, 9, 8, 2, 3, 5, 6, 10, 1, 15, 8, 1, 3, 13, 4, 7, 3, 7, 17, 6, 5, 10, 17, 4, 22, 19, 1, 17, 15, 20, 22, 15, 1, 13, 4, 8, 11, 10, 12, 16, 9, 6, 22, 17, 9, 16, 8, 3, 8, 20, 16, 19, 6, 2, 8, 10, 1, 4, 11, 10, 16, 12, 15, 13, 8, 5, 13, 6, 9, 7, 22, 10, 3, 14, 10, 12, 7, 8, 10, 6, 8, 10, 6, 16, 13, 16, 14, 4, 3, 10, 12, 2, 8, 8, 12, 11, 9, 1, 15, 12, 22, 10, 14,

In [18]:
X_all = sequence.pad_sequences(X_all,maxlen=max_seq_size)

In [19]:
X_all[0]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0, 11,  1,  1,  1,  1,  1, 14,  6,  6,  6,  6,  6,  4, 13, 15,
       15, 17,  4,  6, 19,  6, 13,  6, 19, 13,  6,  4, 19,  4, 11, 19,  9,
        6, 14, 13,  5,  3, 19,  6, 13, 15, 22, 17, 14, 10, 14, 22,  8,  6,
        4,  6,  1, 22,  6, 11, 19, 16, 16,  1, 22,  3,  7, 19, 15,  9, 17,
       15, 19,  1,  8,  9,  9,  8, 16, 13,  5,  4,  7, 14, 17, 22,  2, 14,
       15, 17, 10, 15,  4,  8, 14,  8, 10, 10, 15,  5, 15,  7,  4, 12, 19,
        8,  6,  8, 15,  3

# Split into train/test data

In [20]:
print(X_all.shape)
print(y_all.shape)

(7, 500)
(7,)


2/3 going to train, 1/3 going to test by hand

In [21]:
n = X_all.shape[0]

In [22]:
randomize = np.arange(n)
np.random.shuffle(randomize)

In [23]:
randomize

array([6, 2, 1, 3, 0, 4, 5])

In [24]:
X_all = X_all[randomize]
y_all = y_all[randomize]

In [25]:
test_split = round(n * 2 / 3)
X_train = X_all[:test_split]
y_train = y_all[:test_split]
X_test = X_all[test_split:]
y_test = y_all[test_split:]

In [26]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(5, 500)
(5,)
(2, 500)
(2,)


In [27]:
y_train

array([0, 0, 1, 0, 1])

Spliting into 2/3 by pandas sklearn

In [None]:
#from sklearn.model_selection import train_test_split

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = (0.25))

In [None]:
#print(X_train.shape)
#print(y_train.shape)
#print(X_test.shape)
#print(y_test.shape)

In [28]:
from keras.layers import Embedding, Input, Flatten, Dense, Activation, Input
from keras.models import Model, Sequential
from keras.optimizers import SGD

In [29]:
num_amino_acids = 23
embedding_dims = 10
nb_epoch = 2
batch_sz = 2

In [30]:
model = Sequential()
model.add(Embedding(num_amino_acids, embedding_dims, input_length = max_seq_size))

model.add(Flatten())

model.add(Dense(25, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))

In [31]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 10)           230       
_________________________________________________________________
flatten_1 (Flatten)          (None, 5000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 25)                125025    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 26        
Total params: 125,281
Trainable params: 125,281
Non-trainable params: 0
_________________________________________________________________


In [32]:
model.compile(loss='binary_crossentropy', optimizer=SGD())
metrics=['accuracy']

In [33]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 10)           230       
_________________________________________________________________
flatten_1 (Flatten)          (None, 5000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 25)                125025    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 26        
Total params: 125,281
Trainable params: 125,281
Non-trainable params: 0
_________________________________________________________________


In [34]:
hist = model.fit(X_train, y_train,
                batch_size = batch_sz,
                epochs=nb_epoch,
                validation_data= (X_test,y_test),
                verbose=1)

Train on 5 samples, validate on 2 samples
Epoch 1/2
Epoch 2/2


# Changing to the Functional API

In [36]:
input = Input(shape=(max_seq_size,))

In [40]:
embedding = Embedding(num_amino_acids, embedding_dims)(input)

In [45]:
x = Flatten()(embedding)
x = Dense(25, activation='sigmoid')(x)
x = Dense(1)(x)

In [46]:
output = Activation('sigmoid')(x)

In [48]:
model = Model([input], output)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 500)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 500, 10)           230       
_________________________________________________________________
flatten_5 (Flatten)          (None, 5000)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 25)                125025    
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 26        
_________________________________________________________________
activation_2 (Activation)    (None, 1)                 0         
Total params: 125,281
Trainable params: 125,281
Non-trainable params: 0
_________________________________________________________________


In [49]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [51]:
hist = model.fit(X_train, y_train,
                batch_size=batch_sz,
                epochs = nb_epoch,
                validation_data = (X_test, y_test),
                verbose=1)

Train on 5 samples, validate on 2 samples
Epoch 1/2
Epoch 2/2


In [52]:
hist.history

{'acc': [0.4, 0.4],
 'loss': [0.7973412036895752, 0.679305624961853],
 'val_acc': [0.0, 0.5],
 'val_loss': [0.8742895722389221, 0.6626597046852112]}