In [1]:
# import libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import sklearn.model_selection
from sklearn.utils import shuffle
#from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
import math
from tensorflow.keras.models import load_model
from tensorflow.keras.models import model_from_json
import re
from transformers import TokenAndPositionEmbedding, TransformerBlock

In [2]:
# import detected and undetected datasets
detected_peptides = pd.read_table('../data/detected_peptides_all_aaindex1.tsv')
undetected_peptides = pd.read_table('../data/undetected_peptides_all_aaindex1.tsv')

In [3]:
# keep detected and undetected peptides less than or equal to 40 aa AND greater than or equal to 7 aa in length
detected_peptides = detected_peptides.loc[(detected_peptides["Peptide"].str.len()>=7) & 
                                          (detected_peptides["Peptide"].str.len()<=40)].reset_index(drop=True)

undetected_peptides = undetected_peptides.loc[(undetected_peptides["Peptide"].str.len()>=7) & 
                                              (undetected_peptides["Peptide"].str.len()<=40)].reset_index(drop=True)

print(detected_peptides.shape)
print(undetected_peptides.shape)

(37837, 571)
(134282, 571)


In [4]:
# double check 0 peptides in undetected peptides are present in detected peptides
len(undetected_peptides[undetected_peptides["Peptide"].isin(detected_peptides["Peptide"])])

0

In [5]:
detected_peptides.head()

Unnamed: 0,Protein,Peptide,PEP,Protein_length,Quantification,ANDN920101,ARGP820101,ARGP820102,ARGP820103,BEGF750101,...,KARS160113,KARS160114,KARS160115,KARS160116,KARS160117,KARS160118,KARS160119,KARS160120,KARS160121,KARS160122
0,P35579,DFSALESQLQDTQELLQEENR,1.2363999999999999e-275,1960.0,0.204082,92.33,12.36,22.69,21.8,13.17,...,241.0,380.122,270.0,454.0,713.007,133.94,537.463,-4.227,208.869,60.3
1,P63167,NADMSEEMQQDSVECATQALEK,1.2646e-275,89.0,0.146067,97.11,12.27,20.85,21.58,14.09,...,228.0,375.531,265.0,459.0,684.014,147.94,545.66,-4.406,216.602,61.57
2,P02545,ASASGSGAQVGGPISSGSSASSVTVTR,3.0099e-251,664.0,0.195783,115.44,12.07,23.95,25.26,13.99,...,168.0,297.981,194.0,390.0,545.021,159.77,482.653,-10.095,207.27,105.422
3,Q14257,LSEEEILENPDLFLTSEATDYGR,8.5859e-249,317.0,0.069401,100.45,19.0,23.85,24.63,14.16,...,254.0,387.734,277.0,472.0,741.007,141.73,562.782,-11.055,223.858,77.698
4,P10809,IMQSSSEVGYDAMAGDFVNMVEK,3.1541000000000002e-248,573.0,0.335079,100.34,18.2,22.63,25.14,14.71,...,236.0,375.687,259.0,469.0,687.028,147.42,537.126,-2.78,219.424,66.769


In [6]:
# drop unnecessary columns
detected_peptides = detected_peptides.drop(['Protein', 'PEP', 'Protein_length'], axis=1)
undetected_peptides = undetected_peptides.drop(['Protein', 'PEP', 'Protein_length'], axis=1)

print(detected_peptides.shape)
print(undetected_peptides.shape)

(37837, 568)
(134282, 568)


In [7]:
detected_peptides.head()

Unnamed: 0,Peptide,Quantification,ANDN920101,ARGP820101,ARGP820102,ARGP820103,BEGF750101,BEGF750102,BEGF750103,BHAR880101,...,KARS160113,KARS160114,KARS160115,KARS160116,KARS160117,KARS160118,KARS160119,KARS160120,KARS160121,KARS160122
0,DFSALESQLQDTQELLQEENR,0.204082,92.33,12.36,22.69,21.8,13.17,14.65,13.88,9.563,...,241.0,380.122,270.0,454.0,713.007,133.94,537.463,-4.227,208.869,60.3
1,NADMSEEMQQDSVECATQALEK,0.146067,97.11,12.27,20.85,21.58,14.09,15.42,14.26,9.634,...,228.0,375.531,265.0,459.0,684.014,147.94,545.66,-4.406,216.602,61.57
2,ASASGSGAQVGGPISSGSSASSVTVTR,0.195783,115.44,12.07,23.95,25.26,13.99,18.85,19.32,12.75,...,168.0,297.981,194.0,390.0,545.021,159.77,482.653,-10.095,207.27,105.422
3,LSEEEILENPDLFLTSEATDYGR,0.069401,100.45,19.0,23.85,24.63,14.16,16.16,15.75,10.467,...,254.0,387.734,277.0,472.0,741.007,141.73,562.782,-11.055,223.858,77.698
4,IMQSSSEVGYDAMAGDFVNMVEK,0.335079,100.34,18.2,22.63,25.14,14.71,17.38,15.33,10.0,...,236.0,375.687,259.0,469.0,687.028,147.42,537.126,-2.78,219.424,66.769


In [8]:
# add detectability column
detected_peptides.insert(loc=1, column='Detectability', value=1)
undetected_peptides.insert(loc=1, column='Detectability', value=0)

print(detected_peptides.shape)
print(undetected_peptides.shape)

(37837, 569)
(134282, 569)


In [9]:
detected_peptides.head()

Unnamed: 0,Peptide,Detectability,Quantification,ANDN920101,ARGP820101,ARGP820102,ARGP820103,BEGF750101,BEGF750102,BEGF750103,...,KARS160113,KARS160114,KARS160115,KARS160116,KARS160117,KARS160118,KARS160119,KARS160120,KARS160121,KARS160122
0,DFSALESQLQDTQELLQEENR,1,0.204082,92.33,12.36,22.69,21.8,13.17,14.65,13.88,...,241.0,380.122,270.0,454.0,713.007,133.94,537.463,-4.227,208.869,60.3
1,NADMSEEMQQDSVECATQALEK,1,0.146067,97.11,12.27,20.85,21.58,14.09,15.42,14.26,...,228.0,375.531,265.0,459.0,684.014,147.94,545.66,-4.406,216.602,61.57
2,ASASGSGAQVGGPISSGSSASSVTVTR,1,0.195783,115.44,12.07,23.95,25.26,13.99,18.85,19.32,...,168.0,297.981,194.0,390.0,545.021,159.77,482.653,-10.095,207.27,105.422
3,LSEEEILENPDLFLTSEATDYGR,1,0.069401,100.45,19.0,23.85,24.63,14.16,16.16,15.75,...,254.0,387.734,277.0,472.0,741.007,141.73,562.782,-11.055,223.858,77.698
4,IMQSSSEVGYDAMAGDFVNMVEK,1,0.335079,100.34,18.2,22.63,25.14,14.71,17.38,15.33,...,236.0,375.687,259.0,469.0,687.028,147.42,537.126,-2.78,219.424,66.769


In [10]:
# take random sample of undetected_peptides, with equal number of rows to detected_peptides
# random_state is used for reproducibility
undetected_peptides_balanced = undetected_peptides.sample(n=detected_peptides.shape[0], 
                                                         random_state=42).reset_index(drop=True)
print(detected_peptides.shape)
print(undetected_peptides_balanced.shape)

(37837, 569)
(37837, 569)


In [11]:
undetected_peptides_balanced

Unnamed: 0,Peptide,Detectability,Quantification,ANDN920101,ARGP820101,ARGP820102,ARGP820103,BEGF750101,BEGF750102,BEGF750103,...,KARS160113,KARS160114,KARS160115,KARS160116,KARS160117,KARS160118,KARS160119,KARS160120,KARS160121,KARS160122
0,IWRPPMYQR,0,0.001679,39.78,13.03,7.92,9.32,4.50,6.82,7.08,...,146.0,206.135,147.0,250.0,371.000,54.387,236.732,-2.390,104.006,43.921
1,NGTHLDAGALTTTFEELHFEIKPHDDCTVEQIYEILK,0,0.043841,161.34,32.57,35.55,37.03,22.43,27.78,24.66,...,395.0,613.817,449.0,741.0,1200.014,220.840,893.243,-32.578,356.324,112.647
2,MAEESER,0,0.005042,30.62,3.85,5.35,6.47,5.06,4.67,4.28,...,85.0,140.974,94.0,176.0,239.000,46.340,172.880,0.000,70.935,18.322
3,AVGGAMR,0,0.024155,29.49,4.46,7.29,8.91,5.04,5.52,4.53,...,57.0,88.144,58.0,108.0,147.007,38.000,116.687,0.000,52.267,12.856
4,NIVHNYSEAEIK,0,0.003472,52.37,11.12,7.57,8.97,7.43,8.67,7.66,...,135.0,211.410,156.0,266.0,404.021,74.960,293.794,-6.155,116.990,32.966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37832,CGTVGSR,0,0.003530,29.77,3.23,5.96,5.78,2.89,5.03,5.58,...,45.0,91.344,56.0,118.0,160.007,40.060,133.162,-4.227,55.667,28.200
37833,ALVIAPLFGIAQVVYFLGIAESLLGLLQDPQA,0,0.006192,136.01,35.39,46.69,47.57,22.30,25.29,18.79,...,315.0,440.511,330.0,512.0,866.021,189.020,693.766,-5.883,284.956,95.899
37834,YQEDFNPLVR,0,0.016867,44.37,10.29,8.73,9.14,5.40,7.36,7.19,...,133.0,195.641,143.0,232.0,373.014,61.790,264.513,-0.960,107.056,38.578
37835,EESELQR,0,0.000984,30.29,3.59,5.45,5.39,4.50,4.47,4.44,...,85.0,144.541,97.0,178.0,256.000,44.840,182.388,0.000,71.435,20.628


In [12]:
unused_undetected = undetected_peptides[~undetected_peptides["Peptide"].isin
                                        (undetected_peptides_balanced["Peptide"])]
unused_undetected.shape

(96445, 569)

# Create training, validation and test sets

## Split into train and test sets

In [13]:
# detected peptides
X_trainP, X_testP, y_trainP, y_testP = sklearn.model_selection.train_test_split(
    detected_peptides, detected_peptides['Detectability'], test_size=0.3, random_state=1)

In [14]:
# undetected peptides
X_trainN, X_testN, y_trainN, y_testN = sklearn.model_selection.train_test_split(
    undetected_peptides_balanced, undetected_peptides_balanced['Detectability'], test_size=0.3, random_state=1)

In [15]:
print(X_trainP.shape)
print(X_testP.shape)
print('')
print(X_trainN.shape)
print(X_testN.shape)

(26485, 569)
(11352, 569)

(26485, 569)
(11352, 569)


## split training into train and validation sets

In [16]:
# detected peptides
X_trainP, X_valP, y_trainP, y_valP = sklearn.model_selection.train_test_split(
    X_trainP, y_trainP, test_size=0.25, random_state=1)

In [17]:
# undetected peptides
X_trainN, X_valN, y_trainN, y_valN = sklearn.model_selection.train_test_split(
    X_trainN, y_trainN, test_size=0.25, random_state=1)

In [18]:
print(X_trainP.shape)
print(X_valP.shape)
print('')
print(X_trainN.shape)
print(X_valN.shape)

(19863, 569)
(6622, 569)

(19863, 569)
(6622, 569)


## create final training and validation sets

In [19]:
# create final training and validation sets
X_train = pd.concat([X_trainP, X_trainN])
X_val = pd.concat([X_valP] + [X_valN])
y_train = pd.concat([pd.Series(y_trainP)] + [pd.Series(y_trainN)])
y_val = pd.concat([pd.Series(y_valP)] + [pd.Series(y_valN)])

In [20]:
print(X_train.shape)
print(X_val.shape)
print('')
print(y_train.shape)
print(y_val.shape)

(39726, 569)
(13244, 569)

(39726,)
(13244,)


In [21]:
# check validation set is not in train
print(len(X_val[X_val["Peptide"].isin(X_train["Peptide"])]))

0


## create final test set

In [22]:
# create final test set
X_test = pd.concat([X_testP, X_testN])
y_test = pd.concat([pd.Series(y_testP)] + [pd.Series(y_testN)])
print(X_test.shape)
print(y_test.shape)

(22704, 569)
(22704,)


In [23]:
# check test is not in train or validation
print(len(X_test[X_test["Peptide"].isin(X_val["Peptide"])]))
print(len(X_test[X_test["Peptide"].isin(X_train["Peptide"])]))

0
0


## Pre-processing of train, validation and test sets

In [24]:
# integer-encode peptides
maxLength = 40

aaDict = {"-": 0, "A": 1, "R": 2, "N": 3, "D": 4, "C": 5, "Q": 6, "E": 7, "G": 8, "H": 9, "I": 10, "L": 11, 
          "K": 12, "M": 13, "F": 14, "P": 15, "S": 16, "T": 17, "W": 18, "Y": 19, "V": 20, "U": 21}

def convertPeptide(peptide, maxLength):
    j = 0
    hotPeptide = []
    for aa in peptide:
        hotPeptide.append(aaDict[aa])
        j = j + 1
    for k in range(maxLength - j):
        hotPeptide.append(0)

    return np.array(hotPeptide)

### Separate out each feature

In [25]:
# training set
X_train = shuffle(X_train, random_state=1).reset_index(drop=True)
y_train = shuffle(y_train, random_state=1).reset_index(drop=True)

X_train_peptide = X_train['Peptide'].apply(convertPeptide, args=(maxLength,))
X_train_nsaf = X_train['Quantification']
X_train_aaindex1 = X_train.iloc[:, 3:]

In [26]:
# validation set
X_val = shuffle(X_val, random_state=1).reset_index(drop=True)
y_val = shuffle(y_val, random_state=1).reset_index(drop=True)

X_val_peptide = X_val['Peptide'].apply(convertPeptide, args=(maxLength,))
X_val_nsaf = X_val['Quantification']
X_val_aaindex1 = X_val.iloc[:, 3:]

In [27]:
# test set
X_test = shuffle(X_test, random_state=1).reset_index(drop=True)
y_test = shuffle(y_test, random_state=1).reset_index(drop=True)

X_test_peptide = X_test['Peptide'].apply(convertPeptide, args=(maxLength,))
X_test_nsaf = X_test['Quantification']
X_test_aaindex1 = X_test.iloc[:, 3:]

In [28]:
np.asarray(X_test_aaindex1)

array([[ 43.14 ,   9.01 ,  14.93 , ...,  -1.139,  84.023,  24.31 ],
       [ 39.62 ,   5.64 ,   8.09 , ...,  -6.047,  86.368,  33.68 ],
       [ 29.98 ,   7.38 ,   8.19 , ...,  -7.509,  62.2  ,  22.946],
       ...,
       [ 56.43 ,  11.46 ,  14.49 , ...,  -3.554, 121.601,  38.449],
       [ 47.4  ,  12.79 ,  12.67 , ...,  -2.601, 105.922,  44.107],
       [ 99.06 ,  20.5  ,  22.84 , ...,  -2.694, 212.29 ,  79.143]])

In [29]:
np.asarray(X_test_aaindex1).shape

(22704, 566)

## Scale features

In [30]:
# from sklearn.preprocessing import StandardScaler
# # scale quantification column for detected and undetected
# scaler = StandardScaler()
# print(scaler.fit([X_train_nsaf]))
# print(scaler.mean_)
# print(scaler.transform([X_train_nsaf]))

# scaler = StandardScaler().fit([X_train_nsaf])
# scaler.mean_
# scaler.scale_

# scale AAIndex1 columns for detected and undetected
# OPTION 1: Convert AAIndex1 features to vector
# OPTION 2: Apply PCA to AAIndex1 features

## Further preprocessing

In [31]:
# convert to arrays
X_train_peptide = np.array(X_train_peptide.to_list())
X_val_peptide = np.array(X_val_peptide.to_list())
X_test_peptide = np.array(X_test_peptide.to_list())

In [42]:
X_test_peptide.shape

(22704, 40)

In [114]:
# OLD CODE
# y_train_aaindex1 = tf.one_hot(np.asarray(y_train), depth=566)
# y_test_aaindex1 = tf.one_hot(np.asarray(y_test), depth=566)
# # not sure about this - trying to convert to match every feature column

# Build model (Keras)

In [34]:
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

main_input = tf.keras.layers.Input(shape=(40,))
embedding_layer = TokenAndPositionEmbedding(40, 41, embed_dim)
x = embedding_layer(main_input)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(bx)
lstm_out = tf.keras.layers.GlobalAveragePooling1D()(x)

auxiliary_output = tf.keras.layers.Dense(1, activation='sigmoid', name='aux_output')(lstm_out)

auxiliary_input = tf.keras.layers.Input(shape=(1,), name='aux_input')
x = tf.keras.layers.concatenate([auxiliary_output, auxiliary_input])
x = tf.keras.layers.Dense(64, activation='relu')(x)
x = tf.keras.layers.Dense(64, activation='relu')(x)
x = tf.keras.layers.Dense(64, activation='relu')(x)

main_output = tf.keras.layers.Dense(1, activation='sigmoid', name='main_output')(x)

model = tf.keras.Model(inputs=[main_input, auxiliary_input], outputs=[main_output, auxiliary_output])

optimiser = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

model.compile(loss='binary_crossentropy', optimizer=optimiser, metrics=['accuracy'], loss_weights=[1., 0.2])

print(model.summary())

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 40)]         0                                            
__________________________________________________________________________________________________
token_and_position_embedding_1  (None, 40, 32)       2592        input_2[0][0]                    
__________________________________________________________________________________________________
transformer_block_1 (Transforme (None, 40, 32)       6464        token_and_position_embedding_1[0]
__________________________________________________________________________________________________
global_average_pooling1d_1 (Glo (None, 32)           0           transformer_block_1[0][0]        
____________________________________________________________________________________________

In [None]:
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

# define inputs
main_input = tf.keras.layers.Input(shape=(40,))
quant_input = tf.keras.layers.Input(shape=(1,), name='quant_input')
aaindex_input = tf.keras.layers.Input(shape=(566,), name='aaindex_input')

embedding_layer = TokenAndPositionEmbedding(40, 41, embed_dim)
x = embedding_layer(main_input)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
lstm_out = tf.keras.layers.GlobalAveragePooling1D()(x)

quant_output = tf.keras.layers.Dense(1, activation='sigmoid', name='quant_output')(lstm_out)
aaindex_output = tf.keras.layers.Dense(1, activation='sigmoid', name='aaindex_output')(quant_output)

x = tf.keras.layers.concatenate([aaindex_output, auxiliary_input])
x = tf.keras.layers.Dense(64, activation='relu')(x)
x = tf.keras.layers.Dense(64, activation='relu')(x)
x = tf.keras.layers.Dense(64, activation='relu')(x)

main_output = tf.keras.layers.Dense(1, activation='sigmoid', name='main_output')(x)

model = tf.keras.Model(inputs=[main_input, auxiliary_input], outputs=[main_output, auxiliary_output])

optimiser = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

model.compile(loss='binary_crossentropy', optimizer=optimiser, metrics=['accuracy'], loss_weights=[1., 0.2])

print(model.summary())

In [None]:
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

main_input = tf.keras.layers.Input(shape=(40,))
embedding_layer = TokenAndPositionEmbedding(40, 41, embed_dim)
x = embedding_layer(main_input)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
lstm_out = tf.keras.layers.GlobalAveragePooling1D()(x)

auxiliary_output = tf.keras.layers.Dense(1, activation='sigmoid', name='aux_output')(lstm_out)

auxiliary_input = tf.keras.layers.Input(shape=(1,), name='aux_input')
x = tf.keras.layers.concatenate([auxiliary_output, auxiliary_input])

aaindex_output = tf.keras.layers.Dense(1, activation='sigmoid', name='aaindex_output')(lstm_out)
aaindex_input = tf.keras.layers.Input(shape=(1,), name='aaindex_input')
x = tf.keras.layers.concatenate([aaindex_output, aaindex_input])

x = tf.keras.layers.Dense(64, activation='relu')(x)
x = tf.keras.layers.Dense(64, activation='relu')(x)
x = tf.keras.layers.Dense(64, activation='relu')(x)

main_output = tf.keras.layers.Dense(1, activation='sigmoid', name='main_output')(x)

model = tf.keras.Model(inputs=[main_input, auxiliary_input], outputs=[main_output, auxiliary_output])

optimiser = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

model.compile(loss='binary_crossentropy', optimizer=optimiser, metrics=['accuracy'], loss_weights=[1., 0.2])

print(model.summary())

In [172]:
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

main_input = tf.keras.layers.Input(shape=(40,))
quant_input = tf.keras.layers.Input(shape=(1,), name='quant_input')
aaindex_input = tf.keras.layers.Input(shape=(566,), name='aaindex_input')

embedding_layer = TokenAndPositionEmbedding(40, 41, embed_dim)
x = embedding_layer(main_input)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
lstm_out = tf.keras.layers.GlobalAveragePooling1D()(x)

quant_output = tf.keras.layers.Dense(1, activation='sigmoid', name='quant_output')(lstm_out)
aaindex_output = tf.keras.layers.Dense(1, activation='sigmoid', name='aaindex_output')(lstm_out)

x = tf.keras.layers.concatenate([quant_input, aaindex_input])
x = tf.keras.layers.Dense(64, activation='relu')(x)
x = tf.keras.layers.Dense(64, activation='relu')(x)
x = tf.keras.layers.Dense(64, activation='relu')(x)

main_output = tf.keras.layers.Dense(1, activation='sigmoid', name='main_output')(x)

model = tf.keras.Model(inputs=[main_input, quant_input, aaindex_input], outputs=[main_output, quant_output, aaindex_output])

optimiser = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

model.compile(loss='binary_crossentropy', optimizer=optimiser, metrics=['accuracy'], loss_weights=[1., 0.2])

print(model.summary())

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 40)]         0                                            
__________________________________________________________________________________________________
token_and_position_embedding_4  (None, 40, 32)       2592        input_5[0][0]                    
__________________________________________________________________________________________________
transformer_block_4 (Transforme (None, 40, 32)       6464        token_and_position_embedding_4[0]
__________________________________________________________________________________________________
global_average_pooling1d_4 (Glo (None, 32)           0           transformer_block_4[0][0]        
____________________________________________________________________________________________

# Train and validate model

In [170]:
np.asanyarray(X_train_aaindex1).shape

(39726, 566)

In [154]:
import time

In [173]:
start_time = time.time()

# model.fit([x_train, y_train], 
#           [y_train, x_train], 
#           batch_size=args.batch_size, epochs=args.epochs, 
#           validation_data=([x_test, y_test], 
#                            [y_test, x_test]), 
#           callbacks=[log, tb, checkpoint, lr_decay])


history = model.fit([np.asanyarray(X_train_peptide), np.asanyarray(X_train_nsaf), np.asanyarray(X_train_aaindex1)], 
                    [np.asanyarray(y_train), np.asanyarray(y_train), np.asanyarray(y_train)], 
                    validation_data = ([np.asarray(X_val_peptide), np.asarray(X_val_nsaf), np.asarray(X_val_aaindex1)], 
                                       [np.asarray(y_val), np.asarray(y_val), np.asarray(y_val)]), 
                    epochs=150, batch_size=200, verbose=2)

print("")
print("Time taken for model to run: ", time.time() - start_time)

Epoch 1/150
199/199 - 9s - loss: nan - main_output_loss: nan - quant_output_loss: nan - aaindex_output_loss: 0.0000e+00 - main_output_accuracy: 0.5000 - quant_output_accuracy: 0.4997 - aaindex_output_accuracy: 0.4998 - val_loss: nan - val_main_output_loss: nan - val_quant_output_loss: nan - val_aaindex_output_loss: 0.0000e+00 - val_main_output_accuracy: 0.5000 - val_quant_output_accuracy: 0.5000 - val_aaindex_output_accuracy: 0.5000
Epoch 2/150
199/199 - 4s - loss: nan - main_output_loss: nan - quant_output_loss: nan - aaindex_output_loss: 0.0000e+00 - main_output_accuracy: 0.5000 - quant_output_accuracy: 0.5000 - aaindex_output_accuracy: 0.5000 - val_loss: nan - val_main_output_loss: nan - val_quant_output_loss: nan - val_aaindex_output_loss: 0.0000e+00 - val_main_output_accuracy: 0.5000 - val_quant_output_accuracy: 0.5000 - val_aaindex_output_accuracy: 0.5000
Epoch 3/150
199/199 - 4s - loss: nan - main_output_loss: nan - quant_output_loss: nan - aaindex_output_loss: 0.0000e+00 - main

KeyboardInterrupt: 

In [158]:
start_time = time.time()

history = model.fit([np.asanyarray(X_train_peptide), np.asanyarray(X_train_nsaf), np.asanyarray(X_train_aaindex1)], 
                    [np.asanyarray(y_train), np.asanyarray(y_train), np.asanyarray(y_train)], 
                    validation_data = ([np.asarray(X_val_peptide), np.asarray(X_val_nsaf), np.asarray(X_val_aaindex1)], 
                                       [np.asarray(y_val), np.asarray(y_val), np.asarray(y_val)]), 
                    epochs=150, batch_size=200, verbose=2)
print("")
print("Time taken for model to run: ", time.time() - start_time)

Epoch 1/150


ValueError: in user code:

    D:\Users\Anima\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\training.py:855 train_function  *
        return step_function(self, iterator)
    D:\Users\Anima\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\training.py:845 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    D:\Users\Anima\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\distribute\distribute_lib.py:1285 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    D:\Users\Anima\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\distribute\distribute_lib.py:2833 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    D:\Users\Anima\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\distribute\distribute_lib.py:3608 _call_for_each_replica
        return fn(*args, **kwargs)
    D:\Users\Anima\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\training.py:838 run_step  **
        outputs = model.train_step(data)
    D:\Users\Anima\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\training.py:795 train_step
        y_pred = self(x, training=True)
    D:\Users\Anima\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\base_layer.py:1013 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    D:\Users\Anima\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\input_spec.py:200 assert_input_compatibility
        raise ValueError('Layer ' + layer_name + ' expects ' +

    ValueError: Layer model expects 2 input(s), but it received 3 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(None, 40) dtype=int32>, <tf.Tensor 'ExpandDims:0' shape=(None, 1) dtype=float32>, <tf.Tensor 'IteratorGetNext:2' shape=(None, 566) dtype=float32>]


In [None]:
start_time = time.time()

history = model.fit([np.asanyarray(X_train1_peptide), np.asanyarray(X_train1_pep)], 
                    [np.asanyarray(y_train), np.asanyarray(y_train)], 
                    validation_data = ([np.asarray(X_test1_peptide), np.asarray(X_test1_pep)], 
                                       [np.asarray(y_test),np.asarray(y_test)]), 
                    epochs=150, batch_size=1342, verbose=2)
print("")
print("Time taken for model to run: ", time.time() - start_time)

Epoch 1/150
51/51 - 10s - loss: 0.8292 - main_output_loss: 0.6929 - aux_output_loss: 0.6811 - main_output_accuracy: 0.5109 - aux_output_accuracy: 0.5568 - val_loss: 0.8272 - val_main_output_loss: 0.6926 - val_aux_output_loss: 0.6728 - val_main_output_accuracy: 0.5000 - val_aux_output_accuracy: 0.5807
Epoch 2/150
51/51 - 8s - loss: 0.8258 - main_output_loss: 0.6922 - aux_output_loss: 0.6680 - main_output_accuracy: 0.5008 - aux_output_accuracy: 0.5865 - val_loss: 0.8235 - val_main_output_loss: 0.6913 - val_aux_output_loss: 0.6609 - val_main_output_accuracy: 0.5090 - val_aux_output_accuracy: 0.6070
Epoch 3/150
51/51 - 8s - loss: 0.8213 - main_output_loss: 0.6901 - aux_output_loss: 0.6558 - main_output_accuracy: 0.5250 - aux_output_accuracy: 0.6130 - val_loss: 0.8185 - val_main_output_loss: 0.6884 - val_aux_output_loss: 0.6507 - val_main_output_accuracy: 0.5379 - val_aux_output_accuracy: 0.6197
Epoch 4/150
51/51 - 8s - loss: 0.8147 - main_output_loss: 0.6857 - aux_output_loss: 0.6452 - mai

In [None]:
history_dict = history.history
history_dict.keys()

In [None]:
# save to df
df_acc = pd.DataFrame({'Loss': history_dict['loss'], 'main_output_loss': history_dict['main_output_loss'], 'aux_output_loss': history_dict['aux_output_loss'], 
                      'main_output_accuracy': history_dict['main_output_accuracy'], 'aux_output_accuracy': history_dict['aux_output_accuracy'], 
                      'val_loss': history_dict['val_loss'], 'val_main_output_loss': history_dict['val_main_output_loss'], 'val_aux_output_loss': history_dict['val_aux_output_loss'],
                      'val_main_output_accuracy': history_dict['val_main_output_accuracy'], 'val_aux_output_accuracy': history_dict['val_aux_output_accuracy']})
df_acc

In [None]:
df_acc.to_csv('M3_seq_pep/M3_train_val_acc_40aa_150_epochs.tsv', sep='\t', index=False)

# old code

# Test model

In [38]:
df = pd.DataFrame({'Peptide': X_testFinal_peptide, 'Quantification': X_testFinal_quant, 'Detectability': y_testFinal})

In [40]:
predictions = model.predict([np.array([
    convertPeptide(pep, maxLength) for pep in df['Peptide']]), np.array(X_testFinal_quant)])

In [77]:
confusion_matrix = sklearn.metrics.confusion_matrix(y_testFinal, np.rint(df['Predictions']))
confusion_matrix

array([[75633, 20812],
       [  493,  3291]], dtype=int64)

In [72]:
# Accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_testFinal, np.rint(df['Predictions']))

0.7874367697971645

In [103]:
# Recall
from sklearn.metrics import recall_score
recall_score(y_testFinal, round(df['Predictions']), average='binary')

0.8697145877378436

In [101]:
# Precision
from sklearn.metrics import precision_score
precision_score(y_testFinal, np.rint(df['Predictions']), average='binary')

0.1365390200389993

In [102]:
# F1 score
from sklearn.metrics import f1_score
f1_score(y_testFinal, np.rint(df['Predictions']), average='binary')

0.2360239538136049

In [None]:
# disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix)
# disp.plot()

In [46]:
predictions

[array([[0.00086892],
        [0.00147071],
        [0.74521136],
        ...,
        [0.75317943],
        [0.14952269],
        [0.00810751]], dtype=float32),
 array([[0.00093165],
        [0.02130318],
        [0.49680048],
        ...,
        [0.69334596],
        [0.40080753],
        [0.2815854 ]], dtype=float32)]

In [47]:
df['Predictions'] = predictions[0].flatten()

In [105]:
df['Pred_rounded'] = round(df['Predictions'])
df

Unnamed: 0,Peptide,Quantification,Detectability,Predictions,Pred_rounded
0,ALVYYTFGALGGNLIAHMVLGYR,0.006297,0,0.000869,0.0
1,CHFSALVANIIQNVPVHQR,0.009625,0,0.001471,0.0
2,EDSAYGSQSVEQEAEK,0.035556,0,0.745211,1.0
3,SEVEMEGPEECLGR,0.004199,0,0.317184,0.0
4,LQNCTGIPLFETQPTYAPLYELITQFELSK,0.004428,0,0.297828,0.0
...,...,...,...,...,...
100224,IPLGNDNIQQEGDR,0.006294,0,0.367778,0.0
100225,AEPPLNASASDQGEK,0.116116,0,0.916781,1.0
100226,YLQCILGVDNIK,0.014675,0,0.753179,1.0
100227,GSTALHYCCLTDNAECLK,0.007952,0,0.149523,0.0


In [106]:
df.to_csv('40/M2_test_pred_40aa_800_epochs.tsv', sep='\t', index=False)