# Import libraries

In [30]:
from tensorflow import keras
import Bio.SeqIO
import numpy as np
import pandas as pd

# Define functions

In [2]:
import tensorflow.keras.backend as K

def mcc(label_real, label_predicted):
    true_pos = K.sum(K.round(K.clip(label_real * label_predicted, 0, 1)))
    true_neg = K.sum(K.round(K.clip((1 - label_real) * (1 - label_predicted), 0, 1)))
    false_pos = K.sum(K.round(K.clip((1 - label_real) * label_predicted, 0, 1)))
    false_neg = K.sum(K.round(K.clip(label_real * (1 - label_predicted), 0, 1)))
    number = true_pos*true_neg - false_pos * false_neg
    denominator = K.sqrt((true_pos + false_pos) * 
                        (true_pos + false_neg) * 
                        (true_neg + false_pos) * 
                        (true_neg + false_neg))
    return number / (denominator + K.epsilon())

In [3]:
from keras_preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

def one_hot_encoding_aa(dataset):
    
    integer_encoder = LabelEncoder()
    one_hot_encoder = OneHotEncoder(categories='auto')
    amino_acids = "ARNDCQEGHILKMFPSTWYV*"
    input_features = []
    # fix the encoded categories
    ie = integer_encoder.fit_transform(list(amino_acids))
    ie = np.array(ie).reshape(-1, 1)
    oe = one_hot_encoder.fit_transform(ie)
    for sequence in dataset:
        if type(sequence) == str:
            integer_encoded = integer_encoder.transform(list(sequence))
            integer_encoded = np.array(integer_encoded).reshape(-1,1)
            one_hot_encoded = one_hot_encoder.transform(integer_encoded)
            input_features.append(one_hot_encoded.toarray())
            
    np.set_printoptions(threshold=40)
    input_features = pad_sequences(input_features, padding="post")
    input_features = np.stack(input_features)
    
    return input_features

In [43]:
from keras_preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

def one_hot_encoding(dataset):
    
    integer_encoder = LabelEncoder()
    one_hot_encoder = OneHotEncoder(categories='auto')
    input_features = []
    for sequence in dataset:
        if type(sequence) == str:
            integer_encoded = integer_encoder.fit_transform(list(sequence))
            integer_encoded = np.array(integer_encoded).reshape(-1,1)
            one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded)
            input_features.append(one_hot_encoded.toarray())

    np.set_printoptions(threshold=40)
    input_features = pad_sequences(input_features, padding="post", maxlen=1000)
    input_features = np.stack(input_features)
    
    return input_features

# Load models and mutant library

In [52]:
candidates = []
description = []
for record in Bio.SeqIO.parse("Insilico/mph_dna.fasta", "fasta"):
    candidat = str(record.seq)
    candidates.append(candidat)
    description.append(record.description)
candidate = one_hot_encoding(candidates)

In [53]:
len(description)

246793

In [7]:
np.save('encoded/mph_aa_1.npy', candidate) # save
#candidate = np.load('CNN_MPH/DmpR_bam/CP_HHH.npy')

In [26]:
candidate = np.resize(candidate, (len(candidate), 1000, 4))
candidate.shape

(11, 1000, 4)

In [46]:
wt = one_hot_encoding("Insilico/MPH.fasta")

In [47]:
wt = np.resize(wt, (1, 1000, 4))
wt.shape

(1, 1000, 4)

In [8]:
# Build the models: simple 1D CNN model
# https://colab.research.google.com/drive/17E4h5aAOioh5DiTo7MZg4hpL6Z_0FyWr#scrollTo=eiiwjw4yhX0P

from tensorflow.keras.layers import Conv1D, Dense, MaxPooling1D, Flatten
from tensorflow.keras.models import Sequential
from tensorflow import keras as k

def cnn_1d_multiclass():
    model = Sequential()
    model.add(Conv1D(filters=400, kernel_size=3, input_shape=(1000, 4)))
    model.add(MaxPooling1D(pool_size=4))
    model.add(Conv1D(filters=400, kernel_size=3, input_shape=(1000, 4)))
    model.add(MaxPooling1D(pool_size=4))
    # model.add(Flatten())
    model.add(Conv1D(filters=400, kernel_size=3, input_shape=(1000, 4)))
    model.add(MaxPooling1D(pool_size=4))
    #model.add(Conv1D(filters=400, kernel_size=3, input_shape=(train_features.shape[1], 4)))
    #model.add(MaxPooling1D(pool_size=4))
    model.add(Flatten())
    model.add(Dense(16, activation='elu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['binary_accuracy'])

    #model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

    model.summary()
    return model

In [9]:
model = cnn_1d_multiclass()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 998, 400)          5200      
                                                                 
 max_pooling1d (MaxPooling1D  (None, 249, 400)         0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 247, 400)          480400    
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 61, 400)          0         
 1D)                                                             
                                                                 
 conv1d_2 (Conv1D)           (None, 59, 400)           480400    
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 14, 400)          0

In [33]:
dependencies = {
    'mcc': mcc
}

import tensorflow as tf
tf.debugging.set_log_device_placement(True)

# try:
#   # Specify an invalid GPU device
#   with tf.device('/device:GPU:1'):
#         cp_model = model.load_weights('CNN_MPH/weights/manual_cp_HHHLL_240102_4.h5')
# except RuntimeError as e:
#     print(e)

epn_model = keras.models.load_model('CNN_MPH/weights/manual_epn_HHHLLL_.h5', compile=False)
#bkg_model = keras.models.load_model('CNN_aa/trained_models/mcc_BC_opt_aaprop_bkg_v0.2.h5', compile=False)

# Predict samples

In [48]:
import tensorflow as tf
tf.debugging.set_log_device_placement(True)

try:
  # Specify an invalid GPU device
  with tf.device('/device:GPU:2'):
        wt_bkg = epn_model.predict(wt)
except RuntimeError as e:
    print(e)



In [49]:
wt_bkg

array([[0.3529437]], dtype=float32)

In [15]:
print(wt_bkg)

[[0.27269903]]


In [54]:
import tensorflow as tf
tf.debugging.set_log_device_placement(True)

try:
  # Specify an invalid GPU device
    with tf.device('/device:GPU:0'):
        epn_prediction = epn_model.predict(candidate)
except RuntimeError as e:
    print(e)



In [51]:
epn_prediction

array([[0.07413818],
       [0.05017554],
       [0.05340827],
       [0.0838486 ],
       [0.07206739],
       [0.06336418],
       [0.06570514],
       [0.06214825],
       [0.05586591],
       [0.04945847],
       [0.05253494]], dtype=float32)

In [None]:
#signal_prediction = signal_model.predict(candidates)#.flatten().tolist()

#cp_prediction = background_model.predict(candidate)#.flatten().tolist()seq

In [28]:
epn_prediction

array([[0.60740167],
       [0.38283113],
       [0.09940393],
       [0.5451945 ],
       [0.20894882],
       [0.64754343],
       [0.10491136],
       [0.19242334],
       [0.55850965],
       [0.09068023],
       [0.9559227 ]], dtype=float32)

In [29]:
description

['MPH_insilico_mutant D47H [0.99999917]',
 'MPH_insilico_mutant A12S [0.99999905]',
 'MPH_insilico_mutant N198D [0.99999905]',
 'MPH_insilico_mutant A108V [0.9999987]',
 'MPH_insilico_mutant G302A [0.99999857]',
 'MPH_insilico_mutant N177H [0.99999845]',
 'MPH_insilico_mutant A84V [0.99999845]',
 'MPH_insilico_mutant H145D [0.9999982]',
 'MPH_insilico_mutant Q19E [0.9999981]',
 'MPH_insilico_mutant V321L [0.999998]',
 'MPH_insilico_mutant K271I [0.99999785]']

In [56]:
epn_predi = []
#sig_predi = []
for i in range(len(epn_prediction)):
    epn_predi.append(epn_prediction[i])
 #   sig_predi.append(signal_prediction[i])
epn_pred = pd.DataFrame(epn_predi, columns=["EPN"])#, index=description)
#sig_pred = pd.DataFrame(sig_predi, columns=["Signal"])#, index=description)
#bkg_pred

In [57]:
epn_pred = epn_pred.sort_values(["EPN"], ascending=False)

In [58]:
epn_pred

Unnamed: 0,EPN
56089,0.372745
44605,0.349462
103303,0.339564
73271,0.337265
136059,0.335145
...,...
131159,0.018653
79501,0.018055
126032,0.017840
110841,0.017225


In [59]:
samples=[]
for index, row in epn_pred.iterrows():
    samples.append(index)

In [60]:
sample = samples[:11]
des = []
epn = []
for i in sample:
    des.append(description[i])
    #print(candidates[i])
    epn.append(epn_prediction[i])

In [61]:
des

['MPH_insilico_mutant Q132L',
 'MPH_insilico_mutant T233S',
 'MPH_insilico_mutant L53R',
 'MPH_insilico_mutant A78T',
 'MPH_insilico_mutant A156T',
 'MPH_insilico_mutant T213S',
 'MPH_insilico_mutant P71S',
 'MPH_insilico_mutant L266P',
 'MPH_insilico_mutant L266P',
 'MPH_insilico_mutant V133A',
 'MPH_insilico_mutant R277H']

In [23]:
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

fasta_dna = []
fasta_aa = []
for i in sample:
    epn = epn_prediction[i]
    #bkg = bkg_predi[i]
    dna_read = SeqRecord(Seq(candidates[i]), id=description[i], description=f"{epn}")
    fasta_dna.append(dna_read)
Bio.SeqIO.write(fasta_dna, "CNN_MPH/candidates_epn_1.fasta", "fasta")

11

In [32]:
cp_predi = []
#sig_predi = []
for i in range(len(cp_prediction)):
    cp_predi.append(cp_prediction[i])
 #   sig_predi.append(signal_prediction[i])
cp_pred = pd.DataFrame(cp_predi, columns=["CP"])#, index=description)
#sig_pred = pd.DataFrame(sig_predi, columns=["Signal"])#, index=description)
#bkg_pred

In [33]:
cp_pred = cp_pred.sort_values(["CP"], ascending=False)

In [34]:
cp_pred

Unnamed: 0,CP
33972,9.999992e-01
124168,9.999990e-01
93550,9.999990e-01
120466,9.999987e-01
149683,9.999986e-01
...,...
131408,7.474927e-08
85481,6.737775e-08
202850,5.022250e-08
110996,2.703679e-08


In [35]:
samples=[]
for index, row in cp_pred.iterrows():
    samples.append(index)

In [42]:
sample = samples[:11]

In [43]:
sample

[33972,
 124168,
 93550,
 120466,
 149683,
 108364,
 112773,
 229947,
 36922,
 192165,
 86592]

In [44]:
des = []
cp = []
for i in sample:
    des.append(description[i])
    #print(candidates[i])
    cp.append(cp_prediction[i])

In [45]:
cp

[array([0.99999917], dtype=float32),
 array([0.99999905], dtype=float32),
 array([0.99999905], dtype=float32),
 array([0.9999987], dtype=float32),
 array([0.99999857], dtype=float32),
 array([0.99999845], dtype=float32),
 array([0.99999845], dtype=float32),
 array([0.9999982], dtype=float32),
 array([0.9999981], dtype=float32),
 array([0.999998], dtype=float32),
 array([0.99999785], dtype=float32)]

In [46]:
des

['MPH_insilico_mutant D47H',
 'MPH_insilico_mutant A12S',
 'MPH_insilico_mutant N198D',
 'MPH_insilico_mutant A108V',
 'MPH_insilico_mutant G302A',
 'MPH_insilico_mutant N177H',
 'MPH_insilico_mutant A84V',
 'MPH_insilico_mutant H145D',
 'MPH_insilico_mutant Q19E',
 'MPH_insilico_mutant V321L',
 'MPH_insilico_mutant K271I']

In [47]:
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

fasta_dna = []
fasta_aa = []
for i in sample:
    cp = cp_prediction[i]
    #bkg = bkg_predi[i]
    dna_read = SeqRecord(Seq(candidates[i]), id=description[i], description=f"{cp}")
    fasta_dna.append(dna_read)
Bio.SeqIO.write(fasta_dna, "CNN_MPH/candidates_cp_2.fasta", "fasta")

11