### Importing Libraries and data

In [3]:
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Dense, Dropout, BatchNormalization, Input, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  0


In [4]:
from tensorflow.python.client import device_lib

def get_available_devices():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos]

print(get_available_devices())

['/device:CPU:0']


In [5]:
df = pd.read_csv('D:/mlbc-research/CNN-HSP-Classification/data/processed/sequences_with_labels.csv')
df.head()

Unnamed: 0,Sequence,Label
0,MMINYWNPIEEIDTVRRQLDHLFEDAIDTGKSSNYPSWAPAVELWD...,HSP20
1,MMSIVLRDPFRSFERMYPLGWEPFQELESWRREMDRMFGRLMPISK...,HSP20
2,MLSLLNKNRSFFDDFFEDFNVLNPVTTSNLMRTDIKETQNGYSLSV...,HSP20
3,MALMKWEPLREIDDMFDRYVMSMGWPSRRQELITAGDWSPRVDISE...,HSP20
4,MANEVSRPVVKSVRQVEPLENLIETVWPGVFSPFITRQATQPQIAH...,HSP20


In [6]:
df['Label'].unique()

array(['HSP20', 'HSP40', 'HSP60', 'HSP70', 'HSP90', 'HSP100', 'NON_HSP'],
      dtype=object)

### One-Hot Encoding the sequences

In [7]:
amino_acids = "ACDEFGHIKLMNPQRSTVWY"
aa_to_int = {aa: idx + 1 for idx, aa in enumerate(amino_acids)}
num_amino_acids = len(amino_acids) + 1

sequences = df['Sequence']

int_sequences = [[aa_to_int.get(aa, 0) for aa in seq] for seq in sequences]

In [8]:
#Get Max Sequence Length
max_sequence_length = df['Sequence'].apply(len).max()
max_sequence_length

3321

In [9]:
padded_sequences = pad_sequences(int_sequences, maxlen=max_sequence_length, padding='post')


one_hot_encoded = np.zeros((len(padded_sequences), max_sequence_length, num_amino_acids), dtype=np.float32)
for i, seq in enumerate(padded_sequences):
    for j, aa_index in enumerate(seq):
        if aa_index != 0:  # Ignore padding
            one_hot_encoded[i, j, aa_index] = 1.0

# Verify the result
print("Padded Sequences:\n", padded_sequences)
print("One-Hot Encoded Shape:", one_hot_encoded.shape)


Padded Sequences:
 [[11 11  8 ...  0  0  0]
 [11 11 16 ...  0  0  0]
 [11 10 16 ...  0  0  0]
 ...
 [11  4 16 ...  0  0  0]
 [11 18 20 ...  0  0  0]
 [11 16 16 ...  0  0  0]]
One-Hot Encoded Shape: (12146, 3321, 21)


In [10]:
y = df['Label']
print(f"Number of classes: {len(set(y))}")

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

y_one_hot_encoded = to_categorical(y_encoded, num_classes=7)

X_train, X_test, y_train, y_test = train_test_split(one_hot_encoded, y_one_hot_encoded, test_size=0.2, random_state=42)


Number of classes: 7


In [11]:
y_binary = np.where(df['Label'] == 'Non-HSP', 0, 1)  # 0 for Non-HSP, 1 for HSP
y_categories = y_one_hot_encoded  # Existing one-hot encoded labels for HSP categories

X_train, X_test, y_train_binary, y_test_binary, y_train_categories, y_test_categories = train_test_split(
    one_hot_encoded, y_binary, y_categories, test_size=0.2, random_state=42
)


In [15]:
input_shape = (max_sequence_length, num_amino_acids)

input_layer = Input(shape=input_shape)

conv1 = Conv1D(64, kernel_size=256, activation='relu')(input_layer)
bn1 = BatchNormalization()(conv1)
pool1 = GlobalMaxPooling1D()(bn1)
drop1 = Dropout(0.5)(pool1)

conv2 = Conv1D(32, kernel_size=256, activation='relu')(input_layer)
bn2 = BatchNormalization()(conv2)
pool2 = GlobalMaxPooling1D()(bn2)
drop2 = Dropout(0.5)(pool2)

conv3 = Conv1D(16, kernel_size=256, activation='relu')(input_layer)
bn3 = BatchNormalization()(conv3)
pool3 = GlobalMaxPooling1D()(bn3)
drop3 = Dropout(0.5)(pool3)

concatenated = Concatenate()([drop1, drop2, drop3])

dense1 = Dense(128, activation='relu')(concatenated)
output_multiclass = Dense(7, activation='softmax', name='multiclass_output')(dense1)  # HSP types
output_binary = Dense(1, activation='sigmoid', name='binary_output')(dense1)

model = Model(inputs=input_layer, outputs=[output_multiclass, output_binary])

model.compile(optimizer='adam',
              loss={'multiclass_output': 'categorical_crossentropy', 'binary_output': 'binary_crossentropy'},
              metrics={'multiclass_output': 'accuracy', 'binary_output': 'accuracy'})

history = model.fit(X_train, 
                    {'multiclass_output': y_train_categories, 'binary_output': y_train_binary},
                    validation_data=(X_test, {'multiclass_output': y_test_categories, 'binary_output': y_test_binary}),
                    epochs=15, 
                    batch_size=32)



Epoch 1/15
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m236s[0m 759ms/step - binary_output_accuracy: 0.9799 - binary_output_loss: 0.1114 - loss: 1.8529 - multiclass_output_accuracy: 0.7283 - multiclass_output_loss: 1.7416 - val_binary_output_accuracy: 1.0000 - val_binary_output_loss: 0.0263 - val_loss: 1.0686 - val_multiclass_output_accuracy: 0.8193 - val_multiclass_output_loss: 1.0423
Epoch 2/15
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m228s[0m 751ms/step - binary_output_accuracy: 1.0000 - binary_output_loss: 0.0012 - loss: 0.7795 - multiclass_output_accuracy: 0.8009 - multiclass_output_loss: 0.7784 - val_binary_output_accuracy: 1.0000 - val_binary_output_loss: 0.0029 - val_loss: 0.7031 - val_multiclass_output_accuracy: 0.8267 - val_multiclass_output_loss: 0.7001
Epoch 3/15
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m229s[0m 754ms/step - binary_output_accuracy: 1.0000 - binary_output_loss: 8.8588e-04 - loss: 0.5689 - multiclass_output_a

In [17]:
model.save('Mul_CNN_model_multioutput_new.keras')