In [53]:
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Dense, Dropout, BatchNormalization, Input, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder


In [5]:
df = pd.read_csv('D:/mlbc-research/CNN-HSP-Classification/data/processed/sequences_with_labels.csv')
df.head()

Unnamed: 0,Sequence,Label
0,MMINYWNPIEEIDTVRRQLDHLFEDAIDTGKSSNYPSWAPAVELWD...,HSP20
1,MMSIVLRDPFRSFERMYPLGWEPFQELESWRREMDRMFGRLMPISK...,HSP20
2,MLSLLNKNRSFFDDFFEDFNVLNPVTTSNLMRTDIKETQNGYSLSV...,HSP20
3,MALMKWEPLREIDDMFDRYVMSMGWPSRRQELITAGDWSPRVDISE...,HSP20
4,MANEVSRPVVKSVRQVEPLENLIETVWPGVFSPFITRQATQPQIAH...,HSP20


In [7]:
df['Label'].unique()

array(['HSP20', 'HSP40', 'HSP60', 'HSP70', 'HSP90', 'HSP100', 'NON_HSP'],
      dtype=object)

In [39]:
# Preprocessing and One Hot Encoding
amino_acids = "ACDEFGHIKLMNPQRSTVWY"
aa_to_int = {aa: idx + 1 for idx, aa in enumerate(amino_acids)}
num_amino_acids = len(amino_acids) + 1

sequences = df['Sequence']

int_sequences = [[aa_to_int.get(aa, 0) for aa in seq] for seq in sequences]

In [31]:
#Get Max Sequence Length
max_sequence_length = df['Sequence'].apply(len).max()
max_sequence_length

3321

In [33]:
padded_sequences = pad_sequences(int_sequences, maxlen=max_sequence_length, padding='post')


one_hot_encoded = np.zeros((len(padded_sequences), max_sequence_length, num_amino_acids), dtype=np.float32)
for i, seq in enumerate(padded_sequences):
    for j, aa_index in enumerate(seq):
        if aa_index != 0:  # Ignore padding
            one_hot_encoded[i, j, aa_index] = 1.0

# Verify the result
print("Padded Sequences:\n", padded_sequences)
print("One-Hot Encoded Shape:", one_hot_encoded.shape)


Padded Sequences:
 [[11 11  8 ...  0  0  0]
 [11 11 16 ...  0  0  0]
 [11 10 16 ...  0  0  0]
 ...
 [11  4 16 ...  0  0  0]
 [11 18 20 ...  0  0  0]
 [11 16 16 ...  0  0  0]]
One-Hot Encoded Shape: (12146, 3321, 21)


In [61]:
y = df['Label']
print(f"Number of classes: {len(set(y))}")

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

y_one_hot_encoded = to_categorical(y_encoded, num_classes=7)

X_train, X_test, y_train, y_test = train_test_split(one_hot_encoded, y_one_hot_encoded, test_size=0.2, random_state=42)


Number of classes: 7


In [65]:
input_shape = (max_sequence_length, num_amino_acids)  # Define these appropriately

# Input Layer
input_layer = Input(shape=input_shape)

# Convolutional Layers
conv1 = Conv1D(64, kernel_size=256, activation='relu')(input_layer)
bn1 = BatchNormalization()(conv1)
pool1 = GlobalMaxPooling1D()(bn1)
drop1 = Dropout(0.5)(pool1)

conv2 = Conv1D(32, kernel_size=256, activation='relu')(input_layer)
bn2 = BatchNormalization()(conv2)
pool2 = GlobalMaxPooling1D()(bn2)
drop2 = Dropout(0.5)(pool2)

conv3 = Conv1D(16, kernel_size=256, activation='relu')(input_layer)
bn3 = BatchNormalization()(conv3)
pool3 = GlobalMaxPooling1D()(bn3)
drop3 = Dropout(0.5)(pool3)

# Concatenate
concatenated = Concatenate()([drop1, drop2, drop3])

# Fully Connected Layers
dense1 = Dense(128, activation='relu')(concatenated)
output_multiclass = Dense(7, activation='softmax', name='multiclass_output')(dense1)  # HSP types

# Model
model = Model(inputs=input_layer, outputs = [output_multiclass])

# Compile the Model
model.compile(optimizer='adam',
              loss={'multiclass_output': 'categorical_crossentropy'},
              metrics={'multiclass_output': 'accuracy'})

# Train the Model
history = model.fit(X_train, y_train,validation_data=(X_test, y_test),
                    epochs=30, batch_size=32)




Epoch 1/30


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [71]:
model.save('Mul_CNN_model.keras')