In [12]:
from tensorflow.keras.models import load_model
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [30]:
model = load_model('Mul_CNN_model.keras')

In [6]:
test_df = pd.read_csv('D:/mlbc-research/CNN-HSP-Classification/data/testing/processed/sequences_with_labels_test.csv')

In [24]:
#One hot encoding
amino_acids = "ACDEFGHIKLMNPQRSTVWY"
aa_to_int = {aa: idx + 1 for idx, aa in enumerate(amino_acids)}
num_amino_acids = len(amino_acids) + 1

sequences = test_df['Sequence']

int_sequences = [[aa_to_int.get(aa, 0) for aa in seq] for seq in sequences]

#Get Max Sequence Length
max_sequence_length = 3321 #Might need to fix this later. 3321 is the max_length of the sequences in the train datasets.

padded_sequences = pad_sequences(int_sequences, maxlen=max_sequence_length, padding='post')


one_hot_encoded = np.zeros((len(padded_sequences), max_sequence_length, num_amino_acids), dtype=np.float32)
for i, seq in enumerate(padded_sequences):
    for j, aa_index in enumerate(seq):
        if aa_index != 0:  # Ignore padding
            one_hot_encoded[i, j, aa_index] = 1.0

# Verify the result
print("Padded Sequences:\n", padded_sequences)
print("One-Hot Encoded Shape:", one_hot_encoded.shape)


Padded Sequences:
 [[11 16  7 ...  0  0  0]
 [11 15  9 ...  0  0  0]
 [11 14  8 ...  0  0  0]
 ...
 [ 1  3  3 ...  0  0  0]
 [18 15 11 ...  0  0  0]
 [17 18  1 ...  0  0  0]]
One-Hot Encoded Shape: (680, 3321, 21)


In [26]:
y_test_labels = test_df['Label']        

label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test_labels)
y_test_one_hot = to_categorical(y_test_encoded, num_classes=7)

In [32]:
loss, accuracy = model.evaluate(one_hot_encoded, y_test_one_hot)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Test Loss: 0.9479
Test Accuracy: 93.68%


In [None]:
predictions = model.predict(one_hot_encoded)
predicted_classes = np.argmax(predictions, axis=1)
predicted_labels = label_encoder.inverse_transform(predicted_classes)

# Add predictions to the DataFrame
test_df['Predicted_Label'] = predicted_labels

# Save the results to a new CSV file
test_df.to_csv('test_results.csv', index=False)
print("Predictions saved to 'test_results.csv'")
