In [None]:
# Step1: Load The Data
from Bio import SeqIO


# Function to load data from a FASTA file
def load_fasta_file():
    sequences = []
    virus_types = []
    for record in SeqIO.parse(file_path, "fasta"):
        sequences.append(str(record.seq))
        virus_type = record.description.split()[
            0
        ]  # Assuming virus type is the first element
        virus_types.append(virus_type)
    return sequences, virus_types


# Path to your FASTA file
fasta_file_path = ""  # Enter Fasta File Path

# Load data from the FASTA file
sequences, virus_types = load_fasta_file(fasta_file_path)

# Check the first few sequences and their associated virus types
for i in range(5):  # Adjust the range to see more sequences if needed
    print(f"Sequence {i+1}: {sequences[i]}")
    print(f"Virus Type: {virus_types[i]}")
    print("------")


In [None]:
# Step2: Unique Characters
# Extract unique characters (nucleotides) present in sequences
unique_chars = set("".join(sequences))

# Display unique characters
print("Unique Characters:", unique_chars)


In [None]:
# Step3: Mapping
# Create mappings from characters to numerical values and vice versa
char_to_int = {char: i for i, char in enumerate(sorted(unique_chars))}
int_to_char = {i: char for char, i in char_to_int.items()}

# Display mappings
print("Character to Integer Mapping:", char_to_int)
print("Integer to Character Mapping:", int_to_char)


In [None]:
# Step4: Encoding Of Sequence
import numpy as np


# Function to convert sequences to numerical representation (one-hot encoding)
def sequences_to_one_hot(sequences, char_to_int):
    num_sequences = len(sequences)
    max_seq_length = max(len(seq) for seq in sequences)
    num_chars = len(char_to_int)

    # Initialize an empty array for one-hot encoding
    one_hot_sequences = np.zeros(
        (num_sequences, max_seq_length, num_chars), dtype=np.int8
    )

    # Convert each sequence to one-hot representation
    for i, seq in enumerate(sequences):
        for j, char in enumerate(seq):
            one_hot_sequences[i, j, char_to_int[char]] = 1

    return one_hot_sequences


# Convert sequences to one-hot encoding
one_hot_sequences = sequences_to_one_hot(sequences, char_to_int)

# Display the shape of the one-hot encoded sequences
print("Shape of One-Hot Encoded Sequences:", one_hot_sequences.shape)


In [None]:
# Step5: Encoding of Label
from sklearn.preprocessing import LabelEncoder

# Convert virus types into numerical labels using LabelEncoder
label_encoder = LabelEncoder()
encoded_virus_types = label_encoder.fit_transform(virus_types)

# Display the encoded virus types
print("Encoded Virus Types:", encoded_virus_types)


In [None]:
# Step6: Spliting In Testing and Traning
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    one_hot_sequences, encoded_virus_types, test_size=0.2, random_state=42
)

# Display the shapes of training and testing sets
print("Training set - Sequences:", X_train.shape)
print("Training set - Virus Types:", y_train.shape)
print("Testing set - Sequences:", X_test.shape)
print("Testing set - Virus Types:", y_test.shape)


In [None]:
#Step7: Make and Train Model
from keras.models import Sequential
from keras.layers import Dense, Flatten

# Create a simple feedforward neural network model
model = Sequential()
model.add(Flatten(input_shape=(X_train.shape[1:])))
model.add(Dense(128, activation='relu'))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs= , batch_size=, validation_split=, verbose=) # Adjust epochs,batch_size,validation_split and verbose according to your data

In [None]:
# Step8: Testing
# Evaluate the model on the testing set
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)

# Print the accuracy of the model on the testing set
print(f"Accuracy on Testing Set: {accuracy * 100:.2f}%")


In [None]:
# Step9: Preidct Types On Testing Data
import numpy as np

decoded_predictions = model.predict(X_test)

# Now, convert the probability distributions to class labels
predicted_labels = np.argmax(decoded_predictions, axis=1)

# Display predictions along with their actual virus types
for i in range(
    min(100, len(X_test))
):  # Adjust the range to display more predictions if desired
    pred_label_index = predicted_labels[i]
    pred_label = label_encoder.inverse_transform([pred_label_index])[0]
    true_label = label_encoder.inverse_transform([y_test[i]])[0]
    print(f"Sequence {i+1}: Predicted - {pred_label}, Actual - {true_label}")


In [None]:
# Step10: Report
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Calculate metrics
accuracy = accuracy_score(y_test, predicted_labels)
precision = precision_score(y_test, predicted_labels, average="weighted")
recall = recall_score(y_test, predicted_labels, average="weighted")
f1 = f1_score(y_test, predicted_labels, average="weighted")

# Display the report
print("Classification Report:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
# Step11 : Predict Type on user entered Data
import numpy as np


# Function to preprocess a single sequence
def preprocess_sequence(sequence, char_to_int, max_seq_length):
    # One-hot encode the sequence based on char_to_int mapping
    one_hot_sequence = np.zeros((max_seq_length, len(char_to_int)), dtype=np.int8)
    for i, char in enumerate(sequence[:max_seq_length]):
        one_hot_sequence[i, char_to_int[char]] = 1
    return one_hot_sequence.reshape(1, -1, len(char_to_int))  # Reshape for model input


# Function to take user input, preprocess, and predict virus type
def predict_new_sequence(char_to_int, model):
    if model is None:
        print("Error: Model not loaded.")
        return

    while True:
        choice = input("Do you want to predict the virus type? (yes/no): ").lower()
        if choice == "no":
            print("Thank you for using the service!")
            break
        elif choice == "yes":
            # Take user input for the sequence
            new_sequence = input("Enter the Sequence: ")

            # Preprocess the new sequence similarly to your training data
            max_seq_length = (
                10273  # Define the maximum sequence length (adjust if needed)
            )

            # Preprocess the sequence
            processed_input = preprocess_sequence(
                new_sequence, char_to_int, max_seq_length
            )

            # Use the trained model to predict the virus type
            predicted_type = model.predict(processed_input)
            # Assuming label_encoder is your label encoding object used during training
            predicted_type_index = np.argmax(
                predicted_type
            )  # Get the index of the maximum probability
            predicted_type_name = label_encoder.inverse_transform(
                [predicted_type_index]
            )[0]
            print(
                f"The predicted virus type for the sequence is: {predicted_type_name}"
            )
        else:
            print("Please type 'yes' or 'no'.")


# Replace load_trained_model() with your actual function to load the trained model
def load_trained_model():
    # Load your trained model here
    # Path of your trained Model and Remove (#) From Below Line
    # model =
    return model  # Replace this with your loaded model


# Load your trained model
trained_model = load_trained_model()

# Call the function to predict the virus type for the new sequence
predict_new_sequence(
    char_to_int, trained_model
)  # Pass your char_to_int dictionary and trained model as arguments
