<a href="https://colab.research.google.com/github/angelaxli/S-MARs-Detection/blob/main/CNN_S_MARs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas numpy scikit-learn
!pip install tensorflow==2.17.0



In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
layers = tf.keras.layers
Tokenizer = tf.keras.preprocessing.text.Tokenizer
pad_sequences = tf.keras.preprocessing.sequence.pad_sequences
Sequential = tf.keras.Sequential
Embedding = tf.keras.layers.Embedding
Conv1D = tf.keras.layers.Conv1D
MaxPooling1D = tf.keras.layers.MaxPooling1D
Dropout = tf.keras.layers.Dropout
Flatten = tf.keras.layers.Flatten
Dense = tf.keras.layers.Dense

In [None]:
## Load and preprocess data
data = pd.read_csv("/content/rstudio smar data.csv")
data['Sequence'] = data['Sequence'].str.replace(r'[^AGTC]', '', regex=True)

# Extract features and labels
# Convert 'Sequence' column to strings explicitly
sequences = data['Sequence'].astype(str).tolist()
labels = data['SMAR'].values

# Tokenize the DNA sequences
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(sequences)
encoded_sequences = tokenizer.texts_to_sequences(sequences)

# Pad the sequences to ensure uniform input length
max_sequence_length = max(len(seq) for seq in encoded_sequences)
padded_sequences = pad_sequences(encoded_sequences, maxlen=max_sequence_length, padding='post')

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


In [None]:
# Define the CNN model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128),
    Conv1D(filters=64, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(rate=0.3),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(rate=0.3),
    Flatten(),
    Dense(units=128, activation='relu'),
    Dropout(rate=0.3),
    Dense(units=1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

Epoch 1/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m249s[0m 18s/step - accuracy: 0.5789 - loss: 7.0295 - val_accuracy: 0.6535 - val_loss: 0.6832
Epoch 2/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m248s[0m 17s/step - accuracy: 0.7989 - loss: 0.7413 - val_accuracy: 0.8119 - val_loss: 0.5010
Epoch 3/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m220s[0m 17s/step - accuracy: 0.7726 - loss: 0.5257 - val_accuracy: 0.8119 - val_loss: 0.4758
Epoch 4/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m265s[0m 17s/step - accuracy: 0.7661 - loss: 0.5376 - val_accuracy: 0.8119 - val_loss: 0.4543
Epoch 5/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m270s[0m 18s/step - accuracy: 0.7749 - loss: 0.4571 - val_accuracy: 0.8119 - val_loss: 0.4007
Epoch 6/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m255s[0m 17s/step - accuracy: 0.7730 - loss: 0.3835 - val_accuracy: 0.8119 - val_loss: 0.3408
Epoch 7/10
[1m13/13[0m [3

In [None]:
# Evaluate the model
metrics = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss: {metrics[0]:.4f}")
print(f"Test Accuracy: {metrics[1]:.4f}")

# Generate predictions and classification report
y_pred = (model.predict(X_test) > 0.5).astype(int)
print(classification_report(y_test, y_pred, target_names=['Non-SMAR', 'SMAR']))

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Test Loss: 0.1847
Test Accuracy: 0.9843
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 4s/step
              precision    recall  f1-score   support

    Non-SMAR       1.00      0.93      0.96        27
        SMAR       0.98      1.00      0.99       100

    accuracy                           0.98       127
   macro avg       0.99      0.96      0.98       127
weighted avg       0.98      0.98      0.98       127

Confusion Matrix:
[[ 25   2]
 [  0 100]]
