In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def sequence_to_features(dna_seq=None, aa_seq=None, nu_seq=None):
    # DNA base mapping
    base_dict = {
        'A': 1, 'C': 2, 'G': 3, 'T': 4,
        'N': 0, '-': 0,
        'R': 5, 'Y': 6, 'M': 7, 'K': 8,
        'S': 9, 'W': 10, 'H': 11, 'B': 12,
        'V': 13, 'D': 14
    }

    # Amino acid mapping
    aa_dict = {
        'A': 1, 'R': 2, 'N': 3, 'D': 4, 'C': 5,
        'Q': 6, 'E': 7, 'G': 8, 'H': 9, 'I': 10,
        'L': 11, 'K': 12, 'M': 13, 'F': 14, 'P': 15,
        'S': 16, 'T': 17, 'W': 18, 'Y': 19, 'V': 20,
        '-': 0, 'X': 0
    }

    features = []

    # Amino acid sequence features if provided
    if aa_seq:
        features.extend([aa_dict.get(aa, 0) for aa in aa_seq])

    # Nucleotide sequence features if provided
    if nu_seq:
        features.extend([base_dict.get(base, 0) for base in nu_seq])

    return features

In [None]:
def plot_confusion_matrix(y_true, y_pred, title):
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(title)
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()
    return cm

In [None]:
def prepare_data(data_path):
    data = pd.read_excel(data_path)

    # Create features combining DNA, amino acid sequences, and DAAs
    X = []
    for idx, row in data.iterrows():
        # Modify these column names to match your actual Excel file columns
        # Example: If your sequence column is named 'Sequence' instead of 'SEQ'
        dna_seq = row['Sequence'] if 'Sequence' in data.columns else row['DNA_SEQ']  # adjust this
        aa_seq = row['Amino_Acid'] if 'Amino_Acid' in data.columns else None  # adjust this
        nu_seq = row['Nucleotide'] if 'Nucleotide' in data.columns else None  # adjust this

        features = sequence_to_features(
            dna_seq,
            aa_seq,
            nu_seq
        )

        # Add DAAs features if present
        if 'DAAs' in data.columns:
            features.append(1 if row['DAAs'] else 0)

        X.append(features)

    X = np.array(X)
    # Adjust this to match your actual target column name
    y = data['Classification'].str.contains('Case').astype(int)  # adjust this

    return X, y

In [None]:
def train_and_evaluate_models(X, y):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Define models
    models = {
        'SVM': SVC(kernel='rbf', C=1.0, random_state=42),
        'Neural Network': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42),
        'KNN': KNeighborsClassifier(n_neighbors=7),
        'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'FDA': LinearDiscriminantAnalysis(),
        'GBM': GradientBoostingClassifier(n_estimators=100, random_state=42),
        'Decision Tree': DecisionTreeClassifier(random_state=42),
        'Naive Bayes': GaussianNB()
    }

    results = {}
    for name, model in models.items():
        # Train model
        model.fit(X_train_scaled, y_train)

        # Make predictions
        y_pred = model.predict(X_test_scaled)

        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        cm = plot_confusion_matrix(y_test, y_pred, f'Confusion Matrix - {name}')

        results[name] = {
            'accuracy': accuracy,
            'report': report,
            'confusion_matrix': cm,
            'model': model
        }

        print(f"\nResults for {name}:")
        print(f"Accuracy: {accuracy:.4f}")
        print("Classification Report:")
        print(report)

    return results

In [None]:
data_path = "sample_data/134-samples-aligned-cutted-Nu-and-a.a-17.12.24.xlsx"
X, y = prepare_data(data_path)

KeyError: 'DNA_SEQ'

In [None]:
results = train_and_evaluate_models(X, y)
best_model = max(results.items(), key=lambda x: x[1]['accuracy'])
print(f"\nBest performing model: {best_model[0]}")
print(f"Accuracy: {best_model[1]['accuracy']:.4f}")