In [1]:
import pandas as pd
import re
import random
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from collections import Counter
import itertools

In [2]:
# Excel file path
file_path = "C:\\Users\\Berkay\\PycharmProjects\\MLClassification\\data\\output 2024-08-09 pmt 14.25.07 2024-08-09 pmt 14.26.08.xlsx"

In [3]:
# Read the Excel file
df = pd.read_excel(file_path)

In [4]:
# Clean DNA sequences and fill deleted characters with random bases
def clean_sequence(sequence):
    invalid_indices = [i for i, char in enumerate(sequence) if char not in 'ATGCatgc']
    sequence = re.sub(r'[^ATGCatgc]', '', sequence)
    for i in invalid_indices:
        random_base = random.choice('ATGC')
        sequence = sequence[:i] + random_base + sequence[i:]
    return sequence

df['Sequence'] = df['Sequence'].apply(clean_sequence)

In [5]:
# Calculate k-mer frequencies (including all possible k-mers)
def get_kmer_frequencies(sequence, k):
    kmer_counts = Counter([sequence[i:i+k] for i in range(len(sequence) - k + 1)])
    all_kmers = [''.join(x) for x in itertools.product('ATGC', repeat=k)]
    total_kmers = len(sequence) - k + 1
    return {kmer: kmer_counts.get(kmer, 0) / total_kmers for kmer in all_kmers}

In [6]:
# Calculate GC content
def get_gc_content(sequence):
    gc_count = sequence.count('G') + sequence.count('C')
    return gc_count / len(sequence) if len(sequence) > 0 else 0

In [7]:
# Calculate dinucleotide frequencies
def get_dinucleotide_frequencies(sequence):
    """
    Calculates the dinucleotide frequencies in a DNA sequence.
    Prevents NaN values by considering all possible dinucleotides.
    """
    dinucleotide_counts = Counter([sequence[i:i+2] for i in range(len(sequence) - 1)])
    all_dinucleotides = [''.join(x) for x in itertools.product('ATGC', repeat=2)]
    total_dinucleotides = len(sequence) - 1
    return {dinucleotide: dinucleotide_counts.get(dinucleotide, 0) / total_dinucleotides for dinucleotide in all_dinucleotides}

In [8]:
# Extract features and add them to the DataFrame
k = 3  # k-mer size
df['kmer_frequencies'] = df['Sequence'].apply(lambda x: get_kmer_frequencies(x, k))
df['gc_content'] = df['Sequence'].apply(get_gc_content)
kmer_df = df['kmer_frequencies'].apply(pd.Series)
df = pd.concat([df, kmer_df], axis=1)

In [9]:
# Add dinucleotide features
df['dinucleotide_frequencies'] = df['Sequence'].apply(get_dinucleotide_frequencies)
dinucleotide_df = df['dinucleotide_frequencies'].apply(pd.Series)
df = pd.concat([df, dinucleotide_df], axis=1)

In [10]:
# Apply One-Hot Encoding
df_nucleotides = df['Sequence'].apply(lambda x: pd.Series(list(x)))
df_encoded = pd.get_dummies(df_nucleotides)
df = pd.concat([df, df_encoded], axis=1)

In [11]:
# Encode the 'Class' column with Label Encoding
label_encoder = LabelEncoder()
df['Class'] = label_encoder.fit_transform(df['Class'])

In [12]:
# Combine One-Hot Encoding, k-mer, and dinucleotide features
X = df.drop(['Class', 'id', 'Sequence', 'kmer_frequencies', 'dinucleotide_frequencies'], axis=1)
y = df['Class']

In [13]:
# Data scaling - for k-mer, dinucleotide frequencies, and GC content
scaler = StandardScaler()
X[['gc_content'] + list(kmer_df.columns) + list(dinucleotide_df.columns)] = scaler.fit_transform(
    X[['gc_content'] + list(kmer_df.columns) + list(dinucleotide_df.columns)]
)

In [14]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=78)

In [15]:
# Define the models
knn = KNeighborsClassifier()
svm = SVC()
rf = RandomForestClassifier()
xgb = XGBClassifier()
lr = LogisticRegression(max_iter=1000)

In [16]:
# Train and evaluate models using 5-fold cross-validation
models = [('KNN', knn), ('SVM', svm), ('Random Forest', rf), ('XGBoost', xgb), ('Logistic Regression', lr)]
for name, model in models:
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    print(f'{name}: Accuracy: {cv_results.mean():.3f} (std: {cv_results.std():.3f})')

KNN: Accuracy: 0.846 (std: 0.027)
SVM: Accuracy: 0.940 (std: 0.037)
Random Forest: Accuracy: 0.953 (std: 0.024)
XGBoost: Accuracy: 0.870 (std: 0.068)
Logistic Regression: Accuracy: 0.976 (std: 0.029)
