# Machine Learning-Based Transcriptomic Biomarker Discovery in Huntington's Disease

**Project:** Discovery of Key Biomarkers for Huntington's Disease Using Meta-Analysis and Machine Learning
**Data Source:** GSE64810 (Pre-processed & Filtered)
**Methodology:** **K-Fold Cross-Validation with SMOTE and MCC** for robust evaluation.
**Models:** Random Forest (Feature Selection) & Support Vector Machine (Classification)

---

## 1. Setup and Data Loading
Import necessary libraries and upload the `HD_ML_Ready_Data.csv` file.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve, auc, matthews_corrcoef
from imblearn.over_sampling import SMOTE # For handling imbalance
from imblearn.pipeline import Pipeline # For cross-validation integrity

# Setup plotting style
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
from google.colab import files

print("Please upload 'HD_ML_Ready_Data.csv'")
uploaded = files.upload()

# Check if file is uploaded
filename = list(uploaded.keys())[0]
print(f"Loaded file: {filename}")

data = pd.read_csv(filename)
data.head()

## 2. Data Preprocessing
Encoding the target variable and setting up the Stratified K-Fold strategy for robust evaluation.

In [None]:
# Separate Features (Genes) and Target
X = data.drop('Target_Class', axis=1)
y = data['Target_Class']

# Encode Target (Control -> 0, HD -> 1)
le = LabelEncoder()
y_encoded = le.fit_transform(y)
class_names = le.classes_
print(f"Class Mapping: {dict(zip(class_names, le.transform(class_names)))}")
print(f"Control Samples: {np.sum(y_encoded == 0)}, HD Samples: {np.sum(y_encoded == 1)}\n")

# Define K-Fold Strategy (K=5 is a common choice for smaller datasets)
K = 5
skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)

print(f"Total Samples: {X.shape[0]}")
print(f"Number of Features (Genes): {X.shape[1]}")
print(f"Using {K}-Fold Stratified Cross-Validation with SMOTE for balance.")

## 3. Cross-Validation and Model Training
We use **Pipelines** to ensure **SMOTE** (Synthetic Minority Over-sampling Technique) and **Standardization** only happen on the training data within each fold, preventing data leakage. We evaluate using **Matthew Correlation Coefficient (MCC)** for balanced performance assessment.

In [None]:
# Initialize lists to store metrics for each fold
rf_metrics = {'accuracy': [], 'auc': [], 'mcc': []}
svm_metrics = {'accuracy': [], 'auc': [], 'mcc': []}
rf_tpr_list = []
svm_tpr_list = []
mean_fpr = np.linspace(0, 1, 100)
final_rf_pipeline = None # Store the final pipeline for subsequent steps
final_svm_pipeline = None
y_test_final = None

print(f"\n--- Running {K}-Fold Cross-Validation ---")

for fold, (train_index, test_index) in enumerate(skf.split(X, y_encoded)):
    print(f"--- Fold {fold+1}/{K} ---")
    
    # Split data for the current fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]
    
    # --- Random Forest with SMOTE Pipeline ---
    # RF does not require scaling, so we just use SMOTE -> RF
    rf_pipeline = Pipeline([
        ('smote', SMOTE(random_state=42)), # Oversample minority class (HD)
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
    ])
    rf_pipeline.fit(X_train, y_train)
    y_pred_rf = rf_pipeline.predict(X_test)
    y_prob_rf = rf_pipeline.predict_proba(X_test)[:, 1]
    
    # Calculate Metrics
    rf_metrics['accuracy'].append(accuracy_score(y_test, y_pred_rf))
    rf_metrics['auc'].append(roc_auc_score(y_test, y_prob_rf))
    rf_metrics['mcc'].append(matthews_corrcoef(y_test, y_pred_rf))
    
    # Store ROC curve data for mean plotting
    fpr_rf, tpr_rf, _ = roc_curve(y_test, y_prob_rf)
    rf_tpr_list.append(np.interp(mean_fpr, fpr_rf, tpr_rf))
    rf_tpr_list[-1][0] = 0.0

    # --- SVM with Scaler and SMOTE Pipeline ---
    # SVM requires scaling, so the pipeline ensures scaling is done correctly after splitting and before SMOTE
    svm_pipeline = Pipeline([
        ('scaler', StandardScaler()), # Scale is fit ONLY on X_train
        ('smote', SMOTE(random_state=42)),
        ('classifier', SVC(kernel='linear', probability=True, random_state=42))
    ])
    svm_pipeline.fit(X_train, y_train)
    y_pred_svm = svm_pipeline.predict(X_test)
    y_prob_svm = svm_pipeline.predict_proba(X_test)[:, 1]
    
    # Calculate Metrics
    svm_metrics['accuracy'].append(accuracy_score(y_test, y_pred_svm))
    svm_metrics['auc'].append(roc_auc_score(y_test, y_prob_svm))
    svm_metrics['mcc'].append(matthews_corrcoef(y_test, y_pred_svm))

    # Store ROC curve data for mean plotting
    fpr_svm, tpr_svm, _ = roc_curve(y_test, y_prob_svm)
    svm_tpr_list.append(np.interp(mean_fpr, fpr_svm, tpr_svm))
    svm_tpr_list[-1][0] = 0.0
    
    # Store the final pipeline and test data from the last fold for subsequent cells
    final_rf_pipeline = rf_pipeline
    final_svm_pipeline = svm_pipeline
    y_test_final = y_test
    y_pred_rf_final = y_pred_rf
    y_pred_svm_final = y_pred_svm

# Calculate Mean ROC Curves and AUC for visualization and summary
mean_tpr_rf = np.mean(rf_tpr_list, axis=0)
mean_tpr_rf[-1] = 1.0
mean_auc_rf = auc(mean_fpr, mean_tpr_rf)

mean_tpr_svm = np.mean(svm_tpr_list, axis=0)
mean_tpr_svm[-1] = 1.0
mean_auc_svm = auc(mean_fpr, mean_tpr_svm)


print("\n--- Cross-Validation Summary ---")
print(f"Random Forest Mean Accuracy: {np.mean(rf_metrics['accuracy']):.4f} (+/- {np.std(rf_metrics['accuracy']):.4f})")
print(f"Random Forest Mean MCC: {np.mean(rf_metrics['mcc']):.4f} (+/- {np.std(rf_metrics['mcc']):.4f})")
print(f"Random Forest Mean ROC-AUC: {mean_auc_rf:.4f}")

print(f"SVM Mean Accuracy: {np.mean(svm_metrics['accuracy']):.4f} (+/- {np.std(svm_metrics['accuracy']):.4f})")
print(f"SVM Mean MCC: {np.mean(svm_metrics['mcc']):.4f} (+/- {np.std(svm_metrics['mcc']):.4f})")
print(f"SVM Mean ROC-AUC: {mean_auc_svm:.4f}")

## 4. Evaluation and Visualization
Plotting the mean ROC curves from the K-Fold CV runs and presenting the Confusion Matrix and Classification Report from the last fold as an example.

In [None]:
# --- Plot Mean ROC Curves ---
plt.figure(figsize=(8, 6))
plt.plot(mean_fpr, mean_tpr_rf, label=f'Random Forest (Mean AUC = {mean_auc_rf:.2f})', lw=2)
plt.plot(mean_fpr, mean_tpr_svm, label=f'SVM (Mean AUC = {mean_auc_svm:.2f})', lw=2)
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Mean ROC Curves (5-Fold CV with SMOTE)')
plt.legend(loc='lower right')
plt.show()

# --- Plot Confusion Matrices (Last Fold Example) ---
cm_rf = confusion_matrix(y_test_final, y_pred_rf_final)
cm_svm = confusion_matrix(y_test_final, y_pred_svm_final)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=class_names, yticklabels=class_names)
axes[0].set_title('Random Forest Confusion Matrix (Last Fold)')
axes[0].set_ylabel('Actual')
axes[0].set_xlabel('Predicted')

sns.heatmap(cm_svm, annot=True, fmt='d', cmap='Greens', ax=axes[1],
            xticklabels=class_names, yticklabels=class_names)
axes[1].set_title('SVM Confusion Matrix (Last Fold)')
axes[1].set_ylabel('Actual')
axes[1].set_xlabel('Predicted')

plt.tight_layout()
plt.show()

# Print detailed Classification Report for the last fold
print("\n--- Detailed Classification Report (Last Fold) ---")
print(f"Random Forest MCC: {matthews_corrcoef(y_test_final, y_pred_rf_final):.4f}")
print(classification_report(y_test_final, y_pred_rf_final, target_names=class_names))

print(f"SVM MCC: {matthews_corrcoef(y_test_final, y_pred_svm_final):.4f}")
print(classification_report(y_test_final, y_pred_svm_final, target_names=class_names))

## 5. Feature Importance (Biomarker Identification)
We extract the feature importance from the Random Forest model trained on the final fold, which identifies the top candidate biomarkers.

In [None]:
# Extract the classifier step from the final RF pipeline
rf_classifier = final_rf_pipeline.named_steps['classifier']

importances = rf_classifier.feature_importances_
indices = np.argsort(importances)[::-1][:20] # Top 20 features

top_genes = X.columns[indices]
top_importances = importances[indices]

plt.figure(figsize=(10, 8))
sns.barplot(x=top_importances, y=top_genes, palette='viridis')
plt.title('Top 20 Candidate Biomarkers (Random Forest Importance)')
plt.xlabel('Importance Score')
plt.ylabel('Gene ID')
plt.show()

print("Top 10 Genes:")
for gene, score in zip(top_genes[:10], top_importances[:10]):
    print(f"{gene}: {score:.4f}")

## 6. Save and Download Models
We save the final trained pipelines (which contain the SMOTE and Classifier steps) along with the feature names and encoder for future use and deployment.

In [None]:
# Create a directory for artifacts
!mkdir -p hd_models

# 1. Save Random Forest Pipeline (includes SMOTE and RF model)
joblib.dump(final_rf_pipeline, 'hd_models/rf_pipeline.pkl')

# 2. Save SVM Pipeline (includes Scaler, SMOTE, and SVM model)
joblib.dump(final_svm_pipeline, 'hd_models/svm_pipeline.pkl')

# 3. Save Feature Names (To ensure future input order matches training)
feature_names = list(X.columns)
joblib.dump(feature_names, 'hd_models/feature_names.pkl')

# 4. Save Label Encoder (To decode 0/1 back to Control/HD)
joblib.dump(le, 'hd_models/label_encoder.pkl')

print("Artifacts saved locally in 'hd_models/' folder.")

# Zip the folder
!zip -r hd_models.zip hd_models

# Download the zip file
files.download('hd_models.zip')