Traditional Machine Learning for SV Classification

This notebook trains Random Forest, XGBoost, and Logistic Regression models
using the 15 genomic features for TP vs FP classification.

Input:
- CSV files with computed features

Output:
- Model performance comparison
- Feature importance analysis
- Classification results

Models tested:
- Random Forest, XGBoost, Logistic Regression (with different feature selection approaches)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, confusion_matrix
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load data
df = pd.read_csv('../data/processed/SV_Features_Dataset.csv')

# Define features
FEATURES = [
    'log_svlen', 'depth_ratio', 'depth_mad', 'ab', 'cn_slop',
    'mq_drop', 'clip_frac', 'split_reads', 'read_len_med', 'strand_bias',
    'gc_frac', 'homopolymer_max', 'lcr_mask',
    'support_read', 'svtype_DEL'
]

# Prepare data
X = df[FEATURES].fillna(df[FEATURES].median())
y = (df['label'] == 'TP').astype(int)

print(f"Data shape: {X.shape}")
print(f"Class balance: {y.mean():.3f} TP, {1-y.mean():.3f} FP")
print(f"Features: {len(FEATURES)}")

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='logloss'),
    'Logistic Regression': LogisticRegression(random_state=42)
}

results = []

print("All Features Results:")
print(f"{'Model':<20} {'CV AUC':<10} {'Test AUC':<10} {'Test Precision':<15} {'Test Recall':<12} {'Test F1':<10}")
print("-" * 80)

for name, model in models.items():
    # Use scaled data for LR, unscaled for tree methods
    if 'Regression' in name:
        X_train_use = X_train_scaled
        X_test_use = X_test_scaled
    else:
        X_train_use = X_train
        X_test_use = X_test

    # Cross-validation
    cv_scores = cross_val_score(model, X_train_use, y_train, cv=StratifiedKFold(5), scoring='roc_auc')

    # Fit and predict
    model.fit(X_train_use, y_train)
    y_pred = model.predict(X_test_use)
    y_pred_proba = model.predict_proba(X_test_use)[:, 1]

    # Metrics
    test_auc = roc_auc_score(y_test, y_pred_proba)
    report = classification_report(y_test, y_pred, output_dict=True)

    precision = report['1']['precision']
    recall = report['1']['recall']
    f1 = report['1']['f1-score']

    results.append({
        'Model': f'{name} (All)',
        'Features': len(FEATURES),
        'CV_AUC': cv_scores.mean(),
        'CV_Std': cv_scores.std(),
        'Test_AUC': test_auc,
        'Test_Precision': precision,
        'Test_Recall': recall,
        'Test_F1': f1
    })

    print(f"{name:<20} {cv_scores.mean():<10.3f} {test_auc:<10.3f} {precision:<15.3f} {recall:<12.3f} {f1:<10.3f}")

In [None]:
# Test different numbers of top features
for k in [6, 8, 10]:
    if k > len(FEATURES):
        continue

    print(f"\nTop {k} Features Results:")

    # Select top k features
    selector = SelectKBest(score_func=f_classif, k=k)
    X_train_selected = selector.fit_transform(X_train_scaled, y_train)
    X_test_selected = selector.transform(X_test_scaled)

    # Get selected feature names
    selected_features = [FEATURES[i] for i in selector.get_support(indices=True)]
    print(f"Selected features: {selected_features}")

    print(f"{'Model':<20} {'CV AUC':<10} {'Test AUC':<10} {'Test Precision':<15} {'Test Recall':<12} {'Test F1':<10}")
    print("-" * 80)

    for name, model in models.items():
        # Use selected features for all models
        cv_scores = cross_val_score(model, X_train_selected, y_train, cv=StratifiedKFold(5), scoring='roc_auc')

        model.fit(X_train_selected, y_train)
        y_pred = model.predict(X_test_selected)
        y_pred_proba = model.predict_proba(X_test_selected)[:, 1]

        test_auc = roc_auc_score(y_test, y_pred_proba)
        report = classification_report(y_test, y_pred, output_dict=True)

        precision = report['1']['precision']
        recall = report['1']['recall']
        f1 = report['1']['f1-score']

        results.append({
            'Model': f'{name} (Top {k})',
            'Features': k,
            'CV_AUC': cv_scores.mean(),
            'CV_Std': cv_scores.std(),
            'Test_AUC': test_auc,
            'Test_Precision': precision,
            'Test_Recall': recall,
            'Test_F1': f1
        })

        print(f"{name:<20} {cv_scores.mean():<10.3f} {test_auc:<10.3f} {precision:<15.3f} {recall:<12.3f} {f1:<10.3f}")

In [None]:
# Test PCA with different numbers of components
for n_components in [6, 8, 10]:
    if n_components > len(FEATURES):
        continue

    print(f"\nPCA {n_components} Components Results:")

    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    print(f"Explained variance ratio: {pca.explained_variance_ratio_.sum():.3f}")

    print(f"{'Model':<20} {'CV AUC':<10} {'Test AUC':<10} {'Test Precision':<15} {'Test Recall':<12} {'Test F1':<10}")
    print("-" * 80)

    for name, model in models.items():
        cv_scores = cross_val_score(model, X_train_pca, y_train, cv=StratifiedKFold(5), scoring='roc_auc')

        model.fit(X_train_pca, y_train)
        y_pred = model.predict(X_test_pca)
        y_pred_proba = model.predict_proba(X_test_pca)[:, 1]

        test_auc = roc_auc_score(y_test, y_pred_proba)
        report = classification_report(y_test, y_pred, output_dict=True)

        precision = report['1']['precision']
        recall = report['1']['recall']
        f1 = report['1']['f1-score']

        results.append({
            'Model': f'{name} (PCA {n_components})',
            'Features': n_components,
            'CV_AUC': cv_scores.mean(),
            'CV_Std': cv_scores.std(),
            'Test_AUC': test_auc,
            'Test_Precision': precision,
            'Test_Recall': recall,
            'Test_F1': f1
        })

        print(f"{name:<20} {cv_scores.mean():<10.3f} {test_auc:<10.3f} {precision:<15.3f} {recall:<12.3f} {f1:<10.3f}")

In [None]:
# Convert to DataFrame and sort by Test F1
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('Test_F1', ascending=False)

print("\nFinal Results Ranking (by Test F1):")
print(f"{'Rank':<4} {'Model':<25} {'Test F1':<8} {'Test AUC':<10} {'CV AUC':<10} {'Features':<8}")
print("-" * 70)

for i, (_, row) in enumerate(results_df.head(10).iterrows(), 1):
    print(f"{i:<4} {row['Model']:<25} {row['Test_F1']:<8.3f} {row['Test_AUC']:<10.3f} {row['CV_AUC']:<10.3f} {row['Features']:<8}")

# Best model
best = results_df.iloc[0]
print(f"\nBest Model: {best['Model']}")
print(f"Test F1: {best['Test_F1']:.3f}")
print(f"Test AUC: {best['Test_AUC']:.3f}")
print(f"CV AUC: {best['CV_AUC']:.3f} ± {best['CV_Std']:.3f}")
print(f"Features: {best['Features']}")

# Save results
results_df.to_csv('../data/processed/traditional_ml_results.csv', index=False)

In [None]:
# Train best Random Forest model and get feature importance
best_rf = RandomForestClassifier(n_estimators=100, random_state=42)
best_rf.fit(X_train, y_train)

# Feature importance
importance_df = pd.DataFrame({
    'feature': FEATURES,
    'importance': best_rf.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance (Random Forest):")
for _, row in importance_df.head(10).iterrows():
    print(f"{row['feature']:<18} {row['importance']:.4f}")

# Plot feature importance
plt.figure(figsize=(10, 8))
plt.barh(range(len(importance_df)), importance_df['importance'])
plt.yticks(range(len(importance_df)), importance_df['feature'])
plt.xlabel('Feature Importance')
plt.title('Random Forest Feature Importance')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('../figures/feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

# Save feature importance
importance_df.to_csv('../data/processed/feature_importance.csv', index=False)

print("\nTraditional ML analysis complete.")
print("Results saved to:")
print("  - ../data/processed/traditional_ml_results.csv")
print("  - ../data/processed/feature_importance.csv")
print("  - ../figures/feature_importance.png")