Let's build a sample pipeline for gene expression-based classification of Parkinson’s Disease using real GEO-like data.
We’ll simulate the pipeline as if you’ve already:
- Downloaded and preprocessed the gene expression data
- Have a matrix X (samples × genes, shape: 100 × 10000)
- Have labels y (0 = control, 1 = PD)

We’ll go with:
- MRMR for initial filtering
- RFE for fine-tuning feature selection
- SVM for classification
- Evaluate with cross-validation

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
import pymrmr

In [2]:
# Simulated example
X = pd.DataFrame(np.random.rand(100, 10000), columns=[f'gene{i}' for i in range(10000)])
y = pd.Series(np.random.randint(0, 2, size=100), name='label')

# Combine for MRMR (pymrmr needs labels in same DataFrame)
df = X.copy()
df['label'] = y

In [None]:
# MRMR Feature Filtering (Top 200 Genes)
top_genes = pymrmr.mRMR(df, 'MIQ', 200)
X_mrmr = X[top_genes]

# RFE with SVM to Select Final 30 Genes
svm = SVC(kernel='linear')
rfe = RFE(estimator=svm, n_features_to_select=30, step=1000)
rfe.fit(X_mrmr, y)

# Get reduced feature set
X_rfe = X_mrmr.loc[:, X_mrmr.columns[rfe.support_]]

In [None]:
# Train & Evaluate Final Model
# 5-fold cross-validation
final_model = SVC(kernel='linear')
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(final_model, X_rfe, y, cv=cv, scoring='accuracy')
print(f"Mean Accuracy: {scores.mean():.4f} ± {scores.std():.4f}")


In [None]:
#optional
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

lasso = LogisticRegression(penalty='l1', solver='liblinear', C=0.01)
lasso.fit(X, y)
selector = SelectFromModel(lasso, prefit=True)
X_lasso = selector.transform(X)

scores = cross_val_score(SVC(kernel='linear'), X_lasso, y, cv=cv)
print(f"LASSO + SVM Accuracy: {scores.mean():.4f}")


In [None]:
#alternate to mrmr - mutual info

from sklearn.feature_selection import SelectKBest, mutual_info_classif

selector = SelectKBest(score_func=mutual_info_classif, k=200)
X_selected = selector.fit_transform(X, y)
selected_gene_names = X.columns[selector.get_support()]
