# 🧠 Parkinson's ML Project – Full Pipeline with GridSearchCV

In [None]:
!pip install pandas numpy matplotlib seaborn scikit-learn xgboost lightgbm catboost tensorflow joblib shap plotly openpyxl

In [None]:
import os, json, joblib, shap
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly.graph_objects as go

In [None]:
!mkdir -p parkinsons_project/{data,models,assets,eda}

In [None]:
uci_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data"
df = pd.read_csv(uci_url)
if "name" in df.columns:
    df = df.drop(columns=["name"])
df.columns = df.columns.str.replace("[^A-Za-z0-9_]+", "_", regex=True).str.strip("_")
df.to_csv("parkinsons_project/data/parkinsons.csv", index=False)
X = df.drop("status", axis=1)
y = df["status"]
df.head()

In [None]:
stats_dir = "parkinsons_project/eda"
df.describe().T.to_csv(os.path.join(stats_dir, "summary_stats.csv"))
y.value_counts().rename({0:"Healthy",1:"Parkinson’s"}).to_csv(os.path.join(stats_dir, "target_distribution.csv"))
df.corr()["status"].abs().sort_values(ascending=False).to_csv(os.path.join(stats_dir, "correlation_with_target.csv"))
print("EDA stats saved")

In [None]:
sns.set_theme(style="whitegrid", palette="muted")
# Target distribution
fig, ax = plt.subplots(1,2, figsize=(12,5))
sns.countplot(x=y, palette="Set2", ax=ax[0])
ax[0].set_title("Target Distribution (Count)")
ax[1].pie(y.value_counts(), labels=["Healthy","Parkinson’s"], autopct="%1.1f%%", colors=["#66c2a5","#fc8d62"])
plt.savefig(os.path.join(stats_dir, "target_distribution_combo.png"))
plt.close()
# Heatmap
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), cmap="coolwarm", center=0)
plt.title("Correlation Heatmap")
plt.savefig(os.path.join(stats_dir, "corr_heatmap.png"))
plt.close()
# PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
plt.scatter(X_pca[:,0], X_pca[:,1], c=y, cmap="Set2")
plt.title("PCA Projection")
plt.savefig(os.path.join(stats_dir, "pca.png"))
plt.close()
# t-SNE
X_sample = X.sample(min(300, len(X)), random_state=42)
y_sample = y.loc[X_sample.index]
tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=500)
X_tsne = tsne.fit_transform(X_sample)
plt.scatter(X_tsne[:,0], X_tsne[:,1], c=y_sample, cmap="Set2")
plt.title("t-SNE Projection (sample)")
plt.savefig(os.path.join(stats_dir, "tsne.png"))
plt.close()
print("EDA plots saved")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
param_grids = {
    "RandomForest": {
        "n_estimators": [100, 200, 300],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2]
    },
    "XGBoost": {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.01, 0.1, 0.2],
        "max_depth": [3, 5, 7],
        "subsample": [0.8, 1.0],
        "colsample_bytree": [0.8, 1.0]
    }
}

In [None]:
models = {
    "LogisticRegression": Pipeline([("scaler", StandardScaler()), ("clf", LogisticRegression(max_iter=500))]),
    "RandomForest": RandomForestClassifier(random_state=42),
    "SVM": Pipeline([("scaler", StandardScaler()), ("clf", SVC(probability=True, kernel="rbf"))]),
    "KNN": Pipeline([("scaler", StandardScaler()), ("clf", KNeighborsClassifier(n_neighbors=5))]),
    "XGBoost": xgb.XGBClassifier(eval_metric="logloss", random_state=42),
    "LightGBM": lgb.LGBMClassifier(random_state=42),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=42),
    "NeuralNet": Pipeline([("scaler", StandardScaler()), ("clf", MLPClassifier(hidden_layer_sizes=(64,32), max_iter=500, random_state=42))])
}

In [None]:
results = {}
trained_models = {}
for name, model in models.items():
    if name in param_grids:
        print(f"Running GridSearchCV for {name}...")
        grid = GridSearchCV(model, param_grids[name], cv=5, scoring="roc_auc", n_jobs=-1)
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        print("Best params for", name, ":", grid.best_params_)
    else:
        best_model = model.fit(X_train, y_train)
    trained_models[name] = best_model
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:,1]
    results[name] = {
        "roc_auc": roc_auc_score(y_test, y_proba),
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred)
    }
pd.DataFrame(results).T

In [None]:
best_name = max(results, key=lambda k: results[k]["roc_auc"])
best_model = trained_models[best_name]
joblib.dump(best_model, "parkinsons_project/models/best_model.joblib")
with open("parkinsons_project/assets/metrics.json", "w") as f:
    json.dump(results, f, indent=4)
print("Best model:", best_name, results[best_name])

In [None]:
cm = confusion_matrix(y_test, best_model.predict(X_test))
plt.imshow(cm, cmap="Blues")
plt.title("Confusion Matrix")
plt.colorbar()
plt.savefig("parkinsons_project/assets/confusion_matrix.png")
plt.close()
y_proba = best_model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.plot(fpr, tpr, label=f"{best_name} (AUC={results[best_name]['roc_auc']:.2f})")
plt.plot([0,1],[0,1],'k--')
plt.legend()
plt.savefig("parkinsons_project/assets/roc_curve.png")
plt.close()
prec, rec, _ = precision_recall_curve(y_test, y_proba)
plt.plot(rec, prec, label=best_name)
plt.legend()
plt.savefig("parkinsons_project/assets/pr_curve.png")
plt.close()
print("Plots saved")

In [None]:
try:
    explainer = shap.Explainer(best_model, X_train)
    shap_values = explainer(X_test)
    shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
    plt.savefig("parkinsons_project/assets/shap_summary.png")
    plt.close()
except Exception as e:
    print("SHAP failed:", e)

In [None]:
import shutil
shutil.make_archive("parkinsons_project", 'zip', "parkinsons_project")
print("ZIP created")