In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix,
                             classification_report)
import time
import psutil
import os
import pickle
import warnings
warnings.filterwarnings('ignore')

# Configuration
DATASET_PATH = "parkinsons.data"
TARGET_COLUMN = "status"
OUTPUT_DIR = "parkinsons_outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("PARKINSON'S DISEASE CLASSIFICATION - COMPLETE PIPELINE\n")


# SECTION 1: DATA LOADING
def load_data(path):


    df = pd.read_csv(path)
    print(f"Dataset loaded successfully")
    print(f"Shape: {df.shape} (rows x columns)")
    print(f"Features: {df.shape[1]}")

    print(f"\nData Types:")
    print(df.dtypes.value_counts())

    print(f"\nFirst 5 rows:")
    print(df.head())

    return df


PARKINSON'S DISEASE CLASSIFICATION - COMPLETE PIPELINE



In [21]:
# SECTION 2: DATA CLEANING WITH FEATURE SELECTION
def clean_data(df):
    """
    Enhanced cleaning function with correlation-based feature selection
    to reduce overfitting
    """
    print("\n2. DATA CLEANING WITH FEATURE SELECTION\n")

    print(f"Before cleaning: {df.shape}")

    # Drop identifier column
    if 'name' in df.columns:
        df = df.drop(columns=['name'])
        print("Removed 'name' column (identifier)")

    # Remove duplicates
    original_rows = len(df)
    df = df.drop_duplicates()
    duplicates_removed = original_rows - len(df)
    print(f"Removed {duplicates_removed} duplicate rows")

    # Handle missing values
    missing = df.isnull().sum()
    if missing.sum() > 0:
        print(f"\nMissing values found:")
        print(missing[missing > 0])

        numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns
        numeric_cols = [col for col in numeric_cols if col != TARGET_COLUMN]
        df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
        print("Filled missing values with median")
    else:
        print("No missing values found")

    print(f"\nAfter basic cleaning: {df.shape}")


    print("CORRELATION-BASED FEATURE SELECTION")


    target = TARGET_COLUMN
    X = df.drop(columns=[target])
    y = df[target]

    original_feature_count = len(X.columns)

    # Calculate correlations with target
    print("\nCalculating correlations with target variable...")
    correlations = df.corr()[target].abs().sort_values(ascending=False)

    # Remove target itself and select top 15
    correlations = correlations[correlations.index != target]
    top_15_features = correlations.head(15).index.tolist()

    print(f"\nTop 15 Features by Correlation with '{target}':")
    print("-" * 60)
    for idx, feat in enumerate(top_15_features, 1):
        corr_val = correlations[feat]
        print(f"   {idx:2d}. {feat:30s} | Correlation: {corr_val:.4f}")

    # Create final dataframe with selected features
    df_final = df[top_15_features + [target]]



    print("FEATURE SELECTION SUMMARY")

    print(f"Original features:  {original_feature_count}")
    print(f"Selected features:  15")
    print(f"Reduction:  {original_feature_count - 15} features removed ({(original_feature_count - 15)/original_feature_count*100:.1f}%)")
    print(f"\nFinal shape: {df_final.shape}")
    print(f"Final features: {df_final.shape[1] - 1} (excluding target)")

    return df_final

In [16]:
# SECTION 3: EXPLORATORY DATA ANALYSIS
def run_eda(df, target_col='status'):
    """Comprehensive EDA with visualizations including PPE scatterplot"""
    print("\n3. EXPLORATORY DATA ANALYSIS")

    eda_dir = f"{OUTPUT_DIR}/eda_plots"
    os.makedirs(eda_dir, exist_ok=True)

    # Target distribution
    plt.figure(figsize=(10, 6))
    target_counts = df[target_col].value_counts().sort_index()
    colors = ['#3498db', '#e74c3c']
    bars = plt.bar(target_counts.index, target_counts.values, color=colors,
                   edgecolor='black', width=0.6)
    plt.title('Parkinson\'s Disease Status Distribution', fontsize=14, fontweight='bold')
    plt.xlabel('Status (0=Healthy, 1=Parkinson\'s)')
    plt.ylabel('Count')
    plt.xticks([0, 1], ['Healthy', 'Parkinson\'s'])
    for bar, (idx, v) in zip(bars, target_counts.items()):
        plt.text(bar.get_x() + bar.get_width()/2, v + len(df)*0.01,
                f'{v}\n({v/len(df)*100:.1f}%)',
                ha='center', fontweight='bold')
    plt.tight_layout()
    plt.savefig(f"{eda_dir}/01_target_distribution.png", dpi=300, bbox_inches='tight')
    plt.close()
    print("Saved: 01_target_distribution.png")

    # PPE Scatterplot (Most Important Feature)
    if 'PPE' in df.columns:
        plt.figure(figsize=(12, 6))

        # Create scatterplot with index on x-axis
        healthy = df[df[target_col] == 0]
        parkinsons = df[df[target_col] == 1]

        plt.scatter(healthy.index, healthy['PPE'], c='#3498db', label='Healthy',
                   alpha=0.6, s=50, edgecolors='black', linewidth=0.5)
        plt.scatter(parkinsons.index, parkinsons['PPE'], c='#e74c3c', label='Parkinson\'s',
                   alpha=0.6, s=50, edgecolors='black', linewidth=0.5)

        plt.title('PPE (Pitch Period Entropy) - Most Important Feature',
                 fontsize=14, fontweight='bold')
        plt.xlabel('Sample Index')
        plt.ylabel('PPE Value')
        plt.legend()
        plt.grid(alpha=0.3)
        plt.tight_layout()
        plt.savefig(f"{eda_dir}/02_ppe_scatterplot.png", dpi=300, bbox_inches='tight')
        plt.close()
        print("Saved: 02_ppe_scatterplot.png (PPE scatterplot)")

        # Additional PPE analysis
        print(f"\nPPE Statistics:")
        print(f"Healthy - Mean: {healthy['PPE'].mean():.4f}, Std: {healthy['PPE'].std():.4f}")
        print(f"Parkinson's - Mean: {parkinsons['PPE'].mean():.4f}, Std: {parkinsons['PPE'].std():.4f}")

    # Key Features Distribution
    key_features = ['MDVP:Fo(Hz)', 'MDVP:Jitter(%)', 'MDVP:Shimmer',
                    'HNR', 'RPDE', 'DFA']

    fig, axes = plt.subplots(2, 3, figsize=(15, 8))
    axes = axes.flatten()

    for i, col in enumerate(key_features):
        if col in df.columns:
            axes[i].hist(df[col], bins=30, color='#3498db', alpha=0.7, edgecolor='black')
            axes[i].set_title(col, fontweight='bold')
            axes[i].set_xlabel('Value')
            axes[i].set_ylabel('Frequency')
            axes[i].grid(alpha=0.3)

    plt.tight_layout()
    plt.savefig(f"{eda_dir}/03_feature_distributions.png", dpi=300, bbox_inches='tight')
    plt.close()
    print("Saved: 03_feature_distributions.png")

    # Features vs Target (Box plots)
    fig, axes = plt.subplots(2, 3, figsize=(15, 8))
    axes = axes.flatten()

    for i, col in enumerate(key_features):
        if col in df.columns:
            data_to_plot = [df[df[target_col] == 0][col].dropna(),
                           df[df[target_col] == 1][col].dropna()]
            bp = axes[i].boxplot(data_to_plot, labels=['Healthy', 'Parkinson\'s'],
                                patch_artist=True)
            for patch, color in zip(bp['boxes'], ['#3498db', '#e74c3c']):
                patch.set_facecolor(color)
                patch.set_alpha(0.7)
            axes[i].set_title(col, fontweight='bold')
            axes[i].set_ylabel('Value')
            axes[i].grid(alpha=0.3, axis='y')

    plt.tight_layout()
    plt.savefig(f"{eda_dir}/04_features_vs_target.png", dpi=300, bbox_inches='tight')
    plt.close()
    print("Saved: 04_features_vs_target.png")

    # Correlation Heatmap
    numeric_df = df.select_dtypes(include=[np.number])

    if target_col in numeric_df.columns and len(numeric_df.columns) > 1:
        correlations = numeric_df.corr()[target_col].abs().sort_values(ascending=False)
        top_n = min(16, len(correlations))
        top_features = correlations.head(top_n).index.tolist()

        plt.figure(figsize=(12, 10))
        sns.heatmap(numeric_df[top_features].corr(), annot=True, fmt='.2f',
                    cmap='RdYlBu_r', center=0, square=True, linewidths=1,
                    cbar_kws={"shrink": 0.8})
        plt.title(f'Correlation Heatmap - Top {top_n} Features',
                 fontsize=14, fontweight='bold')
        plt.tight_layout()
        plt.savefig(f"{eda_dir}/05_correlation_heatmap.png", dpi=300, bbox_inches='tight')
        plt.close()
        print("Saved: 05_correlation_heatmap.png")

        # Print top correlations
        print(f"\nTop 10 features correlated with {target_col}:")
        for idx, (feat, corr) in enumerate(correlations.head(11).items(), 1):
            if feat != target_col:
                print(f" {idx}. {feat}: {corr:.4f}")

    print(f"\nAll EDA plots saved to: {eda_dir}")

In [5]:
# SECTION 4: FEATURE ENGINEERING
def create_engineered_features(df):
    print("\n4. FEATURE ENGINEERING\n")

    df_new = df.copy()

    # Jitter-Shimmer interaction
    jitter_cols = [col for col in df.columns if 'Jitter' in col or 'jitter' in col.lower()]
    shimmer_cols = [col for col in df.columns if 'Shimmer' in col or 'shimmer' in col.lower()]

    if jitter_cols and shimmer_cols:
        df_new['jitter_shimmer_product'] = df[jitter_cols].mean(axis=1) * df[shimmer_cols].mean(axis=1)
        print("Created: jitter_shimmer_product")

    if jitter_cols:
        df_new['avg_jitter'] = df[jitter_cols].mean(axis=1)
        print("Created: avg_jitter")

    if shimmer_cols:
        df_new['avg_shimmer'] = df[shimmer_cols].mean(axis=1)
        print("Created: avg_shimmer")

    if 'HNR' in df.columns and 'NHR' in df.columns:
        df_new['hnr_nhr_ratio'] = df['HNR'] / (df['NHR'] + 1e-10)
        print("Created: hnr_nhr_ratio")

    if 'MDVP:Fhi(Hz)' in df.columns and 'MDVP:Flo(Hz)' in df.columns:
        df_new['frequency_range'] = df['MDVP:Fhi(Hz)'] - df['MDVP:Flo(Hz)']
        print("Created: frequency_range")

    if 'MDVP:Fo(Hz)' in df.columns and 'MDVP:Fhi(Hz)' in df.columns and 'MDVP:Flo(Hz)' in df.columns:
        df_new['frequency_cv'] = (df['MDVP:Fhi(Hz)'] - df['MDVP:Flo(Hz)']) / (df['MDVP:Fo(Hz)'] + 1e-10)
        print("Created: frequency_cv")

    if jitter_cols and shimmer_cols:
        df_new['voice_perturbation_index'] = (df[jitter_cols].mean(axis=1) +
                                               df[shimmer_cols].mean(axis=1)) / 2
        print("Created: voice_perturbation_index")

    if 'spread1' in df.columns and 'spread2' in df.columns:
        df_new['spread_interaction'] = df['spread1'] * df['spread2']
        print("Created: spread_interaction")

    if 'DFA' in df.columns and 'PPE' in df.columns:
        df_new['dfa_ppe_product'] = df['DFA'] * df['PPE']
        print("Created: dfa_ppe_product")

    print(f"\nTotal features after engineering: {df_new.shape[1] - 1}")

    return df_new

In [6]:
def validate_target(df, target):
    print("\n5. TARGET VALIDATION\n")

    if target not in df.columns:
        raise Exception(f"Target column '{target}' not found in dataset.")
    if df[target].nunique() < 2:
        raise Exception("Target has less than 2 classes. Classification impossible.")

    print(f"Target variable: {target}")
    print(f"Number of classes: {df[target].nunique()}")
    print(f"\nClass distribution:")
    counts = df[target].value_counts().sort_index()
    labels = {0: "Healthy", 1: "Parkinson's"}
    for label, count in counts.items():
        print(f"  {labels.get(label, label)}: {count} ({count/len(df)*100:.1f}%)")

In [7]:
# SECTION 6: FEATURE SCALING
def encode_and_scale(df, target):
    print("\n6. FEATURE SCALING\n")

    X = df.drop(columns=[target])
    y = df[target]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    print(f"Final feature count: {X.shape[1]}")
    print(f"Features standardized using StandardScaler")

    return X, X_scaled, y, scaler

In [8]:
# SECTION 7: TRAIN-TEST SPLIT
def safe_train_test_split(X, y, test_size=0.2):
    print("\n7. TRAIN-TEST SPLIT\n")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=42
    )

    print(f"Split ratio: {int((1-test_size)*100)}/{int(test_size*100)}")
    print(f"Training set: {len(X_train)} samples")
    print(f"Test set: {len(X_test)} samples")
    print(f"\nClass distribution in training set:")
    train_counts = y_train.value_counts().sort_index()
    labels = {0: "Healthy", 1: "Parkinson's"}
    for label, count in train_counts.items():
        print(f"  {labels.get(label, label)}: {count} ({count/len(y_train)*100:.1f}%)")

    return X_train, X_test, y_train, y_test

In [24]:
# SECTION 8: AGGRESSIVE HYPERPARAMETER TUNING TO PREVENT OVERFITTING
def tune_hyperparameters(X_train, y_train):
    print("\n8.HYPERPARAMETER TUNING\n")

    tuned_models = {}
    tuning_results = []

    # Random Forest with VERY STRONG regularization
    print("Tuning Random Forest with regularization...")
    rf_param_grid = {
        'n_estimators': [50, 100],  # Fewer trees
        'max_depth': [3, 5, 7],  # Very shallow trees
        'min_samples_split': [20, 30, 40],  # Much higher split requirement
        'min_samples_leaf': [10, 15, 20],  # Much higher leaf requirement
        'max_features': ['sqrt', 'log2'],  # Limit features per split
        'min_impurity_decrease': [0.01, 0.02],  # Require minimum improvement
        'class_weight': ['balanced']
    }

    rf = RandomForestClassifier(random_state=42, n_jobs=-1)
    rf_grid = GridSearchCV(rf, rf_param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=0)

    start_time = time.time()
    rf_grid.fit(X_train, y_train)
    rf_time = time.time() - start_time

    tuned_models['Random_Forest'] = rf_grid.best_estimator_
    print(f"Best RF parameters: {rf_grid.best_params_}")
    print(f"Best CV score: {rf_grid.best_score_:.4f}")
    print(f"Training time: {rf_time:.2f} seconds\n")

    tuning_results.append({
        'Model': 'Random_Forest',
        'Best_Params': rf_grid.best_params_,
        'Best_CV_Score': rf_grid.best_score_,
        'Training_Time': rf_time
    })

    # Logistic Regression with STRONG regularization
    print("Tuning Logistic Regression with strong regularization...")
    lr_param_grid = {
        'C': [0.001, 0.01, 0.1, 1.0],  # Strong regularization (lower C)
        'penalty': ['l1', 'l2', 'elasticnet'],  # All penalty types
        'solver': ['saga'],  # Supports all penalties
        'l1_ratio': [0.3, 0.5, 0.7],  # For elasticnet
        'class_weight': ['balanced'],
        'max_iter': [2000]
    }

    lr = LogisticRegression(random_state=42)
    lr_grid = GridSearchCV(lr, lr_param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=0)

    start_time = time.time()
    lr_grid.fit(X_train, y_train)
    lr_time = time.time() - start_time

    tuned_models['Logistic_Regression'] = lr_grid.best_estimator_
    print(f"Best LR parameters: {lr_grid.best_params_}")
    print(f"Best CV score: {lr_grid.best_score_:.4f}")
    print(f"Training time: {lr_time:.2f} seconds\n")

    tuning_results.append({
        'Model': 'Logistic_Regression',
        'Best_Params': lr_grid.best_params_,
        'Best_CV_Score': lr_grid.best_score_,
        'Training_Time': lr_time
    })



    # Save tuning results
    df_tuning = pd.DataFrame(tuning_results)
    df_tuning.to_csv(f"{OUTPUT_DIR}/hyperparameter_tuning.csv", index=False)
    print(f"Hyperparameter tuning results saved to: {OUTPUT_DIR}/hyperparameter_tuning.csv")

    return tuned_models, tuning_results

In [28]:
# SECTION 9: MODEL EVALUATION WITH OVERFITTING ANALYSIS
def evaluate_models(models, X_train, X_test, y_train, y_test):
    print("\n9. MODEL EVALUATION WITH OVERFITTING ANALYSIS\n")

    results = []

    for name, model in models.items():
        print(f"\nEvaluating {name}...")

        # Training set performance
        y_train_pred = model.predict(X_train)
        train_acc = accuracy_score(y_train, y_train_pred)

        # Test set performance
        y_test_pred = model.predict(X_test)
        test_acc = accuracy_score(y_test, y_test_pred)

        # Calculate overfitting metric
        overfitting_gap = train_acc - test_acc

        # Other metrics
        prec = precision_score(y_test, y_test_pred, zero_division=0)
        rec = recall_score(y_test, y_test_pred, zero_division=0)
        f1 = f1_score(y_test, y_test_pred, zero_division=0)

        # ROC AUC
        if hasattr(model, "predict_proba"):
            y_prob = model.predict_proba(X_test)[:, 1]
            roc_auc = roc_auc_score(y_test, y_prob)
        else:
            roc_auc = None

        cm = confusion_matrix(y_test, y_test_pred)

        # Print metrics
        print(f"Training Accuracy:  {train_acc:.4f}")
        print(f"Test Accuracy:      {test_acc:.4f}")
        print(f"Overfitting Gap:    {overfitting_gap:.4f}")
        print(f"Precision:          {prec:.4f}")
        print(f"Recall:             {rec:.4f}")
        print(f"F1 Score:           {f1:.4f}")
        if roc_auc:
            print(f"ROC AUC:            {roc_auc:.4f}")
        print(f"\nConfusion Matrix:")
        print(f"                 Predicted")
        print(f"                 Healthy  Parkinson's")
        print(f"Actual Healthy      {cm[0][0]:3d}       {cm[0][1]:3d}")
        print(f"       Parkinson's  {cm[1][0]:3d}       {cm[1][1]:3d}")

        # Learning curves
        plot_learning_curve(model, X_train, y_train, name)

        results.append({
            "Model": name,
            "Train_Accuracy": train_acc,
            "Test_Accuracy": test_acc,
            "Overfitting_Gap": overfitting_gap,
            "Precision": prec,
            "Recall": rec,
            "F1_Score": f1,
            "ROC_AUC": roc_auc if roc_auc else 0,
            "True_Negatives": int(cm[0][0]),
            "False_Positives": int(cm[0][1]),
            "False_Negatives": int(cm[1][0]),
            "True_Positives": int(cm[1][1])
        })

    df_results = pd.DataFrame(results)
    df_results.to_csv(f"{OUTPUT_DIR}/model_metrics.csv", index=False)
    print(f"\nDetailed metrics saved to: {OUTPUT_DIR}/model_metrics.csv")

    best_model_name = df_results.loc[df_results['Test_Accuracy'].idxmax(), 'Model']
    best_accuracy = df_results['Test_Accuracy'].max()

    print("\n\nMODEL COMPARISON SUMMARY\n")
    print(f"{'Model':<25} {'Train_Acc':<12} {'Test_Acc':<12} {'Gap':<10} {'F1':<10}")
    print("-" * 75)
    for _, row in df_results.iterrows():
        print(f"{row['Model']:<25} {row['Train_Accuracy']:<12.4f} {row['Test_Accuracy']:<12.4f} "
              f"{row['Overfitting_Gap']:<10.4f} {row['F1_Score']:<10.4f}")

    print(f"\nBEST MODEL: {best_model_name} (Test Accuracy: {best_accuracy:.4f})")

    return df_results, best_model_name

In [11]:
def plot_learning_curve(model, X_train, y_train, model_name):
    train_sizes, train_scores, val_scores = learning_curve(
        model, X_train, y_train, cv=5, n_jobs=-1,
        train_sizes=np.linspace(0.1, 1.0, 10), scoring='accuracy'
    )

    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    val_mean = np.mean(val_scores, axis=1)
    val_std = np.std(val_scores, axis=1)

    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_mean, label='Training score', color='blue', marker='o')
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.15, color='blue')
    plt.plot(train_sizes, val_mean, label='Cross-validation score', color='red', marker='s')
    plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.15, color='red')
    plt.xlabel('Training Set Size')
    plt.ylabel('Accuracy Score')
    plt.title(f'Learning Curve - {model_name}', fontweight='bold')
    plt.legend(loc='best')
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(f"{OUTPUT_DIR}/learning_curve_{model_name}.png", dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved learning curve for {model_name}")


In [12]:
# SECTION 10: RESOURCE USAGE ANALYSIS
def measure_resource_usage(models, X_test):
    print("\n10. RESOURCE USAGE ANALYSIS\n")

    process = psutil.Process()
    resource_results = []

    for name, model in models.items():
        # Measure inference time
        start_time = time.time()
        model.predict(X_test)
        inference_time = time.time() - start_time

        # Measure memory
        mem_info = process.memory_info()
        memory_mb = mem_info.rss / (1024 * 1024)

        print(f"{name}:")
        print(f"  Inference time (test set): {inference_time:.4f} seconds")
        print(f"  Memory usage: {memory_mb:.2f} MB")

        resource_results.append({
            'Model': name,
            'Inference_Time_Seconds': inference_time,
            'Memory_MB': memory_mb
        })

    df_resources = pd.DataFrame(resource_results)
    df_resources.to_csv(f"{OUTPUT_DIR}/resource_usage.csv", index=False)
    print(f"\nResource usage saved to: {OUTPUT_DIR}/resource_usage.csv")

    return df_resources

In [13]:
# SECTION 11: FEATURE IMPORTANCE
def plot_feature_importance(model, feature_names, model_name):
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        indices = np.argsort(importances)[-15:]

        plt.figure(figsize=(10, 8))
        plt.barh(range(len(indices)), importances[indices], color='#3498db', alpha=0.7)
        plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
        plt.xlabel('Feature Importance')
        plt.title(f'Top 15 Feature Importances - {model_name}', fontweight='bold')
        plt.tight_layout()
        plt.savefig(f"{OUTPUT_DIR}/feature_importance_{model_name}.png", dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Saved feature importance plot for {model_name}")

In [14]:
# SECTION 12: SAVE ARTIFACTS
def save_artifacts(models, scaler, best_model_name, feature_names):
    print("\n12. SAVING ARTIFACTS\n")

    for name, model in models.items():
        filepath = f"{OUTPUT_DIR}/{name}.pkl"
        with open(filepath, "wb") as f:
            pickle.dump(model, f)
        print(f"Saved: {name}.pkl")

        if name == 'Random_Forest':
            plot_feature_importance(model, feature_names, name)

    with open(f"{OUTPUT_DIR}/scaler.pkl", "wb") as f:
        pickle.dump(scaler, f)
    print(f"Saved: scaler.pkl")

    with open(f"{OUTPUT_DIR}/best_model.pkl", "wb") as f:
        pickle.dump(models[best_model_name], f)
    print(f"Saved: best_model.pkl ({best_model_name})")

    with open(f"{OUTPUT_DIR}/feature_names.pkl", "wb") as f:
        pickle.dump(feature_names, f)
    print(f"Saved: feature_names.pkl")

In [25]:
# MAIN EXECUTION
if __name__ == "__main__":

        # Load data
  df = load_data(DATASET_PATH)

        # Feature engineering
  df = create_engineered_features(df)

        # EDA
  run_eda(df, TARGET_COLUMN)

        #clean data
  df = clean_data(df)

        # Validate target
  validate_target(df, TARGET_COLUMN)

        # Encode and scale
  X, X_scaled, y, scaler = encode_and_scale(df, TARGET_COLUMN)

        # Train-test split
  X_train, X_test, y_train, y_test = safe_train_test_split(X_scaled, y)

        # Hyperparameter tuning
  models, tuning_results = tune_hyperparameters(X_train, y_train)

        # Evaluate models with overfitting analysis
  results, best_model_name = evaluate_models(models, X_train, X_test, y_train, y_test)

        # Resource usage analysis
  resource_df = measure_resource_usage(models, X_test)

        # Save artifacts
  save_artifacts(models, scaler, best_model_name, X.columns.tolist())



  print(f"\nAll outputs saved to: {OUTPUT_DIR}/")
  print(f"  - EDA plots in: {OUTPUT_DIR}/eda_plots/")
  print(f"  - Model metrics: {OUTPUT_DIR}/model_metrics.csv")
  print(f"  - Hyperparameter tuning: {OUTPUT_DIR}/hyperparameter_tuning.csv")
  print(f"  - Resource usage: {OUTPUT_DIR}/resource_usage.csv")
  print(f"  - Trained models: {OUTPUT_DIR}/*.pkl")



Dataset loaded successfully
Shape: (195, 24) (rows x columns)
Features: 24

Data Types:
float64    22
object      1
int64       1
Name: count, dtype: int64

First 5 rows:
             name  MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
0  phon_R01_S01_1      119.992       157.302        74.997         0.00784   
1  phon_R01_S01_2      122.400       148.650       113.819         0.00968   
2  phon_R01_S01_3      116.682       131.111       111.555         0.01050   
3  phon_R01_S01_4      116.676       137.871       111.366         0.00997   
4  phon_R01_S01_5      116.014       141.781       110.655         0.01284   

   MDVP:Jitter(Abs)  MDVP:RAP  MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  ...  \
0           0.00007   0.00370   0.00554     0.01109       0.04374  ...   
1           0.00008   0.00465   0.00696     0.01394       0.06134  ...   
2           0.00009   0.00544   0.00781     0.01633       0.05233  ...   
3           0.00009   0.00502   0.00698     0.01505       0.0549