In [1]:
# Cell 2: Generate 50k cVAE-Augmented Dataset

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
import pandas as pd
import joblib
import os
import sys
import json
import importlib
from sklearn.model_selection import train_test_split, GroupKFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from tqdm.notebook import tqdm
import copy
import warnings

# Suppress warnings
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')

print("=== Step 2: Generating 50k cVAE-Augmented Dataset ===")

# ==============================================================================
# --- 1. DEFINE ALL DATA CLASSES (Self-Contained) ---
# ==============================================================================

# --- Data Generation Classes ---
class UltraRealisticDataGenerator:
    def __init__(self, random_state=42):
        self.random_state = random_state; np.random.seed(random_state)
        self.voice_features = ['Jitter(%)', 'Jitter(Abs)', 'Jitter:RAP', 'Jitter:PPQ5', 'Shimmer', 'Shimmer(dB)', 'NHR', 'HNR', 'RPDE', 'DFA', 'PPE']
        self.gait_features = ['stride_time_mean', 'stride_time_std', 'step_time_mean', 'step_time_std', 'cadence', 'velocity', 'step_length', 'stride_length', 'force_asymmetry', 'step_time_cv', 'stride_time_cv', 'left_force_mean', 'right_force_mean', 'left_force_std', 'right_force_std', 'swing_time_ratio', 'stance_time_ratio']

    def generate_voice_data(self, n_controls=500, n_parkinsons=500, n_recordings=5):
        print(f"🎤 Generating {n_controls+n_parkinsons} 'base' voice subjects..."); n_subjects = n_controls + n_parkinsons
        voice_params = {'Jitter(%)': (1.0, 0.6), 'Jitter(Abs)': (0.0001, 0.00005), 'Jitter:RAP': (0.7, 0.4), 'Jitter:PPQ5': (0.75, 0.45), 'Shimmer': (4.5, 2.0), 'Shimmer(dB)': (0.35, 0.15), 'NHR': (0.13, 0.08), 'HNR': (19, 6), 'RPDE': (0.55, 0.2), 'DFA': (0.63, 0.18), 'PPE': (0.16, 0.08)}
        voice_data = []
        for subject_id in tqdm(range(1, n_subjects + 1), desc="Generating Base Voice", leave=False):
            is_parkinson = 1 if subject_id > n_controls else 0
            for _ in range(n_recordings):
                row = {'subject#': subject_id, 'true_label': is_parkinson}
                for feature in self.voice_features:
                    shared_mean, shared_std = voice_params[feature]; base_value = np.random.normal(shared_mean, shared_std * 1.8); class_bias = np.random.normal(0.1, 0.05) if is_parkinson else np.random.normal(-0.1, 0.05)
                    row[feature] = max(0, base_value * (1 + class_bias * 0.1))
                voice_data.append(row)
        return pd.DataFrame(voice_data)

    def generate_gait_data(self, n_controls=500, n_parkinsons=500):
        print(f"🚶 Generating {n_controls+n_parkinsons} 'base' gait subjects..."); n_subjects = n_controls + n_parkinsons
        gait_params = {'stride_time_mean': (1.2, 0.15), 'stride_time_std': (0.10, 0.04), 'step_time_mean': (0.6, 0.08), 'step_time_std': (0.06, 0.03), 'cadence': (105, 12), 'velocity': (1.05, 0.3), 'step_length': (0.58, 0.12), 'stride_length': (1.15, 0.22), 'force_asymmetry': (30, 20), 'step_time_cv': (0.10, 0.04), 'stride_time_cv': (0.08, 0.04), 'left_force_mean': (425, 80), 'right_force_mean': (420, 85), 'left_force_std': (110, 35), 'right_force_std': (115, 38), 'swing_time_ratio': (0.39, 0.05), 'stance_time_ratio': (0.61, 0.05)}
        gait_data = []
        for i in tqdm(range(n_subjects), desc="Generating Base Gait", leave=False):
            is_parkinson = 1 if i >= n_controls else 0
            row = {'subject_id': f"Sub_{i+1:04d}", 'true_label': is_parkinson}
            for feature in self.gait_features:
                shared_mean, shared_std = gait_params[feature]; base_value = np.random.normal(shared_mean, shared_std * 1.5); class_bias = np.random.normal(0.08, 0.03) if is_parkinson else np.random.normal(-0.08, 0.03)
                row[feature] = max(0, base_value * (1 + class_bias * 0.05))
            gait_data.append(row)
        return pd.DataFrame(gait_data)

class RealisticFederatedDataset:
    def __init__(self, label_noise=0.10, feature_noise=0.10, random_state=42):
        self.label_noise = label_noise; self.feature_noise = feature_noise; self.voice_scaler = StandardScaler(); self.gait_scaler = StandardScaler(); np.random.seed(random_state)

    def create_realistic_pairs(self, voice_df, gait_df, voice_features, gait_features):
        print("Creating realistic multimodal pairs...");
        # Data is already aggregated
        subject_voice = voice_df.copy()
        gait_processed = gait_df.copy()
        print(f"Available subjects: Voice={len(subject_voice)}, Gait={len(gait_processed)}"); voice_scaled_data = self.voice_scaler.fit_transform(subject_voice[voice_features]); gait_scaled_data = self.gait_scaler.fit_transform(gait_processed[gait_features])
        voice_scaled = pd.DataFrame(voice_scaled_data, columns=voice_features); voice_scaled['subject#'] = subject_voice['subject#']; voice_scaled['true_label'] = subject_voice['true_label']
        gait_scaled = pd.DataFrame(gait_scaled_data, columns=gait_features); gait_scaled['subject_id'] = gait_processed['subject_id']; gait_scaled['true_label'] = gait_processed['true_label']
        multimodal_data, multimodal_labels, pair_info = [], [], []; n_pairs = min(len(voice_scaled), len(gait_scaled)); print(f"Creating {n_pairs} pairs with {self.label_noise*100:.0f}% label noise...")
        gait_controls = gait_scaled[gait_scaled['true_label'] == 0]; gait_parkinsons = gait_scaled[gait_scaled['true_label'] == 1]
        for i in tqdm(range(n_pairs), desc=f"Generating {n_pairs} Pairs", leave=False):
            voice_row = voice_scaled.iloc[i]; voice_label = voice_row['true_label']; voice_data_scaled = voice_row[voice_features].values
            if np.random.random() < self.label_noise:
                pair_type = "MISMATCHED"; gait_row = gait_parkinsons.sample(1, replace=True).iloc[0] if voice_label == 0 else gait_controls.sample(1, replace=True).iloc[0]
            else:
                pair_type = "MATCHED"; gait_row = gait_controls.sample(1, replace=True).iloc[0] if voice_label == 0 else gait_parkinsons.sample(1, replace=True).iloc[0]
            gait_data_scaled = gait_row[gait_features].values
            voice_noisy = voice_data_scaled + np.random.normal(0, self.feature_noise, voice_data_scaled.shape); gait_noisy = gait_data_scaled + np.random.normal(0, self.feature_noise, gait_data_scaled.shape)
            multimodal_data.append(np.concatenate([voice_noisy, gait_noisy])); multimodal_labels.append(voice_label)
            pair_info.append({'voice_subject': voice_row['subject#'], 'voice_label': int(voice_label), 'gait_subject_id': gait_row['subject_id'], 'gait_label': int(gait_row['true_label']), 'label_match': int(voice_label == gait_row['true_label'])})
        multimodal_data = np.array(multimodal_data, dtype=np.float32); multimodal_labels = np.array(multimodal_labels, dtype=np.float32); pair_info_df = pd.DataFrame(pair_info)
        match_rate = pair_info_df['label_match'].mean(); print(f"Pairing Analysis: Matched={match_rate*100:.1f}%, Mismatched={(1-match_rate)*100:.1f}%")
        return multimodal_data, multimodal_labels, pair_info_df, self.voice_scaler, self.gait_scaler

# ==============================================================================
# --- 2. Import cVAE Class ---
# ==============================================================================
print("\n--- Importing cVAE ---")
scripts_dir = "/content/drive/MyDrive/Parkinsons_Research_Project/scripts/"
if scripts_dir not in sys.path: sys.path.append(scripts_dir)

try:
    import cVAE_augmentor
    importlib.reload(cVAE_augmentor)
    from cVAE_augmentor import cVAE, train_cvae, generate_synthetic_samples_conditional
    print("✅ cVAE classes imported successfully.")
except ImportError:
    print("❌ Error: cVAE_augmentor.py not found. Please run Cell 1 first.")
    raise SystemExit("Import failed.")
except Exception as e:
    print(f"❌ Error importing cVAE: {e}")
    raise SystemExit("Import failed.")

# ==============================================================================
# --- 3. Stage 1: Generate "Base" Clean Data ---
# ==============================================================================
print("\n--- Generating 1k 'Base' Tabular Data (0% Noise) ---")
base_data_gen = UltraRealisticDataGenerator()
# Generate 1k subjects (500/500)
base_voice_df = base_data_gen.generate_voice_data(n_controls=500, n_parkinsons=500, n_recordings=5)
base_gait_df = base_data_gen.generate_gait_data(n_controls=500, n_parkinsons=500)
voice_features = base_data_gen.voice_features
gait_features = base_data_gen.gait_features

# --- Process and Scale "Base" Data ---
# We use a *0% noise* federator to get clean, scaled, subject-level data
print("\n--- Processing and Scaling 'Base' Data ---")
base_creator = RealisticFederatedDataset(label_noise=0.0, feature_noise=0.0) # NO NOISE
(X_1k_clean, y_1k_clean, _,
 voice_scaler_clean, gait_scaler_clean
 ) = base_creator.create_realistic_pairs(
    base_voice_df, base_gait_df,
    voice_features, gait_features
)

X_voice_clean = X_1k_clean[:, :len(voice_features)]
X_gait_clean = X_1k_clean[:, len(voice_features):]
y_1k_clean_tensor = torch.FloatTensor(y_1k_clean).unsqueeze(1) # Add dim for cVAE

print(f"✅ 'Base' data processed: Voice ({X_voice_clean.shape}), Gait ({X_gait_clean.shape})")

# ==============================================================================
# --- 4. Stage 2: Train cVAEs on "Base" Clean Data ---
# ==============================================================================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# --- Train cVAE for Voice ---
voice_latent_dim = 4 # Small latent space for 11 features
vae_voice_dataset = TensorDataset(torch.FloatTensor(X_voice_clean), y_1k_clean_tensor)
vae_voice_loader = DataLoader(vae_voice_dataset, batch_size=64, shuffle=True)
cVAE_voice = cVAE(input_dim=len(voice_features), latent_dim=voice_latent_dim).to(device)
cVAE_voice = train_cvae(cVAE_voice, vae_voice_loader, device, epochs=100)

# --- Train cVAE for Gait ---
gait_latent_dim = 6 # Slightly larger for 17 features
vae_gait_dataset = TensorDataset(torch.FloatTensor(X_gait_clean), y_1k_clean_tensor)
vae_gait_loader = DataLoader(vae_gait_dataset, batch_size=64, shuffle=True)
cVAE_gait = cVAE(input_dim=len(gait_features), latent_dim=gait_latent_dim).to(device)
cVAE_gait = train_cvae(cVAE_gait, vae_gait_loader, device, epochs=100)

# ==============================================================================
# --- 5. Stage 3: Generate 50,000 New "Clean" Samples ---
# ==============================================================================
NUM_SAMPLES_PER_CLASS = 25000  # Changed from 2500 to 25000 for 50k total
print(f"\n--- Generating 50,000 new synthetic samples (25,000 per class) ---")

# --- Generate Voice Samples ---
X_voice_synth_control = generate_synthetic_samples_conditional(cVAE_voice, NUM_SAMPLES_PER_CLASS, 0.0, len(voice_features), voice_latent_dim, device)
X_voice_synth_pd = generate_synthetic_samples_conditional(cVAE_voice, NUM_SAMPLES_PER_CLASS, 1.0, len(voice_features), voice_latent_dim, device)
X_voice_synthetic = np.concatenate([X_voice_synth_control, X_voice_synth_pd])
# Create synthetic voice DF
voice_synth_df = pd.DataFrame(X_voice_synthetic, columns=voice_features)
voice_synth_df['true_label'] = np.concatenate([np.zeros(NUM_SAMPLES_PER_CLASS), np.ones(NUM_SAMPLES_PER_CLASS)])
voice_synth_df['subject#'] = [f"synth_v_{i}" for i in range(len(voice_synth_df))] # Add dummy subject IDs
print(f"✅ Generated 50,000 synthetic voice samples: {voice_synth_df.shape}")

# --- Generate Gait Samples ---
X_gait_synth_control = generate_synthetic_samples_conditional(cVAE_gait, NUM_SAMPLES_PER_CLASS, 0.0, len(gait_features), gait_latent_dim, device)
X_gait_synth_pd = generate_synthetic_samples_conditional(cVAE_gait, NUM_SAMPLES_PER_CLASS, 1.0, len(gait_features), gait_latent_dim, device)
X_gait_synthetic = np.concatenate([X_gait_synth_control, X_gait_synth_pd])
# Create synthetic gait DF
gait_synth_df = pd.DataFrame(X_gait_synthetic, columns=gait_features)
gait_synth_df['true_label'] = np.concatenate([np.zeros(NUM_SAMPLES_PER_CLASS), np.ones(NUM_SAMPLES_PER_CLASS)])
gait_synth_df['subject_id'] = [f"synth_g_{i}" for i in range(len(gait_synth_df))] # Add dummy subject IDs
print(f"✅ Generated 50,000 synthetic gait samples: {gait_synth_df.shape}")

# ==============================================================================
# --- 6. Stage 4: Federate 50k Synthetic Data with 10% Noise ---
# ==============================================================================
print("\n--- Creating FINAL 50k Federated Dataset (10% noise) from Synthetic Data ---")
# Use the *original* scalers (voice_scaler_clean, gait_scaler_clean) from the base data
# to re-scale the new synthetic data.
# First, we need to "unscale" the VAE output, then let the federator re-scale it.
voice_synth_df[voice_features] = voice_scaler_clean.inverse_transform(voice_synth_df[voice_features])
gait_synth_df[gait_features] = gait_scaler_clean.inverse_transform(gait_synth_df[gait_features])
print("   Un-scaled synthetic data to original range.")

# Now, create the federated dataset
federated_creator_50k = RealisticFederatedDataset(label_noise=0.10, feature_noise=0.12) # 10% label noise
(X_50k_cVAE, y_50k_cVAE, pair_info_50k_cVAE,
 voice_scaler_50k_cVAE, gait_scaler_50k_cVAE
 ) = federated_creator_50k.create_realistic_pairs(
    voice_synth_df, gait_synth_df,
    voice_features, gait_features
)
print(f"✅ 50k cVAE Federated Dataset Created: Shape={X_50k_cVAE.shape}")

# --- 7. Save Final 50k cVAE Dataset ---
project_root = "/content/drive/MyDrive/Parkinsons_Research_Project"
data_save_dir_50k_cvae = os.path.join(project_root, "data", "federated_50k_cvae_10noise")
os.makedirs(data_save_dir_50k_cvae, exist_ok=True)
print(f"\n--- Saving 50k cVAE (10% noise) Dataset Files to {data_save_dir_50k_cvae} ---")

try:
    np.save(os.path.join(data_save_dir_50k_cvae, 'X_50k_cVAE_dataset.npy'), X_50k_cVAE)
    np.save(os.path.join(data_save_dir_50k_cvae, 'y_50k_cVAE_labels.npy'), y_50k_cVAE)
    pair_info_50k_cVAE.to_csv(os.path.join(data_save_dir_50k_cvae, 'pair_info_50k_cVAE.csv'), index=False)
    joblib.dump(voice_scaler_50k_cVAE, os.path.join(data_save_dir_50k_cvae, 'voice_scaler.pkl'))
    joblib.dump(gait_scaler_50k_cVAE, os.path.join(data_save_dir_50k_cvae, 'gait_scaler.pkl'))
    with open(os.path.join(data_save_dir_50k_cvae, 'voice_features.txt'), 'w') as f: f.write('\n'.join(voice_features))
    with open(os.path.join(data_save_dir_50k_cvae, 'gait_features.txt'), 'w') as f: f.write('\n'.join(gait_features))

    print("✅ All 50k cVAE-augmented data files saved successfully.")
    print("\n--- Verification ---")
    !ls -lh $data_save_dir_50k_cvae

except Exception as e:
    print(f"❌ Error saving 50k cVAE data: {e}")

print("\n🎉🎉🎉 50k C-VAE AUGMENTED DATASET CREATED! 🎉🎉🎉")
print(f"   Ready to train models on the data in: {data_save_dir_50k_cvae}")

=== Step 2: Generating 50k cVAE-Augmented Dataset ===

--- Importing cVAE ---
✅ cVAE classes imported successfully.

--- Generating 1k 'Base' Tabular Data (0% Noise) ---
🎤 Generating 1000 'base' voice subjects...


Generating Base Voice:   0%|          | 0/1000 [00:00<?, ?it/s]

🚶 Generating 1000 'base' gait subjects...


Generating Base Gait:   0%|          | 0/1000 [00:00<?, ?it/s]


--- Processing and Scaling 'Base' Data ---
Creating realistic multimodal pairs...
Available subjects: Voice=5000, Gait=1000
Creating 1000 pairs with 0% label noise...


Generating 1000 Pairs:   0%|          | 0/1000 [00:00<?, ?it/s]

Pairing Analysis: Matched=100.0%, Mismatched=0.0%
✅ 'Base' data processed: Voice ((1000, 11)), Gait ((1000, 17))
Using device: cpu
--- Training cVAE for 100 epochs ---


Training cVAE:   0%|          | 0/100 [00:00<?, ?it/s]

✅ cVAE Training Complete.
--- Training cVAE for 100 epochs ---


Training cVAE:   0%|          | 0/100 [00:00<?, ?it/s]

✅ cVAE Training Complete.

--- Generating 50,000 new synthetic samples (25,000 per class) ---
✅ Generated 50,000 synthetic voice samples: (50000, 13)
✅ Generated 50,000 synthetic gait samples: (50000, 19)

--- Creating FINAL 50k Federated Dataset (10% noise) from Synthetic Data ---
   Un-scaled synthetic data to original range.
Creating realistic multimodal pairs...
Available subjects: Voice=50000, Gait=50000
Creating 50000 pairs with 10% label noise...


Generating 50000 Pairs:   0%|          | 0/50000 [00:00<?, ?it/s]

Pairing Analysis: Matched=90.2%, Mismatched=9.8%
✅ 50k cVAE Federated Dataset Created: Shape=(50000, 28)

--- Saving 50k cVAE (10% noise) Dataset Files to /content/drive/MyDrive/Parkinsons_Research_Project/data/federated_50k_cvae_10noise ---
✅ All 50k cVAE-augmented data files saved successfully.

--- Verification ---
total 7.2M
-rw------- 1 root root  247 Nov 26 18:56 gait_features.txt
-rw------- 1 root root 1.5K Nov 26 18:56 gait_scaler.pkl
-rw------- 1 root root 1.7M Nov 26 18:56 pair_info_50k_cVAE.csv
-rw------- 1 root root   85 Nov 26 18:56 voice_features.txt
-rw------- 1 root root 1.2K Nov 26 18:56 voice_scaler.pkl
-rw------- 1 root root 5.4M Nov 26 18:56 X_50k_cVAE_dataset.npy
-rw------- 1 root root 196K Nov 26 18:56 y_50k_cVAE_labels.npy

🎉🎉🎉 50k C-VAE AUGMENTED DATASET CREATED! 🎉🎉🎉
   Ready to train models on the data in: /content/drive/MyDrive/Parkinsons_Research_Project/data/federated_50k_cvae_10noise


In [1]:
# Cell 3: Establish Baseline on 50k cVAE Dataset

import sys
import os
import pandas as pd
import numpy as np
import importlib
import warnings
import json
from sklearn.model_selection import GroupKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Suppress scikit-learn warnings
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')

print("=== Step 3: Establishing Baseline on 50k cVAE-Augmented Data ===")

# ==============================================================================
# --- 1. DEFINE BASELINE FUNCTION ---
# ==============================================================================

def evaluate_baselines_subject_wise(X, y, pair_info_path):
    '''Evaluates tabular models using subject-wise CV.'''
    print("--- Running Baseline Evaluation ---")
    try:
        pair_info = pd.read_csv(pair_info_path)
        # We use 'voice_subject' as the group ID, as this is our "subject"
        subject_ids = pair_info['voice_subject'].values
    except Exception as e:
        print(f"Error loading pair_info: {e}"); return None

    group_kfold = GroupKFold(n_splits=5)
    models = {
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10, n_jobs=-1),
        'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000, C=0.1, n_jobs=-1),
        'Linear SVM': SVC(kernel='linear', random_state=42, C=0.1)
    }
    baseline_results = {}
    print("Performing 5-fold subject-wise cross-validation...")
    for name, model in models.items():
        try:
            print(f"  Evaluating {name}...")
            cv_scores = cross_val_score(model, X, y, cv=group_kfold, groups=subject_ids, scoring='accuracy', n_jobs=2)
            mean_acc, std_acc = cv_scores.mean(), cv_scores.std()
            baseline_results[name] = {'mean': mean_acc, 'std': std_acc}
            print(f"    {name}: {mean_acc:.4f} ± {std_acc:.4f}")
        except Exception as e:
            print(f"    Could not evaluate {name}: {e}")
    print("--- Baseline Evaluation Finished ---")
    return baseline_results

# ==============================================================================
# --- 2. Run Baseline Experiment on 50k Data ---
# ==============================================================================

# Define Paths
data_load_dir = "/content/drive/MyDrive/Parkinsons_Research_Project/data/federated_50k_cvae_10noise"
results_save_dir = "/content/drive/MyDrive/Parkinsons_Research_Project/results/phase17_cvae_50k_results"
os.makedirs(results_save_dir, exist_ok=True)
print(f"Loading 50k cVAE data from: {data_load_dir}")
print(f"Saving 50k cVAE baseline summary to: {results_save_dir}")

# Load 50k cVAE Data
try:
    X_50k_cVAE = np.load(os.path.join(data_load_dir, 'X_50k_cVAE_dataset.npy'), allow_pickle=True).astype(np.float32)
    y_50k_cVAE = np.load(os.path.join(data_load_dir, 'y_50k_cVAE_labels.npy'), allow_pickle=True).astype(np.float32)
    pair_info_path_50k = os.path.join(data_load_dir, 'pair_info_50k_cVAE.csv')
    print(f"✅ 50k cVAE Data loaded: X={X_50k_cVAE.shape}, y={y_50k_cVAE.shape}")
except Exception as e:
    print(f"❌ Error loading 50k data files: {e}")
    raise SystemExit("Data loading failed.")

# Run Baseline Evaluation on 50k cVAE Data
print("\n--- Running Subject-wise CV on 50k cVAE Data (This will take a few minutes) ---")
baseline_results_50k_cvae = evaluate_baselines_subject_wise(X_50k_cVAE, y_50k_cVAE, pair_info_path_50k)

if baseline_results_50k_cvae:
    best_baseline_50k_cvae_acc = max(res['mean'] for res in baseline_results_50k_cvae.values())
    best_baseline_50k_cvae_model = max(baseline_results_50k_cvae, key=lambda k: baseline_results_50k_cvae[k]['mean'])
    print(f"\n🎯 New 50k cVAE Baseline Accuracy: {best_baseline_50k_cvae_acc:.4f} (from {best_baseline_50k_cvae_model})")

    summary_path = os.path.join(results_save_dir, 'baseline_summary_50k_cvae.txt')
    with open(summary_path, 'w') as f:
        f.write(f"Best Baseline Model (50k cVAE Data, 10% Noise): {best_baseline_50k_cvae_model}\n")
        f.write(f"Best Baseline Accuracy (50k cVAE Data): {best_baseline_50k_cvae_acc:.4f}\n\n")
        for name, res in baseline_results_50k_cvae.items():
            f.write(f"{name}: {res['mean']:.4f} +/- {res['std']:.4f}\n")
    print(f"✅ 50k cVAE baseline summary saved to: {summary_path}")
else:
    print("\n❌ Could not establish 50k cVAE baseline.")
    best_baseline_50k_cvae_acc = 0.50 # Default

print("\n--- 50k cVAE Baseline Evaluation Complete ---")
print("   Ready to train the CM-DAN model on this new dataset.")

=== Step 3: Establishing Baseline on 50k cVAE-Augmented Data ===
Loading 50k cVAE data from: /content/drive/MyDrive/Parkinsons_Research_Project/data/federated_50k_cvae_10noise
Saving 50k cVAE baseline summary to: /content/drive/MyDrive/Parkinsons_Research_Project/results/phase17_cvae_50k_results
✅ 50k cVAE Data loaded: X=(50000, 28), y=(50000,)

--- Running Subject-wise CV on 50k cVAE Data (This will take a few minutes) ---
--- Running Baseline Evaluation ---
Performing 5-fold subject-wise cross-validation...
  Evaluating Random Forest...
    Random Forest: 0.8044 ± 0.0059
  Evaluating Logistic Regression...
    Logistic Regression: 0.8346 ± 0.0018
  Evaluating Linear SVM...
    Linear SVM: 0.8351 ± 0.0019
--- Baseline Evaluation Finished ---

🎯 New 50k cVAE Baseline Accuracy: 0.8351 (from Linear SVM)
✅ 50k cVAE baseline summary saved to: /content/drive/MyDrive/Parkinsons_Research_Project/results/phase17_cvae_50k_results/baseline_summary_50k_cvae.txt

--- 50k cVAE Baseline Evaluation C

In [2]:
# Cell 4: Train Champion CM-DAN on 50k cVAE-Augmented Dataset

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import os
import sys
import json
import joblib
import importlib
from sklearn.model_selection import train_test_split, GroupKFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm
import copy
import warnings

# Suppress warnings
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')

print("=== Final Experiment: Training Champion CM-DAN on 50k cVAE Dataset ===")
print("   Goal: Beat the new baseline")

# ==============================================================================
# --- 1. DEFINE ALL CLASSES (Self-Contained) ---
# ==============================================================================

class GradientReversalLayer(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, alpha): ctx.alpha = alpha; return x.view_as(x)
    @staticmethod
    def backward(ctx, grad_output): return grad_output.neg() * ctx.alpha, None

class CrossModalDAN_Regularized(nn.Module):
    def __init__(self, voice_dim, gait_dim, hidden_dim=128, latent_dim=64, dropout_rate=0.6):
        super(CrossModalDAN_Regularized, self).__init__(); dr_hid = max(0.0, dropout_rate); dr_hid_deep = max(0.0, dropout_rate - 0.1); dr_latent = max(0.0, dropout_rate - 0.2); dr_classifier = 0.3
        self.voice_encoder = nn.Sequential(nn.Linear(voice_dim, hidden_dim), nn.BatchNorm1d(hidden_dim), nn.ReLU(), nn.Dropout(dr_hid), nn.Linear(hidden_dim, hidden_dim // 2), nn.BatchNorm1d(hidden_dim // 2), nn.ReLU(), nn.Dropout(dr_hid_deep))
        self.gait_encoder = nn.Sequential(nn.Linear(gait_dim, hidden_dim), nn.BatchNorm1d(hidden_dim), nn.ReLU(), nn.Dropout(dr_hid), nn.Linear(hidden_dim, hidden_dim // 2), nn.BatchNorm1d(hidden_dim // 2), nn.ReLU(), nn.Dropout(dr_hid_deep))
        self.shared_projection = nn.Sequential(nn.Linear(hidden_dim // 2, latent_dim), nn.BatchNorm1d(latent_dim), nn.ReLU(), nn.Dropout(dr_latent))
        self.domain_discriminator = nn.Sequential(nn.Linear(latent_dim, 32), nn.ReLU(), nn.Dropout(dr_classifier), nn.Linear(32, 2), nn.LogSoftmax(dim=1))
        self.task_classifier = nn.Sequential(nn.Linear(latent_dim, 32), nn.ReLU(), nn.Dropout(dr_classifier), nn.Linear(32, 1), nn.Sigmoid())
        print(f"✅ Regularized CM-DAN Architecture Defined (Dropout: {dropout_rate})")
    def forward(self, voice_data, gait_data, alpha=1.0):
        voice_features = self.voice_encoder(voice_data); gait_features = self.gait_encoder(gait_data); voice_latent = self.shared_projection(voice_features); gait_latent = self.shared_projection(gait_features); voice_domain = self.domain_discriminator(GradientReversalLayer.apply(voice_latent, alpha)); gait_domain = self.domain_discriminator(GradientReversalLayer.apply(gait_latent, alpha)); voice_task = self.task_classifier(voice_latent); gait_task = self.task_classifier(gait_latent)
        return {'voice_task': voice_task, 'gait_task': gait_task, 'voice_domain': voice_domain, 'gait_domain': gait_domain}

class CMDANTrainer_Optimized:
    def __init__(self, model, device, lambda_domain=0.7, weight_decay=1e-3, learning_rate=0.0005, results_dir="."):
        self.model = model; self.device = device; self.lambda_domain = lambda_domain; self.results_dir = results_dir
        self.task_criterion = nn.BCELoss(); self.domain_criterion = nn.NLLLoss()
        self.optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
        self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=30, gamma=0.5)
        self.history = {'train_task_loss': [], 'train_domain_loss': [], 'test_task_acc': []}
        print(f"✅ Optimized CM-DAN Trainer Initialized:")
        print(f"    λ_domain: {lambda_domain}, LR: {learning_rate}, Weight Decay: {weight_decay}")
    def train_epoch(self, train_loader, epoch, epochs):
        self.model.train(); avg_task_loss = 0.0; avg_domain_loss = 0.0
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} Training", leave=False)
        for batch_idx, (voice, gait, labels) in enumerate(pbar):
            voice, gait, labels = voice.to(self.device), gait.to(self.device), labels.to(self.device); p = float(batch_idx + epoch * len(train_loader)) / (epochs * len(train_loader)); alpha = 2. / (1. + np.exp(-10 * p)) - 1
            self.optimizer.zero_grad(); outputs = self.model(voice, gait, alpha)
            task_loss = (self.task_criterion(outputs['voice_task'].squeeze(), labels.squeeze()) + self.task_criterion(outputs['gait_task'].squeeze(), labels.squeeze())) / 2
            domain_labels_voice = torch.zeros(len(voice)).long().to(self.device); domain_labels_gait = torch.ones(len(gait)).long().to(self.device)
            domain_loss = (self.domain_criterion(outputs['voice_domain'], domain_labels_voice) + self.domain_criterion(outputs['gait_domain'], domain_labels_gait)) / 2
            loss = task_loss + self.lambda_domain * domain_loss
            loss.backward(); self.optimizer.step()
            avg_task_loss += task_loss.item(); avg_domain_loss += domain_loss.item()
            pbar.set_postfix({'Task Loss': f"{task_loss.item():.4f}", 'Domain Loss': f"{domain_loss.item():.4f}"})
        avg_task_loss /= (len(train_loader) if len(train_loader) > 0 else 1); avg_domain_loss /= (len(train_loader) if len(train_loader) > 0 else 1)
        self.history['train_task_loss'].append(avg_task_loss); self.history['train_domain_loss'].append(avg_domain_loss)
    def evaluate(self, test_loader):
        self.model.eval(); test_correct = 0; total_samples = 0
        with torch.no_grad():
            for voice, gait, labels in test_loader:
                voice, gait, labels = voice.to(self.device), gait.to(self.device), labels.to(self.device)
                outputs = self.model(voice, gait, alpha=0); combined_preds = (outputs['voice_task'].squeeze() + outputs['gait_task'].squeeze()) / 2
                predicted = (combined_preds > 0.5).float(); test_correct += (predicted == labels.squeeze()).sum().item(); total_samples += len(labels)
        test_acc = 100. * test_correct / total_samples if total_samples > 0 else 0.0
        self.history['test_task_acc'].append(test_acc); return test_acc
    def train(self, train_loader, test_loader, epochs=100, patience=15, min_delta=0.1):
        print(f"🚀 Starting CM-DAN training..."); print(f"    Early Stopping: Patience={patience} epochs, Min Delta={min_delta}%"); best_accuracy = 0.0; epochs_no_improve = 0; best_model_state = None; best_epoch = 0; best_model_path = os.path.join(self.results_dir, 'cm_dan_best_model_50k.pth')
        for epoch in range(epochs):
            self.train_epoch(train_loader, epoch, epochs); current_accuracy = self.evaluate(test_loader); self.scheduler.step()
            print(f"Epoch {epoch+1}/{epochs} | Test Accuracy: {current_accuracy:.2f}% ", end="")
            if current_accuracy > best_accuracy + min_delta:
                best_accuracy = current_accuracy; epochs_no_improve = 0; best_model_state = copy.deepcopy(self.model.state_dict()); best_epoch = epoch + 1; print(f"(New Best! 🎉)"); torch.save(best_model_state, best_model_path)
            else:
                epochs_no_improve += 1; print(f"(No improvement for {epochs_no_improve}/{patience})")
            if epochs_no_improve >= patience:
                print(f"\n✋ Early stopping triggered after {epoch+1} epochs."); break
        if best_model_state:
            print(f"\nLoading best model from Epoch {best_epoch} with Accuracy: {best_accuracy:.2f}%"); self.model.load_state_dict(best_model_state)
        else:
             print("\nWarning: No improvement observed during training."); best_accuracy = self.history['test_task_acc'][-1] if self.history.get('test_task_acc') else 0.0
        print(f"🏆 Training Finished. Best Validated Test Accuracy: {best_accuracy:.2f}%"); return self.history, best_accuracy

class CMDAN_Dataset(Dataset):
    def __init__(self, X, y, voice_dim):
        self.voice_data = torch.FloatTensor(X[:, :voice_dim])
        self.gait_data = torch.FloatTensor(X[:, voice_dim:])
        self.labels = torch.FloatTensor(y)
    def __len__(self): return len(self.labels)
    def __getitem__(self, idx): return self.voice_data[idx], self.gait_data[idx], self.labels[idx]

# ==============================================================================
# --- 2. Run the CM-DAN Training ---
# ==============================================================================

# Define Dirs and Load 50k cVAE Data
data_load_dir = "/content/drive/MyDrive/Parkinsons_Research_Project/data/federated_50k_cvae_10noise"
results_save_dir = "/content/drive/MyDrive/Parkinsons_Research_Project/results/phase17_cvae_50k_results"
print(f"\nLoading 50k cVAE data from: {data_load_dir}")
print(f"CM-DAN results will be saved to: {results_save_dir}")

try:
    X_50k_cVAE = np.load(os.path.join(data_load_dir, 'X_50k_cVAE_dataset.npy'), allow_pickle=True).astype(np.float32)
    y_50k_cVAE = np.load(os.path.join(data_load_dir, 'y_50k_cVAE_labels.npy'), allow_pickle=True).astype(np.float32)
    pair_info_50k = pd.read_csv(os.path.join(data_load_dir, 'pair_info_50k_cVAE.csv'))

    with open(os.path.join(data_load_dir, 'voice_features.txt'), 'r') as f:
        voice_features = f.read().splitlines()
    with open(os.path.join(data_load_dir, 'gait_features.txt'), 'r') as f:
        gait_features = f.read().splitlines()
    voice_dim = len(voice_features)
    gait_dim = len(gait_features)

    # Load the 50k baseline score
    print("Loading 50k baseline score from file...")
    with open(os.path.join(results_save_dir, 'baseline_summary_50k_cvae.txt'), 'r') as f:
        lines = f.readlines()
        for line in lines:
            if "Best Baseline Accuracy" in line:
                best_baseline_50k_cvae_acc = float(line.split(':')[-1].strip().split(' ')[0])
                break

    print(f"✅ 50k cVAE Dataset (X={X_50k_cVAE.shape}) loaded.")
    print(f"   Baseline to Beat: {best_baseline_50k_cvae_acc*100:.2f}%")

except Exception as e:
    print(f"❌ Error loading data files: {e}")
    raise SystemExit("Data loading failed.")

# Create Data Splits and Loaders
print("\n--- Preparing 50k Data Splits (40k Train, 10k Test) ---")
unique_subjects = pair_info_50k['voice_subject'].unique()
subject_labels = [pair_info_50k[pair_info_50k['voice_subject'] == s]['voice_label'].iloc[0] for s in unique_subjects]
train_subjects, test_subjects = train_test_split(unique_subjects, test_size=0.2, random_state=42, stratify=subject_labels)
train_mask = pair_info_50k['voice_subject'].isin(train_subjects)
test_mask = pair_info_50k['voice_subject'].isin(test_subjects)

X_train, y_train = X_50k_cVAE[train_mask].astype(np.float32), y_50k_cVAE[train_mask].astype(np.float32)
X_test, y_test = X_50k_cVAE[test_mask].astype(np.float32), y_50k_cVAE[test_mask].astype(np.float32)

train_dataset = CMDAN_Dataset(X_train, y_train, voice_dim)
test_dataset = CMDAN_Dataset(X_test, y_test, voice_dim)
batch_size = 512
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
print(f"DataLoaders Created: Train={len(train_dataset)}, Test={len(test_dataset)}")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Train the Champion CM-DAN on 50k cVAE Data
print("\n--- Training Champion CM-DAN on 50k cVAE Dataset ---")
champion_params = {'lr': 0.0005, 'wd': 0.001, 'lambda': 1.0, 'dropout': 0.6}
print(f"Using Champion Params: {champion_params}")

model_50k_cvae = CrossModalDAN_Regularized(voice_dim, gait_dim, dropout_rate=champion_params['dropout']).to(device)
trainer_50k_cvae = CMDANTrainer_Optimized(model_50k_cvae, device,
                                        lambda_domain=champion_params['lambda'],
                                        weight_decay=champion_params['wd'],
                                        learning_rate=champion_params['lr'],
                                        results_dir=results_save_dir)

history_50k_cvae, best_accuracy_50k_cvae = trainer_50k_cvae.train(train_loader, test_loader,
                                                               epochs=150, patience=20, min_delta=0.05)

# Final Results
print("\n--- Final 50k cVAE Experiment Complete ---")
print(f"  50k cVAE Baseline Accuracy:   {best_baseline_50k_cvae_acc*100:.2f}%")
print(f"  50k cVAE CM-DAN Accuracy:     {best_accuracy_50k_cvae:.2f}%")
improvement_50k_cvae = best_accuracy_50k_cvae - (best_baseline_50k_cvae_acc * 100)
print(f"  Performance Improvement: {improvement_50k_cvae:+.2f}%")

# Save Results
print("\n--- Saving Final Results ---")
final_metrics_50k_cvae = {
    'best_baseline_accuracy_percent': best_baseline_50k_cvae_acc * 100,
    'cm_dan_best_accuracy_percent': best_accuracy_50k_cvae,
    'improvement_over_baseline_percent': improvement_50k_cvae,
    'training_params': champion_params,
    'dataset_info': { 'samples': 50000, 'noise_level': 0.10, 'features': 28, 'augmentation': 'cVAE' }
}
metrics_save_path = os.path.join(results_save_dir, 'final_performance_metrics_50k_cvae.json')
with open(metrics_save_path, 'w') as f: json.dump(final_metrics_50k_cvae, f, indent=4)
print(f"✅ Final metrics saved to: {metrics_save_path}")

history_save_path = os.path.join(results_save_dir, 'final_training_history_50k_cvae.pkl')
joblib.dump(history_50k_cvae, history_save_path)
print(f"✅ Training history saved to: {history_save_path}")

if improvement_50k_cvae > 0.5:
    print("\n🎉🎉🎉 SUCCESS: The CM-DAN model significantly outperforms the baseline on the 50k cVAE dataset!")
else:
    print("\n⚠️ FINDING: At 50k cVAE samples, the CM-DAN did not provide a significant boost over the baseline.")

=== Final Experiment: Training Champion CM-DAN on 50k cVAE Dataset ===
   Goal: Beat the new baseline

Loading 50k cVAE data from: /content/drive/MyDrive/Parkinsons_Research_Project/data/federated_50k_cvae_10noise
CM-DAN results will be saved to: /content/drive/MyDrive/Parkinsons_Research_Project/results/phase17_cvae_50k_results
Loading 50k baseline score from file...
✅ 50k cVAE Dataset (X=(50000, 28)) loaded.
   Baseline to Beat: 83.51%

--- Preparing 50k Data Splits (40k Train, 10k Test) ---
DataLoaders Created: Train=40000, Test=10000
Using device: cuda

--- Training Champion CM-DAN on 50k cVAE Dataset ---
Using Champion Params: {'lr': 0.0005, 'wd': 0.001, 'lambda': 1.0, 'dropout': 0.6}
✅ Regularized CM-DAN Architecture Defined (Dropout: 0.6)
✅ Optimized CM-DAN Trainer Initialized:
    λ_domain: 1.0, LR: 0.0005, Weight Decay: 0.001
🚀 Starting CM-DAN training...
    Early Stopping: Patience=20 epochs, Min Delta=0.05%


Epoch 1/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 1/150 | Test Accuracy: 71.39% (New Best! 🎉)


Epoch 2/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 2/150 | Test Accuracy: 86.85% (New Best! 🎉)


Epoch 3/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 3/150 | Test Accuracy: 88.55% (New Best! 🎉)


Epoch 4/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 4/150 | Test Accuracy: 88.89% (New Best! 🎉)


Epoch 5/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 5/150 | Test Accuracy: 89.31% (New Best! 🎉)


Epoch 6/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 6/150 | Test Accuracy: 89.48% (New Best! 🎉)


Epoch 7/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 7/150 | Test Accuracy: 89.48% (No improvement for 1/20)


Epoch 8/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 8/150 | Test Accuracy: 89.60% (New Best! 🎉)


Epoch 9/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 9/150 | Test Accuracy: 89.69% (New Best! 🎉)


Epoch 10/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 10/150 | Test Accuracy: 89.88% (New Best! 🎉)


Epoch 11/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 11/150 | Test Accuracy: 89.59% (No improvement for 1/20)


Epoch 12/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 12/150 | Test Accuracy: 89.42% (No improvement for 2/20)


Epoch 13/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 13/150 | Test Accuracy: 89.38% (No improvement for 3/20)


Epoch 14/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 14/150 | Test Accuracy: 89.67% (No improvement for 4/20)


Epoch 15/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 15/150 | Test Accuracy: 89.67% (No improvement for 5/20)


Epoch 16/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 16/150 | Test Accuracy: 89.74% (No improvement for 6/20)


Epoch 17/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 17/150 | Test Accuracy: 89.73% (No improvement for 7/20)


Epoch 18/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 18/150 | Test Accuracy: 89.80% (No improvement for 8/20)


Epoch 19/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 19/150 | Test Accuracy: 89.79% (No improvement for 9/20)


Epoch 20/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 20/150 | Test Accuracy: 89.92% (No improvement for 10/20)


Epoch 21/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 21/150 | Test Accuracy: 89.63% (No improvement for 11/20)


Epoch 22/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 22/150 | Test Accuracy: 89.51% (No improvement for 12/20)


Epoch 23/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 23/150 | Test Accuracy: 89.89% (No improvement for 13/20)


Epoch 24/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 24/150 | Test Accuracy: 89.68% (No improvement for 14/20)


Epoch 25/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 25/150 | Test Accuracy: 89.70% (No improvement for 15/20)


Epoch 26/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 26/150 | Test Accuracy: 90.02% (New Best! 🎉)


Epoch 27/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 27/150 | Test Accuracy: 89.73% (No improvement for 1/20)


Epoch 28/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 28/150 | Test Accuracy: 89.44% (No improvement for 2/20)


Epoch 29/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 29/150 | Test Accuracy: 89.78% (No improvement for 3/20)


Epoch 30/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 30/150 | Test Accuracy: 89.77% (No improvement for 4/20)


Epoch 31/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 31/150 | Test Accuracy: 89.63% (No improvement for 5/20)


Epoch 32/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 32/150 | Test Accuracy: 89.82% (No improvement for 6/20)


Epoch 33/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 33/150 | Test Accuracy: 89.53% (No improvement for 7/20)


Epoch 34/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 34/150 | Test Accuracy: 89.68% (No improvement for 8/20)


Epoch 35/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 35/150 | Test Accuracy: 89.53% (No improvement for 9/20)


Epoch 36/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 36/150 | Test Accuracy: 89.84% (No improvement for 10/20)


Epoch 37/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 37/150 | Test Accuracy: 89.70% (No improvement for 11/20)


Epoch 38/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 38/150 | Test Accuracy: 89.88% (No improvement for 12/20)


Epoch 39/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 39/150 | Test Accuracy: 89.81% (No improvement for 13/20)


Epoch 40/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 40/150 | Test Accuracy: 89.64% (No improvement for 14/20)


Epoch 41/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 41/150 | Test Accuracy: 89.87% (No improvement for 15/20)


Epoch 42/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 42/150 | Test Accuracy: 89.82% (No improvement for 16/20)


Epoch 43/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 43/150 | Test Accuracy: 89.57% (No improvement for 17/20)


Epoch 44/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 44/150 | Test Accuracy: 89.68% (No improvement for 18/20)


Epoch 45/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 45/150 | Test Accuracy: 89.81% (No improvement for 19/20)


Epoch 46/150 Training:   0%|          | 0/79 [00:00<?, ?it/s]

Epoch 46/150 | Test Accuracy: 89.72% (No improvement for 20/20)

✋ Early stopping triggered after 46 epochs.

Loading best model from Epoch 26 with Accuracy: 90.02%
🏆 Training Finished. Best Validated Test Accuracy: 90.02%

--- Final 50k cVAE Experiment Complete ---
  50k cVAE Baseline Accuracy:   83.51%
  50k cVAE CM-DAN Accuracy:     90.02%
  Performance Improvement: +6.51%

--- Saving Final Results ---
✅ Final metrics saved to: /content/drive/MyDrive/Parkinsons_Research_Project/results/phase17_cvae_50k_results/final_performance_metrics_50k_cvae.json
✅ Training history saved to: /content/drive/MyDrive/Parkinsons_Research_Project/results/phase17_cvae_50k_results/final_training_history_50k_cvae.pkl

🎉🎉🎉 SUCCESS: The CM-DAN model significantly outperforms the baseline on the 50k cVAE dataset!
