In [1]:
# Cell 1: Generate and Save 50k Voice and 50k Gait Datasets

import numpy as np
import pandas as pd
import os
import sys

print("=== Step 1: Generating 50k Voice and 50k Gait Datasets ===")

# --- 1. Define Project Paths ---
project_root = "/content/drive/MyDrive/Parkinsons_Research_Project"
# We will save these large datasets in a new, specific folder
data_save_dir = os.path.join(project_root, "data", "generated_50k")
os.makedirs(data_save_dir, exist_ok=True)
print(f"Data will be saved to: {data_save_dir}")

# --- 2. Define the Data Generator Class ---
class UltraRealisticDataGenerator:
    """Creates a challenging dataset with substantial class overlap."""

    def __init__(self, random_state=42):
        self.random_state = random_state
        np.random.seed(random_state)
        # Define feature lists
        self.voice_features = ['Jitter(%)', 'Jitter(Abs)', 'Jitter:RAP', 'Jitter:PPQ5', 'Shimmer',
                               'Shimmer(dB)', 'NHR', 'HNR', 'RPDE', 'DFA', 'PPE']
        self.gait_features = ['stride_time_mean', 'stride_time_std', 'step_time_mean', 'step_time_std',
                              'cadence', 'velocity', 'step_length', 'stride_length', 'force_asymmetry',
                              'step_time_cv', 'stride_time_cv', 'left_force_mean', 'right_force_mean',
                              'left_force_std', 'right_force_std', 'swing_time_ratio', 'stance_time_ratio']

    def generate_ultra_realistic_voice_data(self, n_controls=2500, n_parkinsons=2500, n_recordings=10):
        print(f"\nüé§ Generating {n_controls * n_recordings + n_parkinsons * n_recordings} realistic voice samples...")
        n_subjects = n_controls + n_parkinsons

        # MAJOR OVERLAP parameters
        voice_params = {
            'Jitter(%)': (1.0, 0.6), 'Jitter(Abs)': (0.0001, 0.00005), 'Jitter:RAP': (0.7, 0.4),
            'Jitter:PPQ5': (0.75, 0.45), 'Shimmer': (4.5, 2.0), 'Shimmer(dB)': (0.35, 0.15),
            'NHR': (0.13, 0.08), 'HNR': (19, 6), 'RPDE': (0.55, 0.2),
            'DFA': (0.63, 0.18), 'PPE': (0.16, 0.08)
        }

        voice_data = []
        for subject_id in range(1, n_subjects + 1):
            is_parkinson = 1 if subject_id > n_controls else 0
            for recording_id in range(1, n_recordings + 1):
                subject_data = {
                    'subject#': subject_id,
                    'recording_id': recording_id,
                    'true_label': is_parkinson
                }
                for feature in self.voice_features:
                    shared_mean, shared_std = voice_params[feature]
                    base_value = np.random.normal(shared_mean, shared_std * 1.8)
                    class_bias = np.random.normal(0.1, 0.05) if is_parkinson else np.random.normal(-0.1, 0.05)
                    adjusted_value = max(0, base_value * (1 + class_bias * 0.1))
                    subject_data[feature] = adjusted_value
                voice_data.append(subject_data)

        voice_df = pd.DataFrame(voice_data)
        print(f"‚úÖ Generated voice data: {voice_df.shape}")
        return voice_df

    def generate_ultra_realistic_gait_data(self, n_controls=25000, n_parkinsons=25000):
        print(f"\nüö∂ Generating {n_controls + n_parkinsons} realistic gait subjects...")
        n_subjects = n_controls + n_parkinsons

        # MAJOR overlap parameters
        gait_params = {
            'stride_time_mean': (1.2, 0.15), 'stride_time_std': (0.10, 0.04), 'step_time_mean': (0.6, 0.08),
            'step_time_std': (0.06, 0.03), 'cadence': (105, 12), 'velocity': (1.05, 0.3),
            'step_length': (0.58, 0.12), 'stride_length': (1.15, 0.22), 'force_asymmetry': (30, 20),
            'step_time_cv': (0.10, 0.04), 'stride_time_cv': (0.08, 0.04), 'left_force_mean': (425, 80),
            'right_force_mean': (420, 85), 'left_force_std': (110, 35), 'right_force_std': (115, 38),
            'swing_time_ratio': (0.39, 0.05), 'stance_time_ratio': (0.61, 0.05)
        }

        gait_data = []
        for i in range(n_subjects):
            is_parkinson = 1 if i >= n_controls else 0
            subject_data = {
                'subject_id': f"Sub_{i+1:05d}",
                'true_label': is_parkinson
            }
            for feature in self.gait_features:
                shared_mean, shared_std = gait_params[feature]
                base_value = np.random.normal(shared_mean, shared_std * 1.5)
                class_bias = np.random.normal(0.08, 0.03) if is_parkinson else np.random.normal(-0.08, 0.03)
                adjusted_value = max(0, base_value * (1 + class_bias * 0.05))
                subject_data[feature] = adjusted_value
            gait_data.append(subject_data)

        gait_df = pd.DataFrame(gait_data)
        print(f"‚úÖ Generated gait data: {gait_df.shape}")
        return gait_df

# --- 3. Execute Generation and Save Files ---

# Instantiate the generator
data_generator_50k = UltraRealisticDataGenerator()

# --- Generate and Save VOICE Data ---
# We generate 5,000 subjects (2500/2500) with 10 recordings each = 50,000 samples
try:
    voice_df_50k = data_generator_50k.generate_ultra_realistic_voice_data(
        n_controls=2500, n_parkinsons=2500, n_recordings=10
    )
    voice_save_path = os.path.join(data_save_dir, "synthetic_voice_data_50k.csv")
    voice_df_50k.to_csv(voice_save_path, index=False)
    print(f"‚úÖ Voice dataset (50,000 samples) saved to: {voice_save_path}")
    # Verify
    print(f"   Voice class distribution: \n{voice_df_50k['true_label'].value_counts()}")
except Exception as e:
    print(f"‚ùå Error generating or saving voice data: {e}")

# --- Generate and Save GAIT Data ---
# We generate 50,000 subjects (25000/25000) with 1 recording each = 50,000 samples
try:
    gait_df_50k = data_generator_50k.generate_ultra_realistic_gait_data(
        n_controls=25000, n_parkinsons=25000
    )
    gait_save_path = os.path.join(data_save_dir, "synthetic_gait_data_50k.csv")
    gait_df_50k.to_csv(gait_save_path, index=False)
    print(f"‚úÖ Gait dataset (50,000 samples) saved to: {gait_save_path}")
    # Verify
    print(f"   Gait class distribution: \n{gait_df_50k['true_label'].value_counts()}")
except Exception as e:
    print(f"‚ùå Error generating or saving gait data: {e}")

print("\n--- Step 1 Complete ---")
print("You now have two separate, balanced 50k-sample CSV files in your Google Drive.")

=== Step 1: Generating 50k Voice and 50k Gait Datasets ===
Data will be saved to: /content/drive/MyDrive/Parkinsons_Research_Project/data/generated_50k

üé§ Generating 50000 realistic voice samples...
‚úÖ Generated voice data: (50000, 14)
‚úÖ Voice dataset (50,000 samples) saved to: /content/drive/MyDrive/Parkinsons_Research_Project/data/generated_50k/synthetic_voice_data_50k.csv
   Voice class distribution: 
true_label
0    25000
1    25000
Name: count, dtype: int64

üö∂ Generating 50000 realistic gait subjects...
‚úÖ Generated gait data: (50000, 19)
‚úÖ Gait dataset (50,000 samples) saved to: /content/drive/MyDrive/Parkinsons_Research_Project/data/generated_50k/synthetic_gait_data_50k.csv
   Gait class distribution: 
true_label
0    25000
1    25000
Name: count, dtype: int64

--- Step 1 Complete ---
You now have two separate, balanced 50k-sample CSV files in your Google Drive.


In [2]:
# Cell 3: Create 100k Federated Dataset & Save

import numpy as np
import pandas as pd
import joblib
import os
import sys
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm # Import tqdm for a progress bar

print("=== Step 3: Creating 100k Federated Dataset ===")

# --- 1. Define Paths ---
project_root = "/content/drive/MyDrive/Parkinsons_Research_Project"
data_load_dir = os.path.join(project_root, "data", "generated_50k") # Load 50k CSVs
# Define a new directory for this 100k dataset
data_save_dir = os.path.join(project_root, "data", "federated_100k")
os.makedirs(data_save_dir, exist_ok=True)

print(f"Loading raw 50k data from: {data_load_dir}")
print(f"Final 100k federated dataset will be saved to: {data_save_dir}")

# --- 2. Load Raw Data ---
try:
    voice_df_50k = pd.read_csv(os.path.join(data_load_dir, "synthetic_voice_data_50k.csv"))
    gait_df_50k = pd.read_csv(os.path.join(data_load_dir, "synthetic_gait_data_50k.csv"))
    print(f"‚úÖ Loaded 50k voice data: {voice_df_50k.shape}")
    print(f"‚úÖ Loaded 50k gait data: {gait_df_50k.shape}")
except FileNotFoundError:
    print(f"‚ùå Error: Raw 50k CSV files not found. Please re-run Step 1.")
    raise SystemExit("Data loading failed.")

# --- 3. Define Feature Lists ---
voice_features = ['Jitter(%)', 'Jitter(Abs)', 'Jitter:RAP', 'Jitter:PPQ5', 'Shimmer',
                  'Shimmer(dB)', 'NHR', 'HNR', 'RPDE', 'DFA', 'PPE']
gait_features = ['stride_time_mean', 'stride_time_std', 'step_time_mean', 'step_time_std',
                 'cadence', 'velocity', 'step_length', 'stride_length', 'force_asymmetry',
                 'step_time_cv', 'stride_time_cv', 'left_force_mean', 'right_force_mean',
                 'left_force_std', 'right_force_std', 'swing_time_ratio', 'stance_time_ratio']

# --- 4. Process and Scale Voice Data (5,000 subjects) ---
print("\n--- Processing Voice Data ---")
subject_voice = voice_df_50k.groupby('subject#')[voice_features].mean().reset_index()
subject_voice_labels = voice_df_50k.groupby('subject#')['true_label'].first().values
subject_voice['true_label'] = subject_voice_labels
print(f"Aggregated voice data to {subject_voice.shape} subjects (2.5k C, 2.5k P)")
voice_scaler = StandardScaler()
voice_scaled = voice_scaler.fit_transform(subject_voice[voice_features])
print("Voice features scaled.")

# --- 5. Process and Scale Gait Data (50,000 subjects) ---
print("\n--- Processing Gait Data ---")
gait_processed = gait_df_50k.copy()
gait_processed['true_label'] = gait_df_50k['true_label']
gait_scaler = StandardScaler()
gait_scaled = gait_scaler.fit_transform(gait_processed[gait_features])
print("Gait features scaled.")

# --- 6. Create Federated Pairs (100k pairs) ---
n_pairs = 100000 # <-- This is our new target
label_noise = 0.20 # 20% label noise
feature_noise = 0.12 # 12% feature noise
np.random.seed(42)

multimodal_data = []
multimodal_labels = []
pair_info = []
print(f"\n--- Creating {n_pairs} Federated Pairs (with {label_noise*100:.0f}% label noise) ---")

# Pre-filter gait data for efficiency
gait_controls = gait_scaled[gait_processed['true_label'] == 0]
gait_parkinsons = gait_scaled[gait_processed['true_label'] == 1]
gait_controls_info = gait_processed[gait_processed['true_label'] == 0]
gait_parkinsons_info = gait_processed[gait_processed['true_label'] == 1]
n_voice_subjects = len(subject_voice)

# Use tqdm for a progress bar, as this will take some time
for i in tqdm(range(n_pairs), desc="Generating 100k Pairs"):
    # We re-use the 5,000 voice subjects by looping over them
    voice_idx = i % n_voice_subjects

    voice_label = subject_voice.iloc[voice_idx]['true_label']
    voice_data_scaled = voice_scaled[voice_idx]

    if np.random.random() < label_noise:
        # Mismatched pair
        pair_type = "MISMATCHED"
        if voice_label == 0: # Voice is Control, pair with PD Gait
            gait_idx = np.random.randint(len(gait_parkinsons))
            gait_data_scaled = gait_parkinsons[gait_idx]
            gait_info_row = gait_parkinsons_info.iloc[gait_idx]
        else: # Voice is PD, pair with Control Gait
            gait_idx = np.random.randint(len(gait_controls))
            gait_data_scaled = gait_controls[gait_idx]
            gait_info_row = gait_controls_info.iloc[gait_idx]
    else:
        # Matched pair
        pair_type = "MATCHED"
        if voice_label == 0: # Voice is Control, pair with Control Gait
            gait_idx = np.random.randint(len(gait_controls))
            gait_data_scaled = gait_controls[gait_idx]
            gait_info_row = gait_controls_info.iloc[gait_idx]
        else: # Voice is PD, pair with PD Gait
            gait_idx = np.random.randint(len(gait_parkinsons))
            gait_data_scaled = gait_parkinsons[gait_idx]
            gait_info_row = gait_parkinsons_info.iloc[gait_idx]

    # Add feature noise
    voice_noisy = voice_data_scaled + np.random.normal(0, feature_noise, voice_data_scaled.shape)
    gait_noisy = gait_data_scaled + np.random.normal(0, feature_noise, gait_data_scaled.shape)

    combined_features = np.concatenate([voice_noisy, gait_noisy])
    multimodal_data.append(combined_features)
    multimodal_labels.append(voice_label) # Use voice label as ground truth

    pair_info.append({
        'pair_id': i,
        'voice_subject': subject_voice.iloc[voice_idx]['subject#'],
        'voice_label': int(voice_label),
        'gait_subject_id': gait_info_row['subject_id'],
        'gait_label': int(gait_info_row['true_label']),
        'pair_type': pair_type,
        'label_match': int(voice_label == gait_info_row['true_label'])
    })

X_federated_100k = np.array(multimodal_data)
y_federated_100k = np.array(multimodal_labels)
pair_info_100k = pd.DataFrame(pair_info)

print(f"\n‚úÖ Final Federated Dataset Created:")
print(f"    X Shape: {X_federated_100k.shape}")
print(f"    y Shape: {y_federated_100k.shape}")
print(f"    Class Distribution: {dict(zip(*np.unique(y_federated_100k, return_counts=True)))}")
print(f"    Label Match Rate: {pair_info_100k['label_match'].mean():.3f}")

# --- 7. Save Final Federated Dataset and Scalers ---
print(f"\n--- Saving Federated 100k Dataset Files to {data_save_dir} ---")
try:
    np.save(os.path.join(data_save_dir, 'X_federated_100k.npy'), X_federated_100k)
    np.save(os.path.join(data_save_dir, 'y_federated_100k.npy'), y_federated_100k)
    pair_info_100k.to_csv(os.path.join(data_save_dir, 'pair_info_100k.csv'), index=False)

    # We save the scalers that were fit on the source data
    joblib.dump(voice_scaler, os.path.join(data_save_dir, 'voice_scaler.pkl'))
    joblib.dump(gait_scaler, os.path.join(data_save_dir, 'gait_scaler.pkl'))

    # Save feature lists
    with open(os.path.join(data_save_dir, 'voice_features.txt'), 'w') as f: f.write('\n'.join(voice_features))
    with open(os.path.join(data_save_dir, 'gait_features.txt'), 'w') as f: f.write('\n'.join(gait_features))

    print("‚úÖ All federated 100k data files saved successfully.")
    print("   Ready for next steps (Baseline Evaluation and CM-DAN Training).")
except Exception as e:
    print(f"‚ùå Error saving 100k federated data: {e}")

=== Step 3: Creating 100k Federated Dataset ===
Loading raw 50k data from: /content/drive/MyDrive/Parkinsons_Research_Project/data/generated_50k
Final 100k federated dataset will be saved to: /content/drive/MyDrive/Parkinsons_Research_Project/data/federated_100k
‚úÖ Loaded 50k voice data: (50000, 14)
‚úÖ Loaded 50k gait data: (50000, 19)

--- Processing Voice Data ---
Aggregated voice data to (5000, 13) subjects (2.5k C, 2.5k P)
Voice features scaled.

--- Processing Gait Data ---
Gait features scaled.

--- Creating 100000 Federated Pairs (with 20% label noise) ---


Generating 100k Pairs:   0%|          | 0/100000 [00:00<?, ?it/s]


‚úÖ Final Federated Dataset Created:
    X Shape: (100000, 28)
    y Shape: (100000,)
    Class Distribution: {np.float64(0.0): np.int64(50000), np.float64(1.0): np.int64(50000)}
    Label Match Rate: 0.802

--- Saving Federated 100k Dataset Files to /content/drive/MyDrive/Parkinsons_Research_Project/data/federated_100k ---
‚úÖ All federated 100k data files saved successfully.
   Ready for next steps (Baseline Evaluation and CM-DAN Training).
