# Data Exploration Notebook for Schizophrenia Detection

This notebook is optimized for Google Colab and provides tools for exploring fMRI and MEG data for schizophrenia detection.

## Features:
- GPU configuration and memory management
- Google Drive integration for data storage
- Interactive visualizations
- Data quality checks and validation
- Progress tracking for long operations

## Setup and Configuration

### 1. Environment Setup

In [None]:
# Check if running in Google Colab
try:
    import google.colab
    IN_COLAB = True
    print("Running in Google Colab")
except ImportError:
    IN_COLAB = False
    print("Not running in Google Colab")

In [None]:
# Mount Google Drive for data storage and model checkpoints
if IN_COLAB:
    from google.colab import drive
    import os
    
    # Check if already mounted
    if not os.path.exists('/content/drive'):
        print("Mounting Google Drive...")
        drive.mount('/content/drive')
    else:
        print("Google Drive already mounted")
    
    # Set up project directory
    project_path = '/content/drive/MyDrive/schizophrenia_detection'
    os.makedirs(project_path, exist_ok=True)
    print(f"Project directory: {project_path}")
else:
    import os
    project_path = os.path.abspath('../')
    print(f"Local project directory: {project_path}")

In [None]:
# Install required packages specific to Colab environment
if IN_COLAB:
    print("Installing required packages...")
    
    # Core packages
    !pip install nibabel nilearn mne scikit-learn matplotlib seaborn tqdm -q
    
    # Interactive visualization packages
    !pip install plotly ipywidgets -q
    
    # Memory management packages
    !pip install psutil -q
    
    # Advanced neuroimaging packages
    !pip install dipy pydicom -q
    
    print("Packages installed successfully!")
else:
    print("Skipping package installation in local environment")

### 2. GPU Configuration and Memory Management

In [None]:
# Check GPU availability and configure memory
if IN_COLAB:
    import tensorflow as tf
    from psutil import virtual_memory
    
    # Check GPU availability
    gpu_available = tf.test.is_gpu_available()
    print(f"TensorFlow version: {tf.__version__}")
    print(f"GPU available: {gpu_available}")
    
    if gpu_available:
        gpu_name = tf.test.gpu_device_name()
        print(f"GPU device: {gpu_name}")
        
        # Configure GPU memory growth to prevent OOM errors
        gpus = tf.config.experimental.list_physical_devices('GPU')
        if gpus:
            try:
                for gpu in gpus:
                    tf.config.experimental.set_memory_growth(gpu, True)
                print("GPU memory growth enabled")
            except RuntimeError as e:
                print(f"Error setting GPU memory growth: {e}")
    
    # Check RAM availability
    ram_gb = virtual_memory().total / 1e9
    print(f"Available RAM: {ram_gb:.2f} GB")
    
    if ram_gb < 12:
        print("WARNING: Low RAM detected. Consider reducing batch sizes and data loading.")
else:
    print("Skipping GPU configuration in local environment")

In [None]:
# Memory management utilities
import gc
import psutil
import time
from tqdm.notebook import tqdm

def check_memory_usage():
    """Check current memory usage"""
    process = psutil.Process()
    mem_info = process.memory_info()
    print(f"Memory usage: {mem_info.rss / 1e6:.2f} MB")
    return mem_info.rss / 1e6

def clear_memory():
    """Clear memory by garbage collection"""
    gc.collect()
    if IN_COLAB:
        tf.keras.backend.clear_session()
    print("Memory cleared")

def monitor_memory(func):
    """Decorator to monitor memory usage of functions"""
    def wrapper(*args, **kwargs):
        start_mem = check_memory_usage()
        result = func(*args, **kwargs)
        end_mem = check_memory_usage()
        print(f"Memory change: {end_mem - start_mem:.2f} MB")
        return result
    return wrapper

print("Memory management utilities loaded")

### 3. Import Libraries and Configuration

In [None]:
# Import core libraries
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path

# Import neuroimaging libraries
import nibabel as nib
from nilearn import plotting, image, datasets
try:
    import mne
    MNE_AVAILABLE = True
except ImportError:
    MNE_AVAILABLE = False
    print("MNE not available for MEG data processing")

# Import visualization libraries
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import ipywidgets as widgets
from IPython.display import display, HTML

# Configure warnings and display
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['figure.dpi'] = 100
%matplotlib inline

print("Libraries imported successfully")

In [None]:
# Change to project directory and import project modules
sys.path.append(project_path)
os.chdir(project_path)
print(f"Current working directory: {os.getcwd()}")

# Import project modules
try:
    from config import default_config
    from utils.file_utils import list_files, load_json, save_json
    from utils.data_utils import normalize_data, resize_data
    from data_processing.data_loader import create_data_generator
    from data_processing.fmri_preprocessing import preprocess_fmri
    from visualization.interactive_plots import InteractivePlotter
    from visualization.result_plots import ResultPlotter
    print("Project modules imported successfully")
except ImportError as e:
    print(f"Warning: Could not import some project modules: {e}")
    print("Using minimal configuration for exploration")
    
    # Create minimal configuration for testing
    class MinimalConfig:
        def __init__(self):
            self.data = type('DataConfig', (), {
                'data_root': './data',
                'fmri_data_dir': './data/fmri',
                'meg_data_dir': './data/meg',
                'fmri_shape': (96, 96, 96, 4),
                'meg_shape': (306, 100, 1000)
            })()
            self.visualization = type('VisualizationConfig', (), {
                'output_dir': './visualizations'
            })()
    
    default_config = MinimalConfig()

### 4. Configuration for Data Exploration

In [None]:
# Configuration cell for easy parameter adjustment
EXPLORATION_CONFIG = {
    # Data paths
    'data_root': default_config.data.data_root,
    'fmri_dir': default_config.data.fmri_data_dir,
    'meg_dir': default_config.data.meg_data_dir,
    
    # Exploration parameters
    'max_subjects_to_load': 10,  # Limit for memory management
    'sample_voxels': 10000,  # Number of voxels to sample for statistics
    'visualize_slices': True,
    'create_interactive_plots': True,
    
    # Visualization parameters
    'figure_dpi': 150,
    'figure_size': (12, 8),
    'colormap': 'viridis',
    
    # Memory management
    'clear_memory_after_each_subject': True,
    'use_memory_mapping': True
}

# Update matplotlib parameters
plt.rcParams['figure.dpi'] = EXPLORATION_CONFIG['figure_dpi']
plt.rcParams['figure.figsize'] = EXPLORATION_CONFIG['figure_size']

# Create output directories
os.makedirs(EXPLORATION_CONFIG['data_root'], exist_ok=True)
os.makedirs(default_config.visualization.output_dir, exist_ok=True)

print("Exploration configuration set:")
for key, value in EXPLORATION_CONFIG.items():
    print(f"  {key}: {value}")

## Data Loading and Exploration

### 1. fMRI Data Exploration

In [None]:
# List available fMRI files with progress tracking
print("Scanning for fMRI files...")
fmri_files = list_files(EXPLORATION_CONFIG['fmri_dir'], '.nii.gz')

if not fmri_files:
    print("No fMRI files found. Creating sample data for demonstration.")
    # Create sample data directory and files
    os.makedirs(EXPLORATION_CONFIG['fmri_dir'], exist_ok=True)
    
    # Create a dummy fMRI file for demonstration
    sample_shape = default_config.data.fmri_shape
    sample_data = np.random.randn(*sample_shape)
    sample_img = nib.Nifti1Image(sample_data, affine=np.eye(4))
    
    sample_path = os.path.join(EXPLORATION_CONFIG['fmri_dir'], 'sample_fmri.nii.gz')
    nib.save(sample_img, sample_path)
    fmri_files = [sample_path]
    print(f"Created sample fMRI file: {sample_path}")

print(f"Found {len(fmri_files)} fMRI files")
if len(fmri_files) > 0:
    print(f"Example file: {fmri_files[0]}")
    
    # Limit files for memory management
    if len(fmri_files) > EXPLORATION_CONFIG['max_subjects_to_load']:
        fmri_files = fmri_files[:EXPLORATION_CONFIG['max_subjects_to_load']]
        print(f"Limited to {len(fmri_files)} files for memory management")

In [None]:
# Load and examine fMRI data with memory management
@monitor_memory
def load_fmri_data(file_path, use_memory_mapping=True):
    """Load fMRI data with memory management options"""
    try:
        if use_memory_mapping:
            # Use memory mapping for large files
            img = nib.load(file_path, mmap='r+')
        else:
            img = nib.load(file_path)
        
        data = img.get_fdata()
        return img, data
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None, None

# Load first fMRI file for detailed analysis
if fmri_files:
    print(f"\nLoading fMRI data from: {fmri_files[0]}")
    fmri_img, fmri_data = load_fmri_data(
        fmri_files[0], 
        use_memory_mapping=EXPLORATION_CONFIG['use_memory_mapping']
    )
    
    if fmri_data is not None:
        print(f"fMRI data shape: {fmri_data.shape}")
        print(f"Data type: {fmri_data.dtype}")
        print(f"Data range: [{fmri_data.min():.4f}, {fmri_data.max():.4f}]")
        print(f"Affine matrix shape: {fmri_img.affine.shape}")
        print(f"Voxel size: {fmri_img.header.get_zooms()}")
        
        # Calculate memory usage
        data_size_mb = fmri_data.nbytes / 1e6
        print(f"Data size: {data_size_mb:.2f} MB")
else:
    print("No fMRI files available for analysis")

### 2. fMRI Data Visualization

In [None]:
# Visualize fMRI slices with interactive controls
if fmri_files and EXPLORATION_CONFIG['visualize_slices']:
    def plot_fmri_slices(data, slice_indices=None):
        """Plot fMRI slices in three dimensions"""
        if slice_indices is None:
            # Use middle slices
            slice_indices = [
                data.shape[0] // 2,  # Sagittal
                data.shape[1] // 2,  # Coronal
                data.shape[2] // 2   # Axial
            ]
        
        fig, axes = plt.subplots(1, 3, figsize=(15, 5))
        
        # Handle 4D data (take first time point)
        if len(data.shape) == 4:
            data_3d = data[..., 0]
        else:
            data_3d = data
        
        # Sagittal slice
        axes[0].imshow(data_3d[slice_indices[0], :, :], cmap=EXPLORATION_CONFIG['colormap'], origin='lower')
        axes[0].set_title(f'Sagittal slice {slice_indices[0]}')
        axes[0].axis('off')
        
        # Coronal slice
        axes[1].imshow(data_3d[:, slice_indices[1], :], cmap=EXPLORATION_CONFIG['colormap'], origin='lower')
        axes[1].set_title(f'Coronal slice {slice_indices[1]}')
        axes[1].axis('off')
        
        # Axial slice
        axes[2].imshow(data_3d[:, :, slice_indices[2]], cmap=EXPLORATION_CONFIG['colormap'], origin='lower')
        axes[2].set_title(f'Axial slice {slice_indices[2]}')
        axes[2].axis('off')
        
        plt.tight_layout()
        plt.show()
    
    # Plot slices
    plot_fmri_slices(fmri_data)
    
    # Interactive slice viewer
    if EXPLORATION_CONFIG['create_interactive_plots']:
        def interactive_slice_viewer(sagittal_idx, coronal_idx, axial_idx):
            plot_fmri_slices(fmri_data, [sagittal_idx, coronal_idx, axial_idx])
        
        # Create interactive widgets
        sagittal_slider = widgets.IntSlider(
            min=0, max=fmri_data.shape[0]-1, 
            value=fmri_data.shape[0]//2, 
            description='Sagittal:'
        )
        coronal_slider = widgets.IntSlider(
            min=0, max=fmri_data.shape[1]-1, 
            value=fmri_data.shape[1]//2, 
            description='Coronal:'
        )
        axial_slider = widgets.IntSlider(
            min=0, max=fmri_data.shape[2]-1, 
            value=fmri_data.shape[2]//2, 
            description='Axial:'
        )
        
        # Create interactive plot
        widgets.interactive(
            interactive_slice_viewer, 
            sagittal_idx=sagittal_slider,
            coronal_idx=coronal_slider,
            axial_idx=axial_slider
        )

In [None]:
# Use nilearn for advanced visualization
if fmri_files:
    try:
        print("Creating advanced nilearn visualizations...")
        
        # Plot anatomical image with statistical map overlay
        display = plotting.plot_anat(fmri_img, title="fMRI Anatomical View", cut_coords=8)
        plotting.show()
        
        # Create glass brain visualization
        if len(fmri_data.shape) == 4:
            # Create statistical map from first time point
            stat_map = image.new_img_like(fmri_img, fmri_data[..., 0])
        else:
            stat_map = fmri_img
        
        plotting.plot_glass_brain(
            stat_map, 
            title='Glass Brain View',
            display_mode='ortho',
            colorbar=True
        )
        plotting.show()
        
        # Save visualizations
        output_dir = default_config.visualization.output_dir
        os.makedirs(output_dir, exist_ok=True)
        
        anat_path = os.path.join(output_dir, 'fmri_anatomical.png')
        display.savefig(anat_path)
        print(f"Saved anatomical view to {anat_path}")
        
    except Exception as e:
        print(f"Error in nilearn visualization: {e}")

### 3. fMRI Data Statistics and Quality Checks

In [None]:
# Calculate comprehensive fMRI statistics
if fmri_files:
    print("\n=== fMRI DATA STATISTICS ===")
    
    # Handle 4D data
    if len(fmri_data.shape) == 4:
        print(f"4D data detected with {fmri_data.shape[3]} time points")
        # Analyze first time point for basic statistics
        data_3d = fmri_data[..., 0]
        time_series_data = fmri_data
    else:
        data_3d = fmri_data
        time_series_data = None
    
    # Flatten data for statistics
    data_flat = data_3d.flatten()
    
    # Remove zeros (background)
    data_nonzero = data_flat[data_flat != 0]
    
    print(f"Total voxels: {len(data_flat):,}")
    print(f"Non-zero voxels: {len(data_nonzero):,} ({len(data_nonzero)/len(data_flat)*100:.2f}%)")
    print(f"Mean intensity: {data_nonzero.mean():.4f} ± {data_nonzero.std():.4f}")
    print(f"Min intensity: {data_nonzero.min():.4f}")
    print(f"Max intensity: {data_nonzero.max():.4f}")
    print(f"Median: {np.median(data_nonzero):.4f}")
    print(f"25th percentile: {np.percentile(data_nonzero, 25):.4f}")
    print(f"75th percentile: {np.percentile(data_nonzero, 75):.4f}")
    print(f"Skewness: {pd.Series(data_nonzero).skew():.4f}")
    print(f"Kurtosis: {pd.Series(data_nonzero).kurtosis():.4f}")

In [None]:
# Plot histogram of voxel intensities
if fmri_files:
    plt.figure(figsize=(12, 6))
    
    # Main histogram
    plt.subplot(1, 2, 1)
    plt.hist(data_nonzero, bins=100, alpha=0.7, density=True, color='skyblue', edgecolor='black')
    plt.title('Distribution of Voxel Intensities')
    plt.xlabel('Intensity')
    plt.ylabel('Density')
    plt.grid(True, alpha=0.3)
    
    # Log-scale histogram for better visualization of tail
    plt.subplot(1, 2, 2)
    plt.hist(data_nonzero, bins=100, alpha=0.7, density=True, color='lightcoral', edgecolor='black')
    plt.title('Distribution of Voxel Intensities (Log Scale)')
    plt.xlabel('Intensity')
    plt.ylabel('Density (log scale)')
    plt.yscale('log')
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Save histogram
    hist_path = os.path.join(default_config.visualization.output_dir, 'fmri_intensity_histogram.png')
    plt.savefig(hist_path, dpi=EXPLORATION_CONFIG['figure_dpi'], bbox_inches='tight')
    print(f"Saved histogram to {hist_path}")

In [None]:
# Data quality checks
if fmri_files:
    print("\n=== DATA QUALITY CHECKS ===")
    
    # Check for NaN values
    nan_count = np.isnan(data_3d).sum()
    print(f"Number of NaN values: {nan_count:,}")
    
    # Check for infinite values
    inf_count = np.isinf(data_3d).sum()
    print(f"Number of infinite values: {inf_count:,}")
    
    # Check for outliers (values beyond 3 standard deviations)
    mean_val = data_nonzero.mean()
    std_val = data_nonzero.std()
    outliers = data_nonzero[(data_nonzero < mean_val - 3*std_val) | (data_nonzero > mean_val + 3*std_val)]
    print(f"Number of outliers (±3σ): {len(outliers):,} ({len(outliers)/len(data_nonzero)*100:.2f}%)")
    
    # Check for potential artifacts
    if len(outliers) > 0:
        print(f"Outlier range: [{outliers.min():.4f}, {outliers.max():.4f}]")
    
    # Check data distribution quality
    if nan_count > 0 or inf_count > 0:
        print("WARNING: Data contains NaN or infinite values!")
    else:
        print("Data quality check passed: No NaN or infinite values found.")
    
    # Time series analysis if 4D data
    if time_series_data is not None:
        print(f"\n=== TIME SERIES ANALYSIS ===")
        print(f"Number of time points: {time_series_data.shape[3]}")
        
        # Calculate temporal signal-to-noise ratio (tSNR)
        temporal_mean = np.mean(time_series_data, axis=3)
        temporal_std = np.std(time_series_data, axis=3)
        tsnr = temporal_mean / (temporal_std + 1e-8)  # Add small epsilon to avoid division by zero
        
        # Calculate tSNR for brain voxels only
        brain_mask = temporal_mean > temporal_mean.mean()  # Simple threshold for brain mask
        brain_tsnr = tsnr[brain_mask]
        
        print(f"Mean tSNR: {brain_tsnr.mean():.2f} ± {brain_tsnr.std():.2f}")
        print(f"tSNR range: [{brain_tsnr.min():.2f}, {brain_tsnr.max():.2f}]")
        
        # Plot tSNR map
        plt.figure(figsize=(10, 8))
        plt.imshow(tsnr[:, :, tsnr.shape[2]//2], cmap=EXPLORATION_CONFIG['colormap'], origin='lower')
        plt.title(f'tSNR Map (Slice {tsnr.shape[2]//2})')
        plt.colorbar(label='tSNR')
        plt.axis('off')
        plt.show()
        
        # Save tSNR map
        tsnr_path = os.path.join(default_config.visualization.output_dir, 'fmri_tsnr_map.png')
        plt.savefig(tsnr_path, dpi=EXPLORATION_CONFIG['figure_dpi'], bbox_inches='tight')
        print(f"Saved tSNR map to {tsnr_path}")

### 4. MEG Data Exploration

In [None]:
# List and explore MEG files
print("\nScanning for MEG files...")
meg_files = list_files(EXPLORATION_CONFIG['meg_dir'], '.fif')

if not meg_files:
    print("No MEG files found. Creating sample data for demonstration.")
    # Create sample MEG directory and files
    os.makedirs(EXPLORATION_CONFIG['meg_dir'], exist_ok=True)
    print("MEG data exploration requires actual .fif files for detailed analysis.")
else:
    print(f"Found {len(meg_files)} MEG files")
    print(f"Example file: {meg_files[0]}")
    
    # Limit files for memory management
    if len(meg_files) > EXPLORATION_CONFIG['max_subjects_to_load']:
        meg_files = meg_files[:EXPLORATION_CONFIG['max_subjects_to_load']]
        print(f"Limited to {len(meg_files)} files for memory management")

In [None]:
# Load and examine MEG data if available
if meg_files and MNE_AVAILABLE:
    try:
        print(f"\nLoading MEG data from: {meg_files[0]}")
        
        # Load MEG data with progress tracking
        with tqdm(total=1, desc="Loading MEG data") as pbar:
            raw = mne.io.read_raw_fif(meg_files[0], preload=True)
            pbar.update(1)
        
        # Extract basic information
        meg_data = raw.get_data()
        print(f"MEG data shape: {meg_data.shape}")
        print(f"Sampling frequency: {raw.info['sfreq']} Hz")
        print(f"Duration: {raw.times[-1]:.2f} seconds")
        print(f"Number of channels: {len(raw.ch_names)}")
        
        # Get channel types
        channel_types = {}
        for ch in raw.ch_names:
            ch_type = mne.channel_type(raw.info, ch)
            channel_types[ch_type] = channel_types.get(ch_type, 0) + 1
        
        print("\nChannel types:")
        for ch_type, count in channel_types.items():
            print(f"  {ch_type}: {count}")
        
        # Calculate memory usage
        data_size_mb = meg_data.nbytes / 1e6
        print(f"Data size: {data_size_mb:.2f} MB")
        
        # Basic statistics
        print(f"\nMEG Signal Statistics:")
        print(f"Mean: {meg_data.mean():.4e} ± {meg_data.std():.4e}")
        print(f"Range: [{meg_data.min():.4e}, {meg_data.max():.4e}]")
        
        # Visualize MEG data
        plt.figure(figsize=(15, 10))
        
        # Plot first few channels
        n_channels_to_plot = min(10, len(raw.ch_names))
        time_points = raw.times[:1000]  # First 1000 time points
        
        for i in range(n_channels_to_plot):
            plt.subplot(n_channels_to_plot, 1, i+1)
            plt.plot(time_points, meg_data[i, :len(time_points)])
            plt.title(f"Channel: {raw.ch_names[i]}")
            plt.ylabel('Amplitude')
            if i == n_channels_to_plot - 1:
                plt.xlabel('Time (s)')
        
        plt.tight_layout()
        plt.show()
        
        # Save MEG visualization
        meg_plot_path = os.path.join(default_config.visualization.output_dir, 'meg_signals.png')
        plt.savefig(meg_plot_path, dpi=EXPLORATION_CONFIG['figure_dpi'], bbox_inches='tight')
        print(f"Saved MEG signals plot to {meg_plot_path}")
        
    except Exception as e:
        print(f"Error loading MEG data: {e}")
else:
    if not MNE_AVAILABLE:
        print("MNE package not available for MEG data exploration")
    else:
        print("No MEG files available for analysis")

## Group Analysis and Metadata

In [None]:
# Load and analyze metadata if available
metadata_path = os.path.join(EXPLORATION_CONFIG['data_root'], 'metadata.csv')

if os.path.exists(metadata_path):
    print(f"Loading metadata from: {metadata_path}")
    metadata = pd.read_csv(metadata_path)
    print(f"Metadata shape: {metadata.shape}")
    
    print("\nColumns:")
    for col in metadata.columns:
        print(f"  {col}")
    
    print("\nFirst few rows:")
    display(metadata.head())
    
    # Check class distribution
    if 'diagnosis' in metadata.columns:
        print("\nDiagnosis distribution:")
        diagnosis_counts = metadata['diagnosis'].value_counts()
        print(diagnosis_counts)
        
        # Plot diagnosis distribution
        plt.figure(figsize=(8, 6))
        diagnosis_counts.plot(kind='bar', color=['skyblue', 'lightcoral'])
        plt.title('Diagnosis Distribution')
        plt.xlabel('Diagnosis')
        plt.ylabel('Count')
        plt.xticks(rotation=45)
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()
        
        # Save diagnosis distribution plot
        diagnosis_path = os.path.join(default_config.visualization.output_dir, 'diagnosis_distribution.png')
        plt.savefig(diagnosis_path, dpi=EXPLORATION_CONFIG['figure_dpi'], bbox_inches='tight')
        print(f"Saved diagnosis distribution to {diagnosis_path}")
    
    # Plot age distribution by diagnosis if available
    if 'age' in metadata.columns and 'diagnosis' in metadata.columns:
        plt.figure(figsize=(10, 6))
        sns.boxplot(x='diagnosis', y='age', data=metadata)
        plt.title('Age Distribution by Diagnosis')
        plt.xlabel('Diagnosis')
        plt.ylabel('Age')
        plt.grid(True, alpha=0.3)
        plt.show()
        
        # Save age distribution plot
        age_path = os.path.join(default_config.visualization.output_dir, 'age_distribution.png')
        plt.savefig(age_path, dpi=EXPLORATION_CONFIG['figure_dpi'], bbox_inches='tight')
        print(f"Saved age distribution to {age_path}")
    
    # Gender distribution if available
    if 'gender' in metadata.columns:
        plt.figure(figsize=(8, 6))
        gender_counts = metadata['gender'].value_counts()
        gender_counts.plot(kind='pie', autopct='%1.1f%%')
        plt.title('Gender Distribution')
        plt.ylabel('')
        plt.show()
        
        # Save gender distribution plot
        gender_path = os.path.join(default_config.visualization.output_dir, 'gender_distribution.png')
        plt.savefig(gender_path, dpi=EXPLORATION_CONFIG['figure_dpi'], bbox_inches='tight')
        print(f"Saved gender distribution to {gender_path}")
    
else:
    print("No metadata file found. Creating sample metadata for demonstration.")
    
    # Create sample metadata
    n_subjects = max(len(fmri_files), len(meg_files), 20)  # At least 20 subjects
    sample_metadata = pd.DataFrame({
        'subject_id': [f"sub-{i:03d}" for i in range(n_subjects)],
        'age': np.random.randint(18, 65, n_subjects),
        'gender': np.random.choice(['M', 'F'], n_subjects),
        'diagnosis': np.random.choice(['control', 'schizophrenia'], n_subjects, p=[0.6, 0.4]),
        'site': np.random.choice(['site1', 'site2', 'site3'], n_subjects)
    })
    
    # Save sample metadata
    os.makedirs(EXPLORATION_CONFIG['data_root'], exist_ok=True)
    sample_metadata.to_csv(metadata_path, index=False)
    print(f"Created sample metadata with {n_subjects} subjects: {metadata_path}")
    
    display(sample_metadata.head())

## Interactive Visualizations

In [None]:
# Create interactive 3D visualization of fMRI data
if fmri_files and EXPLORATION_CONFIG['create_interactive_plots']:
    try:
        print("Creating interactive 3D visualization...")
        
        # Handle 4D data
        if len(fmri_data.shape) == 4:
            data_3d = fmri_data[..., 0]
        else:
            data_3d = fmri_data
        
        # Create interactive volume visualization using plotly
        # Sample the data for better performance
        sample_step = max(1, data_3d.shape[0] // 50)  # Limit to ~50 slices in each dimension
        sampled_data = data_3d[::sample_step, ::sample_step, ::sample_step]
        
        # Create 3D volume plot
        fig = go.Figure(data=go.Volume(
            x=np.arange(sampled_data.shape[0]),
            y=np.arange(sampled_data.shape[1]),
            z=np.arange(sampled_data.shape[2]),
            value=sampled_data.flatten(),
            isomin=sampled_data.min(),
            isomax=sampled_data.max(),
            opacity=0.1,
            surface_count=10,
            colorscale='Viridis'
        ))
        
        fig.update_layout(
            title='Interactive 3D fMRI Volume',
            scene=dict(
                xaxis_title='X',
                yaxis_title='Y',
                zaxis_title='Z'
            ),
            width=800,
            height=600
        )
        
        fig.show()
        
        # Save interactive plot
        interactive_path = os.path.join(default_config.visualization.output_dir, 'interactive_fmri.html')
        fig.write_html(interactive_path)
        print(f"Saved interactive visualization to {interactive_path}")
        
    except Exception as e:
        print(f"Error creating interactive visualization: {e}")
        print("Interactive visualization may not work in all environments.")

In [None]:
# Create interactive dashboard for data exploration
if EXPLORATION_CONFIG['create_interactive_plots']:
    try:
        print("Creating interactive dashboard...")
        
        # Create dashboard tabs
        tab = widgets.Tab()
        
        # Tab 1: Data Overview
        overview_tab = widgets.VBox([
            widgets.HTML("<h3>Data Overview</h3>"),
            widgets.HTML(f"<p><b>fMRI files:</b> {len(fmri_files)}</p>"),
            widgets.HTML(f"<p><b>MEG files:</b> {len(meg_files)}</p>"),
            widgets.HTML(f"<p><b>Data root:</b> {EXPLORATION_CONFIG['data_root']}</p>")
        ])
        
        # Tab 2: Configuration
        config_tab = widgets.VBox([
            widgets.HTML("<h3>Configuration</h3>"),
            widgets.HTML(f"<p><b>Max subjects:</b> {EXPLORATION_CONFIG['max_subjects_to_load']}</p>"),
            widgets.HTML(f"<p><b>Sample voxels:</b> {EXPLORATION_CONFIG['sample_voxels']}</p>"),
            widgets.HTML(f"<p><b>Memory mapping:</b> {EXPLORATION_CONFIG['use_memory_mapping']}</p>")
        ])
        
        # Tab 3: Memory Usage
        memory_tab = widgets.VBox([
            widgets.HTML("<h3>Memory Usage</h3>"),
            widgets.Button(description="Check Memory", button_style='info')
        ])
        
        # Set up tabs
        tab.children = [overview_tab, config_tab, memory_tab]
        tab.set_title(0, 'Overview')
        tab.set_title(1, 'Configuration')
        tab.set_title(2, 'Memory')
        
        display(tab)
        
        # Memory check button functionality
        def on_memory_button_clicked(b):
            clear_memory()
            check_memory_usage()
        
        memory_tab.children[1].on_click(on_memory_button_clicked)
        
    except Exception as e:
        print(f"Error creating dashboard: {e}")

## Preprocessing Pipeline Visualization

In [None]:
# Demonstrate preprocessing pipeline steps
if fmri_files:
    print("\n=== PREPROCESSING PIPELINE DEMONSTRATION ===")
    
    try:
        # Step 1: Original data
        print("Step 1: Original data")
        original_data = fmri_data
        if len(original_data.shape) == 4:
            original_data = original_data[..., 0]
        
        # Step 2: Normalization
        print("Step 2: Normalization")
        normalized_data = normalize_data(original_data, method='standard')
        
        # Step 3: Smoothing (if available)
        print("Step 3: Smoothing")
        try:
            from nilearn import image
            smoothed_img = image.smooth_img(fmri_img, fwhm=6)
            smoothed_data = smoothed_img.get_fdata()
            if len(smoothed_data.shape) == 4:
                smoothed_data = smoothed_data[..., 0]
        except:
            print("Skipping smoothing step")
            smoothed_data = normalized_data
        
        # Step 4: Resampling (if needed)
        print("Step 4: Resampling to standard space")
        try:
            target_shape = (64, 64, 64)
            resampled_data = resize_data(normalized_data, target_shape)
        except:
            print("Skipping resampling step")
            resampled_data = normalized_data
        
        # Visualize preprocessing steps
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        
        # Original data
        mid_slice = original_data.shape[2] // 2
        axes[0, 0].imshow(original_data[:, :, mid_slice], cmap='gray', origin='lower')
        axes[0, 0].set_title('Original Data')
        axes[0, 0].axis('off')
        
        # Normalized data
        axes[0, 1].imshow(normalized_data[:, :, mid_slice], cmap='gray', origin='lower')
        axes[0, 1].set_title('Normalized Data')
        axes[0, 1].axis('off')
        
        # Smoothed data
        axes[1, 0].imshow(smoothed_data[:, :, mid_slice], cmap='gray', origin='lower')
        axes[1, 0].set_title('Smoothed Data')
        axes[1, 0].axis('off')
        
        # Resampled data
        if 'resampled_data' in locals():
            resampled_slice = resampled_data.shape[2] // 2
            axes[1, 1].imshow(resampled_data[:, :, resampled_slice], cmap='gray', origin='lower')
            axes[1, 1].set_title('Resampled Data')
        else:
            axes[1, 1].text(0.5, 0.5, 'Resampling\nSkipped', 
                           ha='center', va='center', transform=axes[1, 1].transAxes)
            axes[1, 1].set_title('Resampled Data')
        axes[1, 1].axis('off')
        
        plt.tight_layout()
        plt.show()
        
        # Save preprocessing visualization
        prep_path = os.path.join(default_config.visualization.output_dir, 'preprocessing_pipeline.png')
        plt.savefig(prep_path, dpi=EXPLORATION_CONFIG['figure_dpi'], bbox_inches='tight')
        print(f"Saved preprocessing pipeline to {prep_path}")
        
    except Exception as e:
        print(f"Error in preprocessing demonstration: {e}")
else:
    print("No fMRI data available for preprocessing demonstration")

## Summary and Export

In [None]:
# Create a comprehensive summary report
print("\n" + "="*60)
print("DATA EXPLORATION SUMMARY REPORT")
print("="*60)

print(f"\n📁 Data Overview:")
print(f"  - fMRI files found: {len(fmri_files)}")
print(f"  - MEG files found: {len(meg_files)}")
print(f"  - Data root directory: {EXPLORATION_CONFIG['data_root']}")

if fmri_files:
    print(f"\n🧠 fMRI Data Analysis:")
    print(f"  - Data shape: {fmri_data.shape}")
    print(f"  - Data type: {fmri_data.dtype}")
    print(f"  - Non-zero voxels: {len(data_nonzero):,} ({len(data_nonzero)/len(data_flat)*100:.2f}%)")
    print(f"  - Mean intensity: {data_nonzero.mean():.4f} ± {data_nonzero.std():.4f}")
    print(f"  - Data quality: {'✓ PASS' if nan_count == 0 and inf_count == 0 else '✗ FAIL'}")

if meg_files and MNE_AVAILABLE:
    print(f"\n🔊 MEG Data Analysis:")
    print(f"  - Files available: {len(meg_files)}")
    print(f"  - Channel types: {list(channel_types.keys()) if 'channel_types' in locals() else 'N/A'}")

if 'metadata' in locals():
    print(f"\n👥 Group Analysis:")
    print(f"  - Total subjects: {len(metadata)}")
    if 'diagnosis' in metadata.columns:
        print(f"  - Diagnosis distribution: {dict(metadata['diagnosis'].value_counts())}")
    if 'age' in metadata.columns:
        print(f"  - Age range: {metadata['age'].min()} - {metadata['age'].max()} years")

print(f"\n🔧 Configuration:")
print(f"  - Max subjects loaded: {EXPLORATION_CONFIG['max_subjects_to_load']}")
print(f"  - Memory mapping enabled: {EXPLORATION_CONFIG['use_memory_mapping']}")
print(f"  - Interactive plots: {EXPLORATION_CONFIG['create_interactive_plots']}")

print(f"\n📊 Generated Visualizations:")
output_dir = default_config.visualization.output_dir
if os.path.exists(output_dir):
    viz_files = [f for f in os.listdir(output_dir) if f.endswith(('.png', '.html'))]
    for viz_file in viz_files:
        print(f"  - {viz_file}")

print(f"\n💾 Output Directory: {output_dir}")
print(f"\n🚀 Ready for model training!")
print("="*60)

In [None]:
# Export exploration results to Google Drive
if IN_COLAB:
    try:
        # Create exploration summary file
        exploration_summary = {
            'timestamp': pd.Timestamp.now().isoformat(),
            'data_summary': {
                'fmri_files_count': len(fmri_files),
                'meg_files_count': len(meg_files),
                'data_root': EXPLORATION_CONFIG['data_root']
            },
            'configuration': EXPLORATION_CONFIG,
            'visualizations_generated': viz_files if 'viz_files' in locals() else []
        }
        
        if fmri_files:
            exploration_summary['fmri_stats'] = {
                'shape': list(fmri_data.shape),
                'dtype': str(fmri_data.dtype),
                'nonzero_voxels': int(len(data_nonzero)),
                'mean_intensity': float(data_nonzero.mean()),
                'std_intensity': float(data_nonzero.std()),
                'data_quality': 'PASS' if nan_count == 0 and inf_count == 0 else 'FAIL'
            }
        
        # Save summary to Google Drive
        summary_path = os.path.join(project_path, 'exploration_summary.json')
        with open(summary_path, 'w') as f:
            json.dump(exploration_summary, f, indent=2)
        
        print(f"\n📋 Exploration summary saved to Google Drive: {summary_path}")
        
        # Copy visualizations to Google Drive
        import shutil
        drive_viz_dir = os.path.join(project_path, 'exploration_visualizations')
        if os.path.exists(output_dir):
            shutil.copytree(output_dir, drive_viz_dir, dirs_exist_ok=True)
            print(f"📊 Visualizations copied to Google Drive: {drive_viz_dir}")
        
    except Exception as e:
        print(f"Error exporting to Google Drive: {e}")
else:
    print("\n📋 Exploration complete. Results saved locally.")

In [None]:
# Clean up memory before ending
print("\n🧹 Cleaning up memory...")
clear_memory()
check_memory_usage()
print("\n✅ Data exploration notebook completed successfully!")
print("\nNext steps:")
print("1. Run the model_training.ipynb notebook to train the SSPNet model")
print("2. Use the results_analysis.ipynb notebook to evaluate and visualize results")
print("3. Check the generated visualizations in the output directory")