# Data Preprocessing

main output: merge_data

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict, Any, List, Tuple
from collections import defaultdict

In [None]:
def load_excel_sheets(excel_path):
    """Load data from Excel with merged header rows, processing brain region information"""
    xls = pd.ExcelFile(excel_path)
    sheets_dict = {}
    
    for sheet_name in xls.sheet_names:
        # Read first two rows for header processing
        header_df = pd.read_excel(xls, sheet_name=sheet_name, nrows=2, header=None)
        
        # Process brain regions (first row)
        brain_regions = []
        current_region = None
        for val in header_df.iloc[0, 1:].values:
            if pd.notna(val):
                if "Right hemisphere ROIs" in val:
                    current_region = "R"
                elif "Left hemisphere ROIs" in val:
                    current_region = "L"
                else:
                    current_region = val  # Fallback
            brain_regions.append(current_region)
        
        # Get ROI names (second row)
        roi_names = header_df.iloc[1, 1:].values
        
        # Format ROI names as 'ROI (L/R)'
        full_roi_names = []
        for i in range(len(roi_names)):
            roi = str(roi_names[i]).replace("'", "")
            region = brain_regions[i] if pd.notna(brain_regions[i]) else "U"  # U for unknown
            
            if region in ["L", "R"]:
                full_name = f"{roi} ({region})"
            else:
                full_name = roi
            
            full_roi_names.append(full_name)
        
        # Read data (skip header rows)
        df = pd.read_excel(xls, sheet_name=sheet_name, header=None, skiprows=2)
        
        # Determine condition from sheet name
        if 'AUD' in sheet_name:
            condition = 'Auditory'
        elif 'VISampm' in sheet_name:
            condition = 'Visual_anteromedialposteromedial'
        elif 'VISarl' in sheet_name:
            condition = 'Visual_angular_region'
        elif 'rs2' in sheet_name:
            condition = 'Rest'
        else:
            condition = 'Unknown'
        
        # Extract time series data
        time = df.iloc[:, 0].values
        signals = df.iloc[:, 1:].values.T  # Transpose to (ROIs × time)
        
        sheets_dict[sheet_name] = {
            'condition': condition,
            'time': time,
            'signals': signals,
            'roi_names': full_roi_names,
            'brain_regions': brain_regions,
            'base_roi_names': roi_names.tolist()
        }
    
    return sheets_dict

In [None]:
# Define paths
data_root = Path("../data/DMD_fMRI_data_sharing")
timecourse_excel_path = data_root / "Time course data (ketxyl).xlsx"

# Check if file exists
if not timecourse_excel_path.exists():
    print(f"Error: Excel file not found at {timecourse_excel_path}")
else:
    # Load and print data
    print(f"Loading Excel file: {timecourse_excel_path}")
    sheets_data = load_excel_sheets(timecourse_excel_path)
    
    # Print verification info
    print("\nLoaded sheets summary:")
    print(f"Total sheets: {len(sheets_data)}")
    
    # Count sheets by condition
    condition_counts = {}
    for sheet_data in sheets_data.values():
        condition = sheet_data['condition']
        condition_counts[condition] = condition_counts.get(condition, 0) + 1
    
    print("\nSheets by condition:")
    for condition, count in condition_counts.items():
        print(f"{condition}: {count} sheet(s)")
    
    # Print detailed sheet info
    for sheet_name, stim_data in sheets_data.items():
        print(f"\n\nSheet: {sheet_name}")
        print(f"Condition: {stim_data['condition']}")
        print(f"Time points: {len(stim_data['time'])}")
        print(f"Signal shape: {stim_data['signals'].shape} (ROIs × Time)")
        print(f"First 5 ROI names: {stim_data['roi_names'][:5]}")

In [None]:
def load_stimulation_paradigm(file_path: str) -> Dict[str, Dict[str, Any]]:
    """
    Load stimulation paradigm from Excel and organize by filename.
    
    Parameters:
    -----------
    file_path : str
        Path to Excel file with structure:
        - First 2 rows: headers
        - Column A: Filename
        - Row 2, columns E onward: ROI names
        - Rows 3+: stimulation times
    
    Returns:
    --------
    Dict[str, Dict[str, Any]] where each key is a filename and value contains:
        {
            'roi_stim_times': Dict[str, List[float]],  # ROI to stimulation times
            'timeline': List[Tuple[float, float, str]]  # Sorted (start, end, state)
        }
    """
    # Read header to get ROI names (second row, columns E onward)
    header_df = pd.read_excel(file_path, header=None, nrows=2)
    stim_rois = header_df.iloc[1, 4:].dropna().astype(str).tolist()
    
    # Read data starting from row 3 (0-based index 2)
    data_df = pd.read_excel(file_path, header=None, skiprows=2)
    
    # Initialize output structure
    paradigm_data = {}
    
    # Process each row (each experimental run)
    for _, row in data_df.iterrows():
        filename = str(row[0]) if not pd.isna(row[0]) else None
        if not filename:
            continue
            
        # Initialize data for this file
        file_data = {
            'roi_stim_times': {roi: [] for roi in stim_rois},
            'timeline': []
        }
        
        # Collect all stimulation events for this file
        all_events = []
        
        for col_idx, roi in enumerate(stim_rois, start=4):
            if col_idx >= len(row):
                continue
                
            cell_value = row[col_idx]
            if pd.isna(cell_value):
                continue
                
            # Parse time values
            times = []
            for t in str(cell_value).split(','):
                try:
                    time = float(t.strip())
                    if time >= 0:  # Only accept non-negative times
                        times.append(time)
                        all_events.append((time, roi))
                except ValueError:
                    continue
            
            file_data['roi_stim_times'][roi] = times
        
        # Sort events by time for this file
        all_events.sort(key=lambda x: x[0])
        
        # Build timeline
        if all_events:
            # Add initial rest period (0 to first event)
            file_data['timeline'].append((0.0, all_events[0][0], "Rest"))
            
            # Add stimulation periods
            for i in range(len(all_events)):
                current_time, current_roi = all_events[i]
                
                # Determine end time
                end_time = current_time + 60 # 10s stimulation + 50s recovery
                
                file_data['timeline'].append((current_time, end_time, current_roi))
        else:
            # Entire period is rest if no events
            file_data['timeline'].append((0.0, 1.0, "Rest"))
        
        # Store in output dictionary
        paradigm_data[filename] = file_data
    
    return paradigm_data, stim_rois

In [None]:
stimpara_excel_path = data_root / "Stimulation_paradigm.xlsx"
stim_data, stim_rois = load_stimulation_paradigm(stimpara_excel_path)

# Print ROI names from stimulation paradigm
print("\n" + "="*50)
print("ROI Names from Stimulation Paradigm")
print("="*50)
print(f"\nTotal ROIs: {len(stim_rois)}")
print(f"ROI names: {stim_rois}")

# Analyze stimulation data structure
print("\n" + "="*50)
print("Stimulation Data Structure Analysis")
print("="*50)

if not stim_data:
    print("Warning: stim_data is empty!")
else:
    # Print first and last entry keys
    first_key = next(iter(stim_data))
    last_key = next(reversed(stim_data))
    
    print(f"\nTotal entries: {len(stim_data)}")
    print(f"First entry key: '{first_key}'")
    print(f"Last entry key: '{last_key}'")
    
    # Print details of first entry
    print("\nFirst entry details:")
    first_entry = stim_data[first_key]
    print(f"Entry type: {type(first_entry)}")
    
    if isinstance(first_entry, dict):
        print("Available keys:", first_entry.keys())
        
        # Print timeline sample
        if 'timeline' in first_entry:
            print("\nTimeline sample (first 3 events):")
            for start, end, state in first_entry['timeline'][:3]:
                print(f"  {start:.1f}s - {end:.1f}s: {state}")
        
        # Print ROI stimulation times
        if 'roi_stim_times' in first_entry:
            print("\nROI Stimulation Times:")
            for roi, times in list(first_entry['roi_stim_times'].items())[:5]:  # First 5 ROIs only
                times_sample = times[:3] if isinstance(times, (list, np.ndarray)) else [times]
                print(f"  {roi}: {times_sample}... (total {len(times) if hasattr(times, '__len__') else 1} events)")
            
            # Check for unmatched ROIs
            unmatched_rois = set(stim_rois) - set(first_entry['roi_stim_times'].keys())
            if unmatched_rois:
                print(f"\nWarning: {len(unmatched_rois)} ROIs in roi_names not found in stim_data:")
                print(list(unmatched_rois)[:5], "...")
    else:
        print(f"Unexpected entry type. Content: {first_entry}")

# Data consistency check
print("\n" + "="*50)
print("Data Consistency Check")
print("="*50)

if stim_data and stim_rois:
    # Check ROI consistency across all entries
    all_rois_in_stim = set()
    for entry in stim_data.values():
        if isinstance(entry, dict) and 'roi_stim_times' in entry:
            all_rois_in_stim.update(entry['roi_stim_times'].keys())
    
    print(f"\nROIs in stim_data: {len(all_rois_in_stim)}")
    print(f"ROIs in roi_names: {len(stim_rois)}")
    
    missing_in_stim = set(stim_rois) - all_rois_in_stim
    extra_in_stim = all_rois_in_stim - set(stim_rois)
    
    if missing_in_stim:
        print(f"\nWarning: {len(missing_in_stim)} ROIs in roi_names missing from stim_data:")
        print(list(missing_in_stim)[:5], "...")
    
    if extra_in_stim:
        print(f"\nWarning: {len(extra_in_stim)} ROIs in stim_data not in roi_names:")
        print(list(extra_in_stim)[:5], "...")

In [None]:
def line_up_and_merge_stimulation(
    sheets_data: Dict[str, Dict[str, Any]],
    stim_data: Dict[str, Dict[str, Any]],
    sampling_rate: float = 1.0
) -> Dict[str, Dict[str, Any]]: 
    """
    Align stimulation paradigm timeline with time series data and merge by experimental condition
    
    Args:
        sheets_data: Time series data {sheet_name: {'signals': np.ndarray, ...}}
        stim_data: Stimulation paradigm data {filename: {'timeline': [(start, end, state)], ...}}
        sampling_rate: Sampling rate in Hz
    
    Returns:
        {
            'roi_names': List[str],  # Shared across all conditions
            'Rest': {
                'data': np.ndarray,  # shape: (ROIs, time_points, trials)
                'sources': List[str]  
            },
            'Condition1': {
                'data': np.ndarray,  # shape: (ROIs, time_points, trials)
                'stim_roi': str,     # Stimulated ROI name
                'sources': List[str] 
            },
            ...
        }
    """
    merged_data = defaultdict(lambda: {'data': [], 'sources': []}) 
    roi_names = next(iter(sheets_data.values()))['roi_names']
    
    # Process each experimental file
    for filename, file_stim_data in stim_data.items():
        if filename not in sheets_data:
            continue
            
        sheet_data = sheets_data[filename]
        timecourse_data = sheet_data['signals']  # (ROIs, time_points)
        
        for start_time, end_time, state in file_stim_data['timeline']:
            start_idx = int(start_time * sampling_rate)
            end_idx = int(end_time * sampling_rate)
            
            if start_idx >= timecourse_data.shape[1]:
                continue
                
            end_idx = min(end_idx, timecourse_data.shape[1])
            segment = timecourse_data[:, start_idx:end_idx]
            
            merged_data[state]['data'].append(segment)
            merged_data[state]['sources'].append(filename[2])
    
    # Convert to unified structure with shared roi_names
    final_data = {'roi_names': roi_names}
    for state, state_data in merged_data.items():
        if state_data['data']: 
            state_entry = {
                'data': np.stack(state_data['data'], axis=-1),
                'sources': state_data['sources'] 
            }
            
            if state != "Rest":
                state_entry['stim_roi'] = state
                
            final_data[state] = state_entry
    
    return final_data

In [None]:
merged_data = line_up_and_merge_stimulation(
    sheets_data=sheets_data,
    stim_data=stim_data,
    sampling_rate=0.5
)

# Inspect the results
print("\nMerged Data Structure:")
for condition, data in merged_data.items():
    if condition == 'roi_names':
        roi_names = data
    else:
        print(f"\nCondition: {condition}")
        print(f"Data shape: {data['data'].shape} (ROIs×time×trials)")
        print(f"Source: {data['sources']}")

In [None]:
# Export merged_data
import pickle

with open('merged_data.pkl', 'wb') as f:
    pickle.dump(merged_data, f)