# Debugging Stan Sampling Issues

This notebook helps diagnose and fix common Stan sampling errors in pydmc.

In [None]:
import numpy as np
import pandas as pd
import sys

# Check Stan installation
print("Python version:", sys.version)
print("\nChecking Stan backends...\n")

# Try CmdStanPy
try:
    import cmdstanpy
    print("✓ CmdStanPy installed")
    print(f"  Version: {cmdstanpy.__version__}")
    print(f"  CmdStan path: {cmdstanpy.cmdstan_path()}")
except ImportError:
    print("✗ CmdStanPy not installed")
except Exception as e:
    print(f"✗ CmdStanPy error: {e}")

# Try PyStan
try:
    import pystan
    print("\n✓ PyStan installed")
    print(f"  Version: {pystan.__version__}")
except ImportError:
    print("\n✗ PyStan not installed")
except Exception as e:
    print(f"\n✗ PyStan error: {e}")

In [None]:
# Test data creation
def create_minimal_test_data():
    """Create minimal valid test data."""
    np.random.seed(42)
    
    data = []
    # Create simple data for 1 subject
    for i in range(50):
        stimulus = np.random.choice([0, 1])
        response = stimulus + 1  # Always correct
        rt = np.random.uniform(0.4, 0.8)
        
        data.append({
            'subject': 'S01',
            'stimulus': stimulus,
            'response': response,
            'rt': rt,
            'ssd': np.nan  # All go trials
        })
    
    return pd.DataFrame(data)

test_data = create_minimal_test_data()
print("Test data created:")
print(test_data.head())
print(f"\nShape: {test_data.shape}")
print(f"Subjects: {test_data['subject'].unique()}")

In [None]:
# Test data preparation
from pydmc import WaldStopSignalModel

print("Testing data preparation...\n")

# Use individual model (simpler)
model = WaldStopSignalModel(use_hierarchical=False)
print(f"Model created with backend: {model.backend.backend_name}")

try:
    stan_data = model.prepare_data(test_data)
    print("\n✓ Data preparation successful")
    print("\nStan data structure:")
    for key, value in stan_data.items():
        if isinstance(value, np.ndarray):
            print(f"  {key}: array of shape {value.shape}, dtype {value.dtype}")
        else:
            print(f"  {key}: {value}")
except Exception as e:
    print(f"\n✗ Data preparation failed: {e}")
    import traceback
    traceback.print_exc()

In [None]:
# Test model compilation
print("Testing model compilation...\n")

try:
    compiled = model.backend.compile_model(model.model_code)
    print("✓ Model compiled successfully")
    model.compiled_model = compiled
except Exception as e:
    print(f"✗ Compilation failed: {e}")
    import traceback
    traceback.print_exc()

In [None]:
# Test sampling with very short run
print("Testing sampling (short run)...\n")

try:
    fit = model.fit(
        test_data,
        chains=1,
        iter=100,
        warmup=50,
        cores=1,
        show_progress=True
    )
    print("\n✓ Sampling successful!")
    print("\nModel summary:")
    model.summary()
except Exception as e:
    print(f"\n✗ Sampling failed: {e}")
    import traceback
    traceback.print_exc()

## Common Issues and Solutions

### 1. Backend Not Installed
```bash
# Install CmdStanPy (recommended)
pip install cmdstanpy
python -m cmdstanpy.install_cmdstan

# OR install PyStan
pip install pystan
```

### 2. Data Format Issues
- Ensure RT values are positive and in seconds (not milliseconds)
- Response codes: 0=no response, 1=left, 2=right
- Stimulus codes: 0=left, 1=right
- SSD should be NaN for go trials, numeric for stop trials

### 3. Insufficient Data
- Need at least ~50 trials per subject for individual model
- Need at least ~100 trials per subject for hierarchical model
- Should have both correct and incorrect responses

### 4. Numerical Issues
- Very short or very long RTs can cause problems
- Filter RTs to reasonable range (e.g., 0.15-5.0 seconds)

### 5. Model Convergence
- Increase iterations and warmup
- Check for divergent transitions
- Consider adjusting priors

In [None]:
# Validate your actual data
def validate_data(df):
    """Validate stop-signal task data."""
    issues = []
    
    # Check required columns
    required = ['subject', 'stimulus', 'response', 'rt']
    missing = [col for col in required if col not in df.columns]
    if missing:
        issues.append(f"Missing columns: {missing}")
    
    # Check RT range
    valid_rt = df[df['rt'].notna()]['rt']
    if len(valid_rt) > 0:
        if valid_rt.min() < 0.1:
            issues.append(f"Very short RTs detected (min: {valid_rt.min():.3f}s)")
        if valid_rt.max() > 5.0:
            issues.append(f"Very long RTs detected (max: {valid_rt.max():.3f}s)")
    
    # Check response codes
    unique_responses = df['response'].unique()
    if not all(r in [0, 1, 2] for r in unique_responses):
        issues.append(f"Invalid response codes: {unique_responses}")
    
    # Check stimulus codes
    unique_stimuli = df['stimulus'].unique()
    if not all(s in [0, 1] for s in unique_stimuli):
        issues.append(f"Invalid stimulus codes: {unique_stimuli}")
    
    # Check data size
    n_subjects = df['subject'].nunique()
    trials_per_subject = df.groupby('subject').size()
    if trials_per_subject.min() < 50:
        issues.append(f"Some subjects have < 50 trials (min: {trials_per_subject.min()})")
    
    # Report
    if issues:
        print("⚠ Data validation issues:")
        for issue in issues:
            print(f"  - {issue}")
    else:
        print("✓ Data validation passed")
    
    return len(issues) == 0

# Test with our test data
validate_data(test_data)

In [None]:
# Use this cell to validate YOUR data
# Uncomment and modify:

# your_data = pd.read_csv('your_data.csv')
# validate_data(your_data)