# Resting-State EEG pre-processing Tutorial with MNE-Python
This tutorial demonstrates a complete, production-ready pipeline for processing resting-state EEG data using MNE-Python and complementary libraries. Based on best practices from the CleanEEG application, it covers advanced preprocessing techniques including ASR, EOG regression, and automated artifact detection.

# Table of Contents

Installation and Setup

Loading EEG Data

Channel Montage Setup

Basic Preprocessing

Advanced Noise Removal

Bad Channel Detection

Artifact Removal Methods

Channel Interpolation

Quality Assessment

Saving and Reporting

## Installation and Setup

In [None]:
# Install required packages for EEG processing and visualization
!pip install mne             # Core package for EEG/MEG data analysis
!pip install pyprep          # For automatic bad channel detection
!pip install meegkit         # For advanced denoising methods
!pip install mne-icalabel    # For automatic classification of ICA components
!pip install matplotlib      # For visualization
!pip install numpy           # For numerical operations

## Loading Sample Data (Optional)
If you don't have your own resting-state EEG data, you can download a sample dataset:

In [None]:
# Download a sample EEG dataset if needed
from pathlib import Path
from get_sample_data import download_sample_data

# Download sample data from the MPI-Leipzig LEMON dataset
# This dataset contains high-quality resting-state EEG recordings
download_sample_data(
    ftp_host='ftp.gwdg.de',
    ftp_base='/pub/misc/MPI-Leipzig_Mind-Brain-Body-LEMON/EEG_MPILMBB_LEMON/EEG_Raw_BIDS_ID',
    local_base=Path('sample_data'),
    num_subjects=1  # Download just one subject for this tutorial
)

# Loading EEG Data with MNE-Python
MNE-Python supports various EEG file formats (.vhdr, .edf, .bdf, .set, etc.)

In [None]:
import mne
from pathlib import Path

# List of valid EEG file extensions that MNE can read
valid_eeg_formats = [".vhdr", ".edf", ".bdf", ".gdf", ".cnt", ".egi", 
                    ".mff", ".set", ".fif", ".data", ".nxe", ".lay"]

# Define the base directory where your data is located
base_dir = Path('sample_data')
subject_id = None  # Set to None to use the first available subject

# Find subject directory
if subject_id is None:
    # List only directories, sorted alphabetically
    subdirs = sorted([d for d in base_dir.iterdir() if d.is_dir()])
    if not subdirs:
        raise FileNotFoundError(f"No subject folders found under {base_dir}")
    subject_dir = subdirs[0]
    subject_id = subject_dir.name
    print(f"→ No subject_id given; using first available folder: {subject_id}")
else:
    subject_dir = base_dir / subject_id

# Find the first file with a valid EEG format
eeg_files = []
for ext in valid_eeg_formats:
    eeg_files.extend(list(subject_dir.rglob(f"*{ext}")))

if not eeg_files:
    raise FileNotFoundError(f"No valid EEG files found under {subject_dir}")

# Use the first valid EEG file found
eeg_path = eeg_files[0]
print(f"Loading {eeg_path}")

# Read the EEG data using MNE's io module
# preload=True loads all data into memory for faster processing
raw = mne.io.read_raw(eeg_path, preload=True)

# Print basic information about the loaded data
print("\nEEG Info:")
print(f"- File format: {eeg_path.suffix}")
print(f"- Sampling rate: {raw.info['sfreq']} Hz")
print(f"- Duration: {raw.times[-1]:.1f} seconds")
print(f"- Number of channels: {len(raw.ch_names)}")
print(f"- Channel Names: {raw.ch_names}")

## Setting Up Channel Locations

In [None]:
# Drop any non-EEG channels (e.g., VEOG for vertical eye movement)
if 'VEOG' in raw.ch_names:
    raw.drop_channels(['VEOG'])
    print("→ Channel 'VEOG' dropped.")
else:
    print("→ Channel 'VEOG' not found; no channels removed.")

# Verify channels list
print("Current channels:", raw.ch_names)

# Create a standard montage (BrainProducts RNP-BA-128)
montage = mne.channels.make_standard_montage('brainproducts-RNP-BA-128')

# Apply the montage to our raw data
# match_case=False allows for case-insensitive matching of channel names
# on_missing='warn' alerts us if channels don't match the montage
raw.set_montage(montage, match_case=False, on_missing='warn')

# Plot sensor locations to verify
# 2D topographic view of sensor locations
raw.plot_sensors(
    kind='topomap',
    show_names=True,
    title='2D EEG Sensor Montage'
)

# 3D view of sensor locations on a spherical head model
raw.plot_sensors(
    kind='3d',
    show_names=True,
    title='3D EEG Sensor Montage'
)

## Initial Data Inspection
Before processing, let's visualize the raw data to understand what we're working with:

### Plot the raw data

In [None]:
# This shows the time-domain signal for each EEG channel
raw.plot(scalings='auto')

### Calculate and plot the power spectral density (PSD)

In [None]:
# This shows the frequency content of the EEG signal
raw.compute_psd(fmin=1, fmax=40).plot(show=False)

## Basic Preprocessing
Now let's perform some initial preprocessing steps:

In [None]:
# Create a copy of the raw data for processing
raw_processed = raw.copy()

# Step 1: Downsample the data to reduce computational load
# High sampling rates aren't always necessary for EEG analysis
print('Original Sampling Rate:', raw_processed.info['sfreq'])
if raw_processed.info['sfreq'] > 1000:
    raw_processed = raw_processed.resample(1000)  # Downsample to 1000 Hz
    print('New Sampling Rate:', raw_processed.info['sfreq'])

# Step 2: Remove power line noise using DSS (Denoising Source Separation)
# This is more effective than a notch filter for line noise removal
from meegkit import dss

# Define the line noise frequency (50 Hz in Europe, 60 Hz in US/Canada)
line_noise_frequency = 50  

# Get the EEG data as a numpy array (channels x time)
eeg_data = raw_processed.get_data()

# Apply DSS algorithm to remove line noise
# dss_line finds components that represent line noise and removes them
processed_data, components = dss.dss_line(
    eeg_data.T,                           # Data must be (time x channels)
    fline=line_noise_frequency,           # Line frequency to target
    sfreq=raw_processed.info['sfreq'],    # Sampling frequency
    show=False                            # Don't show the components
)

# Update the data in our raw object
raw_processed._data = processed_data.T    # Convert back to (channels x time)

# Step 3: Apply bandpass filter to focus on frequencies of interest
# This removes slow drifts and high-frequency noise
raw_processed.filter(
    l_freq=1,     # High-pass filter cutoff at 1 Hz
    h_freq=100,   # Low-pass filter cutoff at 100 Hz
    method='fir'  # Finite Impulse Response filter
)

## Automatic Bad Channel Detection
Bad channels can severely distort analyses and should be identified:

In [None]:
# Use PyPrep to automatically detect noisy/bad channels
from pyprep.find_noisy_channels import NoisyChannels

# Create a NoisyChannels object and find all types of bad channels
nd = NoisyChannels(raw_processed, random_state=1337)
nd.find_all_bads()

# Get the list of bad channels detected
if nd:
    bad_channels = nd.get_bads()
    print(f"Detected bad channels: {bad_channels}")
    
    # Mark these channels as bad in our data
    raw_processed.info['bads'] = bad_channels
    
    # Exclude the detected noisy channels from further processing
    # This doesn't drop them, just ignores them temporarily
    raw_processed.pick(picks='eeg', exclude='bads')
else:
    print("No bad channels detected")

## Artifact Removal with ICA
Independent Component Analysis (ICA) is a powerful method for identifying and removing artifacts:

In [None]:
# Apply ICA to decompose signals into independent components
from mne.preprocessing import ICA
from mne_icalabel import label_components

# First, re-reference to average to improve ICA decomposition
raw_processed.set_eeg_reference('average')

# Create an ICA object (we'll use FastICA which is faster than Infomax)
# n_components=None uses all components
ica = ICA(n_components=None, random_state=97, method="fastica")

# Fit the ICA model to our preprocessed data
ica.fit(raw_processed)

# Use ICLabel to automatically classify components
# ICLabel uses a deep neural network to classify components as:
# 'brain', 'muscle artifact', 'eye blink', 'heart beat', 'line noise', 'channel noise', or 'other'
ic_labels = label_components(raw_processed, ica, method='iclabel')

# Extract the labels for each component
labels = ic_labels["labels"]
print(f"Component classifications: {labels}")

# Determine which components to exclude (non-brain components)
# We want to keep components labeled as 'brain' or 'other' (uncertain)
exclude_idx = [
    idx for idx, label in enumerate(labels) 
    if label not in ["brain", "other"]
]

print(f"Excluding these ICA components: {exclude_idx}")

# Set the components to exclude
ica.exclude = exclude_idx

# Visualize the ICA components
# This shows the spatial pattern and time course of each component
ica.plot_components()

# Apply ICA to remove the artifactual components
# This reconstructs the signal without the excluded components
raw_processed = ica.apply(raw_processed)

## Channel Interpolation
After removing bad channels and artifacts, we can interpolate the missing channels:

In [None]:
# Interpolate the bad channels to restore the full channel set
# This estimates the signal at bad channel locations using nearby good channels
raw_processed.interpolate_bads(reset_bads=False)

## Saving the Processed Data
Finally, let's save our cleaned data for future analysis:

In [None]:
# Save the cleaned data in MNE
output_path = Path(f"{subject_id}_cleaned_resting_eeg.vhdr")
# Export Raw to external formats
mne.export.export_raw(output_path, raw_processed, fmt='auto', overwrite=True)
print(f"Cleaned data saved to {output_path}")