# NeuroCluster:
<font size= 4> A Python toolbox for nonparametric cluster-based statistical testing of neurophysiological data with respect to continuous predictors 

First Authors: Alexandra Fink-Skular & Christina Maher  \
Updated: 11/10/2024 by AFS

In [1]:
# load required libraries
import numpy as np
import pandas as pd
from glob import glob
import datetime
from IPython import display

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# If you are running NeuroCluster from a local repo (not through a virtual environment package install)
user_base_dir   = '/Users/christinamaher/Documents/GitHub/NeuroCluster/'
# user_base_dir   = '/Path/To/NeuroCluster/NeuroCluster/'
sample_data_dir = f'{user_base_dir}data/synthetic_validation_tfr_data/'

# Let's store the date so we can keep track of versions
date = datetime.date.today().strftime('%m%d%Y')

In [4]:
f'{user_base_dir}NeuroCluster/'

'/Users/christinamaher/Documents/GitHub/NeuroCluster/NeuroCluster/'

In [5]:
# load custom functions
import sys
sys.path.append(f'{user_base_dir}NeuroCluster/')
import NeuroCluster

# load the plotting utils 
import plotting_utils as plotting_utils

In [6]:
with open(f'{user_base_dir}requirements.txt') as f:
    required = f.read().splitlines()

print(f'Required packages: {required}')
# conda create --name <env> --file <this file>'

Required packages: ['joblib==1.0.1', 'matplotlib==3.4.0', 'numpy==1.21.0', 'pandas==1.3.0', 'scipy==1.7.0', 'seaborn==0.11.0', 'statsmodels==0.12.2']


# Step 1: Format input data (neural and behavioral)

The sample data for this notebook includes: 
- neural data: np.array (n_channels x n_trials x n_freqs x n_times)
- behavior data: pd.DataFrame (n_trials x n_variables)

These variables are extracted from an `mne.time_frequency.EpochsTFR` which is a spatiotemporal representation of neural data that includes power across trials, frequencies, and timepoints. Note, for this notebook we provide the neural and behavioral data in `np.array` *a priori*. These data can be found in the `sample_data_dir`.

Here is some example code, loading an `EpochsTFR` following preprocessing with  [MNE-Python](https://mne.tools/stable/index.html) and creating a neural data file (i.e., `channel_13.npy`) and our behavioral regressors (i.e., `sample_behavior.csv`):

`# load EpochsTFR data`


`power_epochs = mne.time_frequency.read_tfrs(fname=f'{tfr_dir}{subj_id}_tfr.h5')[0]`



`# save data from each channel to a .npy file`


`for i in range(len(power_epochs.info['ch_names'])):`

        channel = power_epochs.info['ch_names'][i]

        data = power_epochs.data[i]

        np.save(f'{results_dir}{channel}.npy', data)



`# save metadata as a numpy array to a .csv file`


`metadata = pd.DataFrame(power_epochs.metadata)`


`metadata.to_csv(f'{results_dir}sample_behavior.csv', index=False)`



In the following cell, let's load our template neural data into a dictionary and load the DataFrame of our behavioral predictors.

In [7]:
# read fif file from data directory
sample_ieeg_file = f'{sample_data_dir}synthetic_validation_tfr.fif'
# use mne to load 
import mne

# Load the TFR data
tfr = mne.time_frequency.read_tfrs(sample_ieeg_file)

Reading /Users/christinamaher/Documents/GitHub/NeuroCluster/data/synthetic_validation_tfr_data/synthetic_validation_tfr.fif ...
Adding metadata with 1 columns


  tfr = mne.time_frequency.read_tfrs(sample_ieeg_file)


In [8]:
# print the behavioral metadata there should be an EV column
sample_behav = tfr[0].metadata
sample_behav

Unnamed: 0,expected_value
0,0.000000
1,0.010101
2,0.020202
3,0.030303
4,0.040404
...,...
95,0.959596
96,0.969697
97,0.979798
98,0.989899


In [9]:
# print channel names in tfr data
tfr[0].info['ch_names']

['synthetic_channel']

Let's define the behavioral variables we plan to include as independent variables in our linear regression (`multi_reg_vars`) and the regressor of interest (`target_var`) we will permute to determine whether significant clusters encoding this behavioral variable exist in our time x frequency data. All continuous predictors should be normalized. 

In [10]:
# set main predictor of interest for permutations *target_var must be a continuous numeric variable*
target_var = 'expected_value'

# define subset of predictor variables from sample_behav to include in regression (should include target_var)
multi_reg_vars = ['expected_value']

# subset input dataframe to include only multi_reg_vars
predictor_data = sample_behav.copy()[multi_reg_vars]

# let's print the first few rows of the predictor data to make sure it looks right
predictor_data.head()


Unnamed: 0,expected_value
0,0.0
1,0.010101
2,0.020202
3,0.030303
4,0.040404


In [11]:
tfr_data = tfr[0].data

In [12]:
tfr_data.shape

# get rid of dimension = 1 in tfr_data
tfr_data = tfr_data.squeeze()
tfr_data.shape

(100, 30, 500)

# Step 2: Perform within-electrode cluster test.

First, let's create a variable called `tfr_data` which is a `np.array` (dimensions should correspond with number of trials x number of frequencies x number of timepoints)

In [13]:
# subset demo channel data from sample_ieeg_dict and store as tfr_data variable: np.array of (num epochs x num frequencies x num times)
demo_channel = 'synthetic_channel'
#tfr_data     = sample_ieeg_dict[demo_channel]


# check tfr_data dimensions - must be num trials, num frequencies, num timepoints
tfr_data.shape 
print(f'Number of trials for {demo_channel}: {tfr_data.shape[0]}')
print(f'Number of frequencies for {demo_channel}: {tfr_data.shape[1]}')
print(f'Number of timepoints for {demo_channel}: {tfr_data.shape[2]}')

Number of trials for synthetic_channel: 100
Number of frequencies for synthetic_channel: 30
Number of timepoints for synthetic_channel: 500


Next, let's create an instance of `TFR_Cluster_Test`. This will be used to run the cluster test. It requires the tfr_data, predictor_data, target_var, and demo_channel as inputs.

In [None]:
help(NeuroCluster.TFR_Cluster_Test)

In [None]:
cluster_test  = NeuroCluster.TFR_Cluster_Test(tfr_data,predictor_data,target_var,demo_channel)
cluster_test

To calculate t-critical for a two-sided hypothesis test, we compute a T-distribution with N-K-1 degrees of freedom (N=number of samples, K = number of predictors in regression model) and find the t-values where the area of the t-distribution = 0.025 and 0.975 (1-alpha/ntails,alpha=0.05,ntails=2). 

In [None]:
plotting_utils.plot_tcritical(cluster_test)

Now, we are reading to run our linear regression based on the dependent neural variable (`tfr_data`) and independent behavioral variables (`predictor_data`, `target_var`) we passed as inputs to our `TFR_Cluster_Test` object. This will return pixel-level **β coefficients** and corresponding **t-statistics** for our TFR data in one electrode.

In [None]:
betas, tstats = cluster_test.tfr_regression()

Let's plot the **β coefficients** to give us an idea of the neural encoding pattern for our continuous predictor of interest.

In [None]:
NeuroCluster.plot_beta_coef(betas,cluster_test,freqs)

We can also plot the **t-statistics** that correspond with the **β coefficient** for each time-frequency point.

In [None]:
NeuroCluster.plot_tstats(tstats,cluster_test,freqs)

Let's highlight **clusters** (defined as consecutive time x frequency points) with significant t-statistics. We can do this separately for both positive and negative clusters. 

In [None]:
NeuroCluster.plot_clusters(tstats,cluster_test,freqs)

# Step 3: Calculate True Cluster Statistic(s) 

Now, we will identify the largest cluster (either/both positive and negative) and save the **cluster statistic** which will be our test statistic against our non-parametric null distribution. `max_tfr_cluster()` returns a dictionary containing the **cluster statistic**:`cluster_stat` and its associated **freq_idx**: `freq_idx` and **time_idx**:`time_idx`.

In [None]:
# Step 3: Find largest cluster(s) and return the max cluster statistic(s) and cluster's  frequencies x times indices
max_cluster_data  = cluster_test.max_tfr_cluster(tstats,max_cluster_output='all')
print(f'Max positive cluster dictionary: {max_cluster_data[0]}')
print(f'Max negative cluster dictionary: {max_cluster_data[1]}')

Let's plot our largest cluster and its associated **cluster statistic**

In [None]:
NeuroCluster.plot_max_clusters(cluster_test,tstats,freqs)
# TFR-Level Test Statistic: Largest Cluster

# Step 4: Compute cluster p-value(s) from null distribution of cluster statistics. 
To generate the null distribution, perform non-parametric cluster-based permutation testing by randomly permuting predictor of interest (target_var). 

At this point, we have computed the true cluster statistics from our neural (`tfr_data`) and behavioral data (`predictor_data`, `target_var`). Next, we will permute our input data and re-run the cluster identification procedure on each permuted dataset. This will allow us to generate a null distribution of cluster statistics, which we can use to evaluate the statistical significance of the cluster statistics observed in our true data.

`compute_null_cluster_stats()` takes `num_permutations` as an input, which specifies the desired number of permutations. The function will permute the regressor of interest according to this number. It returns a list of null cluster statistics, with the length of the list depending on the tails of the test. Here we generated 100 null cluster statistics, `num_permutations=100`, but we recommend running at least 200 permutations (500 to 1000 is best practice). 

In [34]:
null_cluster_distribution = cluster_test.compute_null_cluster_stats(num_permutations=100)

In [None]:
null_cluster_distribution

We will compute the ***p*-value** associated with our true cluster statistics based on the null distributions we create using `cluster_significance_test()`.

In [None]:
cluster_pvalue = cluster_test.cluster_significance_test(max_cluster_data,null_cluster_distribution) 
print(f'Positive cluster p-value: {cluster_pvalue[0]}')
print(f'Negative cluster p-value: {cluster_pvalue[1]}')

Let's create a plot showing the **null distribution(s)** we generated, with our true cluster statistic overlaid on top.

In [None]:
NeuroCluster.plot_null_distribution(null_cluster_distribution, max_cluster_data,cluster_pvalue,dpi=125)

In [39]:
# Save all plots
tstat_threshold = cluster_test.threshold_tfr_tstat(tstats)
tcrit_plot,beta_plot,tstat_plot,cluster_plot,max_cluster_plot,null_distribution_plot = NeuroCluster.plot_neurocluster_results(betas,cluster_test,
                                                                                                                    max_cluster_data, null_cluster_distribution, tstats, tstat_threshold,cluster_pvalue,freqs)

# Define the directory where you want to save the plots
output_directory = f'{results_dir}/{demo_channel}_{target_var}'

# Create the directory if it doesn't exist
NeuroCluster.create_directory(output_directory)

# Save plots to the output directory
NeuroCluster.save_plot_to_pdf(tcrit_plot, output_directory, f'{cluster_test.alternative}_tcrit_plot.pdf')
NeuroCluster.save_plot_to_pdf(beta_plot, output_directory, f'{cluster_test.alternative}_beta_plot.pdf')
NeuroCluster.save_plot_to_pdf(tstat_plot, output_directory, f'{cluster_test.alternative}_tstat_plot.pdf')
NeuroCluster.save_plot_to_pdf(cluster_plot, output_directory, f'{cluster_test.alternative}_cluster_plot.pdf')
NeuroCluster.save_plot_to_pdf(max_cluster_plot, output_directory, f'{cluster_test.alternative}_max_cluster_plot.pdf')
NeuroCluster.save_plot_to_pdf(null_distribution_plot, output_directory, f'{cluster_test.alternative}_null_distribution_plot.pdf')


# One-Sided Hypothesis Test Example
Rather than testing whether a tfr cluster significantly encodes our `target_var` in general, we can evaluate the directionality of `target_var` encoding in our cluster. Specifically, we can test whether neuronal activity in a cluster increases (or decreases) with increasing (or decreasing) values of the `target_var`.

In [None]:
cluster_test  = NeuroCluster.TFR_Cluster_Test(tfr_data,predictor_data,target_var,demo_channel,alternative='greater')
cluster_test

To calculate t-critical for a one-sided hypothesis test, we compute a T-distribution with N-K-1 degrees of freedom (N=number of samples, K = number of predictors in regression model) and find the t-value where the area of the t-distribution = 0.95 (1-alpha,alpha=0.05). 


In [None]:
NeuroCluster.plotting_utils.plot_tcritical(cluster_test)

Now, we are reading to run our linear regression based on the dependent neural variable (`tfr_data`) and independent behavioral variables (`predictor_data`, `target_var`) we passed as inputs to our `TFR_Cluster_Test` object. This will return pixel-level **β coefficients** and corresponding **t-statistics** for our TFR data in one electrode.

In [None]:
betas, tstats = cluster_test.tfr_regression()

Let's plot the **β coefficients** to give us an idea of the neural encoding pattern for our continuous predictor of interest.

In [None]:
NeuroCluster.plot_beta_coef(betas,cluster_test,freqs)

We can also plot the **t-statistics** that correspond with the **β coefficient** for each time-frequency point.

In [None]:
NeuroCluster.plot_tstats(tstats,cluster_test,freqs)

Let's highlight **clusters** (defined as consecutive time x frequency points) with significant t-statistics.

In [None]:
NeuroCluster.plot_clusters(tstats,cluster_test,freqs,figsize=(6,5))

# Step 3: Calculate True Cluster Statistic(s) 

Now, we will identify the largest cluster (either/both positive and negative) and save the **cluster statistic** which will be our test statistic against our non-parametric null distribution. `max_tfr_cluster()` returns a dictionary containing the **cluster statistic**:`cluster_stat` and its associated **freq_idx**: `freq_idx` and **time_idx**:`time_idx`.

In [None]:
# Step 3: Find largest cluster(s) and return the max cluster statistic(s) and cluster's  frequencies x times indices
max_cluster_data  = cluster_test.max_tfr_cluster(tstats,max_cluster_output='all')
print(f'Max positive cluster dictionary: {max_cluster_data[0]}')

Let's plot our **positive** largest cluster and its associated **cluster statistic**

In [None]:
NeuroCluster.plot_max_clusters(cluster_test,tstats,freqs,figsize=(6,5))

# Step 4: Compute cluster p-value(s) from null distribution of cluster statistics. 
To generate the null distribution, perform non-parametric cluster-based permutation testing by randomly permuting predictor of interest (target_var). 

At this point, we have computed the true cluster statistics from our neural (`tfr_data`) and behavioral data (`predictor_data`, `target_var`). Next, we will permute our input data and re-run the cluster identification procedure on each permuted dataset. This will allow us to generate a null distribution of cluster statistics, which we can use to evaluate the statistical significance of the cluster statistics observed in our true data.

`compute_null_cluster_stats()` takes `num_permutations` as an input, which specifies the desired number of permutations. The function will permute the regressor of interest according to this number. It returns a list of null cluster statistics, with the length of the list depending on the tails of the test. Here we generated 100 null cluster statistics, `num_permutations=100`, but we recommend running at least 200 permutations (500 to 1000 is best practice). 

In [62]:
null_cluster_distribution = cluster_test.compute_null_cluster_stats(num_permutations=100)

We will compute the ***p*-value** associated with our true cluster statistics based on the null distributions we create using `cluster_significance_test()`.

In [None]:
cluster_pvalue = cluster_test.cluster_significance_test(max_cluster_data,null_cluster_distribution) 
print(f'Positive cluster p-value: {cluster_pvalue[0]}')

Let's create a plot showing the **null distribution(s)** we generated, with our true cluster statistic overlaid on top.

In [None]:
NeuroCluster.plot_null_distribution(null_cluster_distribution, max_cluster_data,cluster_pvalue,figsize=(6,5),dpi=125)

# Trying to replicate findings using MNE's built-in t-test functionality

In [None]:
# Let's see if we get the same result as Neurocluster (a significant positive cluster meaning as error increases, encoding increases), using mne's built-in two sample t-test cluster function
import mne

# Let's take a median split for the error variable and assign it to high and low error variables (we need to discretize the error variable for the mne function)
error = sample_behav['error']
median_error = np.median(error)
low_error = tfr_data[error < median_error, :, :]    
high_error = tfr_data[error >= median_error, :, :]

# Let's run the mne two sample t-test cluster function
t_obs, clusters, cluster_pv, H0 = mne.stats.permutation_cluster_test([low_error, high_error], n_permutations=1000, tail=0)



In [None]:
for i_c, c in enumerate(clusters):
    c = c[0]
    if cluster_pv[i_c] <= 0.05:
        print(f"Cluster {i_c} p-value: {cluster_pv[i_c]}")
    else:
        print("No significant clusters found")

# Let's run the entire pipeline at once and save plots. 

In [14]:
### NeuroCluster single electrode workflow: 

# Step 1: Create TFR_Cluster_Test Object
cluster_test  = NeuroCluster.TFR_Cluster_Test(tfr_data,predictor_data,target_var,demo_channel,alternative='two-sided')

# Step 2: Run TFR regression to extract beta coefficients for predictor of interest (permute_var) & tstats for each pixel in TFR. Determine which t-statistics are significant based on the critical t-value and save a thresholded t-statistic matrix.
betas, tstats = cluster_test.tfr_regression()
tstat_threshold = cluster_test.threshold_tfr_tstat(tstats)

# Step 3: Find largest cluster(s) and return the max cluster statistic(s) and cluster's  frequencies x times indices
max_cluster_data  = cluster_test.max_tfr_cluster(tstats,max_cluster_output='all')

# Step 4: Create null distribution of maximum cluster statistics from permuted data
null_cluster_distribution = cluster_test.compute_null_cluster_stats(num_permutations=100)

# Step 5: Use null cluster statistic distribution from permutations to compute non-parametric p value 
cluster_pvalue = cluster_test.cluster_significance_test(max_cluster_data,null_cluster_distribution) 


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 1688 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 15000 out of 15000 | elapsed:    4.7s finished


In [15]:
import plotting_utils as plotting_utils

In [16]:
# Let's plot all the steps together and save the figures to a dictory that corresponds to the channel and predictor of interest. 
freqs = np.logspace(*np.log10([2, 200]), num=30)
tcrit_plot,beta_plot,tstat_plot,cluster_plot,max_cluster_plot,null_distribution_plot = plotting_utils.plot_neurocluster_results(betas,cluster_test, max_cluster_data, null_cluster_distribution, tstats, tstat_threshold,cluster_pvalue, freqs)

# Define the directory where you want to save the plots
output_directory = '/Users/christinamaher/Documents/GitHub/NeuroCluster/data/synthetic_validation_tfr_data/'

import os 
import matplotlib.pyplot as plt

def save_plot_to_pdf(fig, directory, filename):
    """Save a plot to the specified directory with the given filename."""
    filepath = os.path.join(directory, filename)
    fig.savefig(filepath, dpi=300,bbox_inches='tight')
    plt.close(fig)  # Close the figure to avoid display and memory issues

# Save plots to the output directory
save_plot_to_pdf(beta_plot, output_directory, 'beta_plot.png')
save_plot_to_pdf(tstat_plot, output_directory, 'tstat_plot.png')
save_plot_to_pdf(cluster_plot, output_directory, 'cluster_plot.png')
save_plot_to_pdf(max_cluster_plot, output_directory, 'max_cluster_plot.png')
save_plot_to_pdf(null_distribution_plot, output_directory, 'null_distribution_plot.png')

In [None]:
# starting at 0 get every 6th freq
freqs[::6]

In [None]:
yticks

In [None]:
selected_freqs = np.linspace(freqs[0], freqs[-1], num_yticks) 
selected_freqs