# NeuroCluster:
<font size= 4> Non-parametric cluster-based permutation testing to identify neurophysiological encoding of continuous variables with time-frequency resolution

Authors: Christina Maher & Alexandra Fink-Skular \
Updated: 06/19/2024 by CMM

In [None]:
import numpy as np
import pandas as pd
import mne
from glob import glob
from scipy.stats import zscore, t, linregress, ttest_ind, ttest_rel, ttest_1samp 
import os 
import re
import h5io
import pickle 
import time 
import datetime 
from joblib import Parallel, delayed
import statsmodels.api as sm 
from scipy.ndimage import label 
import statsmodels.formula.api as smf
import tqdm
import operator
import matplotlib.pyplot as plt


import warnings
warnings.filterwarnings('ignore')

# keep this so we can use our respective paths for testing
current_user = 'christina'

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
if current_user == 'christina':
    base_dir = '/Users/christinamaher/Documents/GitHub/NeuroCluster/scripts/'
    data_dir = '/Users/christinamaher/Documents/GitHub/NeuroCluster/'
    tfr_dir  = f'{data_dir}tfr/'
    anat_dir = f'{data_dir}anat/'
elif current_user == 'alie':
    base_dir = '/Users/alexandrafink/Documents/GraduateSchool/SaezLab/NeuroCluster/'
    data_dir = '/Users/alexandrafink/Documents/GraduateSchool/SaezLab/SWB/'
    tfr_dir  = f'{data_dir}ephys_analysis/data/'
    beh_dir  = f'{data_dir}behavior_analysis/behavior_preprocessed/'
    anat_dir = f'{data_dir}anat_recons/'

In [None]:
# load functions 
import sys
sys.path.append(base_dir)
from tfr_cluster_test import *
from helper_utils import *
#from plotting_utils import * 

In [None]:
date = datetime.date.today().strftime('%m%d%Y')
print(date)

# Step 1: Format Input Data (Currently within-subject)
- neural input: np.array (n_channels x n_epochs x n_freqs x n_times)
- regressor data: np.array (numpy array: n_epochs x n_features)

In [None]:
# load epoched data for single subj
if current_user == 'alie':
    subj_id     = 'MS002'   
    power_epochs = mne.time_frequency.read_tfrs(fname=f'{tfr_dir}{subj_id}/{subj_id}_CpeOnset-tfr.h5')[0]
elif current_user == 'christina':
    subj_id     = 'MS009'   
    power_epochs = mne.time_frequency.read_tfrs(fname=f'{tfr_dir}/{subj_id}_tfr.h5')[0]

In [None]:
# set ROI for single ROI anaylsis 
if current_user == 'alie':
    roi = 'ains'
    # set all variables included mutliple regression 
    multi_reg_vars = ['GambleChoice','TotalProfit','RPE','decisionCPE']
    # set main variable of interest for permutations 
    permute_var = 'decisionCPE'

    roi_subj_beh_df = []

    for subj_id in roi_subj_ids: 
        # load subj behavior data 
        beh_df = pd.read_csv(f'{beh_dir}{subj_id}_task_data')
        # clean subj dataframe from fail trials/nan values in vars of interest 
        # beh_df[~(beh_df.GambleChoice=='gamble')|~(beh_df.GambleChoice=='safe')] = np.nan
        # beh_df = beh_df[~beh_df.Outcome.isnull()]
        beh_df['GambleChoice'][(beh_df.GambleChoice!='gamble')&(beh_df.GambleChoice!='safe')&(~beh_df.GambleChoice.isnull())] = np.nan
        # beh_df = beh_df[(beh_df.GambleChoice=='gamble')|(beh_df.GambleChoice=='safe')]
        # beh_df = beh_df[~beh_df.Outcome.isnull()]
        # set datatypes for categorical vars
        beh_df['GambleChoice'] = beh_df['GambleChoice'].astype('category')
        beh_df['subj_id'] = subj_id
        beh_df[multi_reg_vars[1:]] = pd.DataFrame({f'{var}':zscore(beh_df[var])  for var in multi_reg_vars[1:]})
        # beh_df = beh_df[['subj_id','Round'] + multi_reg_vars]
        beh_df = beh_df[['subj_id','Round'] + multi_reg_vars]    
        roi_subj_beh_df.append(beh_df)

    roi_subj_beh_df = pd.concat(roi_subj_beh_df).reset_index(drop=True)
    roi_subj_beh_df['GambleChoice'] = roi_subj_beh_df['GambleChoice'].astype('category')


In [None]:
beh_df = prepare_regressor_df(power_epochs)

In [None]:
## new function for getting elecs in ROI
roi = ['lpfc','ofc']
roi_subj_elecs = prepare_anat_dic(roi, f'{anat_dir}master_labels.csv')
roi_subj_elecs

## Run Univariate Regression 
- Use TFRClusterTest class code to run univariate regression
- Allows for multiple regression implementation and pixel paralellization, so with more speed improvements will ultimately be worth it. (ONGOING DEV)

In [None]:
#### class TFR_Cluster_Test dev + debugging

if current_user == 'alie':

    # subset single electrode tfr data + behav data
    tfr_data = np.squeeze(power_epochs._data[:,16,:,:].copy())
    predictor_data = roi_subj_beh_df[roi_subj_beh_df.subj_id == 'MS002'].drop(columns=['subj_id','Round'])

    test_univar = predictor_data[permute_var].copy()

elif current_user == 'christina':
    
        # subset single electrode tfr data + behav data
        tfr_data = np.squeeze(power_epochs._data[:,0,:,:].copy())
        ch_name = power_epochs.info['ch_names'][0]

In [None]:
beh_df

In [None]:
permute_var

In [None]:
permute_var = 'ev_zscore'
ch_name = power_epochs.info['ch_names'][0]
cluster_test = TFR_Cluster_Test(tfr_data,beh_df,permute_var,ch_name)
betas, tstats = cluster_test.tfr_multireg()
cluster_data = cluster_test.max_tfr_cluster(tstats)

cluster_data

In [None]:
def plot_beta_coef(betas, cluster_test):

    plt.imshow(betas, interpolation = 'Bicubic',cmap='Spectral_r', aspect='auto',origin='lower',vmin=-.5,vmax=.5) 
    plt.colorbar()
    plt.ylabel('Freq')
    plt.xlabel('Time')
    # make title dynamic depending on whether or not you are controlling for other variables
    if cluster_test.predictor_data.columns.tolist() == [cluster_test.permute_var]:
        plt.title(f'Beta coefficients from {cluster_test.ch_name} encoding {cluster_test.permute_var}')
    else:
        beh_variables = cluster_test.predictor_data.columns.tolist().copy()
        control_variables = beh_variables.remove(cluster_test.permute_var) # to do - fix this bc its not printing as it should 
        plt.title(f'Beta coefficients from {cluster_test.ch_name} encoding {cluster_test.permute_var} controlling for {control_variables}')
    plt.show()


def plot_tstats(tstats, cluster_test):

    plt.imshow(tstats, interpolation = 'Bicubic',cmap='Spectral_r', aspect='auto',origin='lower',vmin=-3,vmax=3) 
    plt.colorbar()
    plt.ylabel('Freq')
    plt.xlabel('Time')
    # make title dynamic depending on whether or not you are controlling for other variables
    if cluster_test.predictor_data.columns.tolist() == [cluster_test.permute_var]:
        plt.title(f'T-statistics for beta coefficients from {cluster_test.ch_name} encoding {cluster_test.permute_var}')
    else:
        beh_variables = cluster_test.predictor_data.columns.tolist().copy()
        control_variables = beh_variables.remove(cluster_test.permute_var) # to do - fix this because its not printing as it should 
        plt.title(f'T-statistics for beta coefficents from {cluster_test.ch_name} encoding {cluster_test.permute_var} controlling for {control_variables}')
    plt.show()

def plot_clusterstats(cluster_data, tstats, cluster_test):

    # Loop through the list of dictionaries
    for cluster in cluster_data:
        # Initialize an array the same shape as the tstat
        masked_tstat_plot = np.zeros_like(tstats)

        # Extract the indices from the dictionary
        freq_start, freq_end = cluster['freq_idx']
        time_start, time_end = cluster['time_idx']

        # Copy the values from tstat_plot to masked_tstat_plot for the significant cluster range
        masked_tstat_plot[freq_start:freq_end+1, time_start:time_end+1] = 1

        # Plot the masked tstat plot
        plt.imshow(masked_tstat_plot, interpolation='bicubic', cmap='Spectral_r', aspect='auto', origin='lower', vmin=-3, vmax=3)
        plt.ylabel('Freq')
        plt.xlabel('Time')
        plt.title(f'Significant cluster from {cluster_test.ch_name} encoding {cluster_test.permute_var}')
        plt.show()


plot_beta_coef(betas, cluster_test)
plot_tstats(tstats, cluster_test)
plot_clusterstats(cluster_data, tstats, cluster_test)


## Run Multiple Regression 
- Using TFRClusterTest class
- Need to implement the permutation version of this code (ONGOING DEV)
- Need to add more functionality + plotting utils (ONGOING DEV)

In [None]:
# drop certain columns from predictor data
predictor_data = predictor_data.drop(columns=['condition','chosen_shape_current_trial','chosen_color_current_trial','chosen_shape_previous_trial','chosen_color_previous_trial','ev'])
predictor_data

In [None]:
elec_test = TFR_Cluster_Test(tfr_data,predictor_data,permute_var)
elec_betas, elec_tstats = elec_test.tfr_multireg()
elec_cluster_data = elec_test.max_tfr_cluster(elec_tstats)

elec_cluster_data

In [None]:
plt.imshow(elec_betas, interpolation = 'Bicubic',cmap='Spectral_r', aspect='auto',origin='lower',vmin=-.5,vmax=.5) 
plt.colorbar()
plt.xlabel('Time')
plt.ylabel('Freq')
plt.title(f'{power_epochs.ch_names[16]} - beta coefficients')
plt.show()

# Step 3: Extract Surrogate Clusters from Pixel-wise Permutation
- For loop for each electrode- 
- Run each permutation (1000x) in parallel within electrode loop
- Calculate max cluster p value for each +/- cluster for each electrode
- Save permuted cluster statistics for each electrode 

DEPENDENCIES: permuted_tfr_cluster_test, tfr_cluster_test

In [None]:
# HERE IS THE NEW PERMUTATION IMPLEMENTATION 
cluster_test = TFR_Cluster_Test(tfr_data,predictor_data,permute_var)
perm_cluster_results = run_permutation_test(cluster_test, num_permutations=1000)

In [None]:
# initialize list to store cluster data
cluster_list = []

for p in range(1000):
    uni_test = TFR_Cluster_Test(tfr_data,pd.DataFrame(test_univar),permute_var,1000)
    _, uni_tstats = uni_test.tfr_multireg()
    cluster_data = uni_test.max_tfr_cluster(uni_tstats,output='cluster_stat') 
    # add permutation number to cluster data
    cluster_data['perm_num'] = p
    del uni_test, uni_tstats # clear memory
    cluster_list.append(cluster_data) 



In [None]:
### TEST PERMUTATIONS 
num_permutations = 1000
start = time.time() # start timer

all_ch_perm = {}

for c in range(num_channels):
        ch_start = time.time() # start timer

        # Prepare arguments for the permutation function
        permutation_args = [
        (np.squeeze(power_epochs._data[:,c,:,:]), reg_data, tcritical)
        for _ in range(num_permutations)]
    
        # Perform permutations in parallel
        elec_permuted_data = Parallel(n_jobs=-1, verbose=12)(
        delayed(permuted_tfr_cluster_test)(*args)
        for args in permutation_args)
        
        # save in all elec dict 
        all_ch_perm[ch_names[c]] = elec_permuted_data
        pickle.dump(elec_permuted_data, open(f'{results_dir}{subj_id}_{ch_names[c]}_perm_clusters.pkl', "wb")) 

        ch_end = time.time() 
        print(f'{ch_names[c]} permute time: ', '{:.2f}'.format(ch_end-ch_start))
        
        

end = time.time()    
print('{:.2f} s'.format(end-start)) # print time elapsed for computation (approx 4 seconds per permutation)


In [None]:
num_permutations = 1000
ch_start = time.time() # start timer

# Prepare arguments for the permutation function
permutation_args = [
(np.squeeze(power_epochs._data[:,c,:,:]), reg_data, tcritical)
for _ in range(num_permutations)]

# Perform permutations in parallel
elec_permuted_data_reduc = Parallel(n_jobs=-1, verbose=12)(
delayed(permuted_tfr_cluster_test)(*args)
for args in permutation_args)

# save in all elec dict 
# all_ch_perm[ch_names[c]] = elec_permuted_data
pickle.dump(elec_permuted_data_reduc, open(f'{results_dir}{subj_id}_{ch_names[c]}_reduced_output_perm_clusters.pkl', "wb")) 

ch_end = time.time() 
print(f'{ch_names[c]} permute time: ', '{:.2f}'.format(ch_end-ch_start))

In [None]:
elec_permuted_data

In [None]:
elec_permuted_data_reduc