# Preprocess Continuous Data - Resting State
## MS007
10/01/2023 \
Updated: 03/18/2024 - resaved bp reref lfp data without any data clipping.


In [1]:
import numpy as np
import mne
from glob import glob
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
from scipy.stats import zscore, linregress, ttest_ind, ttest_rel, ttest_1samp
import pandas as pd
from mne.preprocessing.bads import _find_outliers
import os 
import joblib
import emd
import re

import warnings
warnings.filterwarnings('ignore')

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [2]:
import sys
sys.path.append('/Users/alexandrafink/Documents/GitHub/LFPAnalysis')

In [3]:
from LFPAnalysis import lfp_preprocess_utils, sync_utils, analysis_utils, nlx_utils

In [4]:
subj_id = 'MS007'

# Specify root directory for data and results 
base_dir = '/Users/alexandrafink/Documents/GraduateSchool/SaezLab/resting_state_proj/resting_state_ieeg/'
anat_dir = f'{base_dir}anat'
neural_dir = f'{base_dir}preprocess/raw_data/{subj_id}'
save_dir = f'{base_dir}preprocess/clean_data/{subj_id}'
os.makedirs(save_dir,exist_ok = True)

#### Import Data

In [14]:
edf_files = glob(f'{neural_dir}/*.edf')

mne_data = mne.io.read_raw_edf(edf_files[0], preload=True)
mne_data

0,1
Measurement date,"January 01, 2001 13:33:36 GMT"
Experimenter,Unknown
Digitized points,Not available
Good channels,276 EEG
Bad channels,
EOG channels,Not available
ECG channels,Not available
Sampling frequency,1024.00 Hz
Highpass,0.00 Hz
Lowpass,512.00 Hz


In [15]:
mne_data.ch_names

['LmOIF1',
 'LmOIF2',
 'LmOIF3',
 'LmOIF4',
 'LmOIF5',
 'LmOIF6',
 'LmOIF7',
 'LmOIF8',
 'LmOIF9',
 'RmOlF1',
 'RmOlF2',
 'RmOlF3',
 'RmOlF4',
 'RmOlF5',
 'RmOlF6',
 'RmOlF7',
 'RmOlF8',
 'RmOlF9',
 'L almM1',
 'L almM2',
 'L almM3',
 'L almM4',
 'L almM5',
 'L almM6',
 'L almM7',
 'L almM8',
 'L almM9',
 'L almM10',
 'L almM11',
 'L almM12',
 'L almM13',
 'L almM14',
 'R almM1',
 'R almM2',
 'R almM3',
 'R almM4',
 'R almM5',
 'R almM6',
 'R almM7',
 'R almM8',
 'R almM9',
 'R almM10',
 'R almM11',
 'R almM12',
 'L CMfO1',
 'L CMfO2',
 'L CMfO3',
 'L CMfO4',
 'L CMfO5',
 'L CMfO6',
 'L CMfO7',
 'L CMfO8',
 'L CMfO9',
 'L CMfO10',
 'L CMfO11',
 'L CMfO12',
 'L CMfO13',
 'L CMfO14',
 'C59',
 'C60',
 'C61',
 'C62',
 'C63',
 'C64',
 'RCMfO1',
 'RCMfO2',
 'RCMfO3',
 'RCMfO4',
 'RCMfO5',
 'RCMfO6',
 'RCMfO7',
 'RCMfO8',
 'RCMfO9',
 'RCMfO10',
 'RCMfO11',
 'RCMfO12',
 'RCMfO13',
 'RCMfO14',
 'LaCaS1',
 'LaCaS2',
 'LaCaS3',
 'LaCaS4',
 'LaCaS5',
 'LaCaS6',
 'LaCaS7',
 'LaCaS8',
 'LaCaS9',
 'L

# Import Anat Recon Info - check all elecs are present in data + recon sheet


In [16]:
# Load the electrode localization data
anat_file = glob(f'{anat_dir}/{subj_id}_labels.csv')[0]
elec_locs = pd.read_csv(anat_file)
# Sometimes there's extra columns with no entries: 
elec_locs = elec_locs[elec_locs.columns.drop(list(elec_locs.filter(regex='Unnamed')))]
elec_locs = elec_locs.dropna(axis=0, how = 'all')
elec_locs

Unnamed: 0,label,BN246label,x,y,z,mni_x,mni_y,mni_z,gm,NMM,Anat,AnatMacro,BN246,YBA_1,Manual Examination,Notes
0,LaCaS1,A13_L,-6.549650,41.776780,-7.03872,-6.149690,34.97941,-12.45010,Gray,Left ACgG anterior cingulate gyrus,Area s32,L Mid Orbital Gyrus,L OrG,Left frontal pole 1 C,,
1,LaCaS10,A9l_L,-9.746730,49.772330,37.28761,-11.158100,45.87980,38.08775,Gray,Left Cerebral White Matter,Unknown,L Superior Frontal Gyrus,L SFG,Unknown,Left superior frontal gyrus 2 C,
2,LaCaS11,A9l_L,-9.746730,50.971670,42.07965,-11.033800,47.45796,43.55471,Gray,Left SFG superior frontal gyrus,Unknown,L Superior Frontal Gyrus,L SFG,Left superior frontal gyrus 2 C,WM,
3,LaCaS12,A9l_L,-10.146400,50.971670,46.87168,-11.521400,47.74693,49.07517,Gray,Left SFG superior frontal gyrus,Unknown,L Superior Frontal Gyrus,L SFG,Left superior frontal gyrus 3 C,,
4,LaCaS2,A32sg_L,-6.949280,42.576330,-2.24668,-6.826220,36.24643,-7.12713,Gray,Left ACgG anterior cingulate gyrus,Area s32,L ACC,L CG,Left cingulate gyrus D,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,RpCiP8,A39rd_R,32.614580,-40.977200,38.48562,34.704870,-57.38460,39.36843,White,Right Cerebral White Matter,Unknown,R Angular Gyrus,R IPL,Right supramarginal gyrus 4 A,WM,
195,RpCiP9,A39rd_R,37.010560,-42.976100,40.48230,39.443400,-59.42050,41.61339,Gray,Right AnG angular gyrus,Unknown,R Angular Gyrus,R IPL,Right supramarginal gyrus 5 B,,
196,uLmOlF,A11l_L,-14.142700,46.174330,-15.82410,-14.133400,39.07736,-22.87590,White,Left MOrG medial orbital gyrus,Area Fo1,L Superior Orbital Gyrus,L OrG,Left frontal orbital 3 B,WM,
197,uRHplT,rHipp_R,10.634660,4.597446,-17.82080,13.547580,-8.16845,-23.89940,Gray,Right PHG parahippocampal gyrus,Subiculum,Unknown,R Hipp,Right parahippocampal gyrus C,,


In [17]:
list(elec_locs.label)

['LaCaS1',
 'LaCaS10',
 'LaCaS11',
 'LaCaS12',
 'LaCaS2',
 'LaCaS3',
 'LaCaS4',
 'LaCaS5',
 'LaCaS6',
 'LaCaS7',
 'LaCaS8',
 'LaCaS9',
 'LAglT1',
 'LAglT10',
 'LAglT2',
 'LAglT3',
 'LAglT4',
 'LAglT5',
 'LAglT6',
 'LAglT7',
 'LAglT8',
 'LAglT9',
 'LaImM1',
 'LaImM10',
 'LaImM11',
 'LaImM12',
 'LaImM13',
 'LaImM14',
 'LaImM2',
 'LaImM3',
 'LaImM4',
 'LaImM5',
 'LaImM6',
 'LaImM7',
 'LaImM8',
 'LalmM9',
 'LCmfO1',
 'LCmfO10',
 'LCmfO11',
 'LCmfO12',
 'LCmfO13',
 'LCmfO14',
 'LCmfO2',
 'LCmfO3',
 'LCmfO4',
 'LCmfO5',
 'LCmfO6',
 'LCmfO7',
 'LCmfO8',
 'LCmfO9',
 'LHplT1',
 'LHplT10',
 'LHplT2',
 'LHplT3',
 'LHplT4',
 'LHplT5',
 'LHplT6',
 'LHplT7',
 'LHplT8',
 'LHplT9',
 'LmCmS1',
 'LmCmS10',
 'LmCmS2',
 'LmCmS3',
 'LmCmS4',
 'LmCmS5',
 'LmCmS6',
 'LmCmS7',
 'LmCmS8',
 'LmCmS9',
 'LmOlF1',
 'LmOlF2',
 'LmOlF3',
 'LmOlF4',
 'LmOlF5',
 'LmOlF6',
 'LmOlF7',
 'LmOlF8',
 'LmOlF9',
 'LmTpT1',
 'LmTpT2',
 'LmTpT3',
 'LmTpT4',
 'LmTpT5',
 'LmTpT6',
 'LmTpT7',
 'LmTpT8',
 'LpCiP1',
 'LpCiP10',
 'Lp

### Fix edf channel names

In [19]:
new_mne_names, unmatched_names, unmatched_seeg = lfp_preprocess_utils.match_elec_names(mne_data.ch_names, elec_locs.label)


Number of electrodes in the mne file is greater than the number of electrodes in the localization file
       name  lev_score
274  lmoif5   0.833333
275  rmolf5   0.833333
We have too many possible matches for lmolf5! Select one manually from these candidates:lmoif5
       name  lev_score
274  lmoif1   0.833333
275  rmolf1   0.833333
We have too many possible matches for lmolf1! Select one manually from these candidates:lmoif1
       name  lev_score
274  lmoif3   0.833333
275  rmolf3   0.833333
We have too many possible matches for lmolf3! Select one manually from these candidates:lmoif3
       name  lev_score
274  lmoif9   0.833333
275  rmolf9   0.833333
We have too many possible matches for lmolf9! Select one manually from these candidates:lmoif9
       name  lev_score
274  lmoif4   0.833333
275  rmolf4   0.833333
We have too many possible matches for lmolf4! Select one manually from these candidates:lmoif4
       name  lev_score
274  lmoif6   0.833333
275  rmolf6   0.833333
We have 

In [20]:
unmatched_names

['urhplt', 'urmolf', 'ulmolf']

In [21]:
unmatched_seeg

[]

In [22]:
new_name_dict = {x:y for (x,y) in zip(mne_data.ch_names, new_mne_names)}
new_name_dict

{'LmOIF1': 'lmolf1',
 'LmOIF2': 'lmolf2',
 'LmOIF3': 'lmolf3',
 'LmOIF4': 'lmolf4',
 'LmOIF5': 'lmolf5',
 'LmOIF6': 'lmolf6',
 'LmOIF7': 'lmolf7',
 'LmOIF8': 'lmolf8',
 'LmOIF9': 'lmolf9',
 'RmOlF1': 'rmolf1',
 'RmOlF2': 'rmolf2',
 'RmOlF3': 'rmolf3',
 'RmOlF4': 'rmolf4',
 'RmOlF5': 'rmolf5',
 'RmOlF6': 'rmolf6',
 'RmOlF7': 'rmolf7',
 'RmOlF8': 'rmolf8',
 'RmOlF9': 'rmolf9',
 'L almM1': 'laimm1',
 'L almM2': 'laimm2',
 'L almM3': 'laimm3',
 'L almM4': 'laimm4',
 'L almM5': 'laimm5',
 'L almM6': 'laimm6',
 'L almM7': 'laimm7',
 'L almM8': 'laimm8',
 'L almM9': 'lalmm9',
 'L almM10': 'laimm10',
 'L almM11': 'laimm11',
 'L almM12': 'laimm12',
 'L almM13': 'laimm13',
 'L almM14': 'laimm14',
 'R almM1': 'raimm1',
 'R almM2': 'raimm2',
 'R almM3': 'raimm3',
 'R almM4': 'raimm4',
 'R almM5': 'raimm5',
 'R almM6': 'raimm6',
 'R almM7': 'raimm7',
 'R almM8': 'raimm8',
 'R almM9': 'raimm9',
 'R almM10': 'raimm10',
 'R almM11': 'raimm11',
 'R almM12': 'raimm12',
 'L CMfO1': 'lcmfo1',
 'L CMfO2': 

In [23]:
# Rename the mne data according to the localization data
mne_data.rename_channels(new_name_dict)

0,1
Measurement date,"January 01, 2001 13:33:36 GMT"
Experimenter,Unknown
Digitized points,Not available
Good channels,276 EEG
Bad channels,
EOG channels,Not available
ECG channels,Not available
Sampling frequency,1024.00 Hz
Highpass,0.00 Hz
Lowpass,512.00 Hz


In [25]:
anat_names = list(elec_locs.label.str.lower())
sum([ch not in mne_data.ch_names for ch in anat_names]) #if there are no missing channels, sum = 0. if sum >0, find the missing elecs
print([ch for ch in mne_data.ch_names if ch not in anat_names ]) 
#print extra channels in mne_data.ch_names and make sure none of them are neural channels


['c59', 'c60', 'c61', 'c62', 'c63', 'c64', 'c125', 'c126', 'c127', 'c128', 'c191', 'c192', 'fp1', 'f7', 't3', 't5', 'o1', 'f3', 'c3', 'p3', 'fp2', 'f8', 't4', 't6', 'o2', 'f4', 'c4', 'p4', 'fz', 'cz', 'pz', 'ekg1', 'ekg2', 'c230', 'c231', 'c232', 'c233', 'c234', 'c235', 'c236', 'c237', 'c238', 'c239', 'c240', 'c241', 'c242', 'c243', 'c244', 'c245', 'c246', 'c247', 'c248', 'c249', 'c250', 'c251', 'c252', 'c253', 'c254', 'c255', 'c256', 'dc1', 'dc2', 'dc3', 'dc4', 'dc5', 'dc6', 'dc7', 'dc8', 'dc9', 'dc10', 'dc11', 'dc12', 'dc13', 'dc14', 'dc15', 'dc16', 'trig', 'osat', 'pr', 'pleth']


In [26]:
# Note, there is surface EEG data that we should separately indicate from the sEEG:
right_seeg_names = [i for i in mne_data.ch_names if i.startswith('r')]
left_seeg_names = [i for i in mne_data.ch_names if i.startswith('l')]
print(f'We have a total of', len(left_seeg_names), 'left &', len(right_seeg_names), 'right sEEG electrodes')
print(f'We have a total of {len(left_seeg_names) + len(right_seeg_names)} sEEG electrodes')


We have a total of 99 left & 97 right sEEG electrodes
We have a total of 196 sEEG electrodes


In [27]:
drop_chans = list(set(mne_data.ch_names)^set(left_seeg_names+right_seeg_names)) # it is either called DC1 or research
mne_data.drop_channels(drop_chans) #number of chans should = number of seegs 

0,1
Measurement date,"January 01, 2001 13:33:36 GMT"
Experimenter,Unknown
Digitized points,Not available
Good channels,196 EEG
Bad channels,
EOG channels,Not available
ECG channels,Not available
Sampling frequency,1024.00 Hz
Highpass,0.00 Hz
Lowpass,512.00 Hz


In [28]:
# Set channel types:
sEEG_mapping_dict = {f'{x}':'seeg' for x in left_seeg_names+right_seeg_names}
mne_data.set_channel_types(sEEG_mapping_dict)


0,1
Measurement date,"January 01, 2001 13:33:36 GMT"
Experimenter,Unknown
Digitized points,Not available
Good channels,196 sEEG
Bad channels,
EOG channels,Not available
ECG channels,Not available
Sampling frequency,1024.00 Hz
Highpass,0.00 Hz
Lowpass,512.00 Hz


In [29]:
# make montage (convert mm to m)

montage = mne.channels.make_dig_montage(ch_pos=dict(zip(elec_locs.label, 
                                                        elec_locs[['mni_x', 'mni_y', 'mni_z']].to_numpy(dtype=float)/1000)),
                                        coord_frame='mni_tal')

mne_data.set_montage(montage, match_case=False, on_missing='warn')

0,1
Measurement date,"January 01, 2001 13:33:36 GMT"
Experimenter,Unknown
Digitized points,196 points
Good channels,196 sEEG
Bad channels,
EOG channels,Not available
ECG channels,Not available
Sampling frequency,1024.00 Hz
Highpass,0.00 Hz
Lowpass,512.00 Hz


### Notch filter line noise

In [30]:
# Identify line noise
mne_data.info['line_freq'] = 60

# Notch out 60 Hz noise and harmonics 
mne_data.notch_filter(freqs=(60, 120, 180, 240))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 196 out of 196 | elapsed:    3.2s finished


0,1
Measurement date,"January 01, 2001 13:33:36 GMT"
Experimenter,Unknown
Digitized points,196 points
Good channels,196 sEEG
Bad channels,
EOG channels,Not available
ECG channels,Not available
Sampling frequency,1024.00 Hz
Highpass,0.00 Hz
Lowpass,512.00 Hz


### Resampling data 

In [31]:
#resampling if patient is not sampled at 512
resample_sr = 500
mne_data.resample(sfreq=resample_sr, npad='auto', n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done  80 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 196 out of 196 | elapsed:    4.6s finished


0,1
Measurement date,"January 01, 2001 13:33:36 GMT"
Experimenter,Unknown
Digitized points,196 points
Good channels,196 sEEG
Bad channels,
EOG channels,Not available
ECG channels,Not available
Sampling frequency,500.00 Hz
Highpass,0.00 Hz
Lowpass,250.00 Hz


### Bad Channel Removal (manual)

Let's pick out any bad channels missed by automatic screening (visual inspection as a reference), or restore channels that were erroneously deemed bad. You have to press the "power" button twice (once for the plot and once for the panel beneath it) when you're done so that you're manual changes are saved.

In [5]:
mne_data = mne.io.read_raw_fif(f'{save_dir}/{subj_id}_raw_ieeg.fif', preload=True)

In [6]:
%matplotlib notebook

fig = mne_data.plot(start=0, duration=120, n_channels=50, scalings=mne_data._data.max()/20)
fig.fake_keypress('a') #lots of bad times 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [7]:
mne_data.info['bads'] #sanity check that bads info saved


['lmolf4', 'lmolf9', 'lhplt3']

## Save raw LFP data
Notch filtered and resampled with bad elecs indicated

In [8]:
mne_data.save(f'{save_dir}/{subj_id}_raw_ieeg.fif',overwrite=True)

# Rereference data 

In [8]:
# Re-reference neural data
anat_file = glob(f'{anat_dir}/{subj_id}_labels.csv')[0]

mne_data_bp_reref = lfp_preprocess_utils.ref_mne(mne_data=mne_data, 
                                              elec_path=anat_file, 
                                              method='bipolar', 
                                              site='MSSM')

Number of electrodes in the mne file is less than the number of electrodes in the localization file


To start annotating, press 'Add new label' in the bottom panel. Then left click and drag around window of interest. 

In [9]:
%matplotlib notebook

# use the epoch code to select only the WM referenced pairs
fig = mne_data_bp_reref.plot(start=2, duration=100, n_channels=50,scalings=mne_data._data.max()/20 ) # plot all channels at once
fig.fake_keypress("a")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [11]:
mne_data_bp_reref.compute_psd().plot()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [16]:
mne_data_bp_reref.annotations.onset

array([ 38.27626059, 120.60144966, 145.66031655, 232.98009872,
       301.7108754 , 310.73829328, 389.98332519, 408.81638664,
       426.87122241, 438.85589788, 498.67425745, 547.11841739,
       559.88080803, 618.24539939])

In [17]:
def join_good_segs(mne_data):
    #creates indices of good epochs after labeling bad times manually, then crops good epochs and joins data 
    
    ### get good times: 
    good_start = list([mne_data.first_time]) #first timepoint in recording (should be 0)
    good_end = []
    
    for annot in mne_data.annotations:
        bad_start = mne_data.time_as_index(annot['onset']) #onset is start time of bad epoch 
        # ^ start time of bad epoch converted to index, then subtract 1 for end of good epoch
        bad_end = mne_data.time_as_index(annot['onset'] + annot['duration']) #onset + duration = end time of bad epoch
        # ^ end time of bad epoch converted to index 
        # must get bad start and end as indices so you can +-1 for good epochs - cannot +-1 using time only indexes

        good_end.append(mne_data.times[bad_start - 1]) #the start time of a bad epoch is the end of a good epoch - 1
        good_start.append(mne_data.times[bad_end+1]) #the end time of a bad epoch is the start of a good epoch +1 index
        #convert to integers before appending - indexing np arrays later is annoying
                          
    good_end.append(mne_data.times[mne_data.last_samp]) #index of last timepoint in recording (should = mne_data.n_times)
    
    ### get good data epochs and concatenate 
    good_segs = []
    for start,end in list(zip(good_start,good_end)):
        good_segs.append(mne_data.copy().crop(tmin=float(start), tmax=float(end),
                include_tmax=True))
    
    return mne.concatenate_raws(good_segs)
    
#derived from: 
    # source: https://mne.discourse.group/t/removing-time-segments-from-raw-object-without-epoching/4169/2
    # source: https://github.com/mne-tools/mne-python/blob/maint/1.5/mne/io/base.py#L681-L742
    
    

In [18]:
mne_data_bp_reref = join_good_segs(mne_data_bp_reref)
mne_data_bp_reref

0,1
Measurement date,"January 01, 2001 13:33:36 GMT"
Experimenter,Unknown
Digitized points,196 points
Good channels,117 sEEG
Bad channels,
EOG channels,Not available
ECG channels,Not available
Sampling frequency,500.00 Hz
Highpass,0.00 Hz
Lowpass,250.00 Hz


### Save reref data

In [10]:
mne_data_bp_reref.save(f'{save_dir}/{subj_id}_bp_ref_ieeg.fif',overwrite=True) #resaved 03/18/2024 with no clipping