# SWOP script 1
## Single  Subject Preprocessing

Analysis script associated with the manuscript: ***Native word order processing is not uniform: An ERP-study of verb-second word order***, by Susan Sayehli, Marianne Gullberg, Aaron Newman, and Annika Andersson. (2022). *Frontiers in Psychology - Language Sciences*. DOI:[10.3389/fpsyg.2022.668276](https://www.frontiersin.org/articles/10.3389/fpsyg.2022.668276).

This script reads in raw, continuous EEG data from each subject and applies the following preprocessing steps:
1. Bandpass filter raw continuous data using a highpass cutoff of 1 Hz and a lowpass cutoff of 30 Hz (this is used for ICA only)
1. Apply independent components analysis (fastICA) to the filtered data, generating as many components as required to explain 99% of the variance in the data
1. Identify independent components associated with ocular artifacts using MNE's automated correlation approach, with a threshold of *z* > 3
1. Bandpass filter the raw, continuous data with a finite impulse response zero-phase hamming-windowed filter, using a highpass cutoff of 0.1 Hz and a lowpass cutoff of 30 Hz
1. Apply ICA correction to the 0.1-30 Hz filtered data to remove ocula artifacts
1. Segment the continuous data into epochs time-locked to the onset of critical words, including a 1 s pre-stimulus baseline period and a 1 s post-stimulus period. Epochs are mean-centered, i.e., the mean amplitude over the entire 2 s window is subtracted from each epoch, separately at each channel
1. Import log files from each participant's EEG session and add as metadata to the segmented ERP data
1. Export preprocessed epochs to .fif format

---
Copyright 2016-21  [Aaron J Newman](https://github.com/aaronjnewman), [NeuroCognitive Imaging Lab](http://ncil.science), [Dalhousie University](https://dal.ca)

Released under the [The 3-Clause BSD License](https://opensource.org/licenses/BSD-3-Clause)

---

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os

import mne
mne.set_log_level('error')

In [None]:
## Set Parameters
data_path = '../data/'

subjects = ['s_04nm',  's_07ba',  's_09lo',  's_12wg',  's_13ff',  's_14mc',
            's_15rj',  's_17oh',  's_18ak',  's_19am',  's_21ma',  's_23nj',
            's_24zk',  's_25ks',  's_26nm',  's_27lm',  's_28js',  's_29ld',
            's_30la',  's_31bf']

# standard montage file to look up channel locations
montage_fname = 'standard_1005'

# Map event codes to labels
event_id = {'V2/kanske/pronoun':222, 'V3/kanske/pronoun':122,
            'V2/kanske/noun':224,    'V3/kanske/noun':124,
            'V2/hemma/pronoun':232,  'V3/hemma/pronoun':132,
            'V2/hemma/noun':234,     'V3/hemma/noun':134,
            'V2/idag/pronoun':242,   'V3/idag/pronoun':142,
            'V2/idag/noun':244,      'V3/idag/noun':144
           }

# event_id = {'V2/kanske':[222, 224], 'V3/kanske':[122, 124],
#             'V2/hemma':[232, 234],  'V3/hemma':[132, 134],
#             'V2/idag':[242, 244],   'V3/idag':[142, 144],
#            }


# specify parameters for epoching
tmin = -1.0  # start of each epoch (in sec)
tmax =  1.0  # end of each epoch (in sec)
baseline = None
reject = None
flat = dict(eeg=5e-6)
detrend = 0
threshold = 75e-6

# Filter cutoffs and other parameters
l_freq = 0.1
l_freq_ica = 1.
h_freq = 30.0

# maximum number of ICs to reject in ICA artifact correction
ica_random_state = 42  # seed so ICA is reproducable each time it's run

# Specify n_components as a decimal to set % explained variance
n_components = .99

## Loop over subjects and do preprocessing

In [None]:
for subject in subjects:
    print(subject + '...')
    ## Read raw data
    raw = mne.io.read_raw_cnt(data_path + subject + '.cnt', eog='auto')

    # We also need to rename a few channels to conform to capitalization in 10-20 montage
    ch_rename = {'FZ':'Fz', 'CZ':'Cz', 'PZ':'Pz',
                 'FP1':'Fp1', 'FP2':'Fp2',
                }
    mne.rename_channels(raw.info, ch_rename)
    raw.set_montage(montage_fname)

    # Filter
    raw_filt = raw.load_data().copy().filter(l_freq, h_freq,
                                             picks = mne.pick_types(raw.info, eeg=True, eog=True),
                                             n_jobs = 12);

    ## Artifact Detection & Correction with ICA

    ### Filter data for ICA
    # ICA performs better when low-frequency drift is removed with a 1 Hz highpass filter.
    # After identifying artifacts, we will apply corrections to the 0.1 H filtered  data.
    raw_ica = raw.load_data().copy().filter(l_freq_ica, h_freq,
                                             picks = mne.pick_types(raw.info, eeg=True, eog=True),
                                             n_jobs = 12);

    ### Find ICA decomposition of data
    ica = mne.preprocessing.ICA(n_components=n_components, random_state=ica_random_state, max_iter='auto')
    ica.fit(raw_ica)

    ### Idenfity ICA components associated with ocular artifacts
    ica.exclude = []
    # find which ICs match the EOG pattern
    eog_indices, eog_scores = ica.find_bads_eog(raw_ica, threshold=3.)
    ica.exclude = eog_indices
   
    ### Apply ICA corretions to data
    raw_postica = ica.apply(raw_filt.copy())

    ## Epoching
    ### Extract event codes  to events array
    all_codes = raw.annotations.to_data_frame()['description'].unique()
    event_map = dict(zip(all_codes, all_codes.astype('int')))
    events, event_id_new = mne.events_from_annotations(raw, event_id=event_map)

    # ### Segment data into Epochs
    epochs = mne.Epochs(raw_postica,
                        events, event_id=event_id,
                        tmin=tmin, tmax=tmax,
                        baseline=baseline, detrend=detrend,
                        on_missing='ignore',
                        preload=True
                        )

    # One subject had 10 extra trials at the start we need to remove
    # (false start?)
    if subject == 's_07ba':
        epochs = epochs[10:]

    ### Import log files and add as metadata
    # To be used later in mixed effects analyses
    df = pd.read_csv(data_path + 'SWOP_log_files.csv', encoding='utf8')
    # s_12 is missing the first epoch in the EEG data
    if subject == 's_12wg':
        epochs.metadata = df[(df['Subject'] == subject) & (df['Condition'].isin(event_id.values()))][1:]
    # subj 28 is missing one event randomly relative to metadata. Hard-coded based on looking at the data
    elif subject == 's_28js':
        epochs.metadata = df[(df['Subject'] == subject) & (df['Condition'].isin(event_id.values()))].drop(index=8061)
    else:
        epochs.metadata = df[(df['Subject'] == subject) & (df['Condition'].isin(event_id.values()))]
    epochs.drop_bad()

    ## Export preprocessed epochs to file
    # output file names - set to follow MNE conventions
    epochs_fname = data_path + subject + '-epo.fif'
    epochs.save(epochs_fname, overwrite=True)