## Combine columns of Attention Shift bids and EEG events for processing

This script starts with the data that uploaded to OpenNeuro as ds002893.
A copy of the EEG.event structure is dumped to the dataset as `_eventstemp.tsv` files.
The `attention_shift_01_initial_summary.ipynb` has already been run and
indicates that the corresponding versions of the event files have
the same number of events.

This notebook creates a `_eventstemp1.tsv` for each:
1. Check that the respective event files have the expected columns.
2. Combine the dataframes representing the two versions of the file along the columns.
3. Make sure that the `sample` column has the same values as the corresponding values in the
`sample_offset` and `latency` columns.
4. Remove the columns `trial_type`, `value`, `response_time`, `stim_file`, `HED`,
`sample_offset`, `latency`,`urevent`, `type`, and `usertags` from the combined data frame.
5. Save as `_eventstemp1.tsv`.

In [1]:
import os
import datetime
import pandas as pd
from hed.tools import BidsTabularDictionary, get_file_list, get_new_dataframe, HedLogger

# Set the specific variables for the Attention Shift dataset.
bids_root_path = '/XXX/AttentionShiftWorking'
exclude_dirs = ['sourcedata', 'stimuli', 'code']
entities = ('sub', 'run')
bids_cols_expected = ['onset', 'duration', 'sample', 'trial_type', 'response_time', 'stim_file', 'value', 'HED']
bids_cols_remove = ['trial_type', 'value', 'response_time', 'stim_file', 'HED']
eeg_cols_expected = ['sample_offset', 'event_code', 'cond_code', 'type', 'latency', 'urevent', 'usertags']
eeg_cols_remove = ['sample_offset', 'latency', 'urevent', 'usertags', 'type']
final_cols = ['onset', 'duration', 'sample', 'event_code', 'cond_code']
log_name = 'attention_shift_02_initial_combination_log'

# Set up the logger
log_file_name = f"code/curation_logs/{log_name}.txt"
logger = HedLogger(name=log_name)

#  Create the file lists and dictionaries
bids_files = get_file_list(bids_root_path, extensions=[".tsv"], exclude_dirs=exclude_dirs, name_suffix='_events')
bids_dict = BidsTabularDictionary("Bids event files", bids_files, entities=entities)
eeg_files = get_file_list(bids_root_path, extensions=[".tsv"], exclude_dirs=exclude_dirs, name_suffix="_eventstemp")
eeg_dict = BidsTabularDictionary("EEG event files", eeg_files, entities=entities)

# Perform the operations to combine the two versions of the event files
for key, file, rowcount, column_count in bids_dict.iter_tsv_info():
    df_bids = get_new_dataframe(file.file_path)
    logger.add(key, f"Created a dataframe for {os.path.basename(file.file_path)}")

    df_eeg = get_new_dataframe(eeg_dict.get_file_path(key))
    logger.add(key, f"Created a dataframe for {os.path.basename(eeg_dict.get_file_path(key))}")

 # Combine the two versions of the events file after verifying they have same number of rows
    if rowcount != eeg_dict.rowcount_dict[key] and key != 'sub-007_run-01':
        logger.add(key, f"The BIDs file has {rowcount} has {rowcount} row and the EEG file has" +
                   f"{eeg_dict.rowcount_dict[key]} rows", level="ERROR", also_print=True)
        continue
    elif key == 'sub-007_run-01': # Extra EEG events at beginning
        drop_indices = [0, 1, 2, 3]
        df_eeg.drop(axis=0, index=drop_indices, inplace=True)
        df_eeg.reset_index(inplace=True)
        logger.add(key, f"Dropping rows {str(drop_indices)} from EEG events", also_print=True)
    df_out = pd.concat([df_bids, df_eeg], axis=1)
    logger.add(key, f"Concatenated the BIDS and EEG event files for processing")

   # Make sure that the sample, sample_offset, and latency columns are equal
    offset_diff = sum(df_out['sample'].map(str) != df_out['sample_offset'].map(str))
    latency_diff = sum(df_out['sample'].map(str) != df_out['latency'].map(str))
    if offset_diff + latency_diff:
        logger.add(key, f"Sample col has {offset_diff} differences with sample_offset and "
                        f"{latency_diff} differences with latency", level="ERROR", also_print=True)
        continue
    else:
        logger.add(key, f"Verified sample column, sample_offset, and latency columns have same values")

    # Drop the extra columns
    drop_columns = bids_cols_remove + eeg_cols_remove
    df_out.drop(columns=bids_cols_remove+eeg_cols_remove, inplace=True)
    logger.add(key, f"Dropped {str(drop_columns)} drop_columns")

    # Make sure the dataframe has the correct final columns
    extra_cols = set(df_out.columns).difference(set(final_cols))
    if extra_cols:
        df_out.drop(columns=extra_cols, inplace=True)
        logger.add(key, f"Dropped extra columns {str(extra_cols)}")
    missing_cols = set(final_cols).difference(set(df_out.columns))
    if missing_cols:
        df_out[missing_cols] = 'n/a'
        logger.add(key, f"Added missing columns {str(missing_cols)}")

    # Do a final reordering for uniformity
    df_out = df_out[final_cols]
    logger.add(key, f"Reordered the final columns as {str(final_cols)}")

    filename = file.file_path[:-4] + "temp1.tsv"
    df_out.to_csv(filename, sep='\t', index=False)
    logger.add(key, f"Saved as {filename}")

# Output and save the log
log_string = "\n\nLog output:\n" + logger.get_log_string()
error_string = "\n\nERROR Summary:\n" + logger.get_log_string(level="ERROR")
print(log_string)
print(error_string)

save_path = os.path.join(bids_root_path, log_file_name)
with open(save_path, "w") as fp:
    fp.write(f"{log_file_name} {datetime.datetime.now()}\n")
    fp.write(log_string)
    fp.write(error_string)

sub-007_run-01 []: Dropping rows [0, 1, 2, 3] from EEG events


Log output:
attention_shift_02_initial_combination_log: Level None
sub-001_run-01:
	[ Created a dataframe for sub-001_task-AuditoryVisualShift_run-01_events.tsv]
	[ Created a dataframe for sub-001_task-AuditoryVisualShift_run-01_eventstemp.tsv]
	[ Concatenated the BIDS and EEG event files for processing]
	[ Verified sample column, sample_offset, and latency columns have same values]
	[ Dropped ['trial_type', 'value', 'response_time', 'stim_file', 'HED', 'sample_offset', 'latency', 'urevent', 'usertags', 'type'] drop_columns]
	[ Reordered the final columns as ['onset', 'duration', 'sample', 'event_code', 'cond_code']]
	[ Saved as G:\AttentionShift\AttentionShiftWorking\sub-001\eeg\sub-001_task-AuditoryVisualShift_run-01_eventstemp1.tsv]
sub-002_run-01:
	[ Created a dataframe for sub-002_task-AuditoryVisualShift_run-01_events.tsv]
	[ Created a dataframe for sub-002_task-AuditoryVisualShift_run-01_eventstemp.tsv]
	[ Concatena