## Combine columns of Sternberg bids and EEG events for processing

This script starts with the Sternberg data.
A copy of the EEG.event structure is dumped to the dataset as `_events_temp.tsv` files.
The `sternberg_01_initial_summary.ipynb` has already been run and
indicates that the corresponding versions of the event files have
the same number of events.

This notebook creates a `_eventstemp1.tsv` for each:
1. Check that the respective event files have the same number of events.
2. Remove the `trial_type`, `response_time`, `stim_file` columns from the bids events file.
3. Save as `_eventstemp1.tsv`.

In [1]:
import os
import datetime
from hed.tools import BidsTabularDictionary, get_file_list, get_new_dataframe, HedLogger

# Set the specific variables for the Attention Shift dataset.
bids_root_path = '/XXX/SternbergWorking'
exclude_dirs = ['sourcedata', 'stimuli', 'code']
entities = ('sub', 'ses', 'run')
bids_cols_remove = ['trial_type', 'response_time', 'stim_file']
final_cols = ['onset', 'duration', 'sample', 'value']
log_name = 'sternberg_02_initial_combination_log'

# Set up the logger
log_file_name = f"code/curation_logs/{log_name}.txt"
logger = HedLogger(name=log_name)

#  Create the file lists and dictionaries
bids_files = get_file_list(bids_root_path, extensions=[".tsv"], exclude_dirs = ['sourcedata'], name_suffix='_events')
bids_dict = BidsTabularDictionary("Bids event files", bids_files, entities=entities)
eeg_files = get_file_list(bids_root_path, extensions=[".tsv"], exclude_dirs = ['sourcedata'],
                          name_suffix="_eventstemp")
eeg_dict = BidsTabularDictionary("EEG event files", eeg_files, entities=entities)

# Perform the operations to combine the two versions of the event files
for key, file, rowcount, column_count in bids_dict.iter_tsv_info():
    df_bids = get_new_dataframe(file.file_path)
    logger.add(key, f"Created a dataframe for {os.path.basename(file.file_path)}")

    df_eeg = get_new_dataframe(eeg_dict.get_file_path(key))
    logger.add(key, f"Created a dataframe for {os.path.basename(eeg_dict.get_file_path(key))}")

    # Combine the two versions of the events file after verifying they have same number of rows
    if rowcount != eeg_dict.rowcount_dict[key]:
        logger.add(key, f"The BIDs file has {rowcount} has {rowcount} row and the EEG file has" +
                   f"{eeg_dict.rowcount_dict[key]} rows", level="ERROR", also_print=True)
        continue
    logger.add(key, f"Bids event file and EEG.set event structure have the same number of events")

    # Drop the extra columns
    drop_columns = bids_cols_remove
    df_bids.drop(columns=drop_columns, inplace=True)
    logger.add(key, f"Dropped {str(drop_columns)} columns")

    # Make sure the dataframe has the correct final columns
    extra_cols = set(df_bids.columns).symmetric_difference(set(final_cols))
    if extra_cols:
        logger.add(key, f"Columns {str(extra_cols)} do not match expected.", level="ERROR", also_print=True)
        continue

    # Do a final reordering for uniformity
    df_bids = df_bids[final_cols]
    logger.add(key, f"Reordered the final columns as {str(final_cols)}")

    filename = file.file_path[:-4] + "temp1.tsv"
    df_bids.to_csv(filename, sep='\t', index=False)
    logger.add(key, f"Saved as {os.path.basename(filename)}")

In [2]:
key_dict = {
    'sub-001_ses-01_run-2': [0],
    'sub-001_ses-01_run-3': [0],
    'sub-001_ses-01_run-4': [0],
    'sub-002_ses-01_run-2': [1, 0],
    'sub-002_ses-01_run-3': [0],
    'sub-002_ses-01_run-4': [0],
    'sub-003_ses-01_run-2': [0],
    'sub-003_ses-01_run-3': [0],
    'sub-003_ses-01_run-4': [0],
    'sub-004_ses-01_run-2': [0],
    'sub-004_ses-01_run-3': [0],
    'sub-004_ses-01_run-4': [0],
    'sub-005_ses-01_run-2': [0],
    'sub-005_ses-01_run-3': [0],
    'sub-005_ses-01_run-4': [0],
    'sub-006_ses-01_run-2': [0],
    'sub-006_ses-01_run-3': [0],
    'sub-006_ses-01_run-4': [0],
    'sub-007_ses-01_run-2': [0],
    'sub-007_ses-01_run-3': [0],
    'sub-007_ses-01_run-4': [0],
    'sub-008_ses-01_run-2': [0],
    'sub-008_ses-01_run-3': [0],
    'sub-008_ses-01_run-4': [0],
    'sub-009_ses-01_run-2': [0],
    'sub-009_ses-01_run-3': [0],
    'sub-009_ses-01_run-4': [0],
    'sub-010_ses-01_run-2': [1, 0],
    'sub-010_ses-01_run-3': [0],
    'sub-010_ses-01_run-4': [0],
    'sub-011_ses-01_run-2': [1, 0],
    'sub-011_ses-01_run-3': [0],
    'sub-011_ses-01_run-4': [0],
    'sub-012_ses-01_run-2': [0],
    'sub-012_ses-01_run-3': [0],
    'sub-014_ses-01_run-2': [0],
    'sub-014_ses-01_run-3': [0],
    'sub-015_ses-01_run-2': [0],
    'sub-015_ses-01_run-3': [0],
    'sub-016_ses-01_run-2': [0],
    'sub-016_ses-01_run-3': [0],
    'sub-017_ses-01_run-2': [0],
    'sub-017_ses-01_run-3': [0],
    'sub-018_ses-01_run-2': [0],
    'sub-018_ses-01_run-3': [0],
    'sub-019_ses-01_run-2': [0],
    'sub-019_ses-01_run-3': [0],
    'sub-020_ses-01_run-2': [0],
    'sub-020_ses-01_run-3': [0],
    'sub-021_ses-01_run-2': [0],
    'sub-021_ses-01_run-3': [0],
    'sub-022_ses-01_run-1': [307],
    'sub-022_ses-01_run-2': [196, 0],
    'sub-022_ses-01_run-3': [1, 0],
    'sub-022_ses-01_run-4': [0],
    'sub-022_ses-01_run-5': [1, 0],
    'sub-022_ses-01_run-6': [1, 0],
    'sub-023_ses-01_run-2': [0],
    'sub-023_ses-01_run-3': [0],
    'sub-023_ses-01_run-4': [1, 0],
    'sub-023_ses-01_run-5': [1, 0],
    'sub-024_ses-01_run-2': [0],
    'sub-024_ses-01_run-3': [0]
}

# Fix boundary and empty events for several runs
for key, values in key_dict.items():
    file = bids_dict.get_file_path(key)
    filename = file[:-4] + "temp1.tsv"
    df = get_new_dataframe(filename)
    logger.add(key, f"Number of rows {len(df.index)}", also_print=True)
    for val in values:
        logger.add(key, f"Dropping row {str(val)}:\n{df.loc[[val]]}", also_print=True)
        df.drop(axis=0, index=val, inplace=True)
        df.reset_index(inplace=True)
    logger.add(key, f"Saving {os.path.basename(filename)}")
    df.to_csv(filename, sep='\t', index=False)

# Print the log
log_string = "\n\nLog output:\n" + logger.get_log_string()
error_string = "\n\nERROR Summary:\n" + logger.get_log_string(level="ERROR")
print(log_string)
print(error_string)
save_path = os.path.join(bids_root_path, log_file_name)
with open(save_path, "w") as fp:
    fp.write(f"{log_file_name} {datetime.datetime.now()}\n")
    fp.write(log_string)
    fp.write(error_string)

sub-001_ses-01_run-2 []: Number of rows 351
sub-001_ses-01_run-2 []: Dropping row 0:
   onset duration  sample     value
0 -0.002      n/a    -0.5  boundary
sub-001_ses-01_run-3 []: Number of rows 351
sub-001_ses-01_run-3 []: Dropping row 0:
   onset duration  sample     value
0 -0.002      n/a    -0.5  boundary
sub-001_ses-01_run-4 []: Number of rows 351
sub-001_ses-01_run-4 []: Dropping row 0:
   onset duration  sample     value
0 -0.002      n/a    -0.5  boundary
sub-002_ses-01_run-2 []: Number of rows 352
sub-002_ses-01_run-2 []: Dropping row 1:
   onset duration  sample     value
1  0.002      n/a     0.5  boundary
sub-002_ses-01_run-2 []: Dropping row 0:
   index  onset           duration  sample     value
0      0 -0.002  181119.0000000000    -0.5  boundary
sub-002_ses-01_run-3 []: Number of rows 351
sub-002_ses-01_run-3 []: Dropping row 0:
   onset duration  sample     value
0 -0.002      n/a    -0.5  boundary
sub-002_ses-01_run-4 []: Number of rows 351
sub-002_ses-01_run-4 []: