# Preprocessing 03 - Data Format - Transforming the Raw Data into splitted Data
## Partially from replication package (Peitek et al.)

## Importing the libraries

In [None]:
import os
import re
import pandas as pd
from tqdm.notebook import tqdm
import mne
import dask.dataframe as dd
import numpy as np
import sys

ZFILL = 1

## Get all the Participants based on the folders in data/rawData

In [None]:
participants = []
for _dir, sub_dirs, _files in os.walk("./data/rawData"):
    for dir in sub_dirs:
        numbers = re.findall(r'\d+', dir)
        participants.append(int(numbers[0]))
    break

## Move ICA eeg files back to the raw folder and delete everything in the eeg_tmp folder

In [None]:
ica_folder = "./data/eeg_tmp/ica"
raw_folder = "./data/eeg_tmp/raw"

# Move ICA eeg files back to the raw folder
for participant in tqdm(participants):
    participant_folder = "./data/rawData/Participant" + str(participant).zfill(ZFILL)
    fdt_file_source = ica_folder + "/eeg_raw_" + str(participant).zfill(ZFILL) + ".fdt"
    set_file_source = ica_folder + "/eeg_raw_" + str(participant).zfill(ZFILL) + ".set"
    fdt_file_destination = participant_folder + "/eeg_raw_" + str(participant).zfill(ZFILL) + ".fdt"
    set_file_destination = participant_folder + "/eeg_raw_" + str(participant).zfill(ZFILL) + ".set"
    try:
        os.rename(fdt_file_source, fdt_file_destination)
        os.rename(set_file_source, set_file_destination)
    except:
        print("Participant " + str(participant) + " already has the files")

# delete every file in the raw_folder
for _dir, _sub_dirs, _files in os.walk(raw_folder):
    for file in _files:
        os.remove(raw_folder + "/" + file)

## Helper functions to transform data

In [None]:
mne.set_log_level("WARNING")

# extracts the file name from a path like "/test/path/file.txt" and returns "file"
def to_file_name(path):
    file, _ext = os.path.splitext(path)
    return file.split("\\")[-1]

# function to extract numbers from a (x,x) format
def two_extractor(value):
    two_extractor_compiled = re.compile("\((.*), (.*)\)")
    pattern = two_extractor_compiled.match(value)
    return float(pattern.group(1)), float(pattern.group(2))

# Helper to read events from the info field directly; specific to some of our recordings
def get_events_from_info(inst):
    eventsMNE = []
    eventsFromFIF = inst.info["events"]
    for event_idx in range(0, len(eventsFromFIF)):
        if eventsFromFIF[event_idx].get("list") is not None:
            content = eventsFromFIF[event_idx].get("list")
            content_list = content.tolist()
            content_new = [content_list[2], content_list[1], content_list[0]]
            eventsMNE.append(content_new)
        elif eventsFromFIF[event_idx].get("channels") is not None:
            raise
            # content = eventsFromFIF[i].get('channels')
        else:
            print("fiftools: Type of entry #" + str(event_idx + 1) + "unknown.")
    eventsMNE = np.array(eventsMNE)
    return eventsMNE


def load_raw(participant_number, cores=12, digits=1, logging=True,
             raw_path="./data/rawData/"):
    # setup output for logging
    output = sys.stdout
    if not logging:
        output = open(os.devnull, 'w')

    print("(01/09) Construct Paths", file=output, flush=True)
    # setup paths for loading
    participant_folder = raw_path + "Participant" + str(participant_number).zfill(digits) + "/"
    eye_tracking_path = participant_folder + "EyeTracking.csv"
    eeg_path = participant_folder + "eeg_raw.fif"
    eeg_set_path = participant_folder + "eeg_raw_" + str(participant_number).zfill(digits) + ".set"
    task_answers_path = participant_folder + "Task_Answers.csv"
    question_path = "./data/Questions/Questions" + str(participant_number).zfill(digits) + ".xlsx"
    hashs_path = participant_folder + "Hashs.csv"

    print("(02/09) Read Eye Tracker Data", file=output, flush=True)
    # read tracker data
    df_eye_tracking = pd.read_csv(eye_tracking_path, header=None, sep=";")

    # get type for parallel processing
    meta_type = dd.utils.make_meta(0.0)

    # partition dataframe for parallel work
    ddf_eye_tracking = dd.from_pandas(df_eye_tracking, npartitions=cores)

    print("(03/09) Transform Eye Tracker Data", file=output, flush=True)
    # extract the data from the eye-tracking csv to numbers and rename the columns
    df_0 = pd.DataFrame(ddf_eye_tracking[0].compute().transpose().tolist(), columns=["l_valid"])
    df_1 = pd.DataFrame(
        ddf_eye_tracking.apply(lambda x: two_extractor(x[1]), meta=meta_type, axis=1).compute().transpose().tolist(),
        columns=["l_display_x", "l_display_y"], )
    df_2 = pd.DataFrame(ddf_eye_tracking[2].compute().transpose().tolist(), columns=["l_pupil_diameter"])
    df_3 = pd.DataFrame(ddf_eye_tracking[3].compute().transpose().tolist(), columns=["r_valid"])
    df_4 = pd.DataFrame(
        ddf_eye_tracking.apply(lambda x: two_extractor(x[4]), meta=meta_type, axis=1).compute().transpose().tolist(),
        columns=["r_display_x", "r_display_y"], )
    df_5 = pd.DataFrame(ddf_eye_tracking[5].compute().transpose().tolist(), columns=["r_pupil_diameter"])
    df_6 = pd.DataFrame(ddf_eye_tracking[6].compute().transpose().tolist(), columns=["time"])

    # remove ddf_eye_tracking to save a bit of ram
    del ddf_eye_tracking

    # concat the dataframes to one eyetracking dataframe
    df_eye_tracking = pd.concat([df_0, df_1, df_2, df_3, df_4, df_5, df_6], axis=1)

    print("(04/09) Normalize Eye Tracker Time", file=output, flush=True)
    # normalize the time to seconds
    t_0 = df_eye_tracking["time"][0]
    df_eye_tracking["time"] = (df_eye_tracking["time"].astype(float) - t_0) / 1000000.0

    print("(05/09) Read EEG Data", file=output, flush=True)
    # read the eeg data and scale it
    raw = mne.io.read_raw_fif(fname=eeg_path, preload=True)
    raw_set = mne.io.read_raw_eeglab(eeg_set_path, preload=True)

    print("(06/09) Construct Events from EEG Data", file=output, flush=True)
    # get the time of the events in seconds
    sampling_rate = raw.info["sfreq"]
    events = get_events_from_info(raw)
    event_ids = events[:, 2]
    indices_events = events[:, 0]
    t_events = event_ids / sampling_rate

    # save the event times in a dataframe for better handling
    columns = [
        "Baseline",
        "BaselineTask",
        "BaselineHash",
        "BaselineStart",
        "BaselineStop",
        "BaselineCorrectAnswer",
        "BaselineSkipped",
        "Snippet",
        "SnippetHash",
        "SnippetStart",
        "SnippetStop",
        "SnippetCorrectAnswer",
        "SnippetSkipped",
    ]
    df_time = pd.DataFrame([], columns=columns)
    i = 0
    while i < len(t_events):
        df_time = df_time.append(pd.DataFrame([[None, None, indices_events[i + 2], t_events[i + 2], t_events[i + 3], None, None, None, indices_events[i + 6], t_events[i + 6], t_events[i + 7], None, None]], columns=columns, ))
        i += 9
    df_time = df_time.reset_index(drop=True)

    print("(07/09) Get Snippet and Baseline for Hash", file=output, flush=True)
    # read the data from the psychopy csv file
    df_hash = pd.read_csv(hashs_path)
    for index, row in df_time.iterrows():
        baseline_path = str(list(df_hash.loc[df_hash['hash'] == row["BaselineHash"], "task"])[0])
        if "Text" in baseline_path:
            baseline_type = "Text"
        elif "Math" in baseline_path:
            baseline_type = "Math"
        elif "Matrix" in baseline_path:
            baseline_type = "Matrix"
        else:
            baseline_type = "Rest"
        df_time.loc[df_time['BaselineHash'] == row["BaselineHash"], 'BaselineTask'] = to_file_name(baseline_path)
        df_time.loc[df_time['BaselineHash'] == row["BaselineHash"], 'Baseline'] = baseline_type
        df_time.loc[df_time['SnippetHash'] == row["SnippetHash"], 'Snippet'] = to_file_name(list(df_hash.loc[df_hash['hash'] == row["SnippetHash"], "task"])[0])

    print("(08/09) Transform PsychoPy Data", file=output, flush=True)
    df_questions = pd.read_excel(question_path)
    df_task_answer = pd.read_csv(task_answers_path)

    for index, row in df_time.iterrows():
        if row["Baseline"] == "Rest":
            row["BaselineCorrectAnswer"] = True
            row["BaselineSkipped"] = False
        else:
            question_baseline_row = df_questions[df_questions["QuestionImage"].str.contains(row["BaselineTask"])].iloc[0]
            task_baseline_answer_row = df_task_answer[df_task_answer["task"] == question_baseline_row["Baseline_name"]].iloc[0]
            row["BaselineCorrectAnswer"] = question_baseline_row["correct_answer"] == task_baseline_answer_row["answer"]
            if task_baseline_answer_row["answer"] == 'k':
                row["BaselineSkipped"] = True
            else:
                row["BaselineSkipped"] = False
        question_snippet_row = df_questions[df_questions["ProgrammingQuestion"].str.contains(row["Snippet"])].iloc[0]
        task_snippet_row = df_task_answer[df_task_answer["task"] == question_snippet_row["Pr_task_name"]].iloc[0]
        row["SnippetCorrectAnswer"] = question_snippet_row["Pr_correct_answer"] == task_snippet_row["answer"]
        if task_snippet_row["answer"] == 'k':
            row["SnippetSkipped"] = True
        else:
            row["SnippetSkipped"] = False

    print("(09/09) Transform All Data to Dictionary", file=output, flush=True)
    # store all the data in a dictionary for better handling. split everything up by snippet
    result = {}
    # iterate for every snippet to set the data
    for index, row in df_time.iterrows():
        current = {"Code": {"EyeTracking": None, "EEG": None, "Time": {"Start": None, "Stop": None, }, "Answer": None, "Skipped": None, },
                   "Baseline": {"Type": None,"Task": None,"EyeTracking": None, "EEG": None, "Time": {"Start": None, "Stop": None, }, "Answer": None, "Skipped": None, },}

        # add data for code
        current["Code"]["EyeTracking"] = df_eye_tracking[(df_eye_tracking["time"] >= df_time["SnippetStart"][index]) & (df_eye_tracking["time"] < df_time["SnippetStop"][index])]
        current["Code"]["EEG"] = raw_set.copy().crop(df_time["SnippetStart"][index], df_time["SnippetStop"][index])
        current["Code"]["Time"]["Start"] = df_time["SnippetStart"][index]
        current["Code"]["Time"]["Stop"] = df_time["SnippetStop"][index]
        current["Code"]["Answer"] = df_time["SnippetCorrectAnswer"][index]
        current["Code"]["Skipped"] = df_time["SnippetSkipped"][index]

        current["Baseline"]["EyeTracking"] = df_eye_tracking[(df_eye_tracking["time"] >= df_time["BaselineStart"][index]) & (df_eye_tracking["time"] < df_time["BaselineStop"][index])]
        current["Baseline"]["EEG"] = raw_set.copy().crop(df_time["BaselineStart"][index], df_time["BaselineStop"][index])
        current["Baseline"]["Time"]["Start"] = df_time["BaselineStart"][index]
        current["Baseline"]["Time"]["Stop"] = df_time["BaselineStop"][index]
        current["Baseline"]["Answer"] = df_time["BaselineCorrectAnswer"][index]
        current["Baseline"]["Skipped"] = df_time["BaselineSkipped"][index]
        current["Baseline"]["Type"] = df_time["Baseline"][index]
        current["Baseline"]["Task"] = df_time["BaselineTask"][index]

        result[row["Snippet"]] = current

    return result

## Transform all the Data into splitted Data

In [None]:
columns = ["Participant","Baseline","BaselineTask","BaselineStartTime","BaselineEndTime","BaselineEyetracking","BaselineEEG","Algorithm","ProgramStartTime","ProgramEndTime","ProgramEyetracking","ProgramEEG"]

def rescale(data):
    # Scaling factor (to obtain values in [V], depends on device and settings etc.)
    scaling_factor = 1e-8
    return scaling_factor * data

# Iterate over all participants
for participant in tqdm(participants):
    print("-----------------------------------------------------------------------------------------")
    print("Participant " + str(participant))

    # Check if folder exists
    if not os.path.exists("./data/filteredData/Participant" + str(participant).zfill(ZFILL)):
        os.makedirs("./data/filteredData/Participant" + str(participant).zfill(ZFILL))

    if os.path.exists("./data/filteredData/Participant" + str(participant).zfill(ZFILL) + "/filtered_data.csv"):
        print("Skipped: Already processed")
        continue

    df_filtered_part = pd.DataFrame(columns=columns)

    # Load in Raw Data from Input folder
    data = load_raw(participant, cores=24, logging=True)
    folder_prev = "./data/filteredData/Participant" + str(participant).zfill(ZFILL) + "/"

    # save the raw data into split data for task/input/output
    for algorithm in data.keys():
        # Get the start time, end time and answer for baseline
        baseline = data[algorithm]["Baseline"]["Type"]
        baseline_task = data[algorithm]["Baseline"]["Task"]
        baseline_start = data[algorithm]["Baseline"]["Time"]["Start"]
        baseline_end = data[algorithm]["Baseline"]["Time"]["Stop"]
        baseline_answer = data[algorithm]["Baseline"]["Answer"]
        baseline_skipped = data[algorithm]["Baseline"]["Skipped"]

        # Save baseline eeg to file
        baseline_eeg = data[algorithm]["Baseline"]["EEG"]
        baseline_eeg.apply_function(rescale, picks=['eeg'])
        baseline_eeg.save(folder_prev + algorithm + "baseline_eeg_raw.fif",overwrite=True)
        baseline_eeg = folder_prev + algorithm + "baseline_eeg_raw.fif"

        # Save baseline eyetracking to file
        baseline_eyetracking = data[algorithm]["Baseline"]["EyeTracking"]
        baseline_eyetracking.to_csv(folder_prev + algorithm + "baseline_eyetracking_raw.csv", index=False)
        baseline_eyetracking = folder_prev + algorithm + "baseline_eyetracking_raw.csv"

        # Get the start time, end time and answer for code
        code_start = data[algorithm]["Code"]["Time"]["Start"]
        code_end = data[algorithm]["Code"]["Time"]["Stop"]
        code_answer = data[algorithm]["Code"]["Answer"]
        code_skipped = data[algorithm]["Code"]["Skipped"]

        # Save code eeg to file
        code_eeg = data[algorithm]["Code"]["EEG"]
        code_eeg.apply_function(rescale, picks=['eeg'])
        code_eeg.save(folder_prev + algorithm + "code_eeg_raw.fif", overwrite=True)
        code_eeg = folder_prev + algorithm + "code_eeg_raw.fif"

        # Save code eyetracking to file
        code_eyetracking = data[algorithm]["Code"]["EyeTracking"]
        code_eyetracking.to_csv(folder_prev + algorithm + "code_eyetracking_raw.csv", index=False)
        code_eyetracking = folder_prev + algorithm + "code_eyetracking_raw.csv"

        # append the data to the dataframe
        df_filtered_part = df_filtered_part.append({
            "Participant": participant,
            "Baseline": baseline,
            "BaselineTask": baseline_task,
            "BaselineStartTime": baseline_start,
            "BaselineEndTime": baseline_end,
            "BaselineEyetracking": baseline_eyetracking,
            "BaselineEEG": baseline_eeg,
            "BaselineCorrect": baseline_answer,
            "BaselineSkipped": baseline_skipped,
            "Algorithm": algorithm,
            "ProgramStartTime": code_start,
            "ProgramEndTime": code_end,
            "ProgramEyetracking": code_eyetracking,
            "ProgramEEG": code_eeg,
            "ProgramCorrect": code_answer,
            "ProgramSkipped": code_skipped,
        }, ignore_index=True)

    df_filtered_part.to_csv("./data/filteredData/Participant" + str(participant).zfill(ZFILL) + "/filtered_data.csv", index=False)



In [None]:
columns = ["Participant","Baseline","BaselineTask","BaselineStartTime","BaselineEndTime","BaselineEyetracking","BaselineEEG","Algorithm","ProgramStartTime","ProgramEndTime","ProgramEyetracking","ProgramEEG"]
df_filtered = pd.DataFrame(columns=columns)

for participant in tqdm(participants):
    df_filtered_part = pd.read_csv("./data/filteredData/Participant" + str(participant).zfill(ZFILL) + "/filtered_data.csv")
    df_filtered = pd.concat([df_filtered, df_filtered_part], ignore_index=True)

df_filtered.to_csv("./data/filteredData/filtered_data.csv", index=False)
df_filtered