In [129]:
from typing import List, Set

import numpy as np
import pandas as pd
import os

In [130]:
RAW_DATA_PATH = "CogLoad-1Hz"
CLEAN_DATA_PATH = "cleaned_data"

NOT_EXPERIMENT = {"quest", "post"}

WINDOW_SIZE = 30  # seconds

In [131]:
def get_tasks(files_path: str) -> Set[str]:
    tasks = set()

    for file in os.listdir(files_path):
        if "sensors" not in file:
            continue

        df = pd.read_csv(f"{files_path}/{file}")
        tasks |= set(df["task"].unique().flatten())

    return tasks - NOT_EXPERIMENT

Get tasks

In [132]:
def get_segments(files_path: str) -> [np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    tasks: Set[str] = get_tasks(files_path=files_path)

    _label_segments = []
    _temp_segments = []
    _hr_segments = []
    _gsr_segments = []
    _rr_segments = []
    segment_sensor_data: pd.DataFrame = pd.DataFrame()
    for file in os.listdir(files_path):
        if "sensor" not in file:
            continue

        df = pd.read_csv(f"{files_path}/{file}")
        df = df.replace([np.inf, -np.inf], np.nan)
        df = df.dropna()

        for task in tasks:
            for level in df[df.task == task].level.unique():
                task_df = df[(df.task == task) & (df.level == level)]
                if len(task_df) == 0:
                    continue

                if "rest" in task and task not in ("n2_rest", "n3_rest"):
                    timestamp_end_arr = []
                    timestamp_end_arr.extend(
                        task_df[task_df.timestamp.diff() > 10000].timestamp.values
                    )
                    timestamp_end_arr.append(task_df.timestamp.iloc[-1])

                    for segment_end in timestamp_end_arr:
                        segment_sensor_data = task_df[task_df.timestamp < segment_end]
                        segment_sensor_data = segment_sensor_data.iloc[-WINDOW_SIZE:]

                        if segment_sensor_data.shape[0] != WINDOW_SIZE:
                            continue

                        _label_segments.append(segment_sensor_data.iloc[-1].values)

                        _temp_segments.append(
                            segment_sensor_data.temperature.values[:WINDOW_SIZE]
                        )
                        _hr_segments.append(segment_sensor_data.hr.values[:WINDOW_SIZE])
                        _gsr_segments.append(
                            segment_sensor_data.gsr.values[:WINDOW_SIZE]
                        )
                        _rr_segments.append(segment_sensor_data.rr.values[:WINDOW_SIZE])
                else:
                    segment_end = task_df.timestamp.iloc[-1]
                    segment_start = segment_end - WINDOW_SIZE * 1000
                    segment_sensor_data = task_df[
                        (task_df.timestamp <= segment_end)
                        & (task_df.timestamp >= segment_start)
                    ]
                    if 25 < segment_sensor_data.shape[0] < WINDOW_SIZE:
                        segment_sensor_data = df[
                            (df.timestamp <= segment_end)
                            & (df.timestamp >= segment_start)
                        ]

                    if len(segment_sensor_data) < WINDOW_SIZE:
                        continue

                    _label_segments.append(segment_sensor_data.iloc[-1].values)

                    _temp_segments.append(
                        segment_sensor_data.temperature.values[:WINDOW_SIZE]
                    )
                    _hr_segments.append(segment_sensor_data.hr.values[:WINDOW_SIZE])
                    _gsr_segments.append(segment_sensor_data.gsr.values[:WINDOW_SIZE])
                    _rr_segments.append(segment_sensor_data.rr.values[:WINDOW_SIZE])

    _label_segments = np.stack(_label_segments)
    _temp_segments = np.stack(_temp_segments)
    _hr_segments = np.stack(_hr_segments)
    _gsr_segments = np.stack(_gsr_segments)
    _rr_segments = np.stack(_rr_segments)
    _column_names = segment_sensor_data.columns.values

    print(
        type(_label_segments),
        type(_temp_segments),
        type(_hr_segments),
        type(_gsr_segments),
        type(_rr_segments),
        type(_column_names),
    )
    return (
        _label_segments,
        _temp_segments,
        _hr_segments,
        _gsr_segments,
        _rr_segments,
        _column_names,
    )

In [133]:
(
    label_segments,
    temp_segments,
    hr_segments,
    gsr_segments,
    rr_segments,
    column_names,
) = get_segments(files_path=RAW_DATA_PATH)

label_df = pd.DataFrame(label_segments, columns=column_names)

temp_df = pd.DataFrame(temp_segments)
hr_df = pd.DataFrame(hr_segments)
gsr_df = pd.DataFrame(gsr_segments)
rr_df = pd.DataFrame(rr_segments)

label_df.to_csv(f"{CLEAN_DATA_PATH}/segment_labels.csv")
temp_df.to_csv(f"{CLEAN_DATA_PATH}/segment_temperature.csv")
hr_df.to_csv(f"{CLEAN_DATA_PATH}/segment_heartrate.csv")
gsr_df.to_csv(f"{CLEAN_DATA_PATH}/segment_gsr.csv")
rr_df.to_csv(f"{CLEAN_DATA_PATH}/segment_rr.csv")

<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [134]:
def csv_equal(comparee_path: str, standard_path: str) -> bool:
    if standard_path == comparee_path:
        return True

    comparee_df: pd.DataFrame = pd.read_csv(comparee_path)
    standard_df: pd.DataFrame = pd.read_csv(standard_path)

    if len(comparee_df) != len(standard_df):
        return False

    merged_df: pd.DataFrame = pd.merge(
        comparee_df.drop("Unnamed: 0", axis=1),
        standard_df.drop("Unnamed: 0", axis=1),
        how="outer",
        left_index=False,
        right_index=False,
        indicator=True,
    )
    sources: pd.Index = (
        merged_df["_merge"].unique().remove_unused_categories().categories
    )
    if len(sources) > 1 or "both" not in sources:
        return False

    return True

In [135]:
def csvs_equal(comparee_csv_dir: str, standard_csv_dir: str) -> bool:
    comparee_files: List[str] = os.listdir(comparee_csv_dir)
    standard_files: List[str] = os.listdir(standard_csv_dir)

    comparee_files.sort()
    standard_files.sort()

    if comparee_files != standard_files:
        return False

    for comparee_file, standard_file in zip(comparee_files, standard_files):
        files_equal: bool = csv_equal(
            comparee_path=f"{comparee_csv_dir}/{comparee_file}",
            standard_path=f"{standard_csv_dir}/{standard_file}",
        )
        if not files_equal:
            return False

    return True

In [136]:
csvs_equal("cleaned_data", "initial_data")

True