In [1]:
from typing import Set

import numpy as np
import pandas as pd
import os

In [2]:
RAW_DATA_DIR = "CogLoad-1Hz"
CLEAN_DATA_DIR = "clean_data"

NOT_EXPERIMENT = {"quest", "post"}

WINDOW_SIZE = 30  # seconds

Get tasks

In [3]:
def get_tasks(files_path: str) -> Set[str]:
    tasks: Set[str] = set()

    for file in os.listdir(files_path):
        if "sensors" not in file:
            continue

        df = pd.read_csv(f"{files_path}/{file}")
        tasks |= set(df["task"].unique().flatten())

    return tasks - NOT_EXPERIMENT

Segment observations

In [4]:
def segment(path: str, tasks: Set[str]) -> [pd.DataFrame, pd.DataFrame]:
    if "sensors" not in path:
        raise Exception("File doesn’t contain sensor data")

    df: pd.DataFrame = pd.read_csv(path)
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.dropna()

    label_segments = []
    temp_segments = []
    hr_segments = []
    gsr_segments = []
    rr_segments = []
    segment_sensor_data: pd.DataFrame = pd.DataFrame()

    for task in tasks:
        for level in df[df.task == task].level.unique():
            task_df = df[(df.task == task) & (df.level == level)]
            if len(task_df) == 0:
                continue

            if "rest" in task and task not in ("n2_rest", "n3_rest"):
                timestamp_end_arr = []
                timestamp_end_arr.extend(
                    task_df[task_df.timestamp.diff() > 10000].timestamp.values
                )
                timestamp_end_arr.append(task_df.timestamp.iloc[-1])

                for segment_end in timestamp_end_arr:
                    segment_sensor_data = task_df[task_df.timestamp < segment_end]
                    segment_sensor_data = segment_sensor_data.iloc[-WINDOW_SIZE:]

                    if segment_sensor_data.shape[0] != WINDOW_SIZE:
                        continue

                    label_segments.append(segment_sensor_data.iloc[-1].values)

                    temp_segments.append(
                        segment_sensor_data.temperature.values[:WINDOW_SIZE]
                    )
                    hr_segments.append(segment_sensor_data.hr.values[:WINDOW_SIZE])
                    gsr_segments.append(segment_sensor_data.gsr.values[:WINDOW_SIZE])
                    rr_segments.append(segment_sensor_data.rr.values[:WINDOW_SIZE])
            else:
                segment_end = task_df.timestamp.iloc[-1]
                segment_start = segment_end - WINDOW_SIZE * 1000
                segment_sensor_data = task_df[
                    (task_df.timestamp <= segment_end)
                    & (task_df.timestamp >= segment_start)
                ]
                if 25 < segment_sensor_data.shape[0] < WINDOW_SIZE:
                    segment_sensor_data = df[
                        (df.timestamp <= segment_end) & (df.timestamp >= segment_start)
                    ]

                if len(segment_sensor_data) < WINDOW_SIZE:
                    continue

                label_segments.append(segment_sensor_data.iloc[-1].values)

                temp_segments.append(
                    segment_sensor_data.temperature.values[:WINDOW_SIZE]
                )
                hr_segments.append(segment_sensor_data.hr.values[:WINDOW_SIZE])
                gsr_segments.append(segment_sensor_data.gsr.values[:WINDOW_SIZE])
                rr_segments.append(segment_sensor_data.rr.values[:WINDOW_SIZE])

    label_segments = np.stack(label_segments)
    column_names = segment_sensor_data.columns.values

    temp_segments = np.stack(temp_segments)
    hr_segments = np.stack(hr_segments)
    gsr_segments = np.stack(gsr_segments)
    rr_segments = np.stack(rr_segments)

    _features: pd.DataFrame = pd.DataFrame(
        np.concatenate([temp_segments, hr_segments, gsr_segments, rr_segments], axis=1)
    )
    _labels: pd.DataFrame = pd.DataFrame(label_segments, columns=column_names)

    return _features, _labels

In [5]:
def clean_data(raw_data_dir: str, save_dir: str):
    tasks: Set[str] = get_tasks(files_path=RAW_DATA_DIR)

    for file_name in sorted(os.listdir(raw_data_dir)):
        features, labels = segment(path=f"{RAW_DATA_DIR}/{file_name}", tasks=tasks)

        user_id: str = file_name.split("_")[0]
        features.to_csv(f"{save_dir}/{user_id}_features.csv")
        labels.to_csv(f"{save_dir}/{user_id}_labels.csv")

In [6]:
def data_equal(comparee_dir: str, standard_dir: str) -> bool:
    # Check if the directory is compared with itself
    if comparee_dir == standard_dir:
        return True

    # Check if both sources have data of the same shape
    comparee_features: pd.DataFrame = pd.concat(
        [
            pd.read_csv(f"{comparee_dir}/{file_name}")
            for file_name in os.listdir(comparee_dir)
            if "features" in file_name
        ]
    )
    standard_features: pd.DataFrame = pd.concat(
        [
            pd.read_csv(f"{standard_dir}/segment_{data_type}.csv")
            for data_type in ["temperature", "heartrate", "gsr", "rr"]
        ],
        axis=1,
    )

    comparee_features = comparee_features.drop("Unnamed: 0", axis=1)
    standard_features = standard_features.drop("Unnamed: 0", axis=1)

    comparee_labels: pd.DataFrame = pd.concat(
        [
            pd.read_csv(f"{comparee_dir}/{file_name}")
            for file_name in os.listdir(comparee_dir)
            if "labels" in file_name
        ]
    )
    standard_labels: pd.DataFrame = pd.read_csv(f"{standard_dir}/segment_labels.csv")

    comparee_labels = comparee_labels.drop("Unnamed: 0", axis=1)
    standard_labels = standard_labels.drop("Unnamed: 0", axis=1)

    if (
        comparee_features.shape != standard_features.shape
        or comparee_labels.shape != standard_labels.shape
    ):
        return False

    # Check if both sources contain the same rows
    comparee_features.columns = range(comparee_features.shape[1])
    standard_features.columns = range(standard_features.shape[1])

    merged_features: pd.DataFrame = pd.merge(
        comparee_features,
        standard_features,
        how="outer",
        left_index=False,
        right_index=False,
        indicator=True,
    )
    merged_labels: pd.DataFrame = pd.merge(
        comparee_labels,
        standard_labels,
        how="outer",
        left_index=False,
        right_index=False,
        indicator=True,
    )

    sources_features: pd.Index = (
        merged_features["_merge"].unique().remove_unused_categories().categories
    )
    sources_labels: pd.Index = (
        merged_labels["_merge"].unique().remove_unused_categories().categories
    )

    if len(sources_features) > 1 or "both" not in sources_features:
        return False
    if len(sources_labels) > 1 or "both" not in sources_labels:
        return False

    return True

In [7]:
clean_data(raw_data_dir=RAW_DATA_DIR, save_dir=CLEAN_DATA_DIR)
data_equal(comparee_dir=CLEAN_DATA_DIR, standard_dir="initial_data")

True