In [None]:
# Copyright 2021 Carlos Gil, Daniel Moreno.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

## Steps

- Download and unzip the dataset into its corresponding directory.
- Load the `.txt` files.
- Merge the files from each user into one dataframe.
- Export the resulting dataframe to `.csv`.
- Repeat the process for all the users.

## Dataset download
Downloading the dataset from `https://physionet.org` and storing it locally in a folder called `download/`

**Note:** If the dataset has been downloaded, do not run the following cell.

You can download the dataset manually by going to the [website](https://www.physionet.org/content/sleep-accel/1.0.0/).

In [None]:
!sudo apt-get install wget

!wget -r -nv -N -c -np https://physionet.org/files/sleep-accel/1.0.0/

!mkdir ./download

!mv ./physionet.org/files/sleep-accel/1.0.0/* download

!rm -r ./physionet.org/

!find ./download -name "*.html" -type f -delete

!mkdir ./output

In [2]:
import pandas as pd
import numpy as np
import os
import re
import time
import datetime
from enum import Enum

In [3]:
class Error(Enum):
    dir_not_empty = "[Error]: the directory is not empty "
    match_number_users = "[Error]: number of users in list does not match "
    match_user_id = "[Error]: user id does not match between lists "
    match_length_arrays = "[Error]: the length of the lists does not match "
    match_index = "[Error]: indexes are mismatched "
    duplicated = "[Error]: found duplicated indexes "
    generic_error = "[Error] "

    @staticmethod
    def raise_error(type_error, value):
        return type_error.value + ": " + str(value)


## Layout and paths preparation

The directory tree should resemble the following layout structure:

```bash
dataset/
├── download/
│   ├── heart_rate/
│   ├── labels/
│   └── steps/
└── output/
    ├── dataset_user_1.csv
    ├── dataset_user_2.csv
    ⁞
    └── dataset_user_31.csv
```

In [4]:
# Output path.
output_dir = 'output'

output_path = os.path.join(os.getcwd(), output_dir)

# Download path.
data_path = os.path.join(os.getcwd(), 'download')

motion_path = os.path.join(data_path, 'motion')
heart_rate_path = os.path.join(data_path, 'heart_rate')
labels_path = os.path.join(data_path, 'labels')

# Obtaining lists with all users in ascending ordered
motion_list = sorted(os.listdir(motion_path))
heart_rate_list = sorted(os.listdir(heart_rate_path))
labels_list = sorted(os.listdir(labels_path))

# Checking that the downloaded data contains the 31 users in all the lists created
assert len(motion_list) == 31, Error.match_number_users.value
assert len(heart_rate_list) == 31, Error.match_number_users.value
assert len(labels_list) == 31, Error.match_number_users.value

user_ids = []

# Checking that the user ids match in order accross the three lists
for item in range(len(motion_list)):
    user_motion_id = re.search("\d*", motion_list[item])
    user_heart_rate_id = re.search("\d*", heart_rate_list[item])
    user_labels_id = re.search("\d*", labels_list[item])

    assert user_motion_id.group(0) == user_heart_rate_id.group(0), Error.match_user_id.value
    assert user_motion_id.group(0) == user_labels_id.group(0), Error.match_user_id.value

    user_ids.append(user_motion_id.group(0))

## Creation of generate datasets function

In [20]:
def generate_all_datasets(labels_df=[], interval=0, verbose=False):
    '''
    This function takes a total of four parameters: 
        - labels_df: a list with the name of the columns of the generated datasets (6 labels). If not specified or less/more than 6 labels given, it takes default values.
        - verbose: if set to true, display some feedback of the process (the whole process might take several minutes).
    '''
    
    # Checking that the output directory is empty
    if len(os.listdir(output_dir)) > 0:
        print(f"The folder {output_dir}/ is not empty")
        input_val = input("Do you want to overwrite it? [y/n] \n")

        if input_val == "y":
            file_list = [file for file in os.listdir(output_dir)]
            
            for file in file_list:
                os.remove(os.path.join(output_dir, file))
        else:
            raise Exception(Error.raise_error(Error.dir_not_empty, output_dir))

    if len(labels_df) != 6:
        labels_df = ['Time', 'X', 'Y', 'Z', 'Heart Rate', 'Labels']

    time_acc = []

    for user in range(0, len(user_ids)):
        
        start = time.time()

        motion_user = os.path.join(motion_path, motion_list[user])
        heart_rate_user = os.path.join(heart_rate_path, heart_rate_list[user])
        labels_user = os.path.join(labels_path, labels_list[user])

        user_df = generate_dataset(motion_user, heart_rate_user, labels_user, labels_df, interval)

        if interval == 0 and user_df["Time"].duplicated().any():
            raise Exception(Error.raise_error(Error.duplicated, user_df))
        
        # Saving the generated dataset
        fname = os.path.join(output_path, 'dataset_' + user_ids[user] + ".csv")

        user_df.to_csv(fname, index = False, header=True)

        time_acc.append(time.time() - start)

        if verbose:
            print('Dataset user id <{}> generated succesfully. Time: {:.0f} s'.format(user_ids[user], time_acc[user]))


    print(f'\nProcess completed! Total time execution: {datetime.timedelta(seconds=sum(time_acc))}\n')

In [19]:
def generate_dataset(motion_user, heart_rate_user, labels_user, labels_columns, interval):

    '''
    this function accepts three filenames from one user to generate the dataset as well as the pre-defined intervals.
    '''
    
    # --- Loading the txt files
    motion = np.loadtxt(motion_user)
    heart_rate = np.loadtxt(heart_rate_user, delimiter=',')
    labels = np.loadtxt(labels_user)

    # --- Cropping to match the labelled list
    motion = crop_to_offset(motion, labels)    
    heart_rate = crop_to_offset(heart_rate, labels)

    # np.savetxt("cropped_motion.txt", motion)
    # np.savetxt("cropped_heart_rate.txt", heart_rate)

    # --- Pre.processing
    motion = get_peak_values(motion, interval)

    # --- Merging
    merged_arrays = merge_arrays(motion, heart_rate, labels)

    # --- Importing merged to a pandas dataframe
    data_frame = pd.DataFrame(merged_arrays, columns=labels_columns)

    return data_frame

The raw data recorded with the Apple Watch (motion and heart rate) contains continiuous and uninterrumped measurements of one or more days, including the last night.

Since the data corresponding to the last night underwent a proper labelling from the PSG results, it is necessary to crop the raw data only to that night (i.e. the list with labels). Anything else, will not be part of the generated dataset and will therefore be disregarded.

This is handled by the function `crop_to_offset()`. This function carries out two tasks:

1. It finds the last night measured within the array passed.
2. For the last night, it finds the boundaries corresponding to the start and end of the labelled list.

Then, the function returns the indexes where the array needs to be sliced.

In [6]:
def crop_to_offset(array_to_crop, array_ref):
    '''
    This function takes two arrays, the first is the one to be cropped and the second one the reference to where to crop.
    It returs a new array starting and ending where the indexes matched with the reference array.
    '''

    start_index, end_index = 0, 0
    array_size = np.size(array_to_crop, 0)
    cropped_array = []
    
    # --- Find the boundaries corresponding to the labelled list
    first_item = array_ref[0][0]
    last_item = array_ref[-1][0]
    
    last_item_found = False
    
    for item in range(array_size - 1, -1, -1):        
        # find end index
        if not last_item_found:
            if array_to_crop[item][0] < last_item:
                end_index = item + 1
                last_item_found = True
        
        # find start index
        if array_to_crop[item][0] < first_item:
            start_index = item
            break  # No more iteration is needed after finding end_index and start_index.
    
    cropped_array = array_to_crop[start_index:end_index]
        
    return cropped_array

In order to compress the vast amount of data gathered from the IMU sensor, some pre-processing is required. Tipically, these types of sensors record at a high frequencies resulting in hundreds of measurements every single second. Research suggests that changes among sleep cycles tend to occur gradually within a few seconds or even minutes. This also applies to the shift between NREM and REM, which is the variable of interest that is the focus of this application.

Therefore, the time resolution in the raw dataset is too accurate, being most of the accelerometer data redundant or non-relevant and the function `get_peak_values()` will help us separate the wheat from the chaff. All this function does is to find the maximum values within a time interval (here 1 second). The number of maximum values are left to the user, being one maximum values per second as default. 

We consider these peak values are of much significance amongst all the accelerometer data for extracting the most meaningful features for training the a model.

In [17]:
def get_peak_values(array, interval=0):
    '''
    This function finds a specified number of peak values for every interval (1 second) of the passed array.
    It returns the resulting processed array back.
    '''

    if interval > 0:
        interval = 0

    array_size = np.size(array, 0)
    max_value_x, max_value_y, max_value_z = 0, 0, 0
    # acc_x, acc_y, acc_z = [], [], []
    peak_values_x, peak_values_y, peak_values_z = [], [], []
    
    # acc_time = []
    time_accumulate = []
    
    # interval = 0.25
    last_interval = 0


    for item in range(array_size):
        if (array[item][0] - last_interval) < interval:
            if abs(array[item][1]) > abs(max_value_x):  # New peak value found at x
                max_value_x = abs(array[item][1])

            if abs(array[item][2]) > abs(max_value_y):  # New peak value found at y
                max_value_y = abs(array[item][2])

            if abs(array[item][3]) > abs(max_value_z):  # New peak value found at z
                max_value_z = abs(array[item][3])

            # acc_time = []
            # acc_x.append(abs(array[item][1]))
            # acc_y.append(abs(array[item][2]))
            # acc_z.append(abs(array[item][3]))            

        else:

            # for _ in range(num_peak_values):
            #     next_max_x = acc_x.pop(max(acc_x))
            #     peak_values_x.append(next_max_x)

            # for _ in range(num_peak_values):
            #     next_max_y = acc_y.pop(max(acc_y))
            #     peak_values_y.append(next_max_y)

            # for _ in range(num_peak_values):
            #     next_max_z = acc_z.pop(max(acc_z))
            #     peak_values_z.append(next_max_z)
            

            # end of interval
            peak_values_x.append(max_value_x)
            peak_values_y.append(max_value_y)
            peak_values_z.append(max_value_z)

            # reset interval values and increment count
            last_interval = np.floor(array[item][0])

            # acc_x, acc_y, acc_z = [], [], []

            max_value_x = 0
            max_value_y = 0
            max_value_z = 0

            last_interval = np.around(array[item][0], decimals=0)
            time_accumulate.append(last_interval)
        
    
    assert len(time_accumulate) == len(peak_values_x), Error.match_length_arrays.value
    assert len(time_accumulate) == len(peak_values_y), Error.match_length_arrays.value
    assert len(time_accumulate) == len(peak_values_z), Error.match_length_arrays.value
    
    return np.column_stack((time_accumulate, peak_values_x, peak_values_y, peak_values_z))

Last step is merging the already pre-processed motion dataset with the heart rate and labels datasets corresponding to each user. 

For the heart rate dataset the following approach is taken. For each window from the motion time interval, gather all the heart rate values that were measured within that interval and get the mean value of all of them. If no heart rate value was measured within a window interval, the last known value will be taken. If there is a time gap bigger than the actual window's size, then discard that data up to the next available one.

For the labels a simpler approach is taken. Since the time windows are meant to be always smaller or equal than the time interval of labels recorded, the label value will be unique. It can be either one that corresponds to that time window or the last one if missing in that interval. 

All the above-mentioned operations are carried out in the `merge_arrays()` method.

In [8]:
def merge_arrays(motion, heart_rate, labels):
    
    motion_size = np.size(motion, 0)
    heart_rate_size = np.size(heart_rate, 0)
    labels_size = np.size(labels, 0)

    new_heart_rate = []
    new_labels = []
    
    inc_heart_rate = 0
    inc_labels = 0

    for i in range(motion_size):

        # Appending heart rate 
        if inc_heart_rate == heart_rate_size:
            pass
        elif motion[i][0] > heart_rate[inc_heart_rate][0]:
            inc_heart_rate += 1
        
        new_heart_rate.append(heart_rate[inc_heart_rate - 1][1])

        # Appending labels
        if motion[i][0] > labels[inc_labels][0]:
            inc_labels += 1

        new_labels.append(labels[inc_labels - 1][1])

        
        # If the motion dataset finishes but there is still heart rate dataset, dismiss the rest of the latter.
        if inc_heart_rate >= motion_size:
            break

        
    assert motion_size == len(new_heart_rate), Error.match_length_arrays.value
    assert motion_size == len(new_labels), Error.match_length_arrays.value
    
    return np.column_stack((motion, new_heart_rate, new_labels))

In [None]:
# user = 1

# motion_user = os.path.join(motion_path, motion_list[user])
# heart_rate_user = os.path.join(heart_rate_path, heart_rate_list[user])
# labels_user = os.path.join(labels_path, labels_list[user])
# labels_df = ['Time', 'X', 'Y', 'Z', 'Heart Rate', 'Labels']

# user_df = generate_dataset(motion_user, heart_rate_user, labels_user, labels_df)

# user_df

In [None]:
# if user_df["Time"].duplicated().any():
#     print("match")

## Running the application

In [21]:
generate_all_datasets(interval=0.25, verbose=True)

Dataset user id <1066528> generated succesfully. Time: 24 s
Dataset user id <1360686> generated succesfully. Time: 29 s
Dataset user id <1449548> generated succesfully. Time: 33 s
Dataset user id <1455390> generated succesfully. Time: 30 s
Dataset user id <1818471> generated succesfully. Time: 30 s
Dataset user id <2598705> generated succesfully. Time: 29 s
Dataset user id <2638030> generated succesfully. Time: 37 s
Dataset user id <3509524> generated succesfully. Time: 19 s
Dataset user id <3997827> generated succesfully. Time: 34 s
Dataset user id <4018081> generated succesfully. Time: 20 s
Dataset user id <4314139> generated succesfully. Time: 29 s
Dataset user id <4426783> generated succesfully. Time: 34 s
Dataset user id <46343> generated succesfully. Time: 18 s
Dataset user id <5132496> generated succesfully. Time: 21 s
Dataset user id <5383425> generated succesfully. Time: 6 s
Dataset user id <5498603> generated succesfully. Time: 24 s
Dataset user id <5797046> generated succesf

## Downloading the datasets

In [None]:
!zip -r /content/datasets.zip /content/output

In [23]:
from google.colab import files
files.download("/content/datasets.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>