In [1]:
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

## Steps

- Download and unzip the dataset into its corresponding directory.
- Load the `.txt` files.
- Pre-process the loaded files in accordance with some pre-defined requirements.
- Merge processed files from each user into one dataframe.
- Export the resulting dataframe to `.csv`.
- Repeat the process for all the users.

## Dataset download
Downloading the dataset from `https://physionet.org` and storing it locally in a folder called `download/`

**Note:** If the dataset has been downloaded, do not run the following cell.

You can download the dataset manually by going to the [website](https://www.physionet.org/content/sleep-accel/1.0.0/).

In [3]:
!sudo apt-get install wget

!wget -r -nv -N -c -np https://physionet.org/files/sleep-accel/1.0.0/

!mkdir ./download

!mv ./physionet.org/files/sleep-accel/1.0.0/* download

!rm -r ./physionet.org/

!find ./download -name "*.html" -type f -delete

!mkdir ./output

Reading package lists... Done
Building dependency tree       
Reading state information... Done
wget is already the newest version (1.19.4-1ubuntu2.2).
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.
Last-modified header missing -- time-stamps turned off.
2021-06-26 14:17:44 URL:https://physionet.org/files/sleep-accel/1.0.0/ [925] -> "physionet.org/files/sleep-accel/1.0.0/index.html" [1]
2021-06-26 14:17:44 URL:https://physionet.org/robots.txt [22/22] -> "physionet.org/robots.txt" [1]
Last-modified header missing -- time-stamps turned off.
2021-06-26 14:17:44 URL:https://physionet.org/files/sleep-accel/1.0.0/heart_rate/ [4162] -> "physionet.org/files/sleep-accel/1.0.0/heart_rate/index.html" [1]
Last-modified header missing -- time-stamps turned off.
2021-06-26 14:17:44 URL:https://physionet.org/files/sleep-accel/1.0.0/labels/ [4278] -> "physionet.org/files/sleep-accel/1.0.0/labels/index.html" [1]
Last-modified header missing -- time-stamps turned off.
2021-06-26 14:17:4

In [4]:
import pandas as pd
import numpy as np
import os
import re
import time
import datetime
from enum import Enum

In [5]:
class Error(Enum):
    dir_not_empty = "[Error]: the directory is not empty "
    match_number_users = "[Error]: number of users in list does not match "
    match_user_id = "[Error]: user id does not match between lists "
    match_length_arrays = "[Error]: the length of the lists does not match "
    match_index = "[Error]: indexes are mismatched "
    generic_error = "[Error] "

    @staticmethod
    def raise_error(type_error, value):
        return type_error.value + ": " + str(value)


## Layout and paths preparation

The directory tree should resemble the following layout structure:

```bash
dataset/
├── download/
│   ├── heart_rate/
│   ├── labels/
│   └── steps/
└── output/
    ├── dataset_user_1.csv
    ├── dataset_user_2.csv
    ⁞
    └── dataset_user_31.csv
```

In [6]:
# Output path.
output_dir = 'output'

output_path = os.path.join(os.getcwd(), output_dir)

# Download path.
data_path = os.path.join(os.getcwd(), 'download')

motion_path = os.path.join(data_path, 'motion')
heart_rate_path = os.path.join(data_path, 'heart_rate')
labels_path = os.path.join(data_path, 'labels')

# Obtaining lists with all users in ascending ordered
motion_list = sorted(os.listdir(motion_path))
heart_rate_list = sorted(os.listdir(heart_rate_path))
labels_list = sorted(os.listdir(labels_path))

# Checking that the downloaded data contains the 31 users in all the lists created
assert len(motion_list) == 31, Error.match_number_users.value
assert len(heart_rate_list) == 31, Error.match_number_users.value
assert len(labels_list) == 31, Error.match_number_users.value

user_ids = []

# Checking that the user ids match in order accross the three lists
for item in range(len(motion_list)):
    user_motion_id = re.search("\d*", motion_list[item])
    user_heart_rate_id = re.search("\d*", heart_rate_list[item])
    user_labels_id = re.search("\d*", labels_list[item])

    assert user_motion_id.group(0) == user_heart_rate_id.group(0), Error.match_user_id.value
    assert user_motion_id.group(0) == user_labels_id.group(0), Error.match_user_id.value

    user_ids.append(user_motion_id.group(0))

## Creation of generate datasets function

In [7]:
def generate_all_datasets(motion_interval=1, epoch_interval=15, labels_df=[], verbose=False):
    '''
    This function takes a total of four parameters: 
        - motion_interval: the interval of time (s) from which retrieve a peak value
        - epoch_interval: the interval or window of time (s) that the generated dataset will have between instances.
        - labels_df: a list with the name of the columns of the generated datasets (6 labels). If not specified or less/more than 6 labels given, it takes default values.
        - verbose: if set to true, display some feedback of the process (the whole process might take several minutes).
    '''
    
    # Checking that the output directory is empty
    if len(os.listdir(output_dir)) > 0:
        print(f"The folder {output_dir}/ is not empty")
        input_val = input("Do you want to overwrite it? [y/n] \n")

        if input_val == "y":
            file_list = [file for file in os.listdir(output_dir)]
            
            for file in file_list:
                os.remove(os.path.join(output_dir, file))
        else:
            raise Exception(Error.raise_error(Error.dir_not_empty,  output_dir))

    if len(labels_df) != 6:
        labels_df = ['Time', 'X', 'Y', 'Z', 'Heart Rate', 'Labels']

    time_acc = []

    for user in range(0, len(user_ids)):
        
        start = time.time()

        user_df = generate_dataset(os.path.join(motion_path, motion_list[user]), 
                    os.path.join(heart_rate_path, heart_rate_list[user]), 
                    os.path.join(labels_path, labels_list[user]), 
                    motion_interval, epoch_interval, labels_df)
        
        # Saving the generated dataset
        fname = os.path.join(output_path, 'dataset_' + user_ids[user] + ".csv")

        user_df.to_csv(fname, index = False, header=True)

        time_acc.append(time.time() - start)

        if verbose:
            print('Dataset user id <{}> generated succesfully. Time: {:.0f} s'.format(user_ids[user], time_acc[user]))


    print(f'\nProcess complete! Total time execution: {datetime.timedelta(seconds=sum(time_acc))}\n')

In [8]:
def generate_dataset(motion_user, heart_rate_user, labels_user, interval_peak, epoch_interval, labels_columns):

    '''
    this function accepts three filenames from one user to generate the dataset as well as the pre-defined intervals.
    '''
    
    # --- Loading the txt files
    motion = np.loadtxt(motion_user)
    heart_rate = np.loadtxt(heart_rate_user, delimiter=',')
    labels = np.loadtxt(labels_user)

    
    # --- Cropping to match the labelled list
    motion = crop_to_offset(motion, labels)    
    heart_rate = crop_to_offset(heart_rate, labels)
    
    # --- Pre-processing and merging
    motion = get_summary_count(motion, interval_peak, epoch_interval)
    
    merged_arrays = merge_arrays(motion, heart_rate, labels, epoch_interval)
    
    
    data_frame = pd.DataFrame(merged_arrays, columns=labels_columns)
    
    return data_frame

The raw data recorded with the Apple Watch (motion and heart rate) contains continiuous and uninterrumped measurements of one or more days, including the last night.

Since the data corresponding to the last night underwent a proper labelling from the PSG results, it is necessary to crop the raw data only to that night (i.e. the list with labels). Anything else, will not be part of the generated dataset and will therefore be disregarded.

This is handled by the function `crop_to_offset()`. This function carries out two tasks:

1. It finds the last night measured within the array passed.
2. For the last night, it finds the boundaries corresponding to the start and end of the labelled list.

Then, the function returns the indexes where the array needs to be sliced.

In [9]:
def crop_to_offset(array_to_crop, array_ref):
    '''
    This function takes two arrays, the first is the one to be cropped and the second one the reference to where to crop.
    It returs a new array starting and ending where the indexes matched with the reference array.
    '''

    start_index, end_index = 0, 0
    array_size = np.size(array_to_crop, 0)
    cropped_array = []
    
    # --- Find the boundaries corresponding to the labelled list
    first_item = array_ref[0][0]
    last_item = array_ref[-1][0]
    
    last_item_found = False
    
    for item in range(array_size - 1, -1, -1):        
        # find end index
        if not last_item_found:
            if array_to_crop[item][0] < last_item:
                end_index = item + 1
                last_item_found = True
        
        # find start index
        if array_to_crop[item][0] < first_item:
            start_index = item
            break  # No more iteration is needed after finding end_index and start_index.
    
    cropped_array = array_to_crop[start_index:end_index]
        
    return cropped_array

In order to compress the vast amount of data gathered from the IMU sensor, some pre-processing is required. Tipically, these types of sensors record at a high frequencies resulting in hundreds of measurements every single second. Research suggest that changes among sleep cycles tend to occur gradually within a few minutes. This also applies to the shift between NREM and REM, which is the variable of interest that is the focus of this application.

Therefore, the time resolution in the raw dataset is too accurate and the function `get_summary_count()` will handle this. All this function does is first finds the peak values within a user-defined time interval (we are most interested in peak values from the accelerometer that contribute to more valuable information) and sum all the spikes that take place within a window or time (epoch), also defined by the user.

An example might be that the function finds the spike that occur in every second and then sums all the spikes found within a window of 15 seconds. This process then is repeated until completion.

In [10]:
def get_summary_count(array, peak_interval, sum_window):
    '''
    This function first finds the peak value for every peak_interval (s) of the passed array.
    It then sums all the peak values within sum_window (s) and adds the sum to a new array.
    It returns the resulting processed array back.
    '''
    
    THRESHOLD = 2
    array_size = np.size(array, 0)
    peak_values_x, sum_peak_values_x = [], []
    peak_values_y, sum_peak_values_y = [], []
    peak_values_z, sum_peak_values_z = [], []
    
    # missing_indices = np.empty((0, 3), dtype=float)
    max_value_x, max_value_y, max_value_z = 0, 0, 0
    time_accumulate = []
    accumulate = 0
    last_interval = 0
    count = 0

    for item in range(array_size):
        if (array[item][0] - last_interval) < peak_interval:
            if abs(array[item][1]) > abs(max_value_x):  # New peak value found at x
                max_value_x = abs(array[item][1])

            if abs(array[item][2]) > abs(max_value_y):  # New peak value found at y
                max_value_y = abs(array[item][2])

            if abs(array[item][3]) > abs(max_value_z):  # New peak value found at z
                max_value_z = abs(array[item][3])

        # Gap found, do not continue with current window
        elif (array[item][0] - last_interval) > (peak_interval*1.5):
                       
            accumulate = np.around(array[item][0], decimals=0)
            
            # reset for a new window time. Current unfinished window no longer valid.
            peak_values_x = []
            peak_values_y = []
            peak_values_z = []            
            max_value_x = 0
            max_value_y = 0
            max_value_z = 0
            count = 0
            last_interval = np.floor(array[item][0])
            
        else:
            # end of peak interval
            peak_values_x.append(max_value_x)
            peak_values_y.append(max_value_y)
            peak_values_z.append(max_value_z)

            # reset interval values and increment count
            last_interval = np.floor(array[item][0])
            max_value_x = 0
            max_value_y = 0
            max_value_z = 0
            count += 1

        if count == sum_window:
            sum_peak_values_x.append(np.sum(peak_values_x))
            sum_peak_values_y.append(np.sum(peak_values_y))
            sum_peak_values_z.append(np.sum(peak_values_z))
            
            accumulate = np.around(accumulate + sum_window, decimals=0)

            # print("====== acc:", accumulate, "val:", np.around(array[item][0]), "diff:", (accumulate - array[item][0]))
            
            # Ensure that the current accumulated time matches the arrays' time
            assert abs(accumulate - np.around(array[item][0])) < THRESHOLD, \
            Error.raise_error(Error.match_index, abs(accumulate - np.around(array[item][0])))

            time_accumulate.append(accumulate)

            # reset for a new window time
            peak_values_x = []
            peak_values_y = []
            peak_values_z = []
            count = 0
    
    assert len(time_accumulate) == len(sum_peak_values_x), Error.match_length_arrays.value
    assert len(time_accumulate) == len(sum_peak_values_y), Error.match_length_arrays.value
    assert len(time_accumulate) == len(sum_peak_values_z), Error.match_length_arrays.value
    
    return np.column_stack((time_accumulate, sum_peak_values_x, sum_peak_values_y, sum_peak_values_z))

Last step is merging the already pre-processed motion dataset with the heart rate and labels datasets corresponding to each user. 

For the heart rate dataset the following approach is taken. For each window from the motion time interval, gather all the heart rate values that were measured within that interval and get the mean value of all of them. If no heart rate value was measured within a window interval, the last known value will be taken. If there is a time gap bigger than the actual window's size, then discard that data up to the next available one.

For the labels a simpler approach is taken. Since the time windows are meant to be always smaller or equal than the time interval of labels recorded, the label value will be unique. It can be either one that corresponds to that time window or the last one if missing in that interval. 

All the above-mentioned operations are carried out in the `merge_arrays()` method.

In [11]:
def merge_arrays(motion, heart_rate, labels, interval_epoch):
        
    # --- Heart Rate pre-processing
    
    heart_rate_size = np.size(heart_rate, 0)
    motion_size = np.size(motion, 0)

    new_heart_rate = []
    heart_rate_acc = []
    
    inc = 0
        
    for i in range(heart_rate_size):        

        if heart_rate[i][0] < motion[inc][0]:
            heart_rate_acc.append(heart_rate[i][1])
        else:
            # If time interval is bigger than the window time, take the latest value of HR
            if motion[inc][0] - motion[inc - 1][0] > interval_epoch:
                heart_rate_acc = [heart_rate[i][1]]
            
            new_heart_rate.append(sum(heart_rate_acc)/len(heart_rate_acc))

            heart_rate_acc = [heart_rate[i][1]] # include first item that gave the condition too
            inc += 1    

        # If the motion dataset finishes but there is still heart rate dataset, dismiss the rest of the latter.
        if inc == motion_size:
            break
    
    # Removing exceeding data of motion that is not available in heart rate.
    if len(new_heart_rate) < np.size(motion, 0):
        diff = abs(len(new_heart_rate) - np.size(motion, 0)) #3
        last_row = np.size(motion, 0)
        start = last_row - diff

        subarray = np.arange(start, last_row)

        motion = np.delete(motion, subarray, axis=0)
        

    # --- Labels pre-processing
    
    motion_size = np.size(motion, 0)  # motion_size might be new if deletion was needed before
    new_labels = []
    
    inc = 0
    
    for j in range(motion_size):
        if motion[j][0] > labels[inc][0]:
            inc += 1
        
        new_labels.append(labels[inc][1])

        
    assert np.size(motion, 0) == len(new_heart_rate), Error.match_length_arrays.value
    assert np.size(motion, 0) == len(new_labels), Error.match_length_arrays.value
    
    return np.column_stack((motion, new_heart_rate, new_labels))

## Running the application

In [12]:
generate_all_datasets(motion_interval=1, epoch_interval=1, verbose=True)

Dataset user id <1066528> generated succesfully. Time: 15 s
Dataset user id <1360686> generated succesfully. Time: 17 s
Dataset user id <1449548> generated succesfully. Time: 20 s
Dataset user id <1455390> generated succesfully. Time: 18 s
Dataset user id <1818471> generated succesfully. Time: 18 s
Dataset user id <2598705> generated succesfully. Time: 18 s
Dataset user id <2638030> generated succesfully. Time: 22 s
Dataset user id <3509524> generated succesfully. Time: 12 s
Dataset user id <3997827> generated succesfully. Time: 20 s
Dataset user id <4018081> generated succesfully. Time: 12 s
Dataset user id <4314139> generated succesfully. Time: 17 s
Dataset user id <4426783> generated succesfully. Time: 20 s
Dataset user id <46343> generated succesfully. Time: 11 s
Dataset user id <5132496> generated succesfully. Time: 13 s
Dataset user id <5383425> generated succesfully. Time: 4 s
Dataset user id <5498603> generated succesfully. Time: 15 s
Dataset user id <5797046> generated succesf

## Downloading the datasets

In [13]:
!zip -r /content/datsets.zip /content/output

  adding: content/output/ (stored 0%)
  adding: content/output/dataset_4018081.csv (deflated 74%)
  adding: content/output/dataset_46343.csv (deflated 75%)
  adding: content/output/dataset_2638030.csv (deflated 72%)
  adding: content/output/dataset_781756.csv (deflated 75%)
  adding: content/output/dataset_1360686.csv (deflated 79%)
  adding: content/output/dataset_5797046.csv (deflated 76%)
  adding: content/output/dataset_7749105.csv (deflated 71%)
  adding: content/output/dataset_5383425.csv (deflated 77%)
  adding: content/output/dataset_8692923.csv (deflated 77%)
  adding: content/output/dataset_1455390.csv (deflated 75%)
  adding: content/output/dataset_8258170.csv (deflated 74%)
  adding: content/output/dataset_1818471.csv (deflated 76%)
  adding: content/output/dataset_5498603.csv (deflated 79%)
  adding: content/output/dataset_1449548.csv (deflated 71%)
  adding: content/output/dataset_3509524.csv (deflated 76%)
  adding: content/output/dataset_844359.csv (deflated 77%)
  addi

In [14]:
from google.colab import files
files.download("/content/datsets.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>