In [None]:
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
!sudo apt-get install wget

!wget -r -nv -N -c -np https://physionet.org/files/sleep-accel/1.0.0/

!mkdir ./dataset

!mv ./physionet.org/files/sleep-accel/1.0.0/* dataset

!rm -r ./physionet.org/

!find ./dataset -name "*.html" -type f -delete

Steps:

- Download and unzip the dataset
- Load the files
- Pre-process the loaded files (crop to keep the part of interest)
- Merge files from each user selecting a specific window time frame.
- Export the resulting file to `.csv`.
- Repeat the process for all the users.

In [None]:
import pandas as pd
import numpy as np
import os
import re

In [None]:
from enum import Enum

class Error(Enum):
    match_number_users = "[Error]: number of users in list does not match"
    match_user_id = "[Error]: user id does not match between lists"
    match_length_arrays = "[Error]: the length of the lists does not match"
    match_index = "[Error]: indexes are mismatched"
    generic_error = "[Error]"

In [None]:
# Preparing paths
data_path = "C:\dev\DATA\MRH"
# data_path = os.path.join(os.getcwd(), "dataset/")

motion_path = os.path.join(data_path, "motion")

heart_rate_path = os.path.join(data_path, "heart_rate")

labels_path = os.path.join(data_path, "labels")

# Obtaining ordered lists with all users
motion_list = sorted(os.listdir(motion_path))
heart_rate_list = sorted(os.listdir(heart_rate_path))
labels_list = sorted(os.listdir(labels_path))

# Checking that we have data of the 31 users in all the lists created
assert len(motion_list) == 31, Error.match_number_users.value
assert len(heart_rate_list) == 31, Error.match_number_users.value
assert len(labels_list) == 31, Error.match_number_users.value

# Checking that the user ids match in order accross the three lists
for item in range(len(motion_list)):
    user_motion_id = re.search("\d*", motion_list[item])
    user_heart_rate_id = re.search("\d*", heart_rate_list[item])
    user_labels_id = re.search("\d*", labels_list[item])

    assert user_motion_id.group(0) == user_heart_rate_id.group(0), Error.match_user_id.value
    assert user_motion_id.group(0) == user_labels_id.group(0), Error.match_user_id.value


In [None]:
motion_list[0], heart_rate_list[0], labels_list[0]

('1066528_acceleration.txt',
 '1066528_heartrate.txt',
 '1066528_labeled_sleep.txt')

In [None]:
user_1_motion = np.loadtxt(os.path.join(motion_path, motion_list[0]))

user_1_motion

array([[-2.16848465e+04,  7.08010000e-03,  6.40900000e-04,
        -9.87594600e-01],
       [-2.16848171e+04,  4.15040000e-03,  6.25600000e-04,
        -9.90554800e-01],
       [-2.16848079e+04,  4.15040000e-03,  1.11390000e-03,
        -9.90081800e-01],
       ...,
       [ 2.86265419e+04, -5.52734400e-01, -2.99988000e-02,
        -8.10440100e-01],
       [ 2.86265428e+04, -5.53710900e-01, -3.05023000e-02,
        -8.11431900e-01],
       [ 2.86265436e+04, -5.54718000e-01, -2.99988000e-02,
        -8.09021000e-01]])

In [None]:
user_1_heart_rate = np.loadtxt(os.path.join(heart_rate_path, heart_rate_list[0]), delimiter=',')

user_1_heart_rate

array([[-3.55241740e+05,  8.60000000e+01],
       [-3.51407999e+05,  6.70000000e+01],
       [-3.51277368e+05,  1.41000000e+02],
       ...,
       [ 2.91101643e+04,  7.50000000e+01],
       [ 3.43346538e+04,  8.10000000e+01],
       [ 3.44911535e+04,  6.50000000e+01]])

In [None]:
user_1_labels = np.loadtxt(os.path.join(labels_path, labels_list[0]))

user_1_labels

array([[    0.,     0.],
       [   30.,     0.],
       [   60.,     0.],
       ...,
       [28470.,     0.],
       [28500.,     0.],
       [28530.,     0.]])

In [None]:
def generate_dataset(motion_user, heart_rate_user, labels_user, interval_peak=1, interval_epoch=30):

    '''
    It accepts three filenames from one user to generate the dataset. Interval stands for the time in seconds of windowing.
    '''
    
    # --- Loading the txt files
    motion = np.loadtxt(motion_user)
    heart_rate = np.loadtxt(heart_rate_user, delimiter=',')
    labels = np.loadtxt(labels_user)

    
    # --- Cropping to match the labelled list
    motion = crop_to_offset(motion, labels)    
    heart_rate = crop_to_offset(heart_rate, labels)
    
    # --- Pre-processing and merging
    motion = get_summary_count(motion, interval_peak, interval_epoch)
    
    merged_arrays = merge_arrays(motion, heart_rate, labels, interval_epoch)
    
    
    data_frame = pd.DataFrame(merged_arrays, columns=['Time', 'X', 'Y', 'Z', 'Heart Rate', 'Labels'])
  
  
    
    # OLD:
    # Extending smaller arrays to have the same size as the biggest array so as to be merged
    # It returns one dimensional array (time column skipped since it has been matched in the extending process)
    # heart_rate = extend_array(heart_rate, motion)
    # labels = extend_array(labels, motion)
    
    # Merging three arrays into one data frame
    # data_frame = pd.DataFrame(motion, columns=['Time', 'X', 'Y', 'Z', 'Heart Rate', 'Labels'])
    
    # heart_rate_column = pd.Series(heart_rate)
    # data_frame["Heart Rate"] = heart_rate_column
    
    # labels_column = pd.Series(labels)
    # data_frame["Labels"] = labels_column
    
    return data_frame

The raw data recorded with the Apple Watch (motion and heart rate) contains continiuous and uninterrumped measurements of one or more days, including the last night.

Since the data corresponding to the last night underwent a proper labelling from the PSG results, it is necessary to crop the raw data only to that night (i.e. the list with labels). Anything else, will not be part of the generated dataset and will therefore be disregarded.

This is handled by the function `crop_to_offset()`. This function carries out two tasks:

1. It finds the last night measured within the array passed.
2. For the last night, it finds the boundaries corresponding to the start and end of the labelled list.

Then, the function returns the indexes where the array needs to be sliced.

In [None]:
def crop_to_offset(array_to_crop, array_ref):
    '''
    This function takes two arrays, the first is the one to be cropped and the second one the reference to where to crop.
    It returs a new array starting and ending where the indexes matched with the reference array.
    '''

    start_index, end_index = 0, 0
    array_size = np.size(array_to_crop, 0)
    cropped_array = []
    
    # --- Find the boundaries corresponding to the labelled list
    first_item = array_ref[0][0]
    last_item = array_ref[-1][0]
    
    last_item_found = False
    
    for item in range(array_size - 1, -1, -1):        
        # find end index
        if not last_item_found:
            if array_to_crop[item][0] < last_item:
                end_index = item + 1
                last_item_found = True
        
        # find start index
        if array_to_crop[item][0] < first_item:
            start_index = item
            break  # No more iteration is needed after finding end_index and start_index.
    
    
    # return (start_index, end_index)
    cropped_array = array_to_crop[start_index:end_index]
        
    return cropped_array

In order to compress the vast amount of data gathered from the IMU sensor, some pre-processing is required. Tipically, these types of sensors record at a high frequencies resulting in hundreds of measurements every single second. Research suggest that changes among sleep cycles tend to occur gradually within a few minutes. This also applies to the shift between NREM and REM, which is the variable of interest that is the focus of this application.

Therefore, the time resolution in the raw dataset is too accurate and the function `get_summary_count()` will handle this. All this function does is first finds the peak values within a user-defined time interval (we are most interested in peak values from the accelerometer that contribute to more valuable information) and sum all the spikes that take place within a window or time (epoch), also defined by the user.

An example might be that the function finds the spike that occur in every second and then sums all the spikes found within a window of 15 seconds. This process then is repeated until completion.

In [None]:
def get_summary_count(array, peak_interval, sum_window):
    '''
    This function first finds the peak value for every peak_interval (s) of the passed array.
    It then sums all the peak values within sum_window (s) and adds the sum to a new array.
    It returns the resulting processed array back.
    '''

    array_size = np.size(array, 0)
    peak_values_x, sum_peak_values_x = [], []
    peak_values_y, sum_peak_values_y = [], []
    peak_values_z, sum_peak_values_z = [], []
    
    # missing_indices = np.empty((0, 3), dtype=float)
    max_value_x, max_value_y, max_value_z = 0, 0, 0
    time_accumulate = []
    accumulate = 0
    last_interval = 0
    count = 0

    for item in range(array_size):
        if (array[item][0] - last_interval) < peak_interval:
            if abs(array[item][1]) > abs(max_value_x):  # New peak value found at x
                max_value_x = abs(array[item][1])

            if abs(array[item][2]) > abs(max_value_y):  # New peak value found at y
                max_value_y = abs(array[item][2])

            if abs(array[item][3]) > abs(max_value_z):  # New peak value found at z
                max_value_z = abs(array[item][3])

        # Gap found, do not continue with current window
        elif (array[item][0] - last_interval) > (peak_interval*1.5):
            # print("============================================")
            # print(f"[{array[item-1][0]}, {array[item][0]}]: {(array[item][0] - last_interval)}")
            # print("============================================")
                       
            # missing indices gets the time values of the gap between the missing data happened
            # new_interval = np.array([[array[item-1][0], array[item][0], (array[item][0] - array[item-1][0])]])
            # missing_indices = np.append(missing_indices, new_interval, axis=0)
            
            # update ,to the new time
            # print("acc: ", accumulate)
            accumulate = np.around(accumulate + (np.around(array[item][0], decimals=0) - array[item-1][0]) + count, decimals=0)
            # print("acc after: ", accumulate)
            # print("count: ", count)

            # reset for a new window time. Current unfinished window invalid.
            # if count < sum_window:              
            peak_values_x = []
            peak_values_y = []
            peak_values_z = []
            count = 0
            
            last_interval = np.floor(array[item][0])
            max_value_x = 0
            max_value_y = 0
            max_value_z = 0
            
        else:
            # end of peak interval
            peak_values_x.append(max_value_x)
            peak_values_y.append(max_value_y)
            peak_values_z.append(max_value_z)

            # reset interval values and increment count
            last_interval = np.floor(array[item][0])
            max_value_x = 0
            max_value_y = 0
            max_value_z = 0
            count += 1

        if count == sum_window:
            sum_peak_values_x.append(np.sum(peak_values_x))
            sum_peak_values_y.append(np.sum(peak_values_y))
            sum_peak_values_z.append(np.sum(peak_values_z))
            
            accumulate = np.around(accumulate + sum_window, decimals=0)

            # print("====== acc:", accumulate, "val:", np.around(array[item][0]), "diff:", (accumulate - array[item][0]))
            
            # Ensure that the current accumulated time matches the arrays' time
            assert abs(accumulate - np.around(array[item][0])) < 3, Error.match_index.value

            # time_accumulate.append(time_accumulate[-1] + sum_window)
            time_accumulate.append(accumulate)

            # reset for a new window time
            peak_values_x = []
            peak_values_y = []
            peak_values_z = []
            count = 0
       

    
    assert len(time_accumulate) == len(sum_peak_values_x), Error.match_length_arrays.value
    assert len(time_accumulate) == len(sum_peak_values_y), Error.match_length_arrays.value
    assert len(time_accumulate) == len(sum_peak_values_z), Error.match_length_arrays.value
    
    # return time_accumulate
    
    return np.column_stack((time_accumulate, sum_peak_values_x, sum_peak_values_y, sum_peak_values_z))

In [None]:
# user_1_motion = crop_to_offset(user_1_motion, user_1_labels)
# user_1_heart_rate = crop_to_offset(user_1_heart_rate, user_1_labels)

# fname = "cropped_" + motion_list[0]
# np.savetxt(fname, user_1_motion, fmt='%.18e', delimiter=' ', newline='\n', header='', footer='', comments='# ', encoding=None)

# from google.colab import files
# files.download('cropped_1066528_acceleration.txt') 

In [None]:
# [time.prev, time.next, diff]
summary = get_summary_count(user_1_motion, 1, 15)

summary

array([[1.50000000e+01, 9.80928050e+00, 1.11329802e+01, 1.47610627e+01],
       [3.00000000e+01, 6.11120610e+00, 6.92994680e+00, 1.19629060e+01],
       [4.50000000e+01, 6.09844980e+00, 6.90870630e+00, 1.19256594e+01],
       ...,
       [2.77820000e+04, 4.15765380e+00, 1.43766631e+01, 7.25236530e+00],
       [2.83730000e+04, 3.75987240e+00, 1.45188444e+01, 4.36595140e+00],
       [2.83880000e+04, 8.03883330e+00, 1.27173921e+01, 7.98275750e+00]])

In [None]:
SIZE = np.size(summary, 0)
for i in range(1, SIZE):
    if summary[i][0] - summary[i-1][0] > 15:
        print(f"INTERVAL at [{summary[i-1][0]}, {summary[i][0]}], diff: {summary[i][0] - summary[i-1][0]}")

INTERVAL at [11265.0, 11309.0], diff: 44.0
INTERVAL at [11369.0, 11629.0], diff: 260.0
INTERVAL at [11704.0, 11770.0], diff: 66.0
INTERVAL at [11770.0, 11810.0], diff: 40.0
INTERVAL at [11870.0, 12190.0], diff: 320.0
INTERVAL at [12190.0, 12230.0], diff: 40.0
INTERVAL at [12290.0, 12330.0], diff: 40.0
INTERVAL at [12390.0, 12450.0], diff: 60.0
INTERVAL at [12450.0, 12490.0], diff: 40.0
INTERVAL at [12505.0, 12631.0], diff: 126.0
INTERVAL at [12631.0, 12752.0], diff: 121.0
INTERVAL at [12752.0, 12812.0], diff: 60.0
INTERVAL at [12812.0, 12872.0], diff: 60.0
INTERVAL at [12872.0, 12932.0], diff: 60.0
INTERVAL at [12932.0, 13052.0], diff: 120.0
INTERVAL at [13052.0, 13152.0], diff: 100.0
INTERVAL at [13152.0, 13432.0], diff: 280.0
INTERVAL at [13432.0, 13492.0], diff: 60.0
INTERVAL at [13507.0, 13552.0], diff: 45.0
INTERVAL at [13567.0, 13612.0], diff: 45.0
INTERVAL at [13627.0, 13812.0], diff: 185.0
INTERVAL at [13812.0, 13872.0], diff: 60.0
INTERVAL at [13887.0, 13932.0], diff: 45.0
INT

Last step is merging the already pre-processed motion dataset with the heart rate and labels datasets corresponding to each user. 

For the heart rate dataset the following approach is taken. For each window from the motion time interval, gather all the heart rate values that were measured within that interval and get the mean value of all of them. If no heart rate value was measured within a window interval, the last known value will be taken. If there is a time gap bigger than the actual window's size, then discard that data up to the next available one.

For the labels a simpler approach is taken. Since the time windows are meant to be always smaller or equal than the time interval of labels recorded, the label value will be unique. It can be either one that corresponds to that time window or the last one if missing in that interval. 

All the above-mentioned operations are carried out in the `merge_arrays()` method.

In [None]:
def merge_arrays(motion, heart_rate, labels, interval_epoch):
    # array_size = max(np.size(motion, 0), np.size(heart_rate, 0), np.size(labels, 0))
    # Checking that the biggest arrray to iterate through at this point is heart_rate
    # assert array_size == np.size(heart_rate, 0), Error.match_length_arrays
    
    # --- Heart Rate pre-processing
    
    heart_rate_size = np.size(heart_rate, 0)
    new_heart_rate = []
    heart_rate_acc = []
    
    inc = 0
        
    for i in range(heart_rate_size):        

        if heart_rate[i][0] < motion[inc][0]:
            heart_rate_acc.append(heart_rate[i][1])
        else:
            # If time interval is bigger than the window time, take the latest value of HR
            if motion[inc][0] - motion[inc - 1][0] > interval_epoch:
                heart_rate_acc = [heart_rate[i][1]]
            
            # Append the mean off all values recorded
            new_heart_rate.append(sum(heart_rate_acc)/len(heart_rate_acc))
            # print(f"[{motion[inc][0]}, {sum(heart_rate_acc)/len(heart_rate_acc)}]: ")

            heart_rate_acc = [heart_rate[i][1]] # include first item that gave the condition too
            inc += 1        
    
    # Removing exceding data of motion that is not available in heart rate.
    if len(new_heart_rate) < np.size(motion, 0):
        diff = abs(len(new_heart_rate) - np.size(motion, 0)) #3
        last_row = np.size(motion, 0)
        start = last_row - diff

        subarray = np.arange(start, last_row)

        motion = np.delete(motion, subarray, axis=0)
        

    
    # --- Labels pre-processing
    
    motion_size = np.size(motion, 0)
    new_labels = []
    
    inc = 0
    
    for j in range(motion_size):
        if motion[j][0] > labels[inc][0]:
            inc += 1
        
        new_labels.append(labels[inc][1])
    
    
    assert np.size(motion, 0) == len(new_heart_rate), Error.match_length_arrays.value
    assert np.size(motion, 0) == len(new_labels), Error.match_length_arrays.value
    
    return np.column_stack((motion, new_heart_rate, new_labels))

In [None]:
arr = merge_arrays(summary, user_1_heart_rate, user_1_labels, 15)

df = pd.DataFrame(arr, columns=['Time', 'X', 'Y', 'Z', 'Heart Rate', 'Labels'])

df

Unnamed: 0,Time,X,Y,Z,Heart Rate,Labels
0,15.0,6.129440,6.767105,12.008164,51.250000,0.0
1,30.0,6.111206,6.929947,11.962906,52.666667,0.0
2,45.0,6.098450,6.908706,11.925659,52.333333,0.0
3,60.0,6.101441,6.911728,11.935013,49.333333,0.0
4,75.0,6.093109,6.915634,11.933121,51.000000,0.0
...,...,...,...,...,...,...
1554,26057.0,0.599274,1.208099,14.854935,68.000000,0.0
1555,26072.0,4.737259,8.527146,12.957062,68.000000,0.0
1556,27724.0,5.197281,13.699905,5.316589,60.000000,0.0
1557,27739.0,5.081131,13.254791,5.919617,60.000000,0.0


In [None]:
import time

start = time.time()

user_1 = generate_dataset(os.path.join(motion_path, motion_list[0]), 
                 os.path.join(heart_rate_path, heart_rate_list[0]), 
                 os.path.join(labels_path, labels_list[0]), 1, 15)

print('Time to generate dataset: {:.0f} s'.format(time.time() - start))

user_1

Time to generate dataset: 36 s


Unnamed: 0,Time,X,Y,Z,Heart Rate,Labels
0,15.0,6.129440,6.767105,12.008164,51.250000,0.0
1,30.0,6.111206,6.929947,11.962906,52.666667,0.0
2,45.0,6.098450,6.908706,11.925659,52.333333,0.0
3,60.0,6.101441,6.911728,11.935013,49.333333,0.0
4,75.0,6.093109,6.915634,11.933121,51.000000,0.0
...,...,...,...,...,...,...
1554,26057.0,0.599274,1.208099,14.854935,68.000000,0.0
1555,26072.0,4.737259,8.527146,12.957062,68.000000,0.0
1556,27724.0,5.197281,13.699905,5.316589,60.000000,0.0
1557,27739.0,5.081131,13.254791,5.919617,60.000000,0.0


###### Saving sample

In [None]:
user_id = re.search("\d*", motion_list[0])

fname = 'dataset_' + user_id.group(0) + ".csv"

user_1.to_csv(fname, index = False, header=True)

###### Testing DataFrame

In [None]:
user_1[user_1["Labels"] > 4]

Unnamed: 0,Time,X,Y,Z,Heart Rate,Labels
458,6885.0,1.330994,2.418594,14.649231,58.333333,5.0
459,6900.0,1.325516,2.420059,14.650604,56.333333,5.0
460,6915.0,1.330429,2.420959,14.650238,57.000000,5.0
461,6930.0,1.328049,2.418518,14.646271,63.333333,5.0
462,6945.0,1.328033,2.411621,14.656220,65.666667,5.0
...,...,...,...,...,...,...
1480,24947.0,6.986115,1.834229,13.185440,65.000000,5.0
1481,24962.0,6.983322,1.860351,13.181610,61.000000,5.0
1482,24977.0,6.977661,1.855469,13.182022,63.000000,5.0
1483,24992.0,6.976059,1.863800,13.195205,67.500000,5.0
