In [None]:
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
!sudo apt-get install wget

!wget -r -nv -N -c -np https://physionet.org/files/sleep-accel/1.0.0/

!mkdir ./dataset

!mv ./physionet.org/files/sleep-accel/1.0.0/* dataset

!rm -r ./physionet.org/

!find ./dataset -name "*.html" -type f -delete

In [40]:
import pandas as pd
import numpy as np
import os

Steps:

- Download and unzip the dataset
- Load the files
- Pre-process the loaded files (crop to keep the part of interest)
- Merge files from each user selecting a specific window time frame.
- Export the resulting file to `.csv`.
- Repeat the process for all the users.

In [3]:
# Preparing paths
data_path = "C:\dev\DATA\MRH"
# data_path = os.path.join(os.getcwd(), "dataset/")

motion_path = os.path.join(data_path, "motion")

heart_rate_path = os.path.join(data_path, "heart_rate")

labels_path = os.path.join(data_path, "labels")

# Obtaining a list with all users
motion_list = os.listdir(motion_path)
heart_rate_list = os.listdir(heart_rate_path)
labels_list = os.listdir(labels_path)

# Checking that we have data of the 31 users in all the lists created
assert len(motion_list) == 31, "Error, missing users in motion list"
assert len(heart_rate_list) == 31, "Error, missing users in heart rate list"
assert len(labels_list) == 31, "Error, missing users in labels list"

In [4]:
motion_list[0], heart_rate_list[0], labels_list[0]

('1066528_acceleration.txt',
 '1066528_heartrate.txt',
 '1066528_labeled_sleep.txt')

In [48]:
user_1_motion = np.loadtxt(os.path.join(motion_path, motion_list[0]))

user_1_motion

array([[-2.16848465e+04,  7.08010000e-03,  6.40900000e-04,
        -9.87594600e-01],
       [-2.16848171e+04,  4.15040000e-03,  6.25600000e-04,
        -9.90554800e-01],
       [-2.16848079e+04,  4.15040000e-03,  1.11390000e-03,
        -9.90081800e-01],
       ...,
       [ 2.86265419e+04, -5.52734400e-01, -2.99988000e-02,
        -8.10440100e-01],
       [ 2.86265428e+04, -5.53710900e-01, -3.05023000e-02,
        -8.11431900e-01],
       [ 2.86265436e+04, -5.54718000e-01, -2.99988000e-02,
        -8.09021000e-01]])

In [7]:
user_1_heart_rate = np.loadtxt(os.path.join(heart_rate_path, heart_rate_list[0]), delimiter=',')

user_1_heart_rate

array([[-3.55241740e+05,  8.60000000e+01],
       [-3.51407999e+05,  6.70000000e+01],
       [-3.51277368e+05,  1.41000000e+02],
       ...,
       [ 2.91101643e+04,  7.50000000e+01],
       [ 3.43346538e+04,  8.10000000e+01],
       [ 3.44911535e+04,  6.50000000e+01]])

In [50]:
user_1_labels = np.loadtxt(os.path.join(labels_path, labels_list[0]))

user_1_labels

array([[    0.,     0.],
       [   30.,     0.],
       [   60.,     0.],
       ...,
       [28470.,     0.],
       [28500.,     0.],
       [28530.,     0.]])

In [100]:
def generate_dataset(motion_user, heart_rate_user, labels_user, interval=5):
    '''
    It accepts three filenames from one user to generate the dataset. Interval stands for the time in seconds of windowing.
    '''
    motion = np.loadtxt(motion_user)
    heart_rate = np.loadtxt(heart_rate_user, delimiter=',')
    labels = np.loadtxt(labels_user)

    
    # Cropping the 3 lists to match the labelled list
    (start, end) = crop_to_offset(motion, labels)
    motion = motion[start-1:end]
    
    (start, end) = crop_to_offset(heart_rate, labels)
    heart_rate = heart_rate[start-1:end]
    
    # motion = np.delete(motion, crop_to_offset(motion), axis=0)
    # heart_rate = np.delete(heart_rate, crop_to_offset(heart_rate), axis=0)
    # labels = np.delete(labels, crop_to_offset(labels), axis=0) # labels do not need to be cropped, but just in case.
    
    print(np.size(motion, 0))
    print(np.size(heart_rate, 0))
    print(np.size(labels, 0))
    
    # Extending smaller arrays to have the same size as the biggest array so as to be merged
    # It returns one dimensional array (time column skipped since it has been matched in the extending process)
    heart_rate = extend_array(heart_rate, motion)
    labels = extend_array(labels, motion)
    
    # Merging three arrays into one data frame
    data_frame = pd.DataFrame(motion, columns=['Time', 'X', 'Y', 'Z'])
    
    heart_rate_column = pd.Series(heart_rate)
    data_frame["Heart Rate"] = heart_rate_column
    
    labels_column = pd.Series(labels)
    data_frame["Labels"] = labels_column
    
    return data_frame

The raw data recorded with the Apple Watch (motion and heart rate) contains continiuous and uninterrumped measurements of one or more days, including the last night.

Since the data corresponding to the last night underwent a proper labelling from the PSG results, it is necessary to crop the raw data only to that night (i.e. the list with labels). Anything else, will not be part of the generated dataset and will therefore be disregarded.

This is handled by the function `crop_to_offset()`. This function carries out two tasks:

1. It finds the last night measured within the array passed.
2. For the last night, it finds the boundaries corresponding to the start and end of the labelled list.

Then, the function returns the indexes where the array needs to be sliced.

In [103]:
def crop_to_offset(array_to_crop, array_ref):
    '''
    This function takes two arrays, the first is the one to be cropped and the second one the reference to where to crop.
    It returs a two-element tuple that cointains the indexes in which the array will be cropped.
    '''

    start_index, end_index = 0, 0
    array_size = np.size(array_to_crop, 0) - 1
    
    # Get a reversed version of the array to iterate backwards.
    # reversed_array = array_to_crop[::-1]
    
    # --- Find the boundaries corresponding to the labelled list
    first_item = array_ref[0][0]
    last_item = array_ref[-1][0]
    
    last_item_found = False
    
    for item in range(array_size, -1, -1):        
        # find end index
        if not last_item_found:
            if array_to_crop[item][0] < last_item:
                end_index = item + 1
                last_item_found = True
        
        # find start index
        if array_to_crop[item][0] < first_item:
            start_index = item + 1
            break  # No more iteration is needed after finding end_index and start_index.
    
    
    return (start_index, end_index)

In [102]:
def extend_array(array_to_extend, array_ref):
    '''
    This function takes two arrays, the first is the one to be extended and the second one the reference.
    It returns the new array which is 1D, being the time column skipped since it has been taken into accountin the processs. 
    '''
    
    array_size = np.size(array_ref, 0) - 1
    new_array = []
    count = 0
    
    for i in range(array_size):

        if array_ref[i][0] > array_to_extend[count][0]:
            new_array.append(array_to_extend[count][1])
            count += 1
        else:
            new_array.append(-999)  # For the "missing values" it appends -999.
    
    return np.array(new_array)

In [104]:
df = generate_dataset(os.path.join(motion_path, motion_list[0]), 
                 os.path.join(heart_rate_path, heart_rate_list[0]), 
                 os.path.join(labels_path, labels_list[0]))

1187904
4965
952


In [121]:
# check Time: 25890.016651	27356.335354
df[df["Labels"] > -0].tail(50)

Unnamed: 0,Time,X,Y,Z,Heart Rate,Labels
1150608,25530.003946,0.133011,-0.976974,0.162323,-999.0,2.0
1152107,25560.011476,0.132507,-0.975006,0.163269,-999.0,2.0
1153606,25590.017736,0.130524,-0.974991,0.165192,-999.0,2.0
1155104,25620.008041,0.133987,-0.975983,0.163788,-999.0,2.0
1156603,25650.013869,0.131546,-0.976486,0.161316,-999.0,2.0
1158101,25680.002878,0.133026,-0.976486,0.160858,-999.0,2.0
1159600,25710.008974,0.131531,-0.974518,0.163254,-999.0,2.0
1161099,25740.017955,0.132019,-0.974518,0.163254,-999.0,2.0
1162597,25770.002792,0.13298,-0.975479,0.165222,-999.0,2.0
1164096,25800.010575,0.130081,-0.977966,0.160812,-999.0,2.0


In [122]:
''' 
To do next:
    decision making: what to do with the missing inervals in the motion dataset
    divide within interval of time.
    
'''

' \nTo do next:\n    decision making: what to do with the missing inervals in the motion dataset\n    divide within interval of time.\n    \n'

###### Testing DataFrame

In [52]:
# Cropping

(start, end) = crop_to_offset(user_1_heart_rate, user_1_labels)
user_1_heart_rate = user_1_heart_rate[start-1:end]

(start, end) = crop_to_offset(user_1_motion, user_1_labels)
user_1_motion = user_1_motion[start-1:end]

In [78]:
# extending

array_size = np.size(user_1_motion, 0) - 1
expanded_heart_rate = []
count_heart_rate = 0
expanded_labels = []
count_labels = 0

for i in range(array_size):
    
    # heart rate
    if user_1_motion[i][0] > user_1_heart_rate[count_heart_rate][0]:
        expanded_heart_rate.append(user_1_heart_rate[count_heart_rate][1])
        count_heart_rate += 1
    else:
        expanded_heart_rate.append(-999)
    
    # labels
    if user_1_motion[i][0] > user_1_labels[count_labels][0]:
        expanded_labels.append(user_1_labels[count_labels][1])
        count_labels += 1
    else:
        expanded_labels.append(-999)

Unnamed: 0,Time,X,Y,Z,heart rate,labels
0,-0.004037,0.404434,0.446549,-0.796829,50.0,-1.0
1,0.015948,0.403931,0.449005,-0.796860,-1.0,0.0
2,0.036006,0.403915,0.448029,-0.795395,-1.0,-1.0
3,0.055885,0.404907,0.446549,-0.795853,-1.0,-1.0
4,0.075883,0.408356,0.447525,-0.796768,-1.0,-1.0
...,...,...,...,...,...,...
1187899,28394.149736,-0.601166,-0.075180,-0.774841,-1.0,-1.0
1187900,28394.169696,-0.600189,-0.071228,-0.774857,-1.0,-1.0
1187901,28394.189770,-0.599213,-0.069260,-0.773880,-1.0,-1.0
1187902,28394.209753,-0.597260,-0.072205,-0.771393,-1.0,-1.0


In [None]:
# creating dataframe and adding hr and lb to dataframe as columns

d = pd.DataFrame(user_1_motion, columns=['Time', 'X', 'Y', 'Z'])

hr = pd.Series(new_heart_rate)
d["Heart Rate"] = hr
lb = pd.Series(new_labels)
d["Labels"] = lb

d

In [69]:
np.size(user_1_heart_rate, 0), np.size(user_1_labels, 0)

4965

In [87]:
a = d[(d["Time"] > 25559) & (d["Time"] < 25591)]
a[a["labels"] > -1]

Unnamed: 0,Time,X,Y,Z,heart rate,labels
1152107,25560.011476,0.132507,-0.975006,0.163269,-1.0,2.0
1153606,25590.017736,0.130524,-0.974991,0.165192,-1.0,2.0


## To do

- Get pike values for motion
- Get average 
- parameters variables, Pike interval, interval epoch, choose types of median