In [1]:
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
!sudo apt-get install wget

!wget -r -N -c -np https://physionet.org/files/sleep-accel/1.0.0/

!mkdir ./dataset

!mv ./physionet.org/files/sleep-accel/1.0.0/* dataset

!rm -r ./physionet.org/

!find ./dataset -name "*.html" -type f -delete

Reading package lists... Done
Building dependency tree       
Reading state information... Done
wget is already the newest version (1.19.4-1ubuntu2.2).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'sudo apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.
--2021-05-25 07:53:46--  https://physionet.org/files/sleep-accel/1.0.0/
Resolving physionet.org (physionet.org)... 18.18.42.54
Connecting to physionet.org (physionet.org)|18.18.42.54|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘physionet.org/files/sleep-accel/1.0.0/index.html’

physionet.org/files     [ <=>                ]     925  --.-KB/s    in 0s      

Last-modified header missing -- time-stamps turned off.
2021-05-25 07:53:47 (132 MB/s) - ‘physionet.org/files/sleep-accel/1.0.0/index.html’ saved [925]

Loading robots.txt; please ignore errors.
--2021-05-25 07:53:47-- 

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

Steps:

- Download and unzip the dataset
- Load the files
- Pre-process the loaded files (crop to keep the part of interest)
- Merge files from each user selecting a specific window time frame.
- Export the resulting file to `.csv`.
- Repeat the process for all the users.

In [4]:
# Preparing paths
data_path = "C:\dev\DATA\MRH"
# data_path = os.path.join(os.getcwd(), "physionet.org/files/sleep-accel/1.0.0/")
# data_path = os.path.join(os.getcwd(), "dataset/")

motion_path = os.path.join(data_path, "motion")

heart_rate_path = os.path.join(data_path, "heart_rate")

labels_path = os.path.join(data_path, "labels")

# Obtaining a list with all users
motion_list = os.listdir(motion_path)
heart_rate_list = os.listdir(heart_rate_path)
labels_list = os.listdir(labels_path)

# Checking that we have data of the 31 users in all the lists created
assert len(motion_list) == 31, "Error, missing users in motion list"
assert len(heart_rate_list) == 31, "Error, missing users in heart rate list"
assert len(labels_list) == 31, "Error, missing users in labels list"

In [5]:
motion_list[0], heart_rate_list[0], labels_list[0]

('1066528_acceleration.txt',
 '1066528_heartrate.txt',
 '1066528_labeled_sleep.txt')

In [6]:
user_1_motion = np.loadtxt(os.path.join(motion_path, motion_list[0]))

user_1_motion

array([[-2.16848465e+04,  7.08010000e-03,  6.40900000e-04,
        -9.87594600e-01],
       [-2.16848171e+04,  4.15040000e-03,  6.25600000e-04,
        -9.90554800e-01],
       [-2.16848079e+04,  4.15040000e-03,  1.11390000e-03,
        -9.90081800e-01],
       ...,
       [ 2.86265419e+04, -5.52734400e-01, -2.99988000e-02,
        -8.10440100e-01],
       [ 2.86265428e+04, -5.53710900e-01, -3.05023000e-02,
        -8.11431900e-01],
       [ 2.86265436e+04, -5.54718000e-01, -2.99988000e-02,
        -8.09021000e-01]])

In [27]:
user_1_heart_rate = np.loadtxt(os.path.join(heart_rate_path, heart_rate_list[0]), delimiter=',')

user_1_heart_rate

array([[-3.55241740e+05,  8.60000000e+01],
       [-3.51407999e+05,  6.70000000e+01],
       [-3.51277368e+05,  1.41000000e+02],
       ...,
       [ 2.91101643e+04,  7.50000000e+01],
       [ 3.43346538e+04,  8.10000000e+01],
       [ 3.44911535e+04,  6.50000000e+01]])

In [28]:
user_1_labels = np.loadtxt(os.path.join(labels_path, labels_list[0]))

user_1_labels

array([[    0.,     0.],
       [   30.,     0.],
       [   60.,     0.],
       ...,
       [28470.,     0.],
       [28500.,     0.],
       [28530.,     0.]])

In [66]:
def generate_dataset(motion_user, heart_rate_user, labels_user, interval=5):
    '''
    It accepts three filenames from one user to generate the dataset. Interval stands for the time in seconds of windowing.
    '''
    motion = np.loadtxt(motion_user)
    heart_rate = np.loadtxt(heart_rate_user, delimiter=',')
    labels = np.loadtxt(labels_user)

    
    # Cropping the 3 lists to match the labelled list
    (start, end) = crop_to_offset(motion, labels)
    motion = motion[start-1:end]
    
    (start, end) = crop_to_offset(heart_rate, labels)
    heart_rate = heart_rate[start-1:end]
    
    # motion = np.delete(motion, crop_to_offset(motion), axis=0)
    # heart_rate = np.delete(heart_rate, crop_to_offset(heart_rate), axis=0)
    # labels = np.delete(labels, crop_to_offset(labels), axis=0) # labels do not need to be cropped, but just in case.
    
    print(np.size(motion, 0))
    print(np.size(heart_rate, 0))
    print(np.size(labels, 0))
    
    # Merging three lists into one dataset
    

The raw data recorded with the Apple Watch (motion and heart rate) contains continiuous and uninterrumped measurements of one or more days, including the last night.

Since the data corresponding to the last night underwent a proper labelling from the PSG results, it is necessary to crop the raw data only to that night (i.e. the list with labels). Anything else, will not be part of the generated dataset and will therefore be disregarded.

This is handled by the function `crop_to_offset()`. This function carries out two tasks:

1. It finds the last night measured within the array passed.
2. For the last night, it finds the boundaries corresponding to the start and end of the labelled list.

Then, the function returns the indexes where the array needs to be sliced.

In [67]:
def crop_to_offset(array_to_crop, reference):
    '''
    This function gets a numpy array and it performs the following tasks:
     - It gets a reversed version of it.
     - It iterates over the reversed array until finding the first negative from the column 0 (corresponding to time).
     - It takes the index where it took place and relates it to where it is located in the non-reversed array.
     - Returns back another array holding the values of all the indexes from 0 until the index found.
 
    '''

    start_index, end_index = 0, 0
    array_size = np.size(array_to_crop, 0)
    reversed_array = array_to_crop[::-1]
    
    # --- Find the boundaries corresponding to the labelled list
    last_item = reference[-1]
    
    last_item_found = False
    
    for item in range(array_size):
        
        if not last_item_found:
            if reversed_array[item][0] < last_item[0]:
                end_index = array_size - item
                last_item_found = True
            
        if reversed_array[item][0] < 0:
            start_index = array_size - item
            break  # After finding end_index and start_index, no more iteration is needed
    
    
    return (start_index, end_index)

In [68]:
# generate_dataset(user_1_motion, user_1_heart_rate, user_1_labels)

# generate_dataset(motion_list[0], heart_rate_list[0], labels_list[0])

generate_dataset(os.path.join(motion_path, motion_list[0]), 
                 os.path.join(heart_rate_path, heart_rate_list[0]), 
                 os.path.join(labels_path, labels_list[0]))

1187904
4965
952


In [None]:
''' 
To do next:
    finish merge_lists()
    relate length of the smallest list to the other lists
    divide within interval of time. 
    
'''

' \nTo do next: \n    change crop_to_offset to return the index values instead of an array.\n    Crop to when the user finished sleeping??\n    finish merge_lists()\n    relate length of the smallest list to the other lists\n    divide within interval of time. \n    \n'