In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

Steps:

- Download and unzip the dataset
- Load the files
- Pre-process the loaded files (crop to keep the part of interest)
- Merge files from each user selecting a specific window time frame.
- Export the resulting file to `.csv`.
- Repeat the process for all the users.

In [2]:
def load_files(path):
    # Preparing paths
    data_path = "C:\dev\DATA\MRH"

    motion_path = os.path.join(data_path, "motion")

    heart_rate_path = os.path.join(data_path, "heart_rate")

    labels_path = os.path.join(data_path, "labels")

    # Obtaining a list with all users
    motion_list = os.listdir(motion_path)
    heart_rate_list = os.listdir(heart_rate_path)
    labels_list = os.listdir(labels_path)

    # Checking that we have data of the 31 users in all the lists created
    assert len(motion_list) == 31, "Error, missing users in motion list"
    assert len(heart_rate_list) == 31, "Error, missing users in heart rate list"
    assert len(labels_list) == 31, "Error, missing users in labels list"

In [3]:
motion_list[0], heart_rate_list[0], labels_list[0]

('1066528_acceleration.txt',
 '1066528_heartrate.txt',
 '1066528_labeled_sleep.txt')

In [98]:
user_1_motion = np.loadtxt(os.path.join(motion_path, motion_list[0]))

user_1_motion

array([[-2.16848465e+04,  7.08010000e-03,  6.40900000e-04,
        -9.87594600e-01],
       [-2.16848171e+04,  4.15040000e-03,  6.25600000e-04,
        -9.90554800e-01],
       [-2.16848079e+04,  4.15040000e-03,  1.11390000e-03,
        -9.90081800e-01],
       ...,
       [ 2.86265419e+04, -5.52734400e-01, -2.99988000e-02,
        -8.10440100e-01],
       [ 2.86265428e+04, -5.53710900e-01, -3.05023000e-02,
        -8.11431900e-01],
       [ 2.86265436e+04, -5.54718000e-01, -2.99988000e-02,
        -8.09021000e-01]])

In [99]:
user_1_heart_rate = np.loadtxt(os.path.join(heart_rate_path, heart_rate_list[0]), delimiter=',')

user_1_heart_rate

array([[-3.55241740e+05,  8.60000000e+01],
       [-3.51407999e+05,  6.70000000e+01],
       [-3.51277368e+05,  1.41000000e+02],
       ...,
       [ 2.91101643e+04,  7.50000000e+01],
       [ 3.43346538e+04,  8.10000000e+01],
       [ 3.44911535e+04,  6.50000000e+01]])

In [121]:
user_1_labels = np.loadtxt(os.path.join(labels_path, labels_list[0]))

user_1_labels

array([[    0.,     0.],
       [   30.,     0.],
       [   60.,     0.],
       ...,
       [28470.,     0.],
       [28500.,     0.],
       [28530.,     0.]])

In [124]:
def generate_dataset(motion, heart_rate, labels, interval=5):
    '''
    It accepts the three lists to be merged into one. Interval stands for the time in seconds of windowing.
    '''
    
    # Cropping the 3 lists to their initial offset
    motion = np.delete(motion, crop_to_offset(motion), axis=0)
    heart_rate = np.delete(heart_rate, crop_to_offset(heart_rate), axis=0)
    labels = np.delete(labels, crop_to_offset(labels), axis=0) # labels do not need to be cropped, but just in case.
    
    print(np.size(motion, 0))
    print(np.size(heart_rate, 0))
    print(np.size(labels, 0))

The raw data recorded from the Apple Watch contains continiuous measurements of one or more days, including the last night.

Since the data corresponding to the last night underwent a proper labelling from the PSG results, it is necessary to crop the raw data only to that night (this applies to motion data and heart rate data).

Therefore, the function `crop_to_offset()` will handle this. This function finds the last night measured within the array passed and returns another array holding the indexes which have to be removed from the original array. Next, the `np.delete()` function will take this returned array to perform the actual cropping.

In [118]:
def crop_to_offset(array):
    '''
    This function accpets a numpy array and it performs the following tasks:
     - It gets a reversed version of it.
     - It iterates over the reversed array until finding the first negative from the column 0 (corresponding to time).
     - It takes the index where it took place and relates it to where it is located in the non-reversed array.
     - Returns back another array holding the values of all the indexes from 0 until the index found.
 
    '''

    array_size = np.size(array, 0)
    reversed_array = array[::-1]
    index_to_crop = 0
    
    for item in range(array_size):
        if reversed_array[item][0] < 0:
            # arr = np.delete(arr, index, axis=0)
            # my_list.append(index)
            index_to_crop = array_size - item
            
            # array = array[index_to_crop:]
            
            # array = np.delete(array, np.arange(0, index_to_crop), axis=0)
            break
    
    return np.arange(0, index_to_crop)
    # return index_to_crop

In [120]:
generate_dataset(user_1_motion, user_1_heart_rate, user_1_labels)

1188331
4968
952


In [4]:
''' 
To do next: 
    change crop_to_offset to return the index values instead of an array.
    Crop to when the user finished sleeping??
    finish merge_lists()
    relate length of the smallest list to the other lists
    divide within interval of time. 
    
'''

' \nTo do next: \n    change crop_to_offset to return the index values instead of an array.\n    Crop to when the user finished sleeping??\n    finish merge_lists()\n    relate length of the smallest list to the other lists\n    divide within interval of time. \n    \n'