In [None]:
import pandas as pd
import scipy
import numpy as np
from warnings import warn

Multi file is best for the "when are you exercising" problem (segmentation)

In [None]:
matfile = "../../data/raw/exercise_data.50.0000_multionly.mat"
mat_contents = scipy.io.loadmat(matfile, squeeze_me=True, struct_as_record=False)

All the data exists in `subject_data`<br>
Exercise labels and label groups exist in `exerciseConstants`<br>
Sample rate for all data appears to be `Fs=50` Hz

In [None]:
mat_contents.keys()

Within `exerciseConstants`, there are 2 fields<br>
`activities` is thr raw list of all possible labels<br>
`usefulActivityGroupings` is a 13x2 ndarray in the form of `[group_name, [array of labels for this group]]`

In [None]:
exercise_constants = mat_contents["exerciseConstants"]
exercise_constants.activities[:10], exercise_constants.usefulActivityGroupings[0,:]

There are 94 subject's worth of data

In [None]:
# how many subjects are there?
mat_contents['subject_data'].shape

 Within each subject, there are one or more arrays of `scipy.io.matlab._mio5_params.mat_struct`s

In [None]:
# what's in each subjects data structure?
for ix, subj_data in enumerate(mat_contents['subject_data']):
    if ix>6:
        continue
    print(subj_data)

Looks like all `scipy.io.matlab._mio5_params.mat_struct` data has identical fieldnames, so we only need to gather the field names once.

In [None]:
# are field names identical across subjects and arrays?
def check_for_identical_fieldnames(first_fieldnames, next_fieldnames, level="assert", msg=1, ix=None, jx=0):
    if msg == 1:
        msg = (
            f"Not all fieldnames match between subject_index {ix} and first subject. "
            f"First inconsistent index is {jx}"
        )
    elif msg == 2:
        msg = (
            f"Not all activities data match for subject_index {ix}. First inconsistent"
            f" index is {jx}"
        )
    elif msg == 3:
        msg = (
            f"MasterFileToken does not match MasterToken for subject_index {ix}. First "
            f"inconsistent index is {jx}"
        )
    elif msg == 4:
        msg = (
            f"MasterFileTokens & MasterTokens for subject_index {ix} do not match across "
            f"arrays. First inconsistent index is {jx}"
        )
    elif msg == 5:
        msg = (
            f"Not all instanceIndex's match for subject_index {ix}. First inconsistent "
            f"index is {jx}"
        )

    check = all([f==n for f, n in zip(first_fieldnames, next_fieldnames)])
    if level == "assert":
        assert check, msg
    else:  # level == "warn":
        if not check:
            warn(msg)
    return

first_fieldnames = mat_contents['subject_data'][0][0]._fieldnames
for ix, subj_data in enumerate(mat_contents['subject_data']):
    if isinstance(subj_data, np.ndarray):
        for jx, data in enumerate(subj_data):
            next_fieldnames = data._fieldnames
            check_for_identical_fieldnames(first_fieldnames, next_fieldnames, ix=ix, jx=jx)
    else:
        next_fieldnames = subj_data._fieldnames
        check_for_identical_fieldnames(first_fieldnames, next_fieldnames)

print("Field names for all mat_structs are the same")

Each array reflects a separate workout instance, with a varying number of activities

In [None]:
# what is the difference between subjects with an array of mat_structs vs. a subject 
# with only one mat_struct? Subject 0 has an array of mat_structs, let's see what's 
# inside them...
for ix, struct in enumerate(mat_contents["subject_data"][0]):
    f_ix, subject_id = struct.fileIndex, struct.subjectID
    a_ix, activity_name = struct.activityIndex, struct.activityName
    i_ix, shape = struct.instanceIndex, struct.data.accelDataMatrix.shape
    mf_tok, m_tok = struct.masterFileToken, struct.masterToken
    t_start, t_end = (
        struct.data.accelDataMatrix[0,0], 
        struct.data.accelDataMatrix[-1,0]
    )
    n_activities = struct.activityStartMatrix.shape[0]
    print(f"Array {ix}")
    print((
        f"File {f_ix}; Subject {subject_id}; Activity {a_ix}: {activity_name}; Instance "
        f"{i_ix}; Master File Token: {mf_tok}, Master Token {m_tok}; Data shape: {shape}; "
        f"Time_Start: {t_start}, Time_End: {t_end}"
    ))
    print(f"N_Activities={n_activities}")
    print("")

For subjects with more than one data array, it appears that `activityIndex` and `activityName` are all identical, so we only need to look at these fields once.

In [None]:
# for all subject data, let's see if activityIndex, activityName are all the same for each array of data
for ix, subj_data in enumerate(mat_contents['subject_data']):
    if isinstance(subj_data, np.ndarray):
        firsts = subj_data[0].activityIndex, subj_data[0].activityName
        for jx, data in enumerate(subj_data):
            nexts = data.activityIndex, data.activityName
            check_for_identical_fieldnames(firsts, nexts, msg=2, ix=ix, jx=jx)
    else:
        # if there is only 1 data struct, there's no need to compare
        continue
print("Activities data for all subjects with arrays of data are the same")

Moreover, all `activityIndex`s and `activityName`s are identical across all subjects and arrays. Considering how generic these are, they can be ignored entirely.

In [None]:
# what are all of the possible `activityIndex`s and `activityName`s?
all_activity_ixs = []
all_activity_names = []
for ix, subj_data in enumerate(mat_contents['subject_data']):
    if isinstance(subj_data, np.ndarray):
        a_ix, activity_name = struct.activityIndex, struct.activityName
    else:
        a_ix, activity_name = subj_data.activityIndex, subj_data.activityName

    all_activity_ixs.append(a_ix)
    all_activity_names.append(activity_name)

print(set(all_activity_ixs))
print(set(all_activity_names))

It appears that `masterFileToken` and `masterToken` reflect the same values within a subject, regardless of how many data arrays (i.e., number of separate workout instances) a subject completed. So for each subject, we only need to look at the first `masterToken`.

In [None]:
# now let's see if masterFileToken == masterToken within each array of data for a subject (assert)
# and see if firsts == nexts across arrays for a subject (just identify differences, don't assert)
for ix, subj_data in enumerate(mat_contents['subject_data']):
    if isinstance(subj_data, np.ndarray):
        firsts = subj_data[0].masterFileToken, subj_data[0].masterToken
        for jx, data in enumerate(subj_data):
            nexts = data.masterFileToken, data.masterToken
            f_tok, n_tok = nexts
            check_for_identical_fieldnames([f_tok], [n_tok], msg=3, ix=ix, jx=jx)
            check_for_identical_fieldnames(firsts, nexts, level="warn", msg=4, ix=ix, jx=jx)
    else:
        nexts = subj_data.masterFileToken, subj_data.masterToken
        f_tok, n_tok = nexts
        check_for_identical_fieldnames([f_tok], [n_tok], msg=3, ix=ix, jx=jx)
print("MasterFileTokens and masterTokens match for all arrays within a subject")

Moreover, it appears that all data comes from subjects' right arms, so we can ignore this field entirely as well.

In [None]:
# what are all of the possible master_tokens?
all_master_tokens = []
for ix, subj_data in enumerate(mat_contents['subject_data']):
    if isinstance(subj_data, np.ndarray):
        master_token = subj_data[0].masterToken
    else:
        master_token = subj_data.masterToken
    all_master_tokens.append(master_token)

print(set(all_master_tokens))


Finally, for subjects with an array of data, all `instanceIndex`s are identical, so we can ignore these as well.

In [None]:
# what about `instance`?
for ix, subj_data in enumerate(mat_contents['subject_data']):
    if isinstance(subj_data, np.ndarray):
        first_ix = [subj_data[0].instanceIndex]
        for jx, data in enumerate(subj_data):
            next_ix = [data.instanceIndex]
            check_for_identical_fieldnames(first_ix, next_ix, msg=5, ix=ix, jx=jx)
    else:
        # if there is only 1 data struct, there's no need to compare
        continue
print("Instance index data for all subjects with arrays of data are the same")

This code snippet below runs through one array from one subject's data and aligns the activity labels with the time axis of the accelerometer data

In [None]:
# how can we build a dataframe that labels activity appropriately?
# let's start with one array from one subject's data
subj0_data = mat_contents['subject_data'][0]
time = subj0_data[0].data.accelDataMatrix[:,0]
max_t = time[-1]
activity_array = np.empty_like(time, dtype="object")
for activity in subj0_data[0].activityStartMatrix:
    activity_name = activity[0]
    t_s, t_e = activity[1:3]

    # enforce that times fit between start and end times in data matrix
    t_s = 0 if t_s < 0 else t_s
    t_e = max_t if t_e > max_t else t_e
    act_ix = (time >= t_s) & (time <= t_e)
    activity_array[act_ix] = activity_name


Let's just check that the time axis for the accelerometer is the same as that for the gyroscope

In [None]:
time_g = subj0_data[0].data.gyroDataMatrix[:,0]
assert all(t==t_g for t, t_g in zip(time, time_g)), "Time axes don't match"
print("Time axes match")

Finally, we want to put subject data into a dataframe format and then save it as a parquet or csv file for further exploration. Files will reflect one data array for one subject, and file names will be of the form: `fileID_subjID_dataID` where `fileID` is the value in the `fileIndex` field, `subjID` is the value in the `subjectID` field, and `dataID` is the data array index representing the exercise run (0 by default for subjects with only one data structure).

In [None]:
# now we need to put subject data into a dataframe
# identifiers i'll need are:
# file number
# subject number
# data array index (0 by default if only 1, otherwise index of struct in subj_data)
# time
# accel_x
# accel_y
# accel_z
# gyro_x
# gyro_y
# gro_z
# activity_label
df_s0_d0 = pd.DataFrame()
data_ix = 0
time = subj0_data[data_ix].data.accelDataMatrix[:,0]
f_ix = subj0_data[data_ix].fileIndex
s_ix = subj0_data[data_ix].subjectID

df_s0_d0["time"] = time
df_s0_d0["file_id"] = f_ix
df_s0_d0["subject_id"] = s_ix
df_s0_d0["data_id"] = data_ix

df_s0_d0["accel_x"] = subj0_data[data_ix].data.accelDataMatrix[:,1]
df_s0_d0["accel_y"] = subj0_data[data_ix].data.accelDataMatrix[:,2]
df_s0_d0["accel_z"] = subj0_data[data_ix].data.accelDataMatrix[:,3]

df_s0_d0["gyro_x"] = subj0_data[data_ix].data.gyroDataMatrix[:,1]
df_s0_d0["gyro_y"] = subj0_data[data_ix].data.gyroDataMatrix[:,2]
df_s0_d0["gyro_z"] = subj0_data[data_ix].data.gyroDataMatrix[:,3]

max_t = time[-1]
activity_array = np.empty_like(time, dtype="object")
for activity in subj0_data[data_ix].activityStartMatrix:
    activity_name = activity[0]
    t_s, t_e = activity[1:3]

    # enforce that times fit between start and end times in data matrix
    t_s = 0 if t_s < 0 else t_s
    t_e = max_t if t_e > max_t else t_e
    act_ix = (time >= t_s) & (time <= t_e)
    activity_array[act_ix] = activity_name

df_s0_d0["label"] = activity_array

In [None]:
# save the data
output_file =  f"../../data/interim/raw/fileID{f_ix}_subjID{s_ix}_dataID{data_ix}.parquet"
df_s0_d0.to_parquet(
    output_file,
    engine='fastparquet',
)

In [None]:
def write_single_parquet_file(subj_data, data_ix):

    df = pd.DataFrame()
    time = subj_data.data.accelDataMatrix[:,0]
    file_ix = subj_data.fileIndex
    subj_ix = subj_data.subjectID

    df["time"] = time
    df["file_id"] = file_ix
    df["subject_id"] = subj_ix
    df["data_id"] = data_ix

    df["accel_x"] = subj_data.data.accelDataMatrix[:,1]
    df["accel_y"] = subj_data.data.accelDataMatrix[:,2]
    df["accel_z"] = subj_data.data.accelDataMatrix[:,3]

    df["gyro_x"] = subj_data.data.gyroDataMatrix[:,1]
    df["gyro_y"] = subj_data.data.gyroDataMatrix[:,2]
    df["gyro_z"] = subj_data.data.gyroDataMatrix[:,3]

    max_t = time[-1]
    activity_array = np.empty_like(time, dtype="object")
    for activity in subj_data.activityStartMatrix:
        activity_name = activity[0]
        t_s, t_e = activity[1:3]

        # enforce that times fit between start and end times in data matrix
        t_s = 0 if t_s < 0 else t_s
        t_e = max_t if t_e > max_t else t_e

        act_ix = (time >= t_s) & (time <= t_e)
        activity_array[act_ix] = activity_name

    df["label"] = activity_array
    
    # save the data
    output_file =  (
        f"../../data/interim/raw/fileID{file_ix}_subjID{subj_ix}_dataID{data_ix}.parquet"
    )
    df.to_parquet(output_file, engine='fastparquet')
    return

In [None]:
# now, build out the code to re-write the data by fileID, subjID, and dataID
for ix, subj_data in enumerate(mat_contents["subject_data"]):
    print(ix, end="\r")
    if isinstance(subj_data, np.ndarray):
        for d_ix, subj_data_x in enumerate(subj_data):
            write_single_parquet_file(subj_data_x, d_ix)
    else:
        write_single_parquet_file(subj_data, data_ix=0)

Pyarrow breaks the kernel and I don't know why, but fastparquet works...

In [None]:
assert False

In [None]:
output_file =  f"../../data/interim/raw/fileID{f_ix}_subjID{s_ix}_dataID{data_ix}-pa.parquet"
df_s0_d0.to_parquet(
    output_file,
    engine='pyarrow',
)