# Split & prepare train & test data

In [1]:
%run preprocess.ipynb
%run sys_configs.ipynb

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
np.random.seed(123)

This notebook contains functions to split the MyoGym dataset into train and test sets and prepare it for time series classification tasks.

## Train-test split

The train-test split could be applied to each time series, using the minor portion at the end as test data and the major portion that starts at the beginning as train data. However, the aim of this project is to demonstrate real time prediction of labels throughout a workout and to be able to generalise this to unseen trainers. Therefore the train-test split is by trainer.

In [16]:
def train_test_split(data: pd.DataFrame, test = 0.2):
    index = data.index.get_level_values("trainer").unique().tolist()
    n = len(index)
    
    # Shuffle the samples
    np.random.shuffle(index)
    
    # Get the trainers who will be in the train set and the test set respectively
    train_trainers, test_trainers = index[:int((1 - test)*n)], index[int((1 - test)*n):]
    
    # Split to train and test sets
    train = data.loc[data.index.get_level_values('trainer').isin(train_trainers)]
    test = data.loc[data.index.get_level_values('trainer').isin(test_trainers)]
    
    # Finally, split to x and y
    col = "activity"
    x_train, y_train = train.drop(col, axis = 1), train[[col]]
    x_test, y_test = test.drop(col, axis = 1), test[[col]]
    
    return x_train, x_test, y_train, y_test

In [17]:
x_train, x_test, y_train, y_test = train_test_split(data)

## Data normalisation

Data normalisation is essential to the process of adjusting all channels to an identical range. Many time series classification techniques that we will study, for example Dynamic Time Warping, use distance metrics that depend on each dimension having the same scale. We will use sklearn's StandardScalar.

The use of a standard scaling allows us to more easily identify outliers and to more rigorously quantify the extent of these outliers using hypothesis tests of a normal distribution.

In [5]:
scaler = StandardScaler()
x_train = pd.DataFrame(scaler.fit_transform(x_train), columns = x_train.columns, index = x_train.index)
x_test = pd.DataFrame(scaler.transform(x_test), columns = x_test.columns, index = x_test.index)

## Apply windows

There are 2 windowing techniques we will deploy for GAR tasks; 

- Window technique 1: incorporates only a fixed number of the most recent observations, if there is no activity changepoint in the window.
- Window technique 2: incorporates all time steps from the beginning of the time series for an activity up to the current time step, until the minimum of:
    - the end of the time series for that activity or;
    - a maximum time series length

Window technique 1 is used for the benchmark methods, while window technique 2 is used for methods in this research which incorporate historical information. Window technique 1 is implemented as a precomputed Numpy matrix, while window technique 2 is implemented as a generator owing to its size. Pictorial representations of each windowing technique are shown below.

**Note that both window techniques break on reaching an activity change point. Every window should have a single class during the train stage.**

In [None]:
x_train, y_train

In [23]:
label_mask = y_train['activity'].ne(y_train['activity'].shift())
labels = y_train.join(label_mask, rsuffix = "_mask")

In [26]:
labels.to_csv("hehe.csv")

In [6]:
def window_method_1(data, labels, sz = 250, step = 50):
    """
    Computes windows for each time series stream from the data and labels. 
    If there is a label change in the window then the entire window is discarded.
    We seek windows which have unambiguous non-conflicting class labels.
    """
    # Create a mask for where the activity changes. Join to the activity.
    label_mask = labels['activity'].ne(labels['activity'].shift())
    labels = labels.join(label_mask, rsuffix = "_mask")
    
    # Combine the data and labels
    comb = data.join(labels)
    
    data_list = []
    label_list = []
    
    for t in comb.index.get_level_values('trainer').unique():
        # Get all the data for this trainer
        trainer_data = comb.loc[comb.index.get_level_values('trainer') == t]
        
        # Obtain a list of all windows
        for start in range(0, len(trainer_data) - sz + 1, step):
            # Filter for the current window
            window = trainer_data.iloc[start:start + sz]
            
            if window.isna().values.any():
                # Skip windows with NaN values (the first and last few windows)
                continue
            
            if window.loc[:, "activity_mask"].values.any():
                # Skip windows where the activity changes during that window
                continue
            
            window_data = window.loc[:, ["acc_x", "acc_y", "acc_z", "gyr_x", "gyr_y", "gyr_z"]]
            window_label = window.loc[:, "activity"].unique()[0]
            
            data_list.append(window_data)
            label_list.append(window_label)
            
    # Conver to Numpy arrays
    data_np = np.array(data_list)
    label_np = np.array(label_list)
    
    return data_np, label_np            

In [7]:
x1_train, y1_train = window_method_1(x_train, y_train, sz = 500, step = 50)
x1_test, y1_test = window_method_1(x_test, y_test, sz = 500, step = 50)

In [8]:
with open('data/1_train.npy', 'wb') as f:
    np.save(f, x1_train)
    np.save(f, y1_train)
    
with open('data/1_test.npy', 'wb') as f:
    np.save(f, x1_test)
    np.save(f, y1_test)

In [9]:
print("Window Method 1: There are %s samples of window length %s and dimensionality %s." % (x1_train.shape))

Window Method 1: There are 21926 samples of window length 500 and dimensionality 6.


In [10]:
## References#
# Placeholder for windowing method 2
#

## Sample background activity class

According to the paper [1] which introduced the MyoGym dataset, the background activity class, which it describes as the null class, accounts for 77% of the dataset, a number which dwarves the remaining 30 classes. Most of the techniques we explore are sensitive to class imbalanaces or to dataset sizes. Therefore, we sample some windows from this background activity class.

In [11]:
train_labels, train_counts = np.unique(y1_train, return_counts = True)
test_labels, test_counts = np.unique(y1_test, return_counts = True)

train_label_counts = pd.DataFrame(np.hstack([train_labels[:, np.newaxis], train_counts[:, np.newaxis]]), columns = ["Label", "Train Count"])
test_label_counts = pd.DataFrame(np.hstack([test_labels[:, np.newaxis], test_counts[:, np.newaxis]]), columns = ["Label", "Test Count"])

In [12]:
label_counts = train_label_counts.merge(test_label_counts, on = "Label")
label_counts["Label"] = label_counts["Label"].map(ACTIVITY_MAPPING)
label_counts = label_counts.set_index("Label")
label_counts.sort_values(["Train Count"], ascending=[False])

Unnamed: 0_level_0,Train Count,Test Count
Label,Unnamed: 1_level_1,Unnamed: 2_level_1
No activity identified,18093,3566
Dumbbell Alternate Bicep Curl,244,31
Front Dumbbell Raise,223,27
Dumbbell Flyes,205,14
Incline Dumbbell Flyes,190,23
Hammer Curl,184,36
Incline Dumbbell Press,154,19
Spider Curl,154,24
Incline Hammer Curl,145,24
Bar Skullcrusher,141,30


In [13]:
def sample_noise_class(data: np.array, labels: np.array, sz: int):
    """
    Removes most samples from the dominant noise class, down to a sample size (sz) specified in this function.  
    """
    # Identify indices of the noise class and signal class
    noise_idx = np.where(labels == 99)[0]
    signal_idx = np.where(labels != 99)[0]

    # Choose a sample from the noise class
    sample_idx = np.random.choice(noise_idx, size = sz, replace=False)

    # Combine the sampled indices with the other class indices
    combined_idx = np.concatenate([signal_idx, sample_idx])

    # Apply the indexes to the data and labels
    data_sample = x1_train[combined_idx, :, :]
    labels_sample = labels[combined_idx]
    
    return data_sample, labels_sample

In [14]:
x1s_train, y1s_train = sample_noise_class(data = x1_train, labels = y1_train, sz = 250)
x1s_test, y1s_test = sample_noise_class(data = x1_test, labels = y1_test, sz = 50)

In [15]:
with open('data/1s_train.npy', 'wb') as f:
    np.save(f, x1s_train)
    np.save(f, y1s_train)
    
with open('data/1s_test.npy', 'wb') as f:
    np.save(f, x1s_test)
    np.save(f, y1s_test)

## References

[1] Koskimäki, Heli, Pekka Siirtola and Juha Röning. “MyoGym: introducing an open gym data set for activity recognition collected using myo armband.” Proceedings of the 2017 ACM International Joint Conference on Pervasive and Ubiquitous Computing and Proceedings of the 2017 ACM International Symposium on Wearable Computers (2017): n. pag.