# Human Activity Recognition - Baselines

Last Update : 21 July 2019

In [1]:
N_THREADS = 8
# Nota Bene : notebooks don't deallocate GPU memory
IS_FORCE_CPU = True # can also be set in the trainer

## Environment

In [2]:
cd ..

/conv


In [3]:
%autosave 600
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# CENTER PLOTS
from IPython.core.display import HTML
display(HTML(""" <style> .output_png {display: table-cell; text-align: center; margin:auto; }
.prompt display:none;}  </style>"""))

import os
if IS_FORCE_CPU:
    os.environ['CUDA_VISIBLE_DEVICES'] = ""

import sys
sys.path.append("notebooks")

import numpy as np
import matplotlib.pyplot as plt
from functools import partial
import pandas as pd
import h5py


import torch
torch.set_num_threads(N_THREADS)

Autosaving every 600 seconds


# Dataset

In [4]:
from utils.data.tsdata import get_timeseries_dataset

data_train = get_timeseries_dataset("har")(split="train", is_fill_mean=True, data_perc=1)
data_test = get_timeseries_dataset("har")(split="test", is_fill_mean=True, data_perc=1)

In [5]:
X_DIM = 1  # 1D spatial input (although actually 2 but the first is for sparse channels)
Y_DIM = data_train.data.shape[-1] # multiple channels
N_TARGETS = len(np.unique(data_train.targets))

sampling_percentages = [0.05, 0.1, 0.3, 0.5, 0.7, 1]
label_percentages = [N_TARGETS, N_TARGETS*2, 0.01, 0.05, 0.1, 0.3, 0.5]

In [10]:
data_train.data.shape

(7352, 128, 9)

In [8]:
data_train[0]

({'X': tensor([[-1.0000],
          [-0.9843],
          [-0.9685],
          [-0.9528],
          [-0.9370],
          [-0.9213],
          [-0.9055],
          [-0.8898],
          [-0.8740],
          [-0.8583],
          [-0.8425],
          [-0.8268],
          [-0.8110],
          [-0.7953],
          [-0.7795],
          [-0.7638],
          [-0.7480],
          [-0.7323],
          [-0.7165],
          [-0.7008],
          [-0.6850],
          [-0.6693],
          [-0.6535],
          [-0.6378],
          [-0.6220],
          [-0.6063],
          [-0.5906],
          [-0.5748],
          [-0.5591],
          [-0.5433],
          [-0.5276],
          [-0.5118],
          [-0.4961],
          [-0.4803],
          [-0.4646],
          [-0.4488],
          [-0.4331],
          [-0.4173],
          [-0.4016],
          [-0.3858],
          [-0.3701],
          [-0.3543],
          [-0.3386],
          [-0.3228],
          [-0.3071],
          [-0.2913],
          [-0.2756],
        

In [7]:
len(data_test)

2947

## Model

In [12]:
import torch.nn as nn
from skssl.predefined import RNN, MLP
from utils.helpers import count_parameters

In [7]:
class FeatureMLP(nn.Module):

    def __init__(self, input_size, output_size, **kwargs):
        super().__init__()
        self.out = MLP(input_size*4, output_size, **kwargs)

    def forward(self, X, y=None):

        if y is not None:
            # is there's y then that's the actual input and X is time
            X = y

        outputs = self.out(torch.cat([X.mean(-2), X.max(-2)[0], X.min(-2)[0], X.var(-2)], dim=-1))
        return outputs

In [13]:
RNN??

In [8]:
models = {}

models["bi_rnn_delta"] = partial(RNN, input_size=Y_DIM, hidden_size=64, output_size=N_TARGETS, bidirectional=True, n_layers=1, dropout=0.5, is_add_delta=True)
models["feature_mlp"]  = partial(FeatureMLP, input_size=Y_DIM, output_size=N_TARGETS, hidden_size=128, dropout=0.5, n_hidden_layers=3, is_res=True)

In [9]:
from utils.helpers import count_parameters
for k,v in models.items():
    print(k, "- N Param:", count_parameters(v()))

bi_rnn_delta - N Param: 29574
feature_mlp - N Param: 38534


  "num_layers={}".format(dropout, num_layers))


- selected best rnn from [GRU, LSTM], hidden [32,64,128], [bidirect,None], dropout [0,0.5], lr [1e-4,1e-3], layer [1,2]
- selected best feature MLP from hidden [32,64,128], n_hidden_layers [1,3], dropout [0,0.5], [is_res, None], lr [1e-4,1e-3]

all with early stoping patience 10, cross entropy, standardized data, 100 epochs, batch size 64

# Supervised Training with Missing Features

In [10]:
from ntbks_helpers import train_models_

In [11]:
N_EPOCHS = 100 
BATCH_SIZE = 64
IS_RETRAIN = False # if false load precomputed
chckpnt_dirname="results/challenge/har/"

In [16]:
data_trainers = {}

for perc in sampling_percentages:
    data_train = get_timeseries_dataset("har")(split="train", data_perc=perc, is_fill_mean=False)
    data_test = get_timeseries_dataset("har")(split="test", data_perc=perc, is_fill_mean=False)
    
    data_trainers.update(train_models_({"{}%har".format(int(perc*100)): (data_train, data_test)}, 
                             models, 
                             chckpnt_dirname=chckpnt_dirname,
                              max_epochs=N_EPOCHS,
                              batch_size=BATCH_SIZE,
                             is_retrain=IS_RETRAIN))


--- Loading 5%har/bi_rnn_delta ---

5%har/bi_rnn_delta best epoch: 9 val_loss: 0.5359820848704395

--- Loading 5%har/feature_mlp ---

5%har/feature_mlp best epoch: 18 val_loss: 0.5488887399708088

--- Loading 10%har/bi_rnn_delta ---

10%har/bi_rnn_delta best epoch: 9 val_loss: 0.3888445916646617

--- Loading 10%har/feature_mlp ---

10%har/feature_mlp best epoch: 16 val_loss: 0.4893241169030358

--- Loading 30%har/bi_rnn_delta ---

30%har/bi_rnn_delta best epoch: 18 val_loss: 0.25232738165437596

--- Loading 30%har/feature_mlp ---

30%har/feature_mlp best epoch: 15 val_loss: 0.41251740123524194

--- Loading 50%har/bi_rnn_delta ---

50%har/bi_rnn_delta best epoch: 19 val_loss: 0.21550011958437157

--- Loading 50%har/feature_mlp ---

50%har/feature_mlp best epoch: 15 val_loss: 0.38914211493328055

--- Loading 70%har/bi_rnn_delta ---

70%har/bi_rnn_delta best epoch: 9 val_loss: 0.23589642063253485

--- Loading 70%har/feature_mlp ---

70%har/feature_mlp best epoch: 16 val_loss: 0.384192570

In [17]:
models2 = {}

models2["bi_rnn_meanfill"] = RNN(Y_DIM, 64, N_TARGETS, bidirectional=True, n_layers=1, is_add_delta=False)

In [19]:
from utils.helpers import count_parameters
for k,v in models.items():
    print(k, "- N Param:", count_parameters(v()))

bi_rnn_delta - N Param: 29574
feature_mlp - N Param: 38534


In [20]:
for perc in sampling_percentages:
    data_train = get_timeseries_dataset("har")(split="train", data_perc=perc, is_fill_mean=True)
    data_test = get_timeseries_dataset("har")(split="test", data_perc=perc, is_fill_mean=True)
    
    data_trainers.update(train_models_({"{}%har".format(int(perc*100)): (data_train, data_test)}, 
                             models2, 
                             chckpnt_dirname=chckpnt_dirname,
                              max_epochs=N_EPOCHS,
                              batch_size=BATCH_SIZE,
                             is_retrain=IS_RETRAIN))


--- Loading 5%har/bi_rnn_meanfill ---

5%har/bi_rnn_meanfill best epoch: 12 val_loss: 0.56304434497032

--- Loading 10%har/bi_rnn_meanfill ---

10%har/bi_rnn_meanfill best epoch: 1 val_loss: 0.46905726127890274

--- Loading 30%har/bi_rnn_meanfill ---

30%har/bi_rnn_meanfill best epoch: 10 val_loss: 0.3455855598035093

--- Loading 50%har/bi_rnn_meanfill ---

50%har/bi_rnn_meanfill best epoch: 1 val_loss: 0.323673616881123

--- Loading 70%har/bi_rnn_meanfill ---

70%har/bi_rnn_meanfill best epoch: 8 val_loss: 0.3046555771518627

--- Loading 100%har/bi_rnn_meanfill ---

100%har/bi_rnn_meanfill best epoch: 6 val_loss: 0.29275460707603823


In [21]:
for k,t in data_trainers.items():
    for e, h in enumerate(t.history[::-1]):
        if h["valid_loss_best"]:
            print(k, "epoch:", len(t.history)-e, 
                  "val_loss:", h["valid_loss"], 
                  "val_acc:", h["valid_acc"])
            break

5%har/bi_rnn_delta epoch: 9 val_loss: 0.5359820848704395 val_acc: 0.8082796063793688
5%har/feature_mlp epoch: 18 val_loss: 0.5488887399708088 val_acc: 0.7712928401764506
10%har/bi_rnn_delta epoch: 9 val_loss: 0.3888445916646617 val_acc: 0.8625721072276892
10%har/feature_mlp epoch: 16 val_loss: 0.4893241169030358 val_acc: 0.8004750593824228
30%har/bi_rnn_delta epoch: 18 val_loss: 0.25232738165437596 val_acc: 0.9049881235154394
30%har/feature_mlp epoch: 15 val_loss: 0.41251740123524194 val_acc: 0.8391584662368511
50%har/bi_rnn_delta epoch: 19 val_loss: 0.21550011958437157 val_acc: 0.9229725144214456
50%har/feature_mlp epoch: 15 val_loss: 0.38914211493328055 val_acc: 0.8568035290125552
70%har/bi_rnn_delta epoch: 9 val_loss: 0.23589642063253485 val_acc: 0.9131319986426875
70%har/feature_mlp epoch: 16 val_loss: 0.38419257051304795 val_acc: 0.8574821852731591
100%har/bi_rnn_delta epoch: 16 val_loss: 0.204064313460102 val_acc: 0.9229725144214456
100%har/feature_mlp epoch: 24 val_loss: 0.37777

# Sub Labels Training

In [22]:
from utils.data.ssldata import get_train_dev_test_ssl
from skssl.training.loaders import get_supervised_iterator

In [23]:
data_trainers = {}
samplig_perc = 0.5

for label_perc in label_percentages:
    data_train, _, data_test = get_train_dev_test_ssl("har", n_labels=label_perc, data_perc=samplig_perc, dev_size=0)
    
    data_trainers.update(train_models_({"{}%har_{}%lab".format(int(samplig_perc*100), int(label_perc*100)): (data_train, data_test)}, 
                         models, 
                         chckpnt_dirname=chckpnt_dirname,
                          max_epochs=N_EPOCHS,
                          batch_size=64,
                         is_retrain=IS_RETRAIN, 
                         iterator_train=get_supervised_iterator))


--- Loading 50%har_600%lab/bi_rnn_delta ---

50%har_600%lab/bi_rnn_delta best epoch: 25 val_loss: 1.7031053005738868

--- Loading 50%har_600%lab/feature_mlp ---

50%har_600%lab/feature_mlp best epoch: 100 val_loss: 1.2070192208644452


  "num_layers={}".format(dropout, num_layers))



--- Loading 50%har_1200%lab/bi_rnn_delta ---

50%har_1200%lab/bi_rnn_delta best epoch: 34 val_loss: 1.468012556610651

--- Loading 50%har_1200%lab/feature_mlp ---

50%har_1200%lab/feature_mlp best epoch: 100 val_loss: 0.9724344174094147


  "num_layers={}".format(dropout, num_layers))



--- Loading 50%har_1%lab/bi_rnn_delta ---

50%har_1%lab/bi_rnn_delta best epoch: 68 val_loss: 1.1848216887688612

--- Loading 50%har_1%lab/feature_mlp ---

50%har_1%lab/feature_mlp best epoch: 90 val_loss: 0.6480910628376713


  "num_layers={}".format(dropout, num_layers))



--- Loading 50%har_5%lab/bi_rnn_delta ---

50%har_5%lab/bi_rnn_delta best epoch: 37 val_loss: 0.5261391364480424

--- Loading 50%har_5%lab/feature_mlp ---

50%har_5%lab/feature_mlp best epoch: 34 val_loss: 0.6083724190511661


  "num_layers={}".format(dropout, num_layers))



--- Loading 50%har_10%lab/bi_rnn_delta ---

50%har_10%lab/bi_rnn_delta best epoch: 52 val_loss: 0.29275474525687245

--- Loading 50%har_10%lab/feature_mlp ---

50%har_10%lab/feature_mlp best epoch: 43 val_loss: 0.4748540673775312


  "num_layers={}".format(dropout, num_layers))



--- Loading 50%har_30%lab/bi_rnn_delta ---

50%har_30%lab/bi_rnn_delta best epoch: 16 val_loss: 0.242327061919467

--- Loading 50%har_30%lab/feature_mlp ---

50%har_30%lab/feature_mlp best epoch: 43 val_loss: 0.38917470055189546


  "num_layers={}".format(dropout, num_layers))



--- Loading 50%har_50%lab/bi_rnn_delta ---

50%har_50%lab/bi_rnn_delta best epoch: 30 val_loss: 0.22852476952963455

--- Loading 50%har_50%lab/feature_mlp ---

50%har_50%lab/feature_mlp best epoch: 39 val_loss: 0.3619322175691603


  "num_layers={}".format(dropout, num_layers))


In [24]:
for k,t in data_trainers.items():
    if "bi_rnn_delta"not in k:
        continue
    print()
        
    for e, h in enumerate(t.history[::-1]):
        if h["valid_loss_best"]:
            print(k, "epoch:", len(t.history)-e, 
                  "val_loss:", h["valid_loss"], 
                  "val_acc:", h["valid_acc"])
            break


50%har_600%lab/bi_rnn_delta epoch: 25 val_loss: 1.7031053005738868 val_acc: 0.28944689514760774

50%har_1200%lab/bi_rnn_delta epoch: 34 val_loss: 1.468012556610651 val_acc: 0.4896504920257889

50%har_1%lab/bi_rnn_delta epoch: 68 val_loss: 1.1848216887688612 val_acc: 0.6335256192738378

50%har_5%lab/bi_rnn_delta epoch: 37 val_loss: 0.5261391364480424 val_acc: 0.839837122497455

50%har_10%lab/bi_rnn_delta epoch: 52 val_loss: 0.29275474525687245 val_acc: 0.8985408890397014

50%har_30%lab/bi_rnn_delta epoch: 16 val_loss: 0.242327061919467 val_acc: 0.9070240922972514

50%har_50%lab/bi_rnn_delta epoch: 30 val_loss: 0.22852476952963455 val_acc: 0.9124533423820834


# SSL Evaluation

In [25]:
from skssl.classifiers import LabelSpreading
from joblib import dump, load



In [65]:
data_trainers = {}
samplig_perc = 0.5

for label_perc in label_percentages:
    data_train, _, data_test = get_train_dev_test_ssl("har", n_labels=label_perc, data_perc=samplig_perc, dev_size=0)
    file = chckpnt_dirname+"{}%har_{}%lab/feature_labelspread.joblib".format(int(samplig_perc*100), int(label_perc*100))
    
    X_test = np.concatenate([data_test.data.mean(1), data_test.data.min(1), data_test.data.max(1), 
                                  data_test.data.std(1)], axis=-1)
    
    if True:
        X_train = np.concatenate([data_train.data.mean(1), data_train.data.min(1), data_train.data.max(1), 
                                  data_train.data.std(1)], axis=-1)
        Y_train = data_train.targets.flatten()
        Y_test_unsup = -1*np.ones_like(data_test.targets.flatten())
        
        
        clf = LabelSpreading(kernel="rbf", gamma=0.7139, n_jobs=-1, max_iter=50, alpha=0.35)
        clf.fit(np.concatenate([X_train, X_test], axis=0),  
                np.concatenate([Y_train, Y_test_unsup], axis=0))
        #clf.fit(X_train,  Y_train)
        dump(clf, file) 
        
    else:
        
        clf = load(file)
        
    
    
    print("Label Spreading on {}% of data".format(label_perc*100), 
          clf.score(X_test, data_test.targets.flatten()))

Label Spreading on 600% of data 0.6104513064133017
Label Spreading on 1200% of data 0.5778758059043094
Label Spreading on 1.0% of data 0.7020699015948422
Label Spreading on 5.0% of data 0.7387173396674585
Label Spreading on 10.0% of data 0.7645062775704106
Label Spreading on 30.0% of data 0.7852052935188327
Label Spreading on 50.0% of data 0.7882592466915507


In [61]:
from skopt import BayesSearchCV

data_trainers = {}
samplig_perc = 0.5

for label_perc in [1]:
    data_train, _, data_test = get_train_dev_test_ssl("har", n_labels=label_perc, data_perc=samplig_perc, dev_size=0)
    file = chckpnt_dirname+"{}%har_{}%lab/feature_labelspread.joblib".format(int(samplig_perc*100), int(label_perc*100))
    

    X_train = np.concatenate([data_train.data.mean(1), data_train.data.min(1), data_train.data.max(1), 
                              data_train.data.std(1)], axis=-1)

    clf = LabelSpreading(kernel="rbf", gamma=0.7139957563303907, n_jobs=-1, max_iter=50, alpha=0.35)
    
    bcv = BayesSearchCV(clf, clf.get_hypopt_search_space())
    
    
    bcv.fit(X_train, data_train.targets.flatten())
        
   
    print("Label Spreading on {}% of data".format(label_perc*100), bcv.score(X_test, data_test.targets.flatten()))





  probabilities /= normalizer
  probabilities /= normalizer
  probabilities /= normalizer
  probabilities /= normalizer
  probabilities /= normalizer
  probabilities /= normalizer
  probabilities /= normalizer


  probabilities /= normalizer
  probabilities /= normalizer
  probabilities /= normalizer
  probabilities /= normalizer
  probabilities /= normalizer
  probabilities /= normalizer
  probabilities /= normalizer
  probabilities /= normalizer




Label Spreading on 100% of data 0.7716321683067526


In [62]:
bcv.best_params_

{'alpha': 0.36685844814212404, 'gamma': 0.7139957563303907, 'kernel': 'rbf'}