In [1]:
import numpy as np
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import warnings
from tqdm import tqdm
from evaluator import *

In [2]:
VARIABLES = ['ActivityCounts', 'Barometer', 'BloodPerfusion',
             'BloodPulseWave', 'EnergyExpenditure', 'GalvanicSkinResponse', 'HR',
             'HRV', 'RESP', 'Steps', 'SkinTemperature', 'ActivityClass']

# Import data

In [3]:
# file path to data folder
path = './Output'

Metadata (subjectID, etc.)

In [4]:
with open(path + '/metadata_stat.txt') as f:
    metadata = f.read()

metadata = json.loads(metadata.replace('\'', '\"').replace('False', 'false').replace('True', 'true')) # doesn't accept other chars

In [5]:
subjects = [meta['subjectID'] for meta in metadata]

# Random Forest

In [6]:
class RandomForest:

    def __init__(self, path, variable):
        self.SEED = 42
        self.model = None
        self.path = path
        assert variable in (0, 1)
        self.variable = variable
        self.normalizer = StandardScaler()

        # CV ranges
        self.folds = 5
        self.n_trees = [3, 10, 50, 100, 300, 1000]
        self.max_features = ['auto', 'sqrt', 'log2']
        self.max_depths = [10, 30, 50, 100]
        self.criterions = ['gini', 'entropy']
        self.min_samples_splits = [2, 5, 10]

    def load_data(self, indices):
        # load shape
        N = len(indices)
        N_FEATURES = np.load(self.path + '/feature_vector_stat0.npy').shape[0]

        # init
        X = np.empty((N, N_FEATURES))
        y = np.empty(N)

        # load individual datapoints
        for i, index in enumerate(indices):
            X[i, ] = np.load(path + f'/feature_vector_stat{index}.npy', allow_pickle=True)
            y[i, ] = np.load(path + f'/labels_stat{index}.npy', allow_pickle=True)[self.variable]

        return X, y

    def fit(self, train_indices):
        # load data
        X_train, y_train = self.load_data(train_indices)

        # normalize training set
        self.normalizer.fit(X_train) # fit accord. to training set
        X_train = self.normalizer.transform(X_train, copy=True)

        # inner CV (hyperparameter tuning)
        inner_cv = StratifiedKFold(n_splits=self.folds, shuffle=True, random_state=self.SEED)
        combinations = {}
        for n_tree in tqdm(self.n_trees):
            for max_feature in self.max_features:
                for max_depth in self.max_depths:
                    for criterion in self.criterions:
                        for min_sample_split in self.min_samples_splits:
                            # model
                            rf = RandomForestClassifier(n_estimators=n_tree,
                                                        criterion=criterion,
                                                        max_depth=max_depth,
                                                        min_samples_split=min_sample_split,
                                                        max_features=max_feature)

                            # CV
                            scores = cross_val_score(rf, X_train, y_train, cv=inner_cv, scoring='f1_weighted')

                            # store score
                            combination = (n_tree, max_feature, max_depth, criterion, min_sample_split)
                            combinations[combination] = np.mean(scores)

        # best hyperparams
        best_combination, best_score = sorted(list(combinations.items()), key=lambda item: item[1])[-1]

        # use model with best hyperparams
        self.model = RandomForestClassifier(n_estimators=best_combination[0],
                                            criterion=best_combination[3],
                                            max_depth=best_combination[2],
                                            min_samples_split=best_combination[4],
                                            max_features=best_combination[1])

        self.model.fit(X_train, y_train)

    def predict(self, test_indices):
        # load data
        X_test, _ = self.load_data(test_indices)

        # normalize test set
        X_test = self.normalizer.transform(X_test, copy=True)

        return self.model.predict(X_test)

# CV

In [7]:
scores_strat_group_k_fold = [None]*2
scores_strat_k_fold = [None]*2
scores_loso = [None]*2

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')

    for variable in (0, 1): # phF, MF
        model = RandomForest(path, variable=variable)

        scores_strat_group_k_fold[variable] = stratified_group_k_fold(path=path,
                                                            groups=subjects,
                                                            model=model,
                                                            folds=5,
                                                            images=False,
                                                            verbose=True,
                                                            variable=variable)

        scores_strat_k_fold[variable] = stratified_k_fold(path=path,
                                                model=model,
                                                folds=5,
                                                images=False,
                                                verbose=True,
                                                variable=variable)

        scores_loso[variable] = leave_one_subject_out(path=path,
                                            groups=subjects,
                                            model=model,
                                            images=False,
                                            verbose=True,
                                            variable=variable)

Starting stratified group 5-fold for physical fatigue


  0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:02<00:13,  2.71s/it][A
 33%|███▎      | 2/6 [00:11<00:24,  6.20s/it][A
 50%|█████     | 3/6 [00:45<00:57, 19.16s/it][A
 67%|██████▋   | 4/6 [01:56<01:18, 39.47s/it][A
 83%|████████▎ | 5/6 [05:30<01:42, 102.57s/it][A
100%|██████████| 6/6 [17:18<00:00, 173.03s/it][A
 Fold 1 F1: 0.36651583710407243:  20%|██        | 1/5 [17:18<1:09:14, 1038.57s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:03<00:15,  3.12s/it][A
 33%|███▎      | 2/6 [00:10<00:22,  5.64s/it][A
 50%|█████     | 3/6 [00:42<00:53, 17.71s/it][A
 67%|██████▋   | 4/6 [01:46<01:11, 35.85s/it][A
 83%|████████▎ | 5/6 [05:03<01:34, 94.04s/it][A
100%|██████████| 6/6 [15:48<00:00, 158.04s/it][A
 Fold 2 F1: 0.5978589322153203:  40%|████      | 2/5 [33:08<49:19, 986.66s/it]    
  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:03<00:15,  3.08s/it][A
 33%|███▎      | 2/6 [00:10<00:23

Performance model:
 accuracy: 0.65 +- 0.152 

 balanced_accuracy: 0.596 +- 0.174 

 f1: 0.62 +- 0.199 

 recall: 0.65 +- 0.152 

 precision: 0.629 +- 0.237 

Starting stratified 5-fold for physical fatigue


  0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:03<00:16,  3.34s/it][A
 33%|███▎      | 2/6 [00:10<00:23,  5.84s/it][A
 50%|█████     | 3/6 [00:41<00:51, 17.08s/it][A
 67%|██████▋   | 4/6 [01:41<01:08, 34.17s/it][A
 83%|████████▎ | 5/6 [05:20<01:40, 100.76s/it][A
100%|██████████| 6/6 [17:52<00:00, 178.68s/it][A
 Fold 1 F1: 0.6784601691143748:  20%|██        | 1/5 [17:52<1:11:29, 1072.49s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:03<00:16,  3.34s/it][A
 33%|███▎      | 2/6 [00:11<00:25,  6.37s/it][A
 50%|█████     | 3/6 [00:50<01:02, 20.98s/it][A
 67%|██████▋   | 4/6 [02:05<01:24, 42.37s/it][A
 83%|████████▎ | 5/6 [05:49<01:48, 108.03s/it][A
100%|██████████| 6/6 [17:48<00:00, 178.01s/it][A
 Fold 2 F1: 0.7532051282051282:  40%|████      | 2/5 [35:41<53:30, 1070.28s/it]  
  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:03<00:15,  3.13s/it][A
 33%|███▎      | 2/6 [00:11<00:24,

Performance model:
 accuracy: 0.757 +- 0.026 

 balanced_accuracy: 0.616 +- 0.085 

 f1: 0.733 +- 0.042 

 recall: 0.757 +- 0.026 

 precision: 0.733 +- 0.053 

Starting leave-one-subject-out for physical fatigue


  0%|          | 0/21 [00:00<?, ?it/s]
  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:03<00:18,  3.66s/it][A
 33%|███▎      | 2/6 [00:13<00:28,  7.05s/it][A
 50%|█████     | 3/6 [00:54<01:07, 22.63s/it][A
 67%|██████▋   | 4/6 [02:16<01:32, 46.09s/it][A
 83%|████████▎ | 5/6 [05:59<01:49, 109.91s/it][A
100%|██████████| 6/6 [18:50<00:00, 188.47s/it][A
 Fold 1 F1: 0.40476190476190477:   5%|▍         | 1/21 [18:51<6:17:00, 1131.03s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:03<00:17,  3.52s/it][A
 33%|███▎      | 2/6 [00:12<00:27,  6.85s/it][A
 50%|█████     | 3/6 [00:53<01:06, 22.21s/it][A
 67%|██████▋   | 4/6 [02:12<01:29, 44.86s/it][A
 83%|████████▎ | 5/6 [06:08<01:53, 113.56s/it][A
100%|██████████| 6/6 [18:14<00:00, 182.41s/it][A
 Fold 2 F1: 0.16666666666666666:  10%|▉         | 2/21 [37:05<5:51:23, 1109.68s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:03<00:15,  3.10s/it][A
 33%|███▎      | 2/6 [00:11<0

Performance model:
 accuracy: 0.575 +- 0.274 

 balanced_accuracy: 0.562 +- 0.282 

 f1: 0.574 +- 0.288 

 recall: 0.575 +- 0.274 

 precision: 0.642 +- 0.324 

Starting stratified group 5-fold for mental fatigue


  0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:03<00:16,  3.26s/it][A
 33%|███▎      | 2/6 [00:11<00:24,  6.08s/it][A
 50%|█████     | 3/6 [00:47<00:59, 19.67s/it][A
 67%|██████▋   | 4/6 [01:57<01:19, 39.79s/it][A
 83%|████████▎ | 5/6 [05:28<01:41, 101.52s/it][A
100%|██████████| 6/6 [16:58<00:00, 169.74s/it][A
 Fold 1 F1: 0.3388235294117647:  20%|██        | 1/5 [16:58<1:07:54, 1018.69s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:02<00:13,  2.65s/it][A
 33%|███▎      | 2/6 [00:09<00:21,  5.29s/it][A
 50%|█████     | 3/6 [00:40<00:51, 17.07s/it][A
 67%|██████▋   | 4/6 [01:42<01:09, 34.58s/it][A
 83%|████████▎ | 5/6 [04:43<01:27, 87.56s/it][A
100%|██████████| 6/6 [14:43<00:00, 147.19s/it][A
 Fold 2 F1: 0.4557755131182846:  40%|████      | 2/5 [31:42<46:57, 939.10s/it]   
  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:03<00:15,  3.06s/it][A
 33%|███▎      | 2/6 [00:10<00:23, 

Performance model:
 accuracy: 0.537 +- 0.083 

 balanced_accuracy: 0.471 +- 0.033 

 f1: 0.495 +- 0.146 

 recall: 0.537 +- 0.083 

 precision: 0.564 +- 0.19 

Starting stratified 5-fold for mental fatigue


  0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:02<00:14,  2.91s/it][A
 33%|███▎      | 2/6 [00:10<00:22,  5.60s/it][A
 50%|█████     | 3/6 [00:43<00:55, 18.33s/it][A
 67%|██████▋   | 4/6 [01:50<01:14, 37.23s/it][A
 83%|████████▎ | 5/6 [05:03<01:33, 93.64s/it][A
100%|██████████| 6/6 [15:51<00:00, 158.66s/it][A
 Fold 1 F1: 0.5872926093514328:  20%|██        | 1/5 [15:52<1:03:28, 952.22s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:03<00:15,  3.09s/it][A
 33%|███▎      | 2/6 [00:10<00:22,  5.62s/it][A
 50%|█████     | 3/6 [00:44<00:55, 18.48s/it][A
 67%|██████▋   | 4/6 [01:50<01:15, 37.53s/it][A
 83%|████████▎ | 5/6 [05:07<01:34, 94.93s/it][A
100%|██████████| 6/6 [16:01<00:00, 160.17s/it][A
 Fold 2 F1: 0.7159888357256778:  40%|████      | 2/5 [31:53<47:52, 957.55s/it]  
  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:02<00:14,  2.99s/it][A
 33%|███▎      | 2/6 [00:10<00:22,  5.

Performance model:
 accuracy: 0.678 +- 0.049 

 balanced_accuracy: 0.558 +- 0.071 

 f1: 0.642 +- 0.062 

 recall: 0.678 +- 0.049 

 precision: 0.635 +- 0.069 

Starting leave-one-subject-out for mental fatigue


  0%|          | 0/21 [00:00<?, ?it/s]
  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:02<00:13,  2.69s/it][A
 33%|███▎      | 2/6 [00:09<00:21,  5.29s/it][A
 50%|█████     | 3/6 [00:42<00:52, 17.61s/it][A
 67%|██████▋   | 4/6 [01:45<01:11, 35.75s/it][A
 83%|████████▎ | 5/6 [04:54<01:30, 90.88s/it][A
100%|██████████| 6/6 [15:21<00:00, 153.62s/it][A
 Fold 1 F1: 0.6309523809523809:   5%|▍         | 1/21 [15:21<5:07:18, 921.94s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:02<00:13,  2.68s/it][A
 33%|███▎      | 2/6 [00:09<00:21,  5.27s/it][A
 50%|█████     | 3/6 [00:41<00:52, 17.53s/it][A
 67%|██████▋   | 4/6 [01:45<01:11, 35.64s/it][A
 83%|████████▎ | 5/6 [04:53<01:30, 90.60s/it][A
100%|██████████| 6/6 [15:19<00:00, 153.29s/it][A
 Fold 2 F1: 0.3333333333333333:  10%|▉         | 2/21 [30:41<4:51:34, 920.78s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [00:02<00:13,  2.63s/it][A
 33%|███▎      | 2/6 [00:09<00:21, 

KeyboardInterrupt: 

# Save scores

In [18]:
path_scores = './Scores'
model_name = 'random_forest'

In [19]:
# stratified 5-fold
with open(f'{path_scores}/strat_5_fold/{model_name}.txt', 'w') as dat:
    dat.write(str(scores_strat_group_k_fold))

In [20]:
# stratified group 5-fold
with open(f'{path_scores}/strat_group_5_fold/{model_name}.txt', 'w') as dat:
    dat.write(str(scores_strat_k_fold))

In [21]:
# LOSO
with open(f'{path_scores}/loso/{model_name}.txt', 'w') as dat:
    dat.write(str(scores_loso))