In [1]:
import numpy as np
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import warnings
from evaluator import *

In [2]:
VARIABLES = ['ActivityCounts', 'Barometer', 'BloodPerfusion',
             'BloodPulseWave', 'EnergyExpenditure', 'GalvanicSkinResponse', 'HR',
             'HRV', 'RESP', 'Steps', 'SkinTemperature', 'ActivityClass']

# Import data

In [3]:
# file path to data folder
path = './Output'

Metadata (subjectID, etc.)

In [4]:
with open(path + '/metadata_stat.txt') as f:
    metadata = f.read()

metadata = json.loads(metadata.replace('\'', '\"').replace('False', 'false').replace('True', 'true')) # doesn't accept other chars

In [5]:
subjects = [meta['subjectID'] for meta in metadata]

# Random Forest

In [6]:
class RandomForest:

    def __init__(self, path, variable):
        self.SEED = 42
        self.model = None
        self.path = path
        assert variable in (0, 1)
        self.variable = variable
        self.normalizer = StandardScaler()

        # CV ranges
        self.folds = 5
        self.n_trees = [3, 10, 50, 100, 300, 1000]
        self.max_features = ['auto', 'sqrt', 'log2']
        self.max_depths = [10, 30, 50, 100]
        self.criterions = ['gini', 'entropy']
        self.min_samples_splits = [2, 5, 10]

    def load_data(self, indices):
        # load shape
        N = len(indices)
        N_FEATURES = np.load(self.path + '/feature_vector_stat0.npy').shape[0]

        # init
        X = np.empty((N, N_FEATURES))
        y = np.empty(N)

        # load individual datapoints
        for i, index in enumerate(indices):
            X[i, ] = np.load(path + f'/feature_vector_stat{index}.npy', allow_pickle=True)
            y[i, ] = np.load(path + f'/labels_stat{index}.npy', allow_pickle=True)[self.variable]

        return X, y

    def fit(self, train_indices):
        # load data
        X_train, y_train = self.load_data(train_indices)

        # normalize training set
        self.normalizer.fit(X_train) # fit accord. to training set
        X_train = self.normalizer.transform(X_train, copy=True)

        # inner CV (hyperparameter tuning)
        inner_cv = StratifiedKFold(n_splits=self.folds, shuffle=True, random_state=self.SEED)
        combinations = {}
        for n_tree in self.n_trees:
            for max_feature in self.max_features:
                for max_depth in self.max_depths:
                    for criterion in self.criterions:
                        for min_sample_split in self.min_samples_splits:
                            # model
                            rf = RandomForestClassifier(n_estimators=n_tree,
                                                        criterion=criterion,
                                                        max_depth=max_depth,
                                                        min_samples_split=min_sample_split,
                                                        max_features=max_feature)

                            # CV
                            scores = cross_val_score(rf, X_train, y_train, cv=inner_cv, scoring='f1_weighted')

                            # store score
                            combination = (n_tree, max_feature, max_depth, criterion, min_sample_split)
                            combinations[combination] = np.mean(scores)

        # best hyperparams
        best_combination, best_score = sorted(list(combinations.items()), key=lambda item: item[1])[-1]

        # use model with best hyperparams
        self.model = RandomForestClassifier(n_estimators=best_combination[0],
                                            criterion=best_combination[3],
                                            max_depth=best_combination[2],
                                            min_samples_split=best_combination[4],
                                            max_features=best_combination[1])

        self.model.fit(X_train, y_train)

    def predict(self, test_indices):
        # load data
        X_test, _ = self.load_data(test_indices)

        # normalize test set
        X_test = self.normalizer.transform(X_test, copy=True)

        return self.model.predict(X_test)

# CV

In [7]:
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')

    for variable in (0, 1): # phF, MF
        model = RandomForest(path, variable=0)

        scores_strat_group_k_fold = stratified_group_k_fold(path=path,
                                                            groups=subjects,
                                                            model=model,
                                                            folds=5,
                                                            images=False,
                                                            verbose=True,
                                                            variable=variable)

        scores_strat_k_fold = stratified_k_fold(path=path,
                                                groups=subjects,
                                                model=model,
                                                folds=5,
                                                images=False,
                                                verbose=True,
                                                variable=variable)

        scores_loso = leave_one_subject_out(path=path,
                                            groups=subjects,
                                            model=model,
                                            images=False,
                                            verbose=True,
                                            variable=variable)

Starting stratified group 5-fold for physical fatigue


  0%|          | 0/5 [00:14<?, ?it/s]

KeyboardInterrupt

