# Notebook to solve the kaggle competition "Child Mind Institute - Detect Sleep States"
Link to the competition: https://www.kaggle.com/competitions/child-mind-institute-detect-sleep-states


### Install packages

In [1]:
%pip install matplotlib plotly pandas numpy tqdm scikit-learn pyarrow

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


### Imports

In [1]:
from itertools import product
from tqdm import tqdm
from score import score
#from sklearn.svm import SVC
#from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import recall_score, precision_score, average_precision_score
import wandb
from abc import ABC, abstractmethod #, classmethod
from math import ceil, floor
import os
from pathlib import Path
from pickle import dump, load
from custom_enums import ModelTrainingType
import pandas as pd
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import gc

In [3]:

# wandb.login() requires you to get your API key from your account settings
# open the weights and biases website https://wandb.ai/login and login to your account
# then go to your account settings and copy the API key
# paste it in the input box and hit enter

#wandb.login() #TODO: uncomment this line to login to your account

In [4]:
# # start a new wandb run to track this script
# wandb.init(
#     # set the wandb project where this run will be logged
#     project="my-awesome-project",
    
#     # track hyperparameters and run metadata
#     config={
#     "learning_rate": 0.02,
#     "architecture": "CNN",
#     "dataset": "CIFAR-100",
#     "epochs": 10,
#     }
# )

To be able to switch between different models from different libraries at a glance, we implement an interface called `IPipelineRequirements`. This allows us to make the pipleine even more robust and easier to extend upon. 

In [2]:
class IPipelineRequirements(ABC):
    @abstractmethod
    def train(self):
        pass

    @abstractmethod
    def predict(self):
        pass

    @abstractmethod
    def save(self):
        pass

    @abstractmethod
    def load(self):
        pass

    @abstractmethod
    def evaluate(self):
        pass

Our first model is the baseline model, which just takes the mean over all onset and wakeup times and tries to predict `onset` and `wakeup` events with the calculated time.

In [3]:
class BaselineModel(IPipelineRequirements):
    def __init__(self):
        pass

    def train(self):
        raise NotImplementedError("Please Implement this method")

    def predict(self):
        raise NotImplementedError("Please Implement this method")

    def save(self):
        raise NotImplementedError("Please Implement this method")

    def load(self):
        raise NotImplementedError("Please Implement this method")

    def evaluate(self):
        raise NotImplementedError("Please Implement this method")

In [7]:
g = BaselineModel()

Numerous studies have concentrated on applications using `RandomForest`. The primary motivation for this preference is the model's inherent transparency in decision-making processes, which are readily identifiable in such models. Subsequent to the BaselineModel, efforts have been made to abstract models from the Scikit-learn library. Fortunately, the majority of models within their API exhibit consistent implementation patterns, facilitating their integration into the processing pipeline.

In [4]:
class SkLearnModel(IPipelineRequirements):

    def __init__(self, model_path):
        self.load(model_path)

    def __init__(self, model, identifier, scaler=StandardScaler, sk_model_params=None):
        self._model = model
        self._scaler = scaler if not callable(scaler) else scaler() # if scaler is a classpointer, instantiate it
        self.identifier = identifier
        self._model_params = sk_model_params

    def train(self, X, y, not_scaled=False, **kwargs):
        if not_scaled:
            X = self._scaler.transform(X)

        # due to the big dataset we need to check wich model was instantiated and do some model
        # specific stuff to enable us to train the model in batches
        if isinstance(self._model, RandomForestClassifier):
            new_estimators = kwargs['add_estimators'] if 'add_estimators' in kwargs else 50
            if self._model.warm_start: self._model.n_estimators += new_estimators
            #print(f'Current estimator increased to {self._model.n_estimators}, {new_estimators} added this round.')
        # if isinstance(self._model, SVC):
        #     pass

        self._model.fit(X, y)

    def predict(self, X, not_scaled=False):
        if not self._model: raise ValueError('Please load or train a model first.')
        if not_scaled:
            if not self._scaler: raise ValueError('Please load or fit a scaler first.')
            X = self._scaler.transform(X)
        return self._model.predict(X)

    def save(self):
        try:
            Path.mkdir(Path('models'), exist_ok=False)
            with open(f'models/model_{self.identifier}.pkl', 'wb') as f:
                dump(self._model, f)
            with open(f'models/scaler_{self.identifier}.pkl', 'wb') as f:
                dump(self._scaler, f)
        except:
            raise ValueError('Unable to save model and scaler.')

    def load(self, filepath):
        try:
            # load model and scaler
            if os.path.isfile(filepath):
                print(f'Loading model from {filepath}')
                with open(filepath, 'rb') as f:
                    self._model = load(f)
            scaler_path = f'{os.path.split(filepath)[0]}/scaler_{os.path.split(filepath)[1].split(".")[0].split("_")[1]}.pkl'
            if os.path.isfile(scaler_path):
                print(f'Loading scaler from {scaler_path}')
                with open(filepath, 'rb') as f:
                    self._scaler = load(f)

            # extract identifier from filename
            self._identifier = filepath.split('_')[1].split('.')[0]
        except (FileNotFoundError) as e:
                print(f'File {e} not found')
                raise ValueError('Filepath is not valid. Unable to load model and scaler.')
        except (IndexError) as e:
                print(f'The name of the file does not implement the convention "<model|scaler>_<some identifier>".')
                raise ValueError('Filepath is not valid. Unable to load model and scaler.')
        except (Exception) as e:
            print(f'Something went wrong. {e}')
            raise ValueError('Unknown Error.')

    def evaluate(self, X, y, scoreFx=None, not_scaled=False, prepredicted=None):
        if not scoreFx:
            raise ValueError('Please provide a score function.')
        if not self._model:
            raise ValueError('Please load or train a model first.')
        if not_scaled and prepredicted == None:
            if not self._scaler: raise ValueError('Please load or fit a scaler first.')
            X = self._scaler.transform(X)
            
        #TODO: Does score FX require an array of predictions or a single prediction?
        #TODO: TEST PARAMS OF SCORE FX!!!!
        return scoreFx(y, prepredicted if prepredicted is not None else self._model.predict(X))

### Heuristics method to get the right event under the constraints given by Child Mind Institue

* A single sleep period must be at least 30 minutes in length
* A single sleep period can be interrupted by bouts of activity that do not exceed 30 consecutive minutes
* No sleep windows can be detected unless the watch is deemed to be worn for the duration (elaborated upon, below)
* The longest sleep window during the night is the only one which is recorded
* If no valid sleep window is identifiable, neither an onset nor a wakeup event is recorded for that night.
* Sleep events do not need to straddle the day-line, and therefore there is no hard rule defining how many may occur within a given period. However, no more than one window should be assigned per night. For example, it is valid for an individual to have a sleep window from 01h00–06h00 and 19h00–23h30 in the same calendar day, though assigned to consecutive nights
* There are roughly as many nights recorded for a series as there are 24-hour periods in that series.

In [9]:
def heuristic_filter(data):
    pass

### Import the training and the validation data

In [10]:
raise(ValueError('test'))

ValueError: test

In [5]:
NEW_TRAIN_DATA = '../data/train/'
NEW_VALIDATION_DATA = '../data/val/'
TRAIN_DATA = '../data/train_20231021'
#TRAIN_DATA = '../data/train_20231021_20M'
VALIDATION_DATA = '../data/validation_20231021'

# figure out series id mapping
if not 'series_id_mapping' in vars():
    series_id_mapping = {'train': dict(), 'validation': dict()}
    t = ds.dataset(NEW_TRAIN_DATA).to_table(columns=['series_id'])
    #t = pq.ParquetDataset(TRAIN_DATA).read(columns=['series_id'])
    for i, data in enumerate(t.to_pandas()['series_id'].unique()):
        series_id_mapping['train'][data] = i
    #v = pq.ParquetDataset(VALIDATION_DATA).read(columns=['series_id'])
    v = ds.dataset(NEW_VALIDATION_DATA).to_table(columns=['series_id'])
    for i, data in enumerate(v.to_pandas()['series_id'].unique()):
        series_id_mapping['validation'][data] = i
    del t, v
    gc.collect()

#train_dataset_length = pq.ParquetFile(TRAIN_DATA).metadata.num_rows
train_dataset_length = ds.dataset(NEW_TRAIN_DATA).count_rows()
validation_dataset_length = ds.dataset(NEW_VALIDATION_DATA).count_rows()

def dataloader_full_dataset(validation=False):
    return pd.read_parquet(VALIDATION_DATA if validation else TRAIN_DATA)

def dataloader(validation=False, batch_size=5_000_000):
    parquet_file = pq.ParquetFile(VALIDATION_DATA if validation else TRAIN_DATA)
    for i in parquet_file.iter_batches(batch_size=batch_size):
        yield i.to_pandas()

def batched_dataloader(validation=False, batch_size=100_000):
    dataset = ds.dataset(NEW_VALIDATION_DATA if validation else NEW_TRAIN_DATA)
    for fragment in dataset.get_fragments():
        batches = fragment.to_batches(batch_size=batch_size) # https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Fragment.html#pyarrow.dataset.Fragment.to_batches
        for batch in batches:
            yield batch.to_pandas()
            del batch
        del fragment


### Run the pipeline

In [13]:
# define all the combinations of models and features
models_and_hyperparams = {
    RandomForestClassifier: {
        'params': {
            'n_estimators': [1000, 1500, 2000],
            'max_depth': [20, 40, 60, None],
            'random_state': [0],
            #'criterion': ['gini', 'entropy', 'log_loss'],
            'warm_start': [True]
        },
        'modeltype': SkLearnModel,
        'scaler': StandardScaler,
        'training_type': ModelTrainingType.BATCH,
        'batch_size': 1_000_000
    },
    # SVC: {
    #     'params': {
    #         'kernel': ['rbf', 'poly', 'poly', 'poly', 'sigmoid'],
    #         'degree': [3, 4, 5],
    #         'C': [1, 10, 100, 1000],
    #         'gamma': ['scale', 'auto']
    #     },
    #     'modeltype': SkLearnModel,
    #     'scaler': StandardScaler
    # },
    # KNeighborsClassifier: {
    #     'params': {
    #         'n_neighbors': [5, 10, 15, 20],
    #         'weights': ['uniform', 'distance'],
    #         'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    #         'leaf_size': [10, 20, 30, 40, 50],
    #         'p': [1, 2]
    #     },
    #     'modeltype': SkLearnModel,
    #     'scaler': StandardScaler
    # }
}

# define scaler
scaler_type = StandardScaler
pretrained_scalers = dict() # used to store trained scalers for later use

# loop over all combinations and append it to the configurations list
# if there are no hyperparams, just instantiate the model without params
for model_type, hyperparams in models_and_hyperparams.items():
    if len(hyperparams['params']) > 0: # if there are hyperparams, build a dict and pass it to the model as parameters
        fx_param_names, fx_param_values = zip(*hyperparams['params'].items())
        for cartesian_product_values in product(*fx_param_values):
            hyperparam_dict = dict(zip(fx_param_names, cartesian_product_values))

            # create or reuse the scaler specified in the models_and_hyperparams dictionary
            # if hyperparams['scaler'] in pretrained_scalers:
            #     print(f'Using pretrained scaler {hyperparams["scaler"].__name__}')
            #     scaler = pretrained_scalers[hyperparams['scaler']]
            # else:
            #     scaler = hyperparams['scaler']()
            #     print(f'Start fitting scaler {hyperparams["scaler"].__name__}')
            #     for batch in tqdm(dataloader(batch_size=15_000_000)):
            #         X = batch.drop(['wearable_on', 'awake', 'wakeup', 'onset'], axis=1)
            #         X['series_id'] = X['series_id'].map(series_id_mapping['train'])
            #         scaler.partial_fit(X)
            #         del X
            #         gc.collect()
            #     # save the scaler for later use
            #     pretrained_scalers[hyperparams['scaler']] = scaler

            # create the model from the modeltype and the hyperparam_dict
            m = hyperparams['modeltype'](model_type(**hyperparam_dict), \
                                        f'{model_type.__name__}-wrist-{"-".join([f"{na}__{str(va)}" for na, va in hyperparam_dict.items()])}'.lower(), \
                                        #pretrained_scalers[hyperparams['scaler']], \
                                        StandardScaler(), \
                                        hyperparam_dict)
            m2 = hyperparams['modeltype'](model_type(**hyperparam_dict), \
                                        f'{model_type.__name__}-sleep-{"-".join([f"{na}__{str(va)}" for na, va in hyperparam_dict.items()])}'.lower(), \
                                        #pretrained_scalers[hyperparams['scaler']], \
                                        StandardScaler(), \
                                        hyperparam_dict)
            

#TODO: uncemment, standardscaler , not_scaled=True


            
            # init wandb
            # start a new wandb run to track this script
            #cfg = m._model_params.copy()
            #cfg['model'] = m._model.__name__
            #wandb.init(project="classic_models", config=cfg)

            # train model
            rounds = 1
            if hyperparams['training_type'] == ModelTrainingType.BATCH:
                print(f'Model will be traind in batches of {hyperparams["batch_size"]} samples.')
                print(f'Dataset contains {train_dataset_length} samples. {(rounds := train_dataset_length // hyperparams["batch_size"])} batches will be used when fitting the model.')
                if model_type == RandomForestClassifier:
                    print(f'At the end {hyperparam_dict["n_estimators"]} estimators will be fitted. That means, {hyperparam_dict["n_estimators"]//rounds} estimators per batch.')
                    m._model.n_estimators = 0 #set to 0. model will increase the number of estimators in each round
                    m2._model.n_estimators = 0 #set to 0. model will increase the number of estimators in each round
                    # tune hyperparameter 'n_estimators' based on the number of batches
                    attune_estimators_div = hyperparam_dict["n_estimators"] / rounds
                    attuned_estimators = (floor(attune_estimators_div) if attune_estimators_div -0.3 < floor(attune_estimators_div) else ceil(attune_estimators_div)) * rounds
                    print(f'Hyperparameter n_estimators will be adjusted to {attuned_estimators} to fit {attuned_estimators / rounds} estimators per round for each {rounds} batches.')
                    hyperparam_dict["n_estimators"] = attuned_estimators

            print(f'Start training model {m.identifier}')
            for batch in tqdm(batched_dataloader(batch_size=hyperparams['batch_size']) if hyperparams['training_type'] == ModelTrainingType.BATCH else dataloader_full_dataset(), total=rounds+1):
                #X = batch.drop(['wearable_on', 'awake', 'wakeup', 'onset'], axis=1)
                X = batch.drop(['wearable_on', 'awake'], axis=1)
                X['series_id'] = X['series_id'].map(series_id_mapping['train'])
                y = batch['wearable_on']
                args = {}
                if model_type == RandomForestClassifier and hyperparams['training_type'] == ModelTrainingType.BATCH: args['add_estimators'] = hyperparam_dict['n_estimators']//rounds
                m.train(X, y, not_scaled=False, **args)
                del X
                del y
                gc.collect()
            #y_hat = configured_model.predict(X_test)

            #  do the heuristic part...

            #  evaluate model
            print(f'Start evaluating model {m.identifier}')
            score_value_average_precision_score, score_value_recall, score_value_precision = [], [], []
            for validation in tqdm(batched_dataloader(batch_size=hyperparams['batch_size'], validation=True) if hyperparams['training_type'] == ModelTrainingType.BATCH else dataloader_full_dataset(validation=True), total = validation_dataset_length // hyperparams["batch_size"]):
            #validation = batched_dataloader(validation=True)
                validation['series_id'] = validation['series_id'].map(series_id_mapping['validation'])
                validation_y = validation['wearable_on']
                validation.drop(['wearable_on', 'awake'], axis=1, inplace=True)
                validation_y_hat = m.predict(validation, not_scaled=False)
                # this does the same as scoreFX(y, y_hat). It is wrapped in the models evaluate function.
                # the model itself could also do the prediction if prepredicted is None
                # then, the first argument of the evaluate function would be the X_validation data
                score_value_average_precision_score.append(m.evaluate(None, validation_y, scoreFx=average_precision_score, not_scaled=False, prepredicted=validation_y_hat))
                score_value_recall.append(m.evaluate(None, validation_y, scoreFx=recall_score, not_scaled=False, prepredicted=validation_y_hat))
                score_value_precision.append(m.evaluate(None, validation_y, scoreFx=precision_score, not_scaled=False, prepredicted=validation_y_hat))
            #print(score_value_average_precision_score)
            #print(score_value_recall)
            #print(score_value_precision)
                del validation, validation_y

            print(sum(score_value_average_precision_score) / len(score_value_average_precision_score))
            print(sum(score_value_recall) / len(score_value_recall))
            print(sum(score_value_precision) / len(score_value_precision))
            #  save model
            #m.save()

            print(f'Start training model {m2.identifier}')
            for batch in tqdm(batched_dataloader(batch_size=hyperparams['batch_size']) if hyperparams['training_type'] == ModelTrainingType.BATCH else dataloader_full_dataset(), total=rounds+1):
                #X = batch.drop(['wearable_on', 'awake', 'wakeup', 'onset'], axis=1)
                X = batch.drop(['wearable_on', 'awake'], axis=1)
                X['series_id'] = X['series_id'].map(series_id_mapping['train'])
                y = batch['awake']
                args = {}
                if model_type == RandomForestClassifier and hyperparams['training_type'] == ModelTrainingType.BATCH: args['add_estimators'] = hyperparam_dict['n_estimators']//rounds
                worn_y_hat = m.predict(X, not_scaled=False)
                # add worn_y_hat as new column to X
                X['pred_worn'] = worn_y_hat
                # filter out all rows where worn_y_hat is 0
                X = X[X['pred_worn'] == 1]
                y = y[y.index.isin(X.index)]
                m2.train(X, y, not_scaled=False, **args)
                del X
                del y
                gc.collect()

            print(f'Start evaluating model {m2.identifier}')
            score_value_average_precision_score_m2, score_value_recall_m2, score_value_precision_m2 = [], [], []
            for validation in tqdm(batched_dataloader(batch_size=hyperparams['batch_size'], validation=True) if hyperparams['training_type'] == ModelTrainingType.BATCH else dataloader_full_dataset(validation=True), total = validation_dataset_length // hyperparams["batch_size"]):
            #validation = batched_dataloader(validation=True)
                validation['series_id'] = validation['series_id'].map(series_id_mapping['validation'])
                validation_y = validation['awake']
                validation.drop(['wearable_on', 'awake'], axis=1, inplace=True)
                worn_y_hat = m.predict(validation, not_scaled=False)
                validation['pred_worn'] = worn_y_hat
                validation = validation[validation['pred_worn'] == 1]
                validation_y = validation_y[validation_y.index.isin(validation.index)]
                validation_y_hat = m2.predict(validation, not_scaled=False)
                # this does the same as scoreFX(y, y_hat). It is wrapped in the models evaluate function.
                # the model itself could also do the prediction if prepredicted is None
                # then, the first argument of the evaluate function would be the X_validation data
                score_value_average_precision_score_m2.append(m2.evaluate(None, validation_y, scoreFx=average_precision_score, not_scaled=False, prepredicted=validation_y_hat))
                score_value_recall_m2.append(m2.evaluate(None, validation_y, scoreFx=recall_score, not_scaled=False, prepredicted=validation_y_hat))
                score_value_precision_m2.append(m2.evaluate(None, validation_y, scoreFx=precision_score, not_scaled=False, prepredicted=validation_y_hat))
            #print(score_value_average_precision_score)
            #print(score_value_recall)
            #print(score_value_precision)
                del validation, validation_y

            print(sum(score_value_average_precision_score_m2) / len(score_value_average_precision_score_m2))
            print(sum(score_value_recall_m2) / len(score_value_recall_m2))
            print(sum(score_value_precision_m2) / len(score_value_precision_m2))
            # delete model to free up memory
            #del m

            break

            #wandb.finish()
    else:
        print("params in the dictionary cannot be empty. Use the standard values in the dictionary for the model", model_type.__name__)

Model will be traind in batches of 1000000 samples.
Dataset contains 102937860 samples. 102 batches will be used when fitting the model.
At the end 1000 estimators will be fitted. That means, 9 estimators per batch.
Hyperparameter n_estimators will be adjusted to 1020 to fit 10.0 estimators per round for each 102 batches.
Start training model randomforestclassifier-wrist-n_estimators__1000-max_depth__20-random_state__0-warm_start__true


  0%|          | 0/103 [00:00<?, ?it/s]

106it [1:12:41, 41.15s/it]                         


Start evaluating model randomforestclassifier-wrist-n_estimators__1000-max_depth__20-random_state__0-warm_start__true


26it [19:12, 44.32s/it]                        


0.8504913085074458
0.9983862489386269
0.8505018940867151
Start training model randomforestclassifier-sleep-n_estimators__1000-max_depth__20-random_state__0-warm_start__true


  0%|          | 0/103 [00:00<?, ?it/s]

X shape: (1000000, 89)


  1%|          | 1/103 [03:11<5:25:37, 191.54s/it]

X shape: (1000000, 89)


  2%|▏         | 2/103 [05:30<4:30:49, 160.89s/it]

X shape: (931099, 89)


  3%|▎         | 3/103 [08:13<4:29:10, 161.50s/it]

X shape: (992692, 89)


  4%|▍         | 4/103 [10:59<4:29:20, 163.24s/it]

X shape: (1000000, 89)


  5%|▍         | 5/103 [14:01<4:38:01, 170.22s/it]

X shape: (997082, 89)


  6%|▌         | 6/103 [16:33<4:24:58, 163.90s/it]

X shape: (980197, 89)


  7%|▋         | 7/103 [19:13<4:20:15, 162.67s/it]

X shape: (1000000, 89)


  8%|▊         | 8/103 [22:08<4:23:32, 166.45s/it]

X shape: (1000000, 89)


  9%|▊         | 9/103 [24:31<4:09:21, 159.17s/it]

X shape: (961716, 89)


 10%|▉         | 10/103 [27:23<4:13:13, 163.37s/it]

X shape: (999999, 89)


 11%|█         | 11/103 [30:14<4:13:57, 165.63s/it]

X shape: (995610, 89)


 12%|█▏        | 12/103 [33:13<4:17:13, 169.60s/it]

X shape: (1000000, 89)


 13%|█▎        | 13/103 [36:25<4:24:32, 176.36s/it]

X shape: (916624, 89)


 14%|█▎        | 14/103 [38:34<4:00:18, 162.00s/it]

X shape: (1000000, 89)


 15%|█▍        | 15/103 [41:21<4:00:09, 163.74s/it]

X shape: (1000000, 89)


 16%|█▌        | 16/103 [44:24<4:05:26, 169.28s/it]

X shape: (954275, 89)


 17%|█▋        | 17/103 [47:09<4:00:47, 168.00s/it]

X shape: (989473, 89)


 17%|█▋        | 18/103 [49:18<3:41:42, 156.50s/it]

X shape: (1000000, 89)


 18%|█▊        | 19/103 [52:24<3:51:23, 165.28s/it]

X shape: (1000000, 89)


 19%|█▉        | 20/103 [55:28<3:56:22, 170.87s/it]

X shape: (999396, 89)


 20%|██        | 21/103 [58:13<3:50:59, 169.01s/it]

X shape: (986785, 89)


 21%|██▏       | 22/103 [1:00:46<3:41:51, 164.34s/it]

X shape: (1000000, 89)


 22%|██▏       | 23/103 [1:03:38<3:42:10, 166.63s/it]

X shape: (997776, 89)


 23%|██▎       | 24/103 [1:06:42<3:46:26, 171.98s/it]

X shape: (927239, 89)


 24%|██▍       | 25/103 [1:08:57<3:28:51, 160.66s/it]

X shape: (182672, 89)


 25%|██▌       | 26/103 [1:09:21<2:33:42, 119.77s/it]

X shape: (970920, 89)


 26%|██▌       | 27/103 [1:12:18<2:53:26, 136.92s/it]

X shape: (851034, 89)


 27%|██▋       | 28/103 [1:14:35<2:51:13, 136.98s/it]

X shape: (1000000, 89)


 28%|██▊       | 29/103 [1:17:12<2:56:24, 143.04s/it]

X shape: (1000000, 89)


 29%|██▉       | 30/103 [1:20:04<3:04:20, 151.52s/it]

X shape: (994838, 89)


 30%|███       | 31/103 [1:22:37<3:02:35, 152.17s/it]

X shape: (1000000, 89)


 31%|███       | 32/103 [1:25:15<3:01:57, 153.76s/it]

X shape: (999932, 89)


 32%|███▏      | 33/103 [1:27:51<3:00:08, 154.41s/it]

X shape: (987930, 89)


 33%|███▎      | 34/103 [1:30:17<2:54:52, 152.07s/it]

X shape: (515458, 89)


 34%|███▍      | 35/103 [1:31:36<2:27:16, 129.95s/it]

X shape: (1000000, 89)


 35%|███▍      | 36/103 [1:34:43<2:44:16, 147.11s/it]

X shape: (1000000, 89)


 36%|███▌      | 37/103 [1:37:32<2:49:11, 153.81s/it]

X shape: (984024, 89)


 37%|███▋      | 38/103 [1:39:52<2:42:08, 149.67s/it]

X shape: (950140, 89)


 38%|███▊      | 39/103 [1:42:40<2:45:26, 155.10s/it]

X shape: (995785, 89)


 39%|███▉      | 40/103 [1:44:49<2:34:46, 147.40s/it]

X shape: (995542, 89)


 40%|███▉      | 41/103 [1:47:06<2:29:05, 144.28s/it]

X shape: (974649, 89)


 41%|████      | 42/103 [1:49:52<2:33:09, 150.64s/it]

X shape: (989541, 89)


 42%|████▏     | 43/103 [1:52:00<2:23:57, 143.95s/it]

X shape: (999017, 89)


 43%|████▎     | 44/103 [1:54:36<2:25:01, 147.48s/it]

X shape: (992761, 89)


 44%|████▎     | 45/103 [1:57:23<2:28:06, 153.21s/it]

X shape: (941752, 89)


 45%|████▍     | 46/103 [1:59:48<2:23:27, 151.00s/it]

X shape: (1000000, 89)


 46%|████▌     | 47/103 [2:02:34<2:25:04, 155.44s/it]

X shape: (1000000, 89)


 47%|████▋     | 48/103 [2:05:31<2:28:21, 161.85s/it]

X shape: (997298, 89)


 48%|████▊     | 49/103 [2:08:10<2:24:48, 160.89s/it]

X shape: (1000000, 89)


 49%|████▊     | 50/103 [2:11:15<2:28:37, 168.26s/it]

X shape: (1000000, 89)


 50%|████▉     | 51/103 [2:14:14<2:28:30, 171.35s/it]

X shape: (296660, 89)


 50%|█████     | 52/103 [2:14:48<1:50:41, 130.23s/it]

X shape: (1000000, 89)


 51%|█████▏    | 53/103 [2:17:03<1:49:44, 131.70s/it]

X shape: (1000000, 89)


 52%|█████▏    | 54/103 [2:19:29<1:51:08, 136.09s/it]

X shape: (1000000, 89)


 53%|█████▎    | 55/103 [2:22:28<1:58:59, 148.73s/it]

X shape: (993107, 89)


 54%|█████▍    | 56/103 [2:25:13<2:00:27, 153.77s/it]

X shape: (1000000, 89)


 55%|█████▌    | 57/103 [2:27:50<1:58:41, 154.81s/it]

X shape: (1000000, 89)


 56%|█████▋    | 58/103 [2:30:30<1:57:10, 156.24s/it]

X shape: (1000000, 89)


 57%|█████▋    | 59/103 [2:33:33<2:00:26, 164.23s/it]

X shape: (1000000, 89)


 58%|█████▊    | 60/103 [2:35:47<1:51:16, 155.28s/it]

X shape: (999785, 89)


 59%|█████▉    | 61/103 [2:37:55<1:42:55, 147.04s/it]

X shape: (999998, 89)


 60%|██████    | 62/103 [2:40:06<1:37:11, 142.24s/it]

X shape: (1000000, 89)


 61%|██████    | 63/103 [2:42:55<1:40:11, 150.29s/it]

X shape: (969698, 89)


 62%|██████▏   | 64/103 [2:45:27<1:38:02, 150.85s/it]

X shape: (1000000, 89)


 63%|██████▎   | 65/103 [2:48:04<1:36:40, 152.63s/it]

X shape: (1000000, 89)


 64%|██████▍   | 66/103 [2:50:09<1:28:59, 144.30s/it]

X shape: (991113, 89)


 65%|██████▌   | 67/103 [2:52:42<1:28:08, 146.90s/it]

X shape: (1000000, 89)


 66%|██████▌   | 68/103 [2:54:23<1:17:44, 133.27s/it]

X shape: (1000000, 89)


 67%|██████▋   | 69/103 [2:57:27<1:23:59, 148.22s/it]

X shape: (1000000, 89)


 68%|██████▊   | 70/103 [2:59:43<1:19:31, 144.58s/it]

X shape: (1000000, 89)


 69%|██████▉   | 71/103 [3:02:34<1:21:21, 152.56s/it]

X shape: (1000000, 89)


 70%|██████▉   | 72/103 [3:05:36<1:23:27, 161.52s/it]

X shape: (1000000, 89)


 71%|███████   | 73/103 [3:08:24<1:21:39, 163.30s/it]

X shape: (1000000, 89)


 72%|███████▏  | 74/103 [3:10:40<1:15:02, 155.27s/it]

X shape: (999995, 89)


 73%|███████▎  | 75/103 [3:12:02<1:02:09, 133.21s/it]

X shape: (999994, 89)


 74%|███████▍  | 76/103 [3:13:58<57:40, 128.18s/it]  

X shape: (998373, 89)


 75%|███████▍  | 77/103 [3:16:44<1:00:26, 139.47s/it]

X shape: (216920, 89)


 76%|███████▌  | 78/103 [3:17:20<45:09, 108.37s/it]  

X shape: (1000000, 89)


 77%|███████▋  | 79/103 [3:20:28<52:52, 132.21s/it]

X shape: (1000000, 89)


 78%|███████▊  | 80/103 [3:23:09<54:03, 141.03s/it]

X shape: (1000000, 89)


 79%|███████▊  | 81/103 [3:26:00<54:55, 149.82s/it]

X shape: (1000000, 89)


 80%|███████▉  | 82/103 [3:28:41<53:36, 153.15s/it]

X shape: (1000000, 89)


 81%|████████  | 83/103 [3:31:05<50:07, 150.38s/it]

X shape: (1000000, 89)


 82%|████████▏ | 84/103 [3:34:02<50:11, 158.48s/it]

X shape: (1000000, 89)


 83%|████████▎ | 85/103 [3:36:50<48:23, 161.29s/it]

X shape: (1000000, 89)


 83%|████████▎ | 86/103 [3:39:38<46:17, 163.41s/it]

X shape: (1000000, 89)


 84%|████████▍ | 87/103 [3:42:23<43:42, 163.91s/it]

X shape: (1000000, 89)


 85%|████████▌ | 88/103 [3:44:50<39:43, 158.90s/it]

X shape: (1000000, 89)


 86%|████████▋ | 89/103 [3:47:47<38:19, 164.27s/it]

X shape: (1000000, 89)


 87%|████████▋ | 90/103 [3:50:28<35:23, 163.31s/it]

X shape: (1000000, 89)


 88%|████████▊ | 91/103 [3:52:40<30:45, 153.79s/it]

X shape: (1000000, 89)


 89%|████████▉ | 92/103 [3:55:33<29:15, 159.58s/it]

X shape: (1000000, 89)


 90%|█████████ | 93/103 [3:58:12<26:32, 159.26s/it]

X shape: (1000000, 89)


 91%|█████████▏| 94/103 [4:01:01<24:21, 162.43s/it]

X shape: (1000000, 89)


 92%|█████████▏| 95/103 [4:03:30<21:05, 158.22s/it]

X shape: (1000000, 89)


 93%|█████████▎| 96/103 [4:06:15<18:42, 160.35s/it]

X shape: (1000000, 89)


 94%|█████████▍| 97/103 [4:08:54<15:59, 159.99s/it]

X shape: (995377, 89)


 95%|█████████▌| 98/103 [4:10:59<12:26, 149.38s/it]

X shape: (1000000, 89)


 96%|█████████▌| 99/103 [4:13:13<09:39, 144.87s/it]

X shape: (999867, 89)


 97%|█████████▋| 100/103 [4:14:44<06:26, 128.75s/it]

X shape: (999998, 89)


 98%|█████████▊| 101/103 [4:17:35<04:42, 141.26s/it]

X shape: (1000000, 89)


 99%|█████████▉| 102/103 [4:20:31<02:31, 151.76s/it]

X shape: (1000000, 89)


100%|██████████| 103/103 [4:22:57<00:00, 149.89s/it]

X shape: (588960, 89)


104it [4:23:26, 113.63s/it]                         

X shape: (1000000, 89)


105it [4:26:32, 135.58s/it]

X shape: (634040, 89)


106it [4:28:15, 151.85s/it]


Start evaluating model randomforestclassifier-sleep-n_estimators__1000-max_depth__20-random_state__0-warm_start__true


  0%|          | 0/25 [00:14<?, ?it/s]


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- pred_worn


In [17]:
print(f'Start evaluating model {m2.identifier}')
score_value_average_precision_score_m2, score_value_recall_m2, score_value_precision_m2 = [], [], []
for validation in tqdm(batched_dataloader(batch_size=hyperparams['batch_size'], validation=True) if hyperparams['training_type'] == ModelTrainingType.BATCH else dataloader_full_dataset(validation=True), total = validation_dataset_length // hyperparams["batch_size"]):
#validation = batched_dataloader(validation=True)
    validation['series_id'] = validation['series_id'].map(series_id_mapping['validation'])
    validation_y = validation['awake']
    validation.drop(['wearable_on', 'awake'], axis=1, inplace=True)
    worn_y_hat = m.predict(validation, not_scaled=False)
    validation['pred_worn'] = worn_y_hat
    validation = validation[validation['pred_worn'] == 1]
    validation_y = validation_y[validation_y.index.isin(validation.index)]
    validation_y_hat = m2.predict(validation, not_scaled=False)
    # this does the same as scoreFX(y, y_hat). It is wrapped in the models evaluate function.
    # the model itself could also do the prediction if prepredicted is None
    # then, the first argument of the evaluate function would be the X_validation data
    score_value_average_precision_score_m2.append(m2.evaluate(None, validation_y, scoreFx=average_precision_score, not_scaled=False, prepredicted=validation_y_hat))
    score_value_recall_m2.append(m2.evaluate(None, validation_y, scoreFx=recall_score, not_scaled=False, prepredicted=validation_y_hat))
    score_value_precision_m2.append(m2.evaluate(None, validation_y, scoreFx=precision_score, not_scaled=False, prepredicted=validation_y_hat))

Start evaluating model randomforestclassifier-sleep-n_estimators__1000-max_depth__20-random_state__0-warm_start__true


26it [51:10, 118.11s/it]                        


In [18]:
print(sum(score_value_average_precision_score_m2) / len(score_value_average_precision_score_m2))
print(sum(score_value_recall_m2) / len(score_value_recall_m2))
print(sum(score_value_precision_m2) / len(score_value_precision_m2))

0.9607095567837105
0.9281085846082481
0.9731355200195849


In [14]:
with open('models/AAAA.pkl', 'wb') as f:
    dump(m._model, f)

In [15]:
with open('models/BBBB.pkl', 'wb') as f:
    dump(m2._model, f)

In [None]:
m._model.feature_importances_

array([0.26062016, 0.00587829, 0.0036312 , 0.0067284 , 0.00313009,
       0.0023635 , 0.13000739, 0.08961576, 0.03968891, 0.00650785,
       0.004985  , 0.00574397, 0.00601681, 0.00674708, 0.00916001,
       0.00645703, 0.01008137, 0.00771209, 0.00460338, 0.00187635,
       0.00212878, 0.00584396, 0.00304738, 0.00260252, 0.0102062 ,
       0.00257583, 0.01008914, 0.00239926, 0.0018233 , 0.0074098 ,
       0.01305531, 0.0105764 , 0.01563973, 0.01248217, 0.00622891,
       0.01374397, 0.01110173, 0.00869639, 0.00325989, 0.00711063,
       0.00839432, 0.0102059 , 0.00896969, 0.01967453, 0.00406012,
       0.0162956 , 0.00609977, 0.00351079, 0.00349858, 0.00432328,
       0.00430827, 0.00365324, 0.00467349, 0.00531373, 0.00296852,
       0.00231341, 0.00493366, 0.00602081, 0.00053684, 0.00281932,
       0.00216382, 0.00211915, 0.00397052, 0.00743335, 0.00120637,
       0.00171118, 0.00238241, 0.00091997, 0.00588115, 0.0042996 ,
       0.00563069, 0.00923931, 0.00519159, 0.00797593, 0.00543

In [6]:
ff = batched_dataloader(validation=True, batch_size=400_000)

In [7]:
fg = next(ff)

In [8]:
fg.drop(['wearable_on', 'awake'], axis=1, inplace=True)

In [9]:
fg['pred_worn'] = 0

In [10]:
fg.columns


Index(['series_id', 'step', 'anglez', 'enmo', 'hour', 'minute', 'seconds',
       'day', 'month', 'year', 'anglez_abs', 'anglez_rolling_mean_12',
       'anglez_rolling_sum_12', 'anglez_rolling_max_12',
       'anglez_rolling_min_12', 'anglez_rolling_std_12',
       'anglez_rolling_median_12', 'anglez_rolling_variance_12',
       'anglez_rolling_25th_percentile_12',
       'anglez_rolling_75th_percentile_12', 'anglez_diff_12',
       'anglez_diff_rolling_mean_12', 'anglez_diff_rolling_sum_12',
       'anglez_diff_rolling_max_12', 'anglez_diff_rolling_min_12',
       'anglez_diff_rolling_std_12', 'anglez_diff_rolling_median_12',
       'anglez_diff_rolling_variance_12',
       'anglez_diff_rolling_25th_percentile_12',
       'anglez_diff_rolling_75th_percentile_12', 'anglez_rolling_mean_60',
       'anglez_rolling_sum_60', 'anglez_rolling_max_60',
       'anglez_rolling_min_60', 'anglez_rolling_std_60',
       'anglez_rolling_median_60', 'anglez_rolling_variance_60',
       'anglez_roll

In [None]:
sorted(list(zip(fg.columns, m._model.feature_importances_)), key=lambda x: x[1], reverse=True)

[('step', 0.2606201637780376),
 ('day', 0.13000738739185558),
 ('month', 0.08961576019141494),
 ('year', 0.0396889064974704),
 ('anglez_diff_rolling_std_60', 0.01967452775656659),
 ('anglez_diff_rolling_variance_60', 0.016295595479441097),
 ('anglez_rolling_min_60', 0.015639734409586502),
 ('anglez_rolling_variance_60', 0.013743969730654821),
 ('anglez_rolling_sum_60', 0.01305531394729148),
 ('anglez_rolling_std_60', 0.012482167698995373),
 ('anglez_rolling_25th_percentile_60', 0.011101729729531409),
 ('anglez_rolling_max_60', 0.010576399763930622),
 ('anglez_diff_rolling_std_12', 0.010206195539808283),
 ('anglez_diff_rolling_max_60', 0.010205901807059478),
 ('anglez_diff_rolling_variance_12', 0.01008914374048936),
 ('anglez_rolling_variance_12', 0.010081370058817232),
 ('enmo_rolling_min_60', 0.009239309666437786),
 ('anglez_rolling_std_12', 0.009160005279233812),
 ('anglez_diff_rolling_min_60', 0.008969692245192491),
 ('anglez_rolling_75th_percentile_60', 0.008696389961548387),
 ('an

In [None]:
raise ValueError('test')

1. add output of the **BEST** model 1 to train dataset -> [pred_wearable_on] -> must be saved to disk in batches!!!
2. feed the new training to the second model
   1. x of the second model is == ALL but X = batch.drop(['wearable_on', 'awake', 'wakeup', 'onset'], axis=1)
   2. y of the second model is y = batch['awake']

- Do not forget to map the series ID
- All values where pred_wearable_on == 0 must be dropped from the traning set for the second model...
  this can be done by adding a filter to the parquet reader like so:
  ```python
  table = pq.read_table("example.parquet",
                      columns=["col1"],
                      filters=[
                          ("col1", ">", 5),
                          ("col1", "<", 10),
                      ])
  ```


1. the output of the **BEST** second model [pred_awake] must be passed through the heuristics fx alongside with.
   1. the following datafields ['series_id', 'ALL TIME FIELDS', 'pred_awake, ??? others???] 


## Duplication of first pipeline

This pipelines purpos is to train the second model

In [None]:
# define all the combinations of models and features
models_and_hyperparams = {
    RandomForestClassifier: {
        'params': {
            'n_estimators': [200, 300, 400],
            'max_depth': [10, 20, 30, None],
            'random_state': [0],
            'criterion': ['gini', 'entropy', 'log_loss'],
            'warm_start': [True]
        },
        'modeltype': SkLearnModel,
        'scaler': StandardScaler,
        'training_type': ModelTrainingType.BATCH,
        'batch_size': 5_000_000
    }
}

# define scaler
scaler_type = StandardScaler
pretrained_scalers = dict() # used to store trained scalers for later use

# loop over all combinations and append it to the configurations list
# if there are no hyperparams, just instantiate the model without params
for model_type, hyperparams in models_and_hyperparams.items():
    if len(hyperparams['params']) > 0: # if there are hyperparams, build a dict and pass it to the model as parameters
        fx_param_names, fx_param_values = zip(*hyperparams['params'].items())
        for cartesian_product_values in product(*fx_param_values):
            hyperparam_dict = dict(zip(fx_param_names, cartesian_product_values))

            # create or reuse the scaler specified in the models_and_hyperparams dictionary
            # if hyperparams['scaler'] in pretrained_scalers:
            #     print(f'Using pretrained scaler {hyperparams["scaler"].__name__}')
            #     scaler = pretrained_scalers[hyperparams['scaler']]
            # else:
            #     scaler = hyperparams['scaler']()
            #     print(f'Start fitting scaler {hyperparams["scaler"].__name__}')
            #     for batch in tqdm(dataloader(batch_size=15_000_000)):
            #         X = batch.drop(['wearable_on', 'awake', 'wakeup', 'onset'], axis=1)
            #         X['series_id'] = X['series_id'].map(series_id_mapping['train'])
            #         scaler.partial_fit(X)
            #         del X
            #         gc.collect()
            #     # save the scaler for later use
            #     pretrained_scalers[hyperparams['scaler']] = scaler

            # create the model from the modeltype and the hyperparam_dict
            m = hyperparams['modeltype'](model_type(**hyperparam_dict), \
                                        f'{model_type.__name__}-{"-".join([f"{na}__{str(va)}" for na, va in hyperparam_dict.items()])}'.lower(), \
                                        #pretrained_scalers[hyperparams['scaler']], \
                                        StandardScaler(), \
                                        hyperparam_dict)
            

#TODO: uncemment, standardscaler , not_scaled=True


            
            # init wandb
            # start a new wandb run to track this script
            #cfg = m._model_params.copy()
            #cfg['model'] = m._model.__name__
            #wandb.init(project="classic_models", config=cfg)

            # train model
            if hyperparams['training_type'] == ModelTrainingType.BATCH:
                print(f'Model will be traind in batches of {hyperparams["batch_size"]} samples.')
                print(f'Dataset contains {train_dataset_length} samples. {(rounds := train_dataset_length // hyperparams["batch_size"])} batches will be used when fitting the model.')
                if model_type == RandomForestClassifier:
                    print(f'At the end {hyperparam_dict["n_estimators"]} estimators will be fitted. That means, {hyperparam_dict["n_estimators"]//rounds} estimators per batch.')
                    m._model.n_estimators = 0 #set to 0. model will increase the number of estimators in each round

            print(f'Start training model {m.identifier}')
            for batch in tqdm(dataloader(batch_size=hyperparams['batch_size']) if hyperparams['training_type'] == ModelTrainingType.BATCH else [dataloader_full_dataset()]):
                X = batch.drop(['wearable_on', 'awake', 'wakeup', 'onset'], axis=1)
                X['series_id'] = X['series_id'].map(series_id_mapping['train'])
                y = batch['awake']
                args = {}
                if model_type == RandomForestClassifier: args['add_estimators'] = hyperparam_dict['n_estimators']//rounds
                m.train(X, y, not_scaled=False, **args)
                del X
                del y
                gc.collect()

            #y_hat = configured_model.predict(X_test)

            #  do the heuristic part...

            #  evaluate model
            print(f'Start evaluating model {m.identifier}')
            validation = dataloader_full_dataset(validation=True)
            validation['series_id'] = validation['series_id'].map(series_id_mapping['validation'])
            validation_y = validation['awake']
            validation.drop(['wearable_on', 'awake', 'wakeup', 'onset'], axis=1, inplace=True)
            validation_y_hat = m.predict(validation, not_scaled=False)
            # this does the same as scoreFX(y, y_hat). It is wrapped in the models evaluate function.
            # the model itself could also do the prediction if prepredicted is None
            # then, the first argument of the evaluate function would be the X_validation data
            score_value_average_precision_score = m.evaluate(None, validation_y, scoreFx=average_precision_score, not_scaled=False, prepredicted=validation_y_hat)
            score_value_recall = m.evaluate(None, validation_y, scoreFx=recall_score, not_scaled=False, prepredicted=validation_y_hat)
            score_value_precision = m.evaluate(None, validation_y, scoreFx=precision_score, not_scaled=False, prepredicted=validation_y_hat)
            print(score_value_average_precision_score)
            print(score_value_recall)
            print(score_value_precision)
            del validation, validation_y

            #  save model
            #m.save()


            # delete model to free up memory
            #del m

            break

            #wandb.finish()
    else:
        print("params in the dictionary cannot be empty. Use the standard values in the dictionary for the model", model_type.__name__)

In [None]:
#TODO: https://arrow.apache.org/cookbook/py/io.html#id11
# for splitted files 