# Notebook to solve the kaggle competition "Child Mind Institute - Detect Sleep States" with a neural network
Link to the competition: https://www.kaggle.com/competitions/child-mind-institute-detect-sleep-states


### Install packages

In [1]:
!pip install matplotlib plotly pandas numpy tqdm scikit-learn pyarrow




[notice] A new release of pip is available: 23.1.2 -> 23.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import wandb
# wandb.login() requires you to get your API key from your account settings
# open the weights and biases website https://wandb.ai/login and login to your account
# then go to your account settings and copy the API key
# paste it in the input box and hit enter

#wandb.login() #TODO: uncomment this line to login to your account

In [3]:
# # start a new wandb run to track this script
# wandb.init(
#     # set the wandb project where this run will be logged
#     project="my-awesome-project",
    
#     # track hyperparameters and run metadata
#     config={
#     "learning_rate": 0.02,
#     "architecture": "CNN",
#     "dataset": "CIFAR-100",
#     "epochs": 10,
#     }
# )

To be able to switch between different models from different libraries at a glance, we implement an interface called `IPipelineRequirements`. This allows us to make the pipleine even more robust and easier to extend upon. 

In [4]:
from abc import ABC, abstractmethod #, classmethod
import os

class IPipelineRequirements(ABC):
    @abstractmethod
    def train(self):
        pass

    @abstractmethod
    def predict(self):
        pass

    @abstractmethod
    def save(self):
        pass

    @abstractmethod
    def load(self):
        pass

    @abstractmethod
    def evaluate(self):
        pass


Our first model is the baseline model, which just takes the mean over all onset and wakeup times and tries to predict `onset` and `wakeup` events with the calculated time.

In [5]:
class BaselineModel(IPipelineRequirements):
    def __init__(self):
        pass

    def train(self):
        raise NotImplementedError("Please Implement this method")

    def predict(self):
        raise NotImplementedError("Please Implement this method")

    def save(self):
        raise NotImplementedError("Please Implement this method")

    def load(self):
        raise NotImplementedError("Please Implement this method")

    def evaluate(self):
        raise NotImplementedError("Please Implement this method")


In [6]:
g = BaselineModel()

Numerous studies have concentrated on applications using `RandomForest`. The primary motivation for this preference is the model's inherent transparency in decision-making processes, which are readily identifiable in such models. Subsequent to the BaselineModel, efforts have been made to abstract models from the Scikit-learn library. Fortunately, the majority of models within their API exhibit consistent implementation patterns, facilitating their integration into the processing pipeline.

In [24]:
from sklearn.preprocessing import StandardScaler
from pickle import dump, load

class SkLearnModel(IPipelineRequirements):

    def __init__(self, model_path):
        self.load(model_path)

    def __init__(self, model, identifier, scaler=StandardScaler, sk_model_params=None):
        self._model = model
        self._scaler = scaler if not callable(scaler) else scaler() # if scaler is a classpointer, instantiate it
        self.identifier = identifier
        self._model_params = sk_model_params

    def set_pretrained_scaler(self, scaler):
        self._scaler = scaler

    def train(self, X, y, not_scaled=False, sk_fit_params=None):
        if not_scaled:
            X = self._scaler.transform(X)

        # due to the big dataset we need to check wich model was instantiated and do some model
        # specific stuff to enable us to train the model in batches
        if isinstance(self._model, RandomForestClassifier):
            added_estimators = 75
            if self._model.warm_start: self._model.n_estimators += added_estimators
            print(f'Current estimator increased to {self._model.n_estimators}, {added_estimators} added this round.')
        if isinstance(self._model, SVC):
            pass

        if sk_fit_params:
            self._model.fit(X, y, **sk_fit_params)
        else:
            self._model.fit(X, y)

    def predict(self, X, not_scaled=False):
        if not self._model: raise ValueError('Please load or train a model first.')
        if not_scaled:
            if not self._scaler: raise ValueError('Please load or fit a scaler first.')
            X = self._scaler.transform(X)
        return self._model.predict(X)

    def save(self):
        try:
            with open(f'model_{self.identifier}.pkl', 'wb') as f:
                dump(self._model, f)
            with open(f'scaler_{self.identifier}.pkl', 'wb') as f:
                dump(self._scaler, f)
        except:
            raise ValueError('Unable to save model and scaler.')

    def load(self, filepath):
        try:
            # load model and scaler
            if os.path.isfile(filepath):
                print(f'Loading model from {filepath}')
                with open(filepath, 'rb') as f:
                    self._model = load(f)
            scaler_path = f'{os.path.split(filepath)[0]}/scaler_{os.path.split(filepath)[1].split(".")[0].split("_")[1]}.pkl'
            if os.path.isfile(scaler_path):
                print(f'Loading scaler from {scaler_path}')
                with open(filepath, 'rb') as f:
                    self._scaler = load(f)

            # extract identifier from filename
            self._identifier = filepath.split('_')[1].split('.')[0]
        except (FileNotFoundError) as e:
                print(f'File {e} not found')
                raise ValueError('Filepath is not valid. Unable to load model and scaler.')
        except (IndexError) as e:
                print(f'The name of the file does not implement the convention "<model|scaler>_<some identifier>".')
                raise ValueError('Filepath is not valid. Unable to load model and scaler.')
        except (Exception) as e:
            print(f'Something went wrong. {e}')
            raise ValueError('Unknown Error.')

    def evaluate(self, X, y, scoreFx=None, not_scaled=False):
        if not scoreFx:
            raise ValueError('Please provide a score function.')
        if not self._model:
            raise ValueError('Please load or train a model first.')
        if not_scaled:
            if not self._scaler: raise ValueError('Please load or fit a scaler first.')
            X = self._scaler.transform(X)
            
        #TODO: Does score FX require an array of predictions or a single prediction?
        #TODO: TEST PARAMS OF SCORE FX!!!!
        return scoreFx(y, self._model.predict(X))


### Heuristics method to get the right event under the constraints given by Child Mind Institue

* A single sleep period must be at least 30 minutes in length
* A single sleep period can be interrupted by bouts of activity that do not exceed 30 consecutive minutes
* No sleep windows can be detected unless the watch is deemed to be worn for the duration (elaborated upon, below)
* The longest sleep window during the night is the only one which is recorded
* If no valid sleep window is identifiable, neither an onset nor a wakeup event is recorded for that night.
* Sleep events do not need to straddle the day-line, and therefore there is no hard rule defining how many may occur within a given period. However, no more than one window should be assigned per night. For example, it is valid for an individual to have a sleep window from 01h00–06h00 and 19h00–23h30 in the same calendar day, though assigned to consecutive nights
* There are roughly as many nights recorded for a series as there are 24-hour periods in that series.

In [8]:
def heuristic_filter(data):
    pass

### Import the training and the validation data

In [9]:
#import libraries
# import pandas as pd
# import pyarrow.parquet as pq
# import gc

# parquet_file = pq.ParquetFile('../data/train_20231021')
# for i in parquet_file.iter_batches(batch_size=5_000_000):
#     train = i.to_pandas()
#     break


# train = pd.read_parquet('../data/train_20231021')
# validation = pd.read_parquet('../data/validation_20231021')

In [10]:
# series_id_mapping = dict()
# for i, data in enumerate(train.series_id.unique()):
#     series_id_mapping[data] = i
# train['series_id'] = train['series_id'].map(series_id_mapping)

In [11]:
# # fit the scaler
# scr = StandardScaler()
# for serie in tqdm(train.series_id.unique()):
#     X = train[train.series_id == serie].drop(['wearable_on', 'awake', 'wakeup', 'onset'], axis=1)
#     #y = train[train.series_id == serie]['wearable_on']
#     scr.partial_fit(X)
#     del X
#     #del y
#     gc.collect()


In [12]:
# # train the model
# testmd = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0,warm_start=True)
# for serie in tqdm(train.series_id.unique()):
#     X = train[train.series_id == serie].drop(['wearable_on', 'awake', 'wakeup', 'onset'], axis=1)
#     y = train[train.series_id == serie]['wearable_on']
#     testmd.fit(scr.transform(X), y)
#     testmd.n_estimators += 10
#     del X
#     del y
#     gc.collect()


In [13]:
#with open(f'TEST.pkl', 'wb') as f:
#    dump(testmd, f)
#with open(f'TEST_SCALER.pkl', 'wb') as f:
#    dump(scr, f)

In [14]:
# validation = pd.read_parquet('../data/validation_20231021')
# validation = validation[["step", "anglez", "enmo", "hour", "minute", "seconds", "day", "month", "year", "seconds_from_midnight", "awake", "wearable_on", "series_id", "onset", "wakeup"]]
# series_id_mapping_va = dict()
# for i, data in enumerate(validation.series_id.unique()):
#     series_id_mapping_va[data] = i
# validation['series_id'] = validation['series_id'].map(series_id_mapping_va)

In [15]:
# res_f1 = []
# res_recall = []
# res_precision = []

# for serie in tqdm(validation.series_id.unique()):
#     X = validation[validation.series_id == serie].drop(['wearable_on', 'awake', 'wakeup', 'onset'], axis=1)
#     y = validation[validation.series_id == serie]['wearable_on']
#     y_pred = testmd.predict(scr.transform(X))
#     res_f1.append(f1_score(y, y_pred))
#     res_recall.append(recall_score(y, y_pred))
#     res_precision.append(precision_score(y, y_pred))



In [16]:
#sum(res_f1)/len(res_f1)

In [17]:
#sum(res_precision)/len(res_precision)

In [18]:
#sum(res_recall)/len(res_recall)

In [19]:
#pd.DataFrame(y_pred).value_counts()

In [20]:
raise(ValueError('test'))

ValueError: test

In [21]:
import pandas as pd
import pyarrow.parquet as pq
import gc

TRAIN_DATA = '../data/train_20231021'
TRAIN_DATA = '../data/train_20231021_20M'
VALIDATION_DATA = '../data/validation_20231021'

# figure out series id mapping
if not 'series_id_mapping' in vars():
    series_id_mapping = {'train': dict(), 'validation': dict()}
    t = pq.ParquetDataset(TRAIN_DATA).read(columns=['series_id'])
    for i, data in enumerate(t.to_pandas()['series_id'].unique()):
        series_id_mapping['train'][data] = i
    v = pq.ParquetDataset(VALIDATION_DATA).read(columns=['series_id'])
    for i, data in enumerate(v.to_pandas()['series_id'].unique()):
        series_id_mapping['validation'][data] = i
    del t, v
    gc.collect()

def dataloader_full_dataset(validation=False):
    return pd.read_parquet(VALIDATION_DATA if validation else TRAIN_DATA)

def dataloader(validation=False, batch_size=5_000_000):
    parquet_file = pq.ParquetFile(VALIDATION_DATA if validation else TRAIN_DATA)
    for i in parquet_file.iter_batches(batch_size=batch_size):
        yield i.to_pandas()


### Run the pipeline

In [26]:
# pipeline base idea
# import all models and the score function
from itertools import product
from tqdm import tqdm
from score import score
#from sklearn.linear_model import LinearRegression
#from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, recall_score, precision_score, average_precision_score

# define all the combinations of models and features
models_and_hyperparams = {
    RandomForestClassifier: {
        'params': {
            'n_estimators': [0, 100, 200],
            #'n_estimators': [200, 300, 400],
            'max_depth': [10, 10, 20, None],
            'random_state': [0],
            #'criterion': ['gini', 'entropy', 'log_loss],
            'warm_start': [True]
        },
        'modeltype': SkLearnModel,
        'scaler': StandardScaler
    },
    # SVC: {
    #     'params': {
    #         'kernel': ['rbf', 'poly', 'poly', 'poly', 'sigmoid'],
    #         'degree': [3, 4, 5],
    #         'C': [1, 10, 100, 1000],
    #         'gamma': ['scale', 'auto']
    #     },
    #     'modeltype': SkLearnModel,
    #     'scaler': StandardScaler
    # },
    # KNeighborsClassifier: {
    #     'params': {
    #         'n_neighbors': [5, 10, 15, 20],
    #         'weights': ['uniform', 'distance'],
    #         'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    #         'leaf_size': [10, 20, 30, 40, 50],
    #         'p': [1, 2]
    #     },
    #     'modeltype': SkLearnModel,
    #     'scaler': StandardScaler
    # }
}

# split data
X_train, X_test, y_train, y_test = None, None, None, None

# define scaler
scaler_type = StandardScaler
pretrained_scalers = dict() # used to store trained scalers for later use

# loop over all combinations and append it to the configurations list
# if there are no hyperparams, just instantiate the model without params
for model_type, hyperparams in models_and_hyperparams.items():
    if len(hyperparams['params']) > 0: # if there are hyperparams, build a dict and pass it to the model as parameters
        fx_param_names, fx_param_values = zip(*hyperparams['params'].items())
        for cartesian_product_values in product(*fx_param_values):
            hyperparam_dict = dict(zip(fx_param_names, cartesian_product_values))

            # create or reuse the scaler specified in the models_and_hyperparams dictionary
            # if hyperparams['scaler'] in pretrained_scalers:
            #     print(f'Using pretrained scaler {hyperparams["scaler"].__name__}')
            #     scaler = pretrained_scalers[hyperparams['scaler']]
            # else:
            #     scaler = hyperparams['scaler']()
            #     print(f'Start fitting scaler {hyperparams["scaler"].__name__}')
            #     for batch in tqdm(dataloader(batch_size=15_000_000)):
            #         X = batch.drop(['wearable_on', 'awake', 'wakeup', 'onset'], axis=1)
            #         X['series_id'] = X['series_id'].map(series_id_mapping['train'])
            #         scaler.partial_fit(X)
            #         del X
            #         gc.collect()
            #     # save the scaler for later use
            #     pretrained_scalers[hyperparams['scaler']] = scaler

            # create the model from the modeltype and the hyperparam_dict
            m = hyperparams['modeltype'](model_type(**hyperparam_dict), \
                                        f'{model_type.__name__}-{"-".join([f"{na}__{str(va)}" for na, va in hyperparam_dict.items()])}'.lower(), \
                                        #pretrained_scalers[hyperparams['scaler']], \
                                        StandardScaler(), \
                                        hyperparam_dict)
            

#TODO: uncemment, standardscaler , not_scaled=True


            
            # init wandb
            # start a new wandb run to track this script
            #cfg = m._model_params.copy()
            #cfg['model'] = m._model.__name__
            #wandb.init(project="classic_models", config=cfg)

            # train model
            print(f'Start training model {m.identifier}')
            for batch in tqdm(dataloader(batch_size=5_000_000)):
                X = batch.drop(['wearable_on', 'awake', 'wakeup', 'onset'], axis=1)
                X['series_id'] = X['series_id'].map(series_id_mapping['train'])
                y = batch['wearable_on']
                m.train(X, y, not_scaled=False)
                del X
                del y
                gc.collect()

            #y_hat = configured_model.predict(X_test)

            #  do the heuristic part...

            #  evaluate model
            print(f'Start evaluating model {m.identifier}')
            validation = dataloader_full_dataset(validation=True)
            validation['series_id'] = validation['series_id'].map(series_id_mapping['validation'])
            validation_y = validation['wearable_on']
            validation.drop(['wearable_on', 'awake', 'wakeup', 'onset'], axis=1, inplace=True)
            score_value_average_precision_score = m.evaluate(validation, validation_y, scoreFx=average_precision_score, not_scaled=False)
            score_value_recall = m.evaluate(validation, validation_y, scoreFx=recall_score, not_scaled=False)
            score_value_precision = m.evaluate(validation, validation_y, scoreFx=precision_score, not_scaled=False)
            print(score_value_average_precision_score)
            print(score_value_recall)
            print(score_value_precision)
            del validation, validation_y

            #  save model
            #m.save()


            # delete model to free up memory
            #del m

            break

            #wandb.finish()
    else:
        print("params in the dictionary cannot be empty. Use the standard values in the dictionary for the model", model_type.__name__)

Start training model randomforestclassifier-n_estimators__0-max_depth__10-random_state__0-warm_start__true


0it [00:00, ?it/s]

Current estimator increased to 75, 75 added this round.


1it [06:17, 377.12s/it]

Current estimator increased to 150, 75 added this round.


2it [13:30, 410.00s/it]

Current estimator increased to 225, 75 added this round.


3it [20:16, 408.53s/it]

Current estimator increased to 300, 75 added this round.


4it [26:55, 403.75s/it]


Start evaluating model randomforestclassifier-n_estimators__0-max_depth__10-random_state__0-warm_start__true
0.6638572042117677
0.9785789916554606
0.6640060035666919


In [None]:
raise ValueError('test')

In [None]:
#m._scaler.transform(X)[0]
#TODO: LOAD alL DATA AND TRY TO SCALE IT? look at the first row if the data are scaeld equally

array([-1.40547154, -2.26902544, -0.2318546 , -0.07499694, -1.70340755,
       -1.59325501, -1.12511605,  0.42450226, -0.0196756 , -0.14689035,
       -1.75911204])

In [None]:
del validation
gc.collect()

NameError: name 'validation' is not defined

In [None]:
StandardScaler in pretrained_scalers

True

In [None]:
score_value_f1 = m.evaluate(validation, validation_y, scoreFx=f1_score, not_scaled=True)

In [None]:
score_value_f1

0.8032802541617328

In [None]:
score_value_recall = m.evaluate(validation, validation_y, scoreFx=recall_score, not_scaled=True)
score_value_precision = m.evaluate(validation, validation_y, scoreFx=precision_score, not_scaled=True)

In [None]:
print(score_value_f1)
print(score_value_recall)
print(score_value_precision)

0.8032802541617328
0.9884572274792981
0.676538032360052


In [None]:
TRAIN_DATA = '../data/train_20231021'
VALIDATION_DATA = '../data/validation_20231021'
def dataloader_full_dataset(validation=False):
    return pd.read_parquet(VALIDATION_DATA if validation else TRAIN_DATA)

In [None]:
# pipeline base idea
# import all models and the score function
from itertools import product
from tqdm import tqdm
from score import score
#from sklearn.linear_model import LinearRegression
#from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, recall_score, precision_score
import pandas as pd
import gc

t1 = RandomForestClassifier(n_estimators=50, max_depth=2, random_state=0)

dd = dataloader_full_dataset(validation=False)
dd.drop(['awake', 'wakeup', 'onset'], axis=1, inplace=True)
df = dd['wearable_on']
dd['series_id'] = dd['series_id'].map(series_id_mapping['train'])
dd.drop(['wearable_on'], axis=1, inplace=True)
gc.collect()


0

In [None]:
t1.fit(dd, df)

In [None]:
from pickle import dump, load
with open(f'model_all_data_no_scaling.pkl', 'wb') as f:
    dump(t1, f)

In [None]:
validation = dataloader_full_dataset(validation=True)
validation['series_id'] = validation['series_id'].map(series_id_mapping['validation'])
validation_y = validation['wearable_on']
validation.drop(['wearable_on', 'awake', 'wakeup', 'onset'], axis=1, inplace=True)


In [None]:
f1 = t1.predict(validation)
score_value_f1 = f1_score(f1, validation_y)
score_value_recall = recall_score(f1, validation_y)
score_value_precision = precision_score(f1, validation_y)
print(score_value_f1)
print(score_value_recall)
print(score_value_precision)

0.8023151368599453
0.6705377647348899
0.9985563264533389
