# Notebook to solve the kaggle competition "Child Mind Institute - Detect Sleep States" with a neural network
Link to the competition: https://www.kaggle.com/competitions/child-mind-institute-detect-sleep-states


### Install packages

In [None]:
!pip install matplotlib plotly pandas numpy tqdm scikit-learn pyarrow

To be able to switch between different models from different libraries at a glance, we implement an interface called `IPipelineRequirements`. This allows us to make the pipleine even more robust and easier to extend upon. 

In [None]:
from abc import ABC, abstractmethod #, classmethod
import os

class IPipelineRequirements(ABC):
    @abstractmethod
    def train(self):
        pass

    @abstractmethod
    def predict(self):
        pass

    @abstractmethod
    def save(self):
        pass

    @abstractmethod
    def load(self):
        pass

    @abstractmethod
    def evaluate(self):
        pass


Our first model is the baseline model, which just takes the mean over all onset and wakeup times and tries to predict `onset` and `wakeup` events with the calculated time.

In [None]:
class BaselineModel(IPipelineRequirements):
    def __init__(self):
        pass

    def train(self):
        raise NotImplementedError("Please Implement this method")

    def predict(self):
        raise NotImplementedError("Please Implement this method")

    def save(self):
        raise NotImplementedError("Please Implement this method")

    def load(self):
        raise NotImplementedError("Please Implement this method")

    def evaluate(self):
        raise NotImplementedError("Please Implement this method")


In [None]:
g = BaselineModel()

Numerous studies have concentrated on applications using `RandomForest`. The primary motivation for this preference is the model's inherent transparency in decision-making processes, which are readily identifiable in such models. Subsequent to the BaselineModel, efforts have been made to abstract models from the Scikit-learn library. Fortunately, the majority of models within their API exhibit consistent implementation patterns, facilitating their integration into the processing pipeline.

In [None]:
from sklearn.preprocessing import StandardScaler
from pickle import dump, load

class SkLearnModel(IPipelineRequirements):

    def __init__(self, model_path):
        self.load(model_path)

    def __init__(self, model, identifier, scaler=StandardScaler):
        self._model = model
        self._scaler = scaler if not callable(scaler) else scaler() # if scaler is a class, instantiate it
        self.identifier = identifier

    def train(self, X, y, not_scaled=False, sk_fit_params=None):
        if not not_scaled:
            X = self._scaler().fit_transform(X)
        if sk_fit_params:
            self._model.fit(X, y, **sk_fit_params)
        else:
            self._model.fit(X, y)

    def predict(self, X, not_scaled=False):
        if not self._model: raise ValueError('Please load or train a model first.')
        if not not_scaled:
            if not self._scaler: raise ValueError('Please load or fit a scaler first.')
            X = self._scaler.transform(X)
        return self._model.predict(X)

    def save(self):
        try:
            with open(f'model_{self.identifier}.pkl', 'wb') as f:
                dump(self._model, f)
            with open(f'scaler_{self.identifier}.pkl', 'wb') as f:
                dump(self._scaler, f)
        except:
            raise ValueError('Unable to save model and scaler.')

    def load(self, filepath):
        try:
            # load model and scaler
            if os.path.isfile(filepath):
                print(f'Loading model from {filepath}')
                with open(filepath, 'rb') as f:
                    self._model = load(f)
            scaler_path = f'{os.path.split(filepath)[0]}/scaler_{os.path.split(filepath)[1].split(".")[0].split("_")[1]}.pkl'
            if os.path.isfile(scaler_path):
                print(f'Loading scaler from {scaler_path}')
                with open(filepath, 'rb') as f:
                    self._scaler = load(f)

            # extract identifier from filename
            self._identifier = filepath.split('_')[1].split('.')[0]
        except (FileNotFoundError) as e:
                print(f'File {e} not found')
                raise ValueError('Filepath is not valid. Unable to load model and scaler.')
        except (IndexError) as e:
                print(f'The name of the file does not implement the convention "<model|scaler>_<some identifier>".')
                raise ValueError('Filepath is not valid. Unable to load model and scaler.')
        except (Exception) as e:
            print(f'Something went wrong. {e}')
            raise ValueError('Unknown Error.')

    def evaluate(self, X, y, scoreFx=None):
        if not scoreFx:
            raise ValueError('Please provide a score function.')
        if not self._model:
            raise ValueError('Please load or train a model first.')
        
        # y_hat = self.predict(X)
        # score = scoreFx(y, y_hat)
        
        raise NotImplementedError("Please Implement this method")


In [None]:
from score import score
#sk_model.evaluate(X, y, score)

In [None]:
# impor linear regression
from sklearn.linear_model import LinearRegression
# import svr
from sklearn.svm import SVR 
# import polynomial features
from sklearn.preprocessing import PolynomialFeatures
# import randomforest classifier
from sklearn.ensemble import RandomForestClassifier



In [None]:
#TEST

#model = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
#sk_model = SkLearnModel(model, 'Hansa')

In [None]:
# pipeline base idea

# 1. load data

# 2. split data



models_and_hyperparams = {
    RandomForestClassifier: {
        'n_estimators': [100, 200, 300],
        'max_depth': [2, 3, 4],
        'random_state': [0]
    },
    LinearRegression: {

    },
    SVR: {
        'kernel': ['rbf', 'poly', 'poly', 'poly', 'sigmoid'],
        'degree': [3, 4, 5],
        'C': [1, 10, 100, 1000]
    },
    PolynomialFeatures: {
        'degree': [2, 3, 4]
    }
}

# 3. train models
# 4. save model
# 5. evaluate model
# 6. predict on test set
# 7. save predictions


In [None]:
list(**models_and_hyperparams)