# **aha-sleep-fe-cnn**
**sleep event** transforms - read event csv with converters
1. remove rows with NANs
2. transform timestamp -> uint32 minutes past 2017-01-01
3. transform event from text to enum (NAN, **ONSET**, SLEEP, **WAKEUP**, WAKE)
4. drop "step"
5. drop "night"
**series parquet** transforms - read series parquet with transforms
1. remove rows with NANs
2. transform timestamp -> uint32 minutes past 2017-01-01
3. transform zangle -> uint16 zangle
4. transform enmo -> uint16 enmo * 1000
**feature label** generation
<br>generate labels for series - X_train series, Y labels
```
for each series_id 
    set event onset time
    set event wakeup time
    for each series row
        if series time < event wakeup time
            series row label = SLEEP
        else if series time = event wakeup time
            series row label = WAKEUP
        else if series time > event wakeup time AND series time < event onset time
            series row label = WAKE
        else if series time = event onset time
            series row label = ONSET
```

**model** 
1. train CNN using X_train, Y labels
2. optimize learning rate
3. forecast
4. evaluate


 

In [None]:
!pip install icecream

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import seaborn as sns
import pyarrow.parquet as pq
import tensorflow as tf

In [None]:
from icecream import ic
ic(tf.__version__)

In [None]:
####################################################################
import wandb
#wandb_enabled = True       # on -> interactive
wandb_enabled = False     # off -> submission

if wandb_enabled:
    wandb.login()

In [None]:
# read raw csv & print all rows
# train_events = pd.read_csv('/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv')
# pd.set_option('display.max_rows', None)
# train_events

In [None]:
####################################################################
from types import SimpleNamespace
tuner = SimpleNamespace(
    # column labels
    SERIES_ID_COLUMN = 'series_id',
    NIGHT_COLUMN = 'night',
    EVENT_COLUMN = 'event',
    STEP_COLUMN = 'step',
    TIME_COLUMN = 'timestamp',

    NAN_TIME = 0,

    ANGLEZ_COLUMN = 'anglez',
    ENMO_COLUMN = 'enmo',

    # event labels
    ONSET_EVENT_LABEL = 'onset',
    WAKEUP_EVENT_LABEL = 'wakeup',

    # event enumeration
    NAN_EVENT = 0,
    ONSET_EVENT = 1,
    SLEEP_EVENT = 2,
    WAKEUP_EVENT = 3,
    WAKE_EVENT = 4
)
ic(tuner)


# converters
Converters
==========
## remove_rows_with_nan
## convert_to_minutes
## convert_event_enumeration

In [None]:
import datetime

def remove_rows_with_nan(row):
  """Removes rows with NaN.

  Args:
    row: A Pandas Series object representing the current row of the CSV file.

  Returns:
    None if the row contains NaN, otherwise the row.
  """

  if row.isna().any():
    return None
  else:
    return row

def convert_to_seconds(date_string):
    """Converts a date string to seconds past 2017-01-01.

    Args:
    date_string: A string in the format YYYY-MM-DDTHH:MM:SS-TZ.

    Returns:
    An integer representing the number of seconds since 2017-01-01.
    """
    #print(f"date_string->,{date_string}")
    if len(date_string) == 0:
        return None # NAN_TIME
    
    # 2018-08-14T22:26:00-0400
    date_time = datetime.datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%S-%f")
    time_in_seconds = np.int32((date_time - datetime.datetime(2017, 1, 1)).total_seconds())
    #print(time_in_minutes)
    return time_in_seconds 

def convert_to_minutes(date_string):
    """Converts a date string to minutes past 2017-01-01.

    Args:
    date_string: A string in the format YYYY-MM-DDTHH:MM:SS-TZ.

    Returns:
    An integer representing the number of minutes since 2017-01-01.
    """
    #print(f"date_string->,{date_string}")
    if len(date_string) == 0:
        return None # NAN_TIME
    
    # 2018-08-14T22:26:00-0400
    date_time = datetime.datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%S-%f")
    #return (date_time - datetime.datetime(2018, 1, 1)).total_seconds() // 60
    #return np.int32((date_time - datetime.datetime(2017, 1, 1)).total_seconds() // 60)
    time_in_minutes = np.int32((date_time - datetime.datetime(2017, 1, 1)).total_seconds() // 60)
    #print(time_in_minutes)
    return time_in_minutes 


def convert_event_enumeration(event_string):
    if event_string == tuner.ONSET_EVENT_LABEL:
        return tuner.ONSET_EVENT
    elif event_string == tuner.WAKEUP_EVENT_LABEL:
        return tuner.WAKEUP_EVENT
    return tuner.NAN_EVENT  

def convert_zangle(zangle_string):
    zangle_float = np.float32(zangle_string)
    zangle = np.int16(zangle_float)

    return zangle

def convert_enmo(enmo_string):
    enmo_float = np.float32(enmo_string)
    enmo = np.uint16(enmo_float*1000)

    return enmo


In [None]:
# unit test converters
print(convert_to_seconds('2018-08-14T22:26:00-0400'))
print(convert_to_seconds(''))
print(convert_to_minutes('2018-08-14T22:26:00-0400'))
print(convert_to_minutes(''))

print(convert_to_seconds('2018-09-06T04:59:55-0400'))
duration = convert_to_seconds('2018-09-06T04:59:55-0400') - convert_to_seconds('2018-08-14T22:26:00-0400')
print(f"begin-end-trial event duration (seconds)->{duration}")

print(convert_event_enumeration('onset'))
print(convert_event_enumeration('wakeup'))
print(convert_event_enumeration('dunno'))

print(convert_zangle('2.636700'))
print(convert_zangle('-90.636700'))

print(convert_enmo('0.0216'))

In [None]:
event_converters = {tuner.TIME_COLUMN: convert_to_seconds, tuner.EVENT_COLUMN: convert_event_enumeration}
#event_converters = {tuner.TIME_COLUMN: convert_to_minutes, tuner.EVENT_COLUMN: convert_event_enumeration}
# converters = {'timestamp': convert_to_minutes, 'event': convert_event_enumeration}
#converters = {'remove_rows_with_nan': {STEP_COLUMN: remove_rows_with_nan, TIME_COLUMN: remove_rows_with_nan}, TIME_COLUMN: convert_to_minutes, EVENT_COLUMN: convert_event_enumeration}


In [None]:
train_event = pd.read_csv('/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv', 
                          converters=event_converters)
print(f"raw # event rows - > {len(train_event)}")
# drop NAN rows
train_event = train_event.dropna(axis=0)
print(f"drop NAN # event rows - > {len(train_event)}")
# Re-index the DataFrame
train_event = train_event.reset_index(drop=True)

# drop night, step columns
train_event = train_event.drop(tuner.NIGHT_COLUMN, axis=1)
train_event = train_event.drop(tuner.STEP_COLUMN, axis=1)

train_event[tuner.TIME_COLUMN] = train_event[tuner.TIME_COLUMN].astype('uint32')
print(train_event.iloc[0])
print(train_event[tuner.TIME_COLUMN].dtype)

#pd.set_option('display.max_rows', None)
pd.set_option('display.max_rows', 64)
train_event

In [None]:
series_id_filter = train_event.loc[0]['series_id']
ic(series_id_filter)
series_id_list = train_event['series_id']
series_id_unique_list = train_event['series_id'].unique()
ic(len(series_id_list),len(series_id_unique_list))

In [None]:
# train_event_filter = train_events.loc[(train_events['series_id'] == train_events.loc[0]['series_id'])]
train_event_filter = train_event.loc[(train_event['series_id'] == series_id_filter)]
ic(len(train_event_filter))
pd.set_option('display.max_rows', None)
train_event_filter

# Train Series
* metrics at 5 sec intervals
* ~86,400 per day (NAN rows will be deleted)


In [None]:
parquet_train_series = '/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet'
#train_series = pq.read_table(parquet_train_series).to_pandas()

train_series = pq.read_table(parquet_train_series,
                             filters=[[('series_id', '=', series_id_filter)],]).to_pandas()
ic(series_id_filter)
pd.set_option('display.max_rows', 128)
train_series

In [None]:
print(f"raw # train_series rows - > {len(train_series)}")
# drop NAN rows
train_series = train_series.dropna(axis=0)
print(f"drop NAN # train_series rows - > {len(train_series)}")
# drop step column
print(train_series.columns)
train_series = train_series.drop(tuner.STEP_COLUMN, axis=1)
train_series

In [None]:
# train_series_x = train_series
# train_series_x[tuner.TIME_COLUMN] = train_series_x[tuner.TIME_COLUMN].apply(convert_to_seconds)
# train_series_x

In [None]:
train_series_x = train_series
train_series_x[tuner.TIME_COLUMN] = train_series_x[tuner.TIME_COLUMN].apply(convert_to_seconds)
train_series_x[tuner.ANGLEZ_COLUMN] = train_series[tuner.ANGLEZ_COLUMN].apply(convert_zangle)
train_series_x[tuner.ENMO_COLUMN] = train_series[tuner.ENMO_COLUMN].apply(convert_enmo)
train_series_x

In [None]:
# reset train series to transform
# train_series = train_series_x

In [None]:
# trace event & series related info
def trace_event_series_snapshot(event_inx, event_onset_time, event_wakeup_time,\
                              series_inx, series_time, series_label):
    ic(event_inx, event_onset_time, event_wakeup_time)
    ic(series_inx-1, series_time, series_label[series_inx-1])
    ic(series_inx, series_time, series_label[series_inx])


In [None]:
def label_series(series_id_list,train_event,train_series_x):
    event_inx = 0
    series_inx = 0
    event_onset_time = 0
    event_wakeup_time = 0
    event_detected = False
    ic(event_inx, event_onset_time, event_wakeup_time)
    INTERVAL = 5 # 5 second intervals
    series_label = []

    # series_id 038441c925bb
    # train_event_filter_limit = 38
    # train_series_x_limit = 389879

    train_event_filter_limit = len(train_event)
    train_series_x_limit = len(train_series_x)
    ic(train_event_filter_limit, train_series_x_limit)

    #df.at[index, 'col_name'] = x

    ic(series_id_list)
    for series_id in series_id_list:
        ic(series_id)
        #while event_inx < train_event_filter_limit and \
        while series_inx < train_series_x_limit and \
            train_event.at[event_inx, tuner.SERIES_ID_COLUMN] == series_id and \
            train_series_x.at[series_inx, tuner.SERIES_ID_COLUMN] == series_id:

            if not event_detected:
                if train_event.at[event_inx, tuner.EVENT_COLUMN] == tuner.ONSET_EVENT:
                    event_onset_time = train_event.at[event_inx, tuner.TIME_COLUMN]
                    event_detected = True
                elif train_event.at[event_inx, tuner.EVENT_COLUMN] == tuner.WAKEUP_EVENT:
                    event_wakeup_time = train_event.at[event_inx, tuner.TIME_COLUMN]
                    event_detected = True
                #ic(event_inx, event_onset_time, event_wakeup_time)

            series_time = train_series_x.at[series_inx, tuner.TIME_COLUMN]
            #ic(series_inx, series_time)
            if series_time > event_wakeup_time and series_time < event_onset_time:
                series_label.append(tuner.WAKE_EVENT)
                
            elif series_time > event_onset_time and series_time < event_wakeup_time:
                series_label.append(tuner.SLEEP_EVENT)
                
            elif series_time > event_wakeup_time and series_time > event_onset_time:
                # prevent event_inx advancing past EOF
                if event_inx + 1 < len(train_event):
                    event_inx = event_inx + 1
                    if series_label[series_inx-1] == tuner.WAKE_EVENT:
                        series_label.append(tuner.ONSET_EVENT)
                    elif series_label[series_inx-1] == tuner.SLEEP_EVENT:
                        series_label.append(tuner.WAKEUP_EVENT)
                # last event at EOF
                else:
                    if series_label[series_inx-1] == tuner.WAKEUP_EVENT or\
                    series_label[series_inx-1] == tuner.WAKE_EVENT: 
                        series_label.append(tuner.WAKE_EVENT)
                    elif series_label[series_inx-1] == tuner.ONSET_EVENT or\
                    series_label[series_inx-1] == tuner.SLEEP_EVENT:
                        series_label.append(tuner.SLEEP_EVENT)

                event_detected = False

                trace_event_series_snapshot(event_inx, event_onset_time, event_wakeup_time,\
                                           series_inx, series_time, series_label)
                
            elif series_time == event_onset_time:
                series_label.append(tuner.ONSET_EVENT)
                # prevent event_inx advancing past EOF
                if event_inx + 1 < len(train_event):
                    event_inx = event_inx + 1
                event_detected = False

                trace_event_series_snapshot(event_inx, event_onset_time, event_wakeup_time,\
                                           series_inx, series_time, series_label)
                
            elif series_time == event_wakeup_time: 
                series_label.append(tuner.WAKEUP_EVENT)
                # prevent event_inx advancing past EOF
                if event_inx + 1 < len(train_event):
                    event_inx = event_inx + 1
                event_detected = False

                trace_event_series_snapshot(event_inx, event_onset_time, event_wakeup_time,\
                                           series_inx, series_time, series_label)

            series_inx = series_inx + 1
    return series_label    


In [None]:
# series time = onset or wakeup event time
# dummy_series_list = [['038441c925bb', 51031800, 2, 21],\
#                 ['038441c925bb', 51031805, 2, 21],\
#                 ['038441c925bb', 51056760, 2, 21],\
#                 ['038441c925bb', 51057000, 2, 21],\
#                 ['038441c925bb', 51067800, 2, 21],\
#                 ['038441c925bb', 51086460, 2, 21],\
#                 ['038441c925bb', 51087460, 2, 21],\
#                 ['038441c925bb', 51088460, 2, 21],\
#                 ['038441c925bb', 51133020, 2, 21],\
#                 ['038441c925bb', 51143020, 2, 21],\
#                 ['038441c925bb', 51153020, 2, 21],\
#                ]


In [None]:
# column_headers = list(train_series_x.columns.values)
# ic(column_headers)
# # series time != onset or wakeup event time
# dummy_series_list = [['038441c925bb', 51031800, 2, 21],\
#                 ['038441c925bb', 51031805, 2, 21],\
#                 ['038441c925bb', 51056761, 2, 21],\
#                 ['038441c925bb', 51057000, 2, 21],\
#                 ['038441c925bb', 51067800, 2, 21],\
#                 ['038441c925bb', 51086461, 2, 21],\
#                 ['038441c925bb', 51087460, 2, 21],\
#                 ['038441c925bb', 51088460, 2, 21],\
#                 ['038441c925bb', 51133021, 2, 21],\
#                 ['038441c925bb', 51143020, 2, 21],\
#                 ['038441c925bb', 51153020, 2, 21],\
#                ]
# dummy_series_array = np.array(dummy_series_list)
# dummy_series_df = pd.DataFrame(dummy_series_array, columns=column_headers)
# dummy_series_df[tuner.TIME_COLUMN] = dummy_series_df[tuner.TIME_COLUMN].astype('uint32')
# ic(dummy_series_df)

In [None]:
# label series
series_id_list = ['038441c925bb']
series_label_list = label_series(series_id_list, train_event_filter, train_series_x)
# dummy series
#series_label_list = label_series(series_id_list, train_event, dummy_series_df)
#ic(series_label_list)
series_label_array = np.array(series_label_list)
series_label = pd.DataFrame(series_label_array)
#series_label


In [None]:

!pwd
series_label.to_csv('/kaggle/working/series_label.csv', index=False)
!ls -l
!pwd

In [None]:
import os

def download_local_csv_file(file_name, kaggle_working_dir):
  """Downloads a local CSV file to the Kaggle/working directory.

  Args:
    file_name: The name of the local CSV file.
    kaggle_working_dir: The path to the Kaggle/working directory.
  """

  # Copy the local CSV file to the Kaggle/working directory.
  os.system(f"cp {file_name} {kaggle_working_dir}")

# Get the path to the Kaggle/working directory.
kaggle_working_dir = os.getcwd()

# Download the local CSV file to the Kaggle/working directory.
download_local_csv_file('series_label.csv', kaggle_working_dir)

# Print a message confirming that the file was downloaded.
print(f'File data.csv downloaded successfully to {kaggle_working_dir}.')

# Python
# from kaggle.api.kaggle_api_extended import KaggleApi

# # Create a Kaggle API object.
# api = KaggleApi()

# # Get the path to the Kaggle/working directory.
# kaggle_working_dir = api.get_working_directory()

# # Upload the local CSV file to the Kaggle/working directory.
# api.upload_file(file_name, kaggle_working_dir)

# # Print a message confirming that the file was uploaded.
# print(f'File {file_name} uploaded successfully to {kaggle_working_dir}.')

In [None]:
# x_train = train_series_x.to_numpy()
# y_train = series_label.to_numpy()
x_train = train_series_x
y_train = series_label
ic(len(x_train), len(y_train))

In [None]:
import tensorflow as tf

# Define the CNN architecture
class CNN(tf.keras.Model):
    def __init__(self):
        super(CNN, self).__init__()

        # Define the convolutional layers
        self.conv1 = tf.keras.layers.Conv1D(32, 3, activation='relu')
        self.conv2 = tf.keras.layers.Conv1D(64, 3, activation='relu')

        # Define the pooling layers
        self.pool1 = tf.keras.layers.MaxPooling1D(2)
        self.pool2 = tf.keras.layers.MaxPooling1D(2)

        # Define the fully connected layers
        self.fc1 = tf.keras.layers.Dense(128, activation='relu')
        self.fc2 = tf.keras.layers.Dense(4, activation='softmax')

    def call(self, inputs):
        # Pass the inputs through the convolutional and pooling layers
        x = self.conv1(inputs)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.pool2(x)

        # Flatten the output of the pooling layer
        x = tf.keras.layers.Flatten()(x)

        # Pass the flattened output through the fully connected layers
        x = self.fc1(x)
        x = self.fc2(x)

        return x

# Create a CNN model
model = CNN()

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model on the data
model.fit(x_train, y_train, epochs=10)

# Evaluate the model on the test data
model.evaluate(x_test, y_test)