In [1]:
import numpy as np
import pandas as pd
import torch
import gc
from sklearn.metrics import f1_score,accuracy_score
import matplotlib.pyplot as plt
from tqdm import tqdm
import copy
import lightgbm
from sklearn.metrics import f1_score, accuracy_score
import warnings
warnings.filterwarnings("ignore")

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
def data_preprocessing(df, window):
    df[f'anglez_bf_{window}_std'] = df.anglez.rolling(window).std()
    df[f'anglez_at_{window}_std'] = df.anglez.iloc[::-1].rolling(window).std().sort_index()
    df[f'anglez_bf_{window}_mean'] = df.anglez.rolling(window).mean()
    df[f'anglez_at_{window}_mean'] = df.anglez.iloc[::-1].rolling(window).mean().sort_index()
    df[f'enmo_bf_{window}_std'] = df.enmo.rolling(window).std()
    df[f'enmo_at_{window}_std'] = df.enmo.iloc[::-1].rolling(window).std().sort_index()
    df[f'enmo_bf_{window}_mean'] = df.enmo.rolling(window).mean()
    df[f'enmo_at_{window}_mean'] = df.enmo.iloc[::-1].rolling(window).mean().sort_index()
    df.dropna(inplace=True)
    df.reset_index(drop=True,inplace=True)
    return df

In [3]:
series_ids = pd.read_parquet('/kaggle/input/zzzs-lightweight-training-dataset-target/Zzzs_train.parquet', columns=['series_id'])
series_ids = series_ids.series_id.unique()

In [4]:
train_series_ids = series_ids[:30]
train_x = pd.DataFrame(columns = ['anglez', 'enmo', 'anglez_bf_30_std', 'anglez_at_30_std',
       'anglez_bf_30_mean', 'anglez_at_30_mean', 'enmo_bf_30_std',
       'enmo_at_30_std', 'enmo_bf_30_mean', 'enmo_at_30_mean'])
train_y = pd.DataFrame(columns = ['awake'])
for _id in tqdm(train_series_ids):
    train_df_tmp = pd.read_parquet("/kaggle/input/zzzs-lightweight-training-dataset-target/Zzzs_train.parquet", filters=[('series_id','=',_id)], columns = ['anglez', 'enmo', 'awake'])
    train_df_tmp.anglez = (train_df_tmp.anglez + 8.8104) / 35.5218
    train_df_tmp.enmo = (train_df_tmp.enmo - 0.0413) / 0.1018
    train_df_tmp = data_preprocessing(train_df_tmp, 30)
    train_x = pd.concat([train_x, train_df_tmp.drop('awake', axis=1)])
    train_y = pd.concat([train_y, train_df_tmp[['awake']]])
train_x.reset_index(drop=True, inplace=True)
train_y.reset_index(drop=True, inplace=True)
train_y['awake'] = train_y['awake'].astype(int)

100%|██████████| 30/30 [00:35<00:00,  1.19s/it]


In [5]:
valid_series_ids = series_ids[31:]
valid_x = pd.DataFrame(columns = ['anglez', 'enmo', 'anglez_bf_30_std', 'anglez_at_30_std',
       'anglez_bf_30_mean', 'anglez_at_30_mean', 'enmo_bf_30_std',
       'enmo_at_30_std', 'enmo_bf_30_mean', 'enmo_at_30_mean'])
valid_y = pd.DataFrame(columns = ['awake'])
for _id in tqdm(valid_series_ids):
    valid_df_tmp = pd.read_parquet("/kaggle/input/zzzs-lightweight-training-dataset-target/Zzzs_train.parquet", filters=[('series_id','=',_id)], columns = ['anglez', 'enmo', 'awake'])
    valid_df_tmp.anglez = (valid_df_tmp.anglez + 8.8104) / 35.5218
    valid_df_tmp.enmo = (valid_df_tmp.enmo - 0.0413) / 0.1018
    valid_df_tmp = data_preprocessing(valid_df_tmp, 30)
    valid_x = pd.concat([valid_x, valid_df_tmp.drop('awake', axis=1)])
    valid_y = pd.concat([valid_y, valid_df_tmp[['awake']]])
valid_x.reset_index(drop=True, inplace=True)
valid_y.reset_index(drop=True, inplace=True)
valid_y['awake'] = valid_y['awake'].astype(int)

100%|██████████| 4/4 [00:03<00:00,  1.15it/s]


In [6]:
model = lightgbm.LGBMClassifier(random_state=42, max_depth=5,learning_rate=0.05, n_estimators=100)
model.fit(train_x,train_y, eval_set = (valid_x,valid_y),eval_metric='auc',verbose=2)

[2]	valid_0's auc: 0.978607	valid_0's binary_logloss: 0.582542
[4]	valid_0's auc: 0.980683	valid_0's binary_logloss: 0.521073
[6]	valid_0's auc: 0.982441	valid_0's binary_logloss: 0.470365
[8]	valid_0's auc: 0.982683	valid_0's binary_logloss: 0.427902
[10]	valid_0's auc: 0.982932	valid_0's binary_logloss: 0.391908
[12]	valid_0's auc: 0.98301	valid_0's binary_logloss: 0.361129
[14]	valid_0's auc: 0.983175	valid_0's binary_logloss: 0.334347
[16]	valid_0's auc: 0.983327	valid_0's binary_logloss: 0.311199
[18]	valid_0's auc: 0.983404	valid_0's binary_logloss: 0.291113
[20]	valid_0's auc: 0.983459	valid_0's binary_logloss: 0.273457
[22]	valid_0's auc: 0.983591	valid_0's binary_logloss: 0.257984
[24]	valid_0's auc: 0.983647	valid_0's binary_logloss: 0.244588
[26]	valid_0's auc: 0.983858	valid_0's binary_logloss: 0.23255
[28]	valid_0's auc: 0.983934	valid_0's binary_logloss: 0.221933
[30]	valid_0's auc: 0.984073	valid_0's binary_logloss: 0.212395
[32]	valid_0's auc: 0.984116	valid_0's binary_

In [7]:
# preds = model.predict(test_x)

In [8]:
# print('f1_score : ',f1_score(valid_y.values, preds))
# print('accuracy :' , accuracy_score(valid_y.values, preds))

In [9]:
# onset : 5 ~ 18 not_normal
# wakeup : 00 ~ 03, 12 ~ 23 not_noraml

In [10]:
def get_events(_id, model,file_root = None):
    test = pd.read_parquet(f"{file_root}", filters=[('series_id','=',_id)])
    test['timestamp'] = pd.to_datetime(test['timestamp']).apply(lambda t: t.tz_localize(None))
    test['hour'] = test['timestamp'].dt.hour
    test.anglez = (test.anglez + 8.8104) / 35.5218
    test.enmo = (test.enmo - 0.0413) / 0.1018
    test = data_preprocessing(test,30)
    test.dropna(inplace=True)
    test.reset_index(drop=True,inplace=True)
    test_x = test[train_x.columns]
    preds, probs = model.predict(test_x), model.predict_proba(test_x)[:, 1]
    test['prediction'] = preds
    test['prediction'] = test['prediction'].rolling(360+1, center=True).median()
    test['probability'] = probs
    test = test[test['prediction']!=2]
    test.loc[test['prediction']==0, 'probability'] = 1-test.loc[test['prediction']==0, 'probability']
    test['score'] = test['probability'].rolling(60*12*5, center=True, min_periods=10).mean().bfill().ffill()
    test['pred_diff'] = test['prediction'].diff()
    test['event'] = test['pred_diff'].replace({1:'wakeup', -1:'onset', 0:np.nan})
    test_wakeup = test[test['event']=='wakeup'].groupby(test['timestamp'].dt.date).agg('first')
    test_onset = test[test['event']=='onset'].groupby(test['timestamp'].dt.date).agg('last')
    test = pd.concat([test_wakeup, test_onset], ignore_index=True).sort_values('timestamp')
    return test

In [11]:
file_root = '/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet'
series_id  = pd.read_parquet(file_root, columns=['series_id'])
series_id = series_id.series_id.unique()
submit_columns = ['series_id','step','event','score']
submission = []
for _id in series_id:
    test_tmp = get_events(_id, model,file_root)
    test_tmp = test_tmp[submit_columns]
    submission.append(test_tmp)
submission = pd.concat(submission, ignore_index=True).reset_index(names='row_id')
submission.to_csv('submission.csv', index=False)
submission

Unnamed: 0,row_id,series_id,step,event,score


In [12]:
# submit_columns = ['series_id','step','event','score']
# valid_sub = []
# valid_series_ids = series_ids[31:]
# for _id in tqdm(valid_series_ids):
#     valid_tmp = get_events(_id, model,file_root = '/kaggle/input/zzzs-lightweight-training-dataset-target/Zzzs_train.parquet')
#     valid_tmp = valid_tmp[submit_columns]
#     valid_sub.append(valid_tmp)
# valid_sub = pd.concat(valid_sub, ignore_index=True).reset_index(names='row_id')
# valid_sub

In [13]:
# uqid = submission.series_id.unique()

In [14]:
# submission[submission['series_id'] == uqid[1]]

In [15]:
# submission_exception[submission_exception['series_id'] == uqid[1]]