In [1]:
import numpy as np
import pandas as pd
import torch
import gc
from sklearn.metrics import f1_score,accuracy_score
import matplotlib.pyplot as plt
from tqdm import tqdm
import copy
import lightgbm
from sklearn.metrics import f1_score, accuracy_score
import warnings
warnings.filterwarnings("ignore")

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
def data_preprocessing(df, window):
    rog_args = {'window' : window, 'min_periods':2}
    df[f'anglez_bf_{window}_std'] = df.anglez.rolling(**rog_args).std().round(4)
    df[f'anglez_at_{window}_std'] = df.anglez.iloc[::-1].rolling(**rog_args).std().round(4).sort_index()
    df[f'anglez_bf_{window}_mean'] = df.anglez.rolling(**rog_args).mean().round(4)
    df[f'anglez_at_{window}_mean'] = df.anglez.iloc[::-1].rolling(**rog_args).mean().round(4).sort_index()
    df[f'enmo_bf_{window}_std'] = df.enmo.rolling(**rog_args).std().round(4)
    df[f'enmo_at_{window}_std'] = df.enmo.iloc[::-1].rolling(**rog_args).std().round(4).sort_index()
    df[f'enmo_bf_{window}_mean'] = df.enmo.rolling(**rog_args).mean().round(4)
    df[f'enmo_at_{window}_mean'] = df.enmo.iloc[::-1].rolling(**rog_args).mean().round(4).sort_index()
    df.dropna(inplace=True)
    df.reset_index(drop=True,inplace=True)
    return df

In [3]:
series_ids = pd.read_parquet('/kaggle/input/zzzs-lightweight-training-dataset-target/Zzzs_train.parquet', columns=['series_id'])
series_ids = series_ids.series_id.unique()

In [4]:
train_series_ids = series_ids
train_list = []
window_list = [60, 360, 720] # 5m, 30m, 1h
for _id in tqdm(train_series_ids):
    train_df_tmp = pd.read_parquet("/kaggle/input/zzzs-lightweight-training-dataset-target/Zzzs_train.parquet", filters=[('series_id','=',_id)], columns = ['anglez', 'enmo', 'awake'])
    train_df_tmp.anglez = (train_df_tmp.anglez + 8.8104) / 35.5218
    train_df_tmp.enmo = (train_df_tmp.enmo - 0.0413) / 0.1018
    for window in window_list:
        train_df_tmp = data_preprocessing(train_df_tmp, window)
    train_list.append(train_df_tmp)

train = pd.concat(train_list, ignore_index=True)
train_x = train.drop('awake',axis = 1)
train_y = train[['awake']]
del train, train_list

100%|██████████| 35/35 [00:46<00:00,  1.33s/it]


In [5]:
# valid_series_ids = series_ids[31:]
# valid_list = []
# for _id in tqdm(valid_series_ids):
#     valid_df_tmp = pd.read_parquet("/kaggle/input/zzzs-lightweight-training-dataset-target/Zzzs_train.parquet", filters=[('series_id','=',_id)], columns = ['anglez', 'enmo', 'awake'])
#     valid_df_tmp.anglez = (valid_df_tmp.anglez + 8.8104) / 35.5218
#     valid_df_tmp.enmo = (valid_df_tmp.enmo - 0.0413) / 0.1018
#     for window in window_list:
#         valid_df_tmp = data_preprocessing(valid_df_tmp, window)
#     valid_list.append(valid_df_tmp)

# valid = pd.concat(valid_list, ignore_index=True)
# valid_x = valid.drop('awake',axis = 1)
# valid_y = valid[['awake']]
# del valid, valid_list

In [6]:
model = lightgbm.LGBMClassifier(random_state=42, max_depth=5,learning_rate=0.05, n_estimators=60)
model.fit(train_x,train_y, verbose=1) # eval_set = (valid_x,valid_y), eval_metric='auc',

In [7]:
# preds = model.predict(valid_x)

In [8]:
# print('f1_score : ',f1_score(valid_y.values, preds))
# print('accuracy :' , accuracy_score(valid_y.values, preds))

In [9]:
# onset : 19 ~ 03 -> 04 ~ 18 not_normal
# wakeup : 04 ~ 11 -> 00 ~ 03, 12 ~ 23 not_noraml
# 00시 ~ 06시 -> 이전 날짜로 계산

In [10]:
def get_events(_id, model,file_root = None):
    test = pd.read_parquet(f"{file_root}", filters=[('series_id','=',_id)])
    test['timestamp'] = pd.to_datetime(test['timestamp']).apply(lambda t: t.tz_localize(None))
    test['date'] = test['timestamp'].dt.date
    test['hour'] = test['timestamp'].dt.hour
    test.anglez = (test.anglez + 8.8104) / 35.5218
    test.enmo = (test.enmo - 0.0413) / 0.1018
    for window in window_list:
        test = data_preprocessing(test,window)
    test.dropna(inplace=True)
    test.reset_index(drop=True,inplace=True)
    test_x = test[train_x.columns]
    preds, probs = model.predict(test_x), model.predict_proba(test_x)[:, 1]
    test['prediction'] = preds
    test['prediction'] = test['prediction'].rolling(360, center=True).median()
    test['probability'] = probs
    test = test[test['prediction']!=2]
    test.loc[test['prediction']==0, 'probability'] = 1-test.loc[test['prediction']==0, 'probability']
    test['score'] = test['probability'].rolling(60*12*5, center=True, min_periods=10).mean().bfill().ffill()
    test['pred_diff'] = test['prediction'].diff()
    test['event'] = test['pred_diff'].replace({1:'wakeup', -1:'onset', 0:np.nan})
    test.loc[test.hour.isin([0,1,2,3,4,5,6]),'date'] = test.loc[test.hour.isin([0,1,2,3,4,5,6]),'date'] + pd.Timedelta(days=-1)
    test_wakeup = test[test['event']=='wakeup'].groupby(test['timestamp'].dt.date).agg('first')
    test_onset = test[test['event']=='onset'].groupby(test['date']).agg('last')
    test = pd.concat([test_wakeup, test_onset], ignore_index=True).sort_values('timestamp')
    return test

In [11]:
file_root = '/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet'
series_id  = pd.read_parquet(file_root, columns=['series_id'])
series_id = series_id.series_id.unique()
submit_columns = ['series_id','step','event','score']
submission = []
for _id in series_id:
    test_tmp = get_events(_id, model,file_root)
    test_tmp = test_tmp[submit_columns]
    submission.append(test_tmp)
submission = pd.concat(submission, ignore_index=True).reset_index(names='row_id')
submission.to_csv('submission.csv', index=False)
submission

Unnamed: 0,row_id,series_id,step,event,score


In [12]:
# submit_columns = ['series_id','step','event','score','timestamp']
# valid_sub = []
# valid_series_ids = series_ids[31:]
# for _id in tqdm(valid_series_ids):
#     valid_tmp = get_events(_id, model,file_root = '/kaggle/input/zzzs-lightweight-training-dataset-target/Zzzs_train.parquet')
#     valid_tmp = valid_tmp[submit_columns]
#     valid_sub.append(valid_tmp)
# valid_sub = pd.concat(valid_sub, ignore_index=True).reset_index(names='row_id')
# valid_sub