In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet
/kaggle/input/child-mind-institute-detect-sleep-states/sample_submission.csv
/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv
/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet


In [2]:
# Explained Baseline Solution 💨
# polars rather than pq
# Importing data 
import polars as pl

# Column transformations

dt_transforms = [
    pl.col('timestamp').str.to_datetime(), 
    (pl.col('timestamp').str.to_datetime().dt.year()-2000).cast(pl.UInt8).alias('year'), 
    pl.col('timestamp').str.to_datetime().dt.month().cast(pl.UInt8).alias('month'),
    pl.col('timestamp').str.to_datetime().dt.day().cast(pl.UInt8).alias('day'), 
    pl.col('timestamp').str.to_datetime().dt.hour().cast(pl.UInt8).alias('hour')
]

data_transforms = [
    pl.col('anglez').cast(pl.Int16), # Casting anglez to 16 bit integer    #TODO *1000 -> cast
    (pl.col('enmo')*1000).cast(pl.UInt16), # Convert enmo to 16 bit uint
]

train_series = pl.scan_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet').with_columns(
    dt_transforms + data_transforms
    )

train_events = pl.read_csv('/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv').with_columns(
    dt_transforms
    ).drop_nulls()

test_series = pl.scan_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet').with_columns(
    dt_transforms + data_transforms
    )

# Removing null events and nights with mismatched counts from series_events
mismatches = train_events.drop_nulls().group_by(['series_id', 'night']).agg([
    ((pl.col('event') == 'onset').sum() == (pl.col('event') == 'wakeup').sum()).alias('balanced')
    ]).sort(by=['series_id', 'night']).filter(~pl.col('balanced'))

for mm in mismatches.to_numpy(): 
    train_events = train_events.filter(~((pl.col('series_id') == mm[0]) & (pl.col('night') == mm[1])))


# Getting series ids as a list for convenience
series_ids = train_events['series_id'].unique(maintain_order=True).to_list()

# Updating train_series to only keep these series ids
train_series = train_series.filter(pl.col('series_id').is_in(series_ids))

In [None]:
train_series

In [3]:
train_events

series_id,night,event,step,timestamp,year,month,day,hour
str,i64,str,i64,"datetime[μs, UTC]",u8,u8,u8,u8
"""038441c925bb""",1,"""onset""",4992,2018-08-15 02:26:00 UTC,18,8,15,2
"""038441c925bb""",1,"""wakeup""",10932,2018-08-15 10:41:00 UTC,18,8,15,10
"""038441c925bb""",2,"""onset""",20244,2018-08-15 23:37:00 UTC,18,8,15,23
"""038441c925bb""",2,"""wakeup""",27492,2018-08-16 09:41:00 UTC,18,8,16,9
"""038441c925bb""",3,"""onset""",39996,2018-08-17 03:03:00 UTC,18,8,17,3
"""038441c925bb""",3,"""wakeup""",44400,2018-08-17 09:10:00 UTC,18,8,17,9
"""038441c925bb""",4,"""onset""",57240,2018-08-18 03:00:00 UTC,18,8,18,3
"""038441c925bb""",4,"""wakeup""",62856,2018-08-18 10:48:00 UTC,18,8,18,10
"""038441c925bb""",6,"""onset""",91296,2018-08-20 02:18:00 UTC,18,8,20,2
"""038441c925bb""",6,"""wakeup""",97860,2018-08-20 11:25:00 UTC,18,8,20,11


In [4]:
series_ids

['038441c925bb',
 '03d92c9f6f8a',
 '0402a003dae9',
 '04f547b8017d',
 '05e1944c3818',
 '062cae666e2a',
 '062dbd4c95e6',
 '08db4255286f',
 '0a96f4993bd7',
 '0cd1e3d0ed95',
 '0ce74d6d2106',
 '0cfc06c129cc',
 '0d0ad1e77851',
 '0dee4fda51c3',
 '0ec9fc461819',
 '0ef7d94fde99',
 '0f572d690310',
 '10469f6765bf',
 '1087d7b0ff2e',
 '10f8bc1f7b07',
 '12d01911d509',
 '1319a1935f48',
 '137771d19ca2',
 '137b99e936ab',
 '13b4d6a01d27',
 '148471991ffb',
 '154fe824ed87',
 '16fe2798ed0f',
 '1716cd4163b2',
 '1762ab70ec76',
 '188d4b7cd28b',
 '18a0ca03431d',
 '18b61dd5aae8',
 '1955d568d987',
 '1b92be89db4c',
 '1c7c0bad1263',
 '1d4569cbac0f',
 '1e6717d93c1d',
 '1f96b9668bdf',
 '207eded97727',
 '25e2b3dd9c3b',
 '2654a87be968',
 '27f09a6a858f',
 '280e08693c6d',
 '292a75c0b94e',
 '29c75c018220',
 '29d3469bd15d',
 '2b0a1fa8eba8',
 '2b8d87addea9',
 '2cd2340ca14d',
 '2e9ced2c7976',
 '2f7504d0f426',
 '2fbbee1a38e3',
 '31011ade7c0a',
 '3318a0e3ed6f',
 '33ceeba8918a',
 '3452b878e596',
 '349c5562ee2c',
 '35826366dfc7