In [1]:
import gc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns




## Exploratory Data Analysis (EDA)

In [2]:
train_series = pd.read_parquet("../data/train_series.parquet")
train_events = pd.read_csv("../data/train_events.csv")

### Train Series

In [3]:
train_series.head()

Unnamed: 0,series_id,step,timestamp,anglez,enmo
0,038441c925bb,0,2018-08-14T15:30:00-0400,2.6367,0.0217
1,038441c925bb,1,2018-08-14T15:30:05-0400,2.6368,0.0215
2,038441c925bb,2,2018-08-14T15:30:10-0400,2.637,0.0216
3,038441c925bb,3,2018-08-14T15:30:15-0400,2.6368,0.0213
4,038441c925bb,4,2018-08-14T15:30:20-0400,2.6368,0.0215


In [4]:
train_series.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127946340 entries, 0 to 127946339
Data columns (total 5 columns):
 #   Column     Dtype  
---  ------     -----  
 0   series_id  object 
 1   step       uint32 
 2   timestamp  object 
 3   anglez     float32
 4   enmo       float32
dtypes: float32(2), object(2), uint32(1)
memory usage: 3.3+ GB


In [5]:
train_series.describe(include='all')

Unnamed: 0,series_id,step,timestamp,anglez,enmo
count,127946340,127946300.0,127946340,127946300.0,127946300.0
unique,277,,12159540,,
top,78569a801a38,,2018-12-27T10:03:30-0500,,
freq,1433880,,19,,
mean,,254804.8,,-8.810453,0.04131503
std,,177893.0,,35.52188,0.1018289
min,,0.0,,-90.0,0.0
25%,,115812.0,,-31.8589,0.0013
50%,,234519.0,,-9.5979,0.0172
75%,,357196.0,,11.3002,0.0437


In [6]:
# Check null values
train_series.isnull().any()

series_id    False
step         False
timestamp    False
anglez       False
enmo         False
dtype: bool

### Train Events

In [7]:
train_events.head()

Unnamed: 0,series_id,night,event,step,timestamp
0,038441c925bb,1,onset,4992.0,2018-08-14T22:26:00-0400
1,038441c925bb,1,wakeup,10932.0,2018-08-15T06:41:00-0400
2,038441c925bb,2,onset,20244.0,2018-08-15T19:37:00-0400
3,038441c925bb,2,wakeup,27492.0,2018-08-16T05:41:00-0400
4,038441c925bb,3,onset,39996.0,2018-08-16T23:03:00-0400


In [8]:
train_events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14508 entries, 0 to 14507
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   series_id  14508 non-null  object 
 1   night      14508 non-null  int64  
 2   event      14508 non-null  object 
 3   step       9585 non-null   float64
 4   timestamp  9585 non-null   object 
dtypes: float64(1), int64(1), object(3)
memory usage: 566.8+ KB


In [9]:
train_events.describe(include='all')

Unnamed: 0,series_id,night,event,step,timestamp
count,14508,14508.0,14508,9585.0,9585
unique,277,,2,,9360
top,78569a801a38,,onset,,2017-11-27T21:37:00-0500
freq,168,,7254,,3
mean,,15.120072,,214352.123944,
std,,10.286758,,141268.408192,
min,,1.0,,936.0,
25%,,7.0,,95436.0,
50%,,14.0,,200604.0,
75%,,21.0,,317520.0,


In [10]:
# Check null values
train_events.isnull().any()

series_id    False
night        False
event        False
step          True
timestamp     True
dtype: bool

## Pre-processing of Data

In [11]:
train_events = train_events.dropna(axis=0, ignore_index=True)
train_events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9585 entries, 0 to 9584
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   series_id  9585 non-null   object 
 1   night      9585 non-null   int64  
 2   event      9585 non-null   object 
 3   step       9585 non-null   float64
 4   timestamp  9585 non-null   object 
dtypes: float64(1), int64(1), object(3)
memory usage: 374.5+ KB


### Invalid events

Search for occurrences of "bad nights," which refers to any nights deviating from a single onset/wakeup pattern.

In [12]:
bad_nights = train_events[['series_id', 'night', 'event']].groupby(by=['series_id', 'night']).count()
bad_nights[bad_nights['event']!=2]

Unnamed: 0_level_0,Unnamed: 1_level_0,event
series_id,night,Unnamed: 2_level_1
0ce74d6d2106,20,1
154fe824ed87,30,1
44a41bba1ee7,10,1
efbfc4526d58,7,1
f8a8da8bdd00,17,1


Removing problematic Data

In [13]:
train_events = train_events[~(((train_events['series_id']=='0ce74d6d2106') & (train_events['night']==20)) |
                              ((train_events['series_id']=='154fe824ed87') & (train_events['night']==30)) |
                              ((train_events['series_id']=='44a41bba1ee7') & (train_events['night']==10)) |                
                              ((train_events['series_id']=='efbfc4526d58') & (train_events['night']==7)) |
                              ((train_events['series_id']=='f8a8da8bdd00') & (train_events['night']==17)))].reset_index(drop=True)

train_events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9580 entries, 0 to 9579
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   series_id  9580 non-null   object 
 1   night      9580 non-null   int64  
 2   event      9580 non-null   object 
 3   step       9580 non-null   float64
 4   timestamp  9580 non-null   object 
dtypes: float64(1), int64(1), object(3)
memory usage: 374.3+ KB


In [14]:
series_ID_list = list(train_events['series_id'].unique())
print('Total number of valid series in training dataset: '+ str(len(series_ID_list)))

Total number of valid series in training dataset: 269


In [15]:
# Update train_series dataset with final set of series
train_series = train_series[train_series['series_id'].isin(series_ID_list)].reset_index(drop=True)
# Check for nulls
train_series.isnull().any()

series_id    False
step         False
timestamp    False
anglez       False
enmo         False
dtype: bool

### Sample encoding and merge data 

In [16]:
train_events["step"]  = train_events["step"].astype("int")
train_events["awake"] = train_events["event"].replace({"onset":1,"wakeup":0})

train = pd.merge(train_series, train_events[['step','awake']], on='step', how='left')
train["awake"] = train["awake"].bfill(axis ='rows')


train['awake'] = train['awake'].fillna(1) # awake
train["awake"] = train["awake"].astype("int")


In [17]:
train.sample(10)

Unnamed: 0,series_id,step,timestamp,anglez,enmo,awake
112280629,e6ddbaaf0639,279998,2017-10-13T18:08:10-0400,16.243099,0.2674,1
104337635,dc80ca623d71,495827,2019-06-18T03:53:55-0400,47.297001,0.0,0
63016473,89bd631d1769,105535,2018-01-22T19:34:35-0500,32.337502,0.01,1
113908767,ebb6fae8ed43,108590,2018-05-17T23:04:10-0400,-77.011497,0.0,1
28736224,3664fe9233f9,110424,2018-02-12T03:52:00-0500,40.6898,0.0,1
25634228,31011ade7c0a,29890,2017-08-17T10:15:50-0400,29.7535,0.0,0
34059593,44a41bba1ee7,92163,2018-02-28T02:00:15-0500,7.0698,0.0151,1
94160466,ce85771a714c,40570,2018-07-19T20:35:50-0400,35.776299,0.85,1
38376109,51b23d177971,101840,2017-09-12T09:41:40-0400,-24.7705,0.0094,0
85466694,bf00506437aa,307692,2019-06-18T08:51:00-0400,-27.0228,0.0501,0


## Visualization

In [18]:
for series_ID in series_ID_list:
    train_series_sample = train_series[('series_id','=',series_ID)]
    display(Markdown('###  anglez for series ' + series_ID))
    fig, ax = plt.subplots(figsize=(20, 3))
    sns.lineplot(data=train, x="step", y="anglez",hue="awake", linewidth = 0.5)
    plt.show();
    display(Markdown('###  enmo for series ' + series_ID))
    fig, ax = plt.subplots(figsize=(20, 3))
    sns.lineplot(data=train, x="step", y="enmo",hue="awake", linewidth = 0.5)
    plt.show();
    del train
    gc.collect();
