In [1]:
import pandas as pd
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import datetime
import numpy as np
import matplotlib.pyplot as plt

## sample data to test

In [2]:
file_path = '../data/train_20231021'
DEBUG = False
series_list = ds.dataset(file_path).to_table(columns=['series_id']).to_pandas().series_id.unique().tolist()

In [3]:
sample = pq.read_table(file_path, filters=[('series_id', 'in', series_list[1:10])]).to_pandas()
sample.head()

Unnamed: 0,step,anglez,enmo,hour,minute,seconds,day,month,year,seconds_from_midnight,awake,wearable_on,series_id,onset,wakeup
26181720,0,-82.680603,0.0,16,45,0,15,8,2017,60300,1.0,0.0,31011ade7c0a,0,0
26181721,1,-82.680603,0.0,16,45,5,15,8,2017,60305,1.0,0.0,31011ade7c0a,0,0
26181722,2,-82.680603,0.0,16,45,10,15,8,2017,60310,1.0,0.0,31011ade7c0a,0,0
26181723,3,-82.680603,0.0,16,45,15,15,8,2017,60315,1.0,0.0,31011ade7c0a,0,0
26181724,4,-82.680603,0.0,16,45,20,15,8,2017,60320,1.0,0.0,31011ade7c0a,0,0


In [4]:
sample.series_id.unique()

array(['31011ade7c0a', '3318a0e3ed6f', '33ceeba8918a', '3452b878e596',
       '349c5562ee2c', '35826366dfc7', '361366da569e', '3664fe9233f9',
       '3665c86afaf5'], dtype=object)

## drop rows with wearable not worn to simulate the data from model

In [5]:
sample.series_id.unique()


array(['31011ade7c0a', '3318a0e3ed6f', '33ceeba8918a', '3452b878e596',
       '349c5562ee2c', '35826366dfc7', '361366da569e', '3664fe9233f9',
       '3665c86afaf5'], dtype=object)

In [6]:
# date will be generated by fx
#sample['timestamp'] = pd.to_datetime(sample[['year', 'month', 'day', 'hour', 'minute', 'seconds']])
#sample.drop(['year', 'month', 'day', 'hour', 'minute', 'seconds'], axis=1, inplace=True)

In [7]:
sample = sample[sample['wearable_on'] != 0]

In [8]:
# mask = (df2['awake_changes'] == 1)

# before = mask.shift(1, fill_value=False)
# after = mask.shift(-1, fill_value=False)

# combined_mask = mask | before | after

# df2[combined_mask]

## heuristic function

In [9]:
def delete_small_breaks(df):
    series = df.series_id.unique().tolist()
    for serie in series:
        #onset_events = df[(df['series_id'] == serie) & (df['onset'] == 1)]
        wakeup_events = df[(df['series_id'] == serie) & (df['wakeup'] == 1)]
        
        for index, wakeup_row in wakeup_events.iterrows():
            next_onset_rows = df.iloc[index:].loc[df['onset'] == 1]
            if not next_onset_rows.empty:
                next_onset_index = next_onset_rows.iloc[0].name
                time_diff = (df.loc[next_onset_index]['timestamp'] - df.loc[index]['timestamp']).total_seconds() / 60
                if time_diff < 30:
                    df.loc[index, 'wakeup'] = 0
                    df.loc[next_onset_index, 'onset'] = 0
            else:
                if DEBUG: print("No row found with 'onset' == 1 after index", index)

In [10]:
def delete_too_small_periods(df):
    series = df.series_id.unique().tolist()
    for serie in series:
        onset_events = df[(df['series_id'] == serie) & (df['onset'] == 1)]
        #wakeup_events = df[(df['series_id'] == serie) & (df['wakeup'] == 1)]   
        
        for index, onset_row in onset_events.iterrows():
            #print(df.loc[index].name)
            next_wakeup_rows = df.iloc[index:].loc[df['wakeup'] == 1]
            if not next_wakeup_rows.empty:
                next_wakeup_index = next_wakeup_rows.iloc[0].name
                #print(next_onset_index)
                time_diff = (df.loc[next_wakeup_index]['timestamp'] - df.loc[index]['timestamp']).total_seconds() / 60
                if time_diff < 30:
                    df.loc[index, 'onset'] = 0
                    df.loc[next_wakeup_index, 'wakeup'] = 0
            else:
                if DEBUG: print("No row found with 'onset' == 1 after index", index)

In [11]:
def add_column_night(df):
    series = df.series_id.unique().tolist()
    for serie in series:
        counter = 1
        df.loc[df['series_id'] == serie, 'night'] = 1
        next_index = 0
        
        # Check if there are any occurrences for series_id and hour is 15
        while (df[(df['series_id'] == serie) & (df.index > next_index + 60)]['timestamp'].dt.hour == 15).any():
            counter += 1
            next_index = df.loc[(df['series_id'] == serie) & (df.index > next_index + 60)].loc[df['timestamp'].dt.hour == 15].index[0]
            df.loc[(df['series_id'] == serie) & (df.index >= next_index), 'night'] = counter
        
        if DEBUG: print("No occurrences found that meet the conditions.")

In [12]:
def one_sleep_widow(df):
    series = df.series_id.unique().tolist()
    df['event'] = np.NAN
    for serie in series:
        nights = df[df['series_id'] == serie]['night'].unique().tolist()
        for night in nights:
            max_window_duration = pd.Timedelta(0)
            current_window_start = None
            current_window_end = None
            onset_index = None
            wakeup_index = None
            
            for index, row in df[(df['series_id'] == serie) & (df['night'] == night)].iterrows():
                if row['onset'] == 1:
                    # Start of a potential sleeping window
                    current_window_start = row['timestamp']
                    onset_index = index
                elif row['wakeup'] == 1 and current_window_start is not None:
                    # End of a potential sleeping window
                    current_window_end = row['timestamp']
                    wakeup_index = index
                    window_duration = current_window_end - current_window_start
                    if window_duration > max_window_duration:
                        max_window_duration = window_duration
                        max_onset_index = onset_index
                        max_wakeup_index = wakeup_index

            if DEBUG: print(f"The longest sleeping window duration is: {max_window_duration}")
            if DEBUG: print(f"Starts at index: {max_onset_index}, Ends at index: {max_wakeup_index}")
            # Check if the longest window duration is longer than 30 minutes
            if max_window_duration > pd.Timedelta(minutes=30):
                # Assign 'onset' and 'wakeup' events based on indexes
                df.loc[max_onset_index, 'event'] = 'onset'
                df.loc[max_wakeup_index, 'event'] = 'wakeup'

In [13]:
def calculate_score(row):
    if row['event'] == 'onset' or row['event'] == 'wakeup':
        return 1
    else:
        return None

In [14]:
def add_score(df):
    
    df['score'] = df.apply(calculate_score, axis=1)
    
    for index, row in df.iterrows():
        if row['score'] == 1:
            score_value = 1.0
            event_value = row['event']
            for i in range(index, -1, -1):
                df.at[i, 'score'] = score_value
                df.at[i, 'event'] = event_value
                score_value -= 0.1
                if score_value < 0:
                    break
            
    for index, row in df.iterrows():
        if row['score'] == 1:
            score_value = 1.0
            event_value = row['event']
            for i in range(index, len(df)):
                df.at[i, 'score'] = score_value
                df.at[i, 'event'] = event_value
                score_value -= 0.1
                if score_value < 0:
                    break
                
    df['score'] = df['score'].map('{:.1f}'.format)

In [15]:
def heuristic_function(df):
    #df.rename(columns={'pred_awake': 'awake'}, inplace=True)
    # step1: restore timestamp information
    #restore_timestamp(df)
    df['timestamp'] = pd.to_datetime(df[['year', 'month', 'day', 'hour', 'minute', 'seconds']])
    #df.drop(['year', 'month', 'day', 'hour', 'minute', 'seconds'], axis=1, inplace=True)

    # step 2: only keep necessary columns
    # IF MEMORY ISSUE DO A INPLACE DROP WITH SET(KEEP) ^ SET(DF.COLUMNS)
    df = df[['step', 'awake', 'series_id', 'timestamp']]

    # step 2.1: fill missing rows
    #processed = missing_rows(df)

    # step 3: binning the data
    #result = binning(processed)
    df.set_index('timestamp', inplace=True)
    df = df.groupby('series_id').resample('1T').agg({'step': 'first', 'awake': 'mean'}).reset_index() # we use mean and round to get whether the person is awake or not
    df = df[df['awake'].notna()] # drop rows where awake is na, meaning that there is no data for this minute
    df.loc[:, 'awake'] = df['awake'].round().astype(int)


    # step 4: find changes
    df['awake_changes'] = df['awake'].diff().ne(0).astype(int)

    # # step 5: find onset and wakeup
    #add_onset_add_wakeup(df)
    onset_mask = (df['awake_changes'] == 1) & (df['awake'] == 0)
    wakeup_mask = (df['awake_changes'] == 1) & (df['awake'] == 1)
    df['onset'] = np.where(onset_mask, 1, 0)
    df['wakeup'] = np.where(wakeup_mask, 1, 0)

    # step 6: if break between two sleeping windows is smaller than 30 min make one window out of it
    delete_small_breaks(df)

    # step 7: if sleeping period is smaller than 30 min delete it
    delete_too_small_periods(df)

    # step 8: get a column for night
    add_column_night(df)

    # step 9: only keep 1 sleeping window per night
    one_sleep_widow(df)

    # step 10: add score around the onset and wakeup
    add_score(df)

    # step 11: delete not necessary rows and columns
    df.drop(['timestamp', 'awake', 'awake_changes', 'wakeup', 'onset', 'night'], axis = 1, inplace=True)
    df =df.dropna(subset=['event'])

    # step 12: reset index
    df.reset_index(drop=True, inplace=True)
    df['row_id'] = df.reset_index().index

    return df

In [16]:
raise ValueError('stop here')

ValueError: stop here

## test the sample data

In [17]:
versuch = heuristic_function(sample)

  df.loc[max_onset_index, 'event'] = 'onset'


In [18]:
versuch.head(20)

Unnamed: 0,series_id,step,event,score,row_id
0,31011ade7c0a,3552.0,onset,1.0,0
1,31011ade7c0a,3564.0,onset,0.9,1
2,31011ade7c0a,3576.0,onset,0.8,2
3,31011ade7c0a,3588.0,onset,0.7,3
4,31011ade7c0a,3600.0,onset,0.6,4
5,31011ade7c0a,3612.0,onset,0.5,5
6,31011ade7c0a,3624.0,onset,0.4,6
7,31011ade7c0a,3636.0,onset,0.3,7
8,31011ade7c0a,3648.0,onset,0.2,8
9,31011ade7c0a,3660.0,onset,0.1,9
