In [347]:
import pandas as pd
import pyarrow.parquet as pq
import datetime
import numpy as np
import matplotlib.pyplot as plt

## sample data to test

In [368]:
file_path = '/Users/ra/Library/CloudStorage/OneDrive-HochschuleLuzern/AICH/data/Prepared/V2/train_20231021'

table = pq.read_table(file_path)

df = table.to_pandas()

series_list =df.series_id.unique().tolist()


Unnamed: 0,step,anglez,enmo,hour,minute,seconds,day,month,year,seconds_from_midnight,awake,wearable_on,series_id,onset,wakeup
26181720,0,-82.680603,0.0,16,45,0,15,8,2017,60300,1.0,0.0,31011ade7c0a,0,0
26181721,1,-82.680603,0.0,16,45,5,15,8,2017,60305,1.0,0.0,31011ade7c0a,0,0
26181722,2,-82.680603,0.0,16,45,10,15,8,2017,60310,1.0,0.0,31011ade7c0a,0,0
26181723,3,-82.680603,0.0,16,45,15,15,8,2017,60315,1.0,0.0,31011ade7c0a,0,0
26181724,4,-82.680603,0.0,16,45,20,15,8,2017,60320,1.0,0.0,31011ade7c0a,0,0


In [None]:

sample = df[df['series_id'].isin(series_list[1:3])]
sample.head()

## drop rows with wearable not worn to simulate the data from model

In [369]:
sample.series_id.unique()


array(['31011ade7c0a', '3318a0e3ed6f'], dtype=object)

In [370]:
sample = sample.copy()  # Create a copy of the DataFrame
sample.drop(sample[sample['wearable_on'] == 0].index, inplace=True)

## heuristic function

In [351]:
def restore_timestamp(df):
    df['hour'] = df['hour'].apply(lambda x: f'{x:02d}')
    df['minute'] = df['minute'].apply(lambda x: f'{x:02d}')
    df['seconds'] = df['seconds'].apply(lambda x: f'{x:02d}')
    df['day'] = df['day'].apply(lambda x: f'{x:02d}')
    df['month'] = df['month'].apply(lambda x: f'{x:02d}')
    df['year'] = df['year'].astype(str)

    df['timestamp'] = (
        df['year'] + '-' +
        df['month'] + '-' +
        df['day'] + ' ' +
        df['hour'] + ':' +
        df['minute'] + ':' +
        df['seconds']
    )

    # Convert 'timestamp' to datetime format
    df['timestamp'] = pd.to_datetime(df['timestamp'])

In [352]:
def process_dataframe(df):
    df.loc[:, 'timestamp'] = pd.to_datetime(df['timestamp'])
    df.set_index('timestamp', inplace=True)

    df_resampled = df.resample('5S').asfreq().fillna({'awake': 3, 'step': 0.1, 'series_id': '3318a0e3ed6f'})
    df_resampled = df_resampled.reset_index()

    df_resampled['step'].fillna(0, inplace=True)

    for index, row in df_resampled.iterrows():
        if row['step'] == 0.1 and index-1 in df_resampled.index:
            df_resampled.at[index, 'step'] = df_resampled.at[index-1, 'step'] + 1

    return df_resampled

def missing_rows(df):
    processed_dataframes = []
    series = df.series_id.unique().tolist()
    for serie in series:
        processed_df = process_dataframe(df[df['series_id'] == serie])
        processed_dataframes.append(processed_df)
    
    final_result = pd.concat(processed_dataframes, ignore_index=True)
    return final_result

    

In [353]:
def binning(df):
    # Split the DataFrame by 'series_id'
    separated_dfs = [group for _, group in df.groupby('series_id')]
    print(separated_dfs)
    # Apply binning to each separate DataFrame
    custom_agg_function = lambda x: x.value_counts().idxmax()
    bin_size = '1Min'

    binned_dfs = []

    for df_subset in separated_dfs:
        try:
            binned_df = df_subset.resample(bin_size, on='timestamp').agg({
                'series_id': 'first',
                'step': 'first',
                'awake': custom_agg_function,
                #'wearable_on': custom_agg_function
            }).reset_index()
            
            binned_dfs.append(binned_df)
        except ValueError as e:
            print(f"Error processing DataFrame: {e}")
            print(df_subset)
            continue
    # Concatenate the DataFrames back together
    result = pd.concat(binned_dfs)
    result.reset_index(drop=True, inplace=True)
    return result
    

In [354]:
def add_onset_add_wakeup(df):
    # Initialize a new column 'wakeup' with zeros
    df['wakeup'] = 0
    df['onset'] = 0

    for index, row in df.iterrows():
        if row['awake_changes'] == 1:
            if index > 0:
                previous_row = df.loc[index - 1]
                # Check if same series
                if row['series_id'] == previous_row['series_id']:
                # Check condition for setting 'onset' column
                    if row['awake'] == 0.0 and previous_row['awake'] == 1.0:
                        df.at[index, 'onset'] = 1
                    
                    # Check condition for setting 'wakeup' column
                    elif row['awake'] == 1.0 and previous_row['awake'] == 0.0:
                        df.at[index, 'wakeup'] = 1


In [355]:
def delete_small_breaks(df):
    series = df.series_id.unique().tolist()
    for serie in series:
        onset_events = df[(df['series_id'] == serie) & (df['onset'] == 1)]
        wakeup_events = df[(df['series_id'] == serie) & (df['wakeup'] == 1)]
        
        for index, wakeup_row in wakeup_events.iterrows():
            next_onset_rows = df.iloc[index:].loc[df['onset'] == 1]
            if not next_onset_rows.empty:
                next_onset_index = next_onset_rows.iloc[0].name
                time_diff = (df.loc[next_onset_index]['timestamp'] - df.loc[index]['timestamp']).total_seconds() / 60
                if time_diff < 30:
                    df.loc[index, 'wakeup'] = 0
                    df.loc[next_onset_index, 'onset'] = 0
            else:
                print("No row found with 'onset' == 1 after index", index)

In [356]:
def delete_too_small_periods(df):
    series = df.series_id.unique().tolist()
    for serie in series:
        onset_events = df[(df['series_id'] == serie) & (df['onset'] == 1)]
        wakeup_events = df[(df['series_id'] == serie) & (df['wakeup'] == 1)]   
        
        for index, onset_row in onset_events.iterrows():
            #print(df.loc[index].name)
            next_wakeup_rows = df.iloc[index:].loc[df['wakeup'] == 1]
            if not next_wakeup_rows.empty:
                next_wakeup_index = next_wakeup_rows.iloc[0].name
                #print(next_onset_index)
                time_diff = (df.loc[next_wakeup_index]['timestamp'] - df.loc[index]['timestamp']).total_seconds() / 60
                if time_diff < 30:
                    df.loc[index, 'onset'] = 0
                    df.loc[next_wakeup_index, 'wakeup'] = 0
            else:
                print("No row found with 'onset' == 1 after index", index)

In [357]:
def add_column_night(df):
    series = df.series_id.unique().tolist()
    for serie in series:
        counter = 1
        df.loc[df['series_id'] == serie, 'night'] = 1
        next_index = 0
        
        # Check if there are any occurrences for series_id and hour is 15
        while (df[(df['series_id'] == serie) & (df.index > next_index + 60)]['timestamp'].dt.hour == 15).any():
            counter += 1
            next_index = df.loc[(df['series_id'] == serie) & (df.index > next_index + 60)].loc[df['timestamp'].dt.hour == 15].index[0]
            df.loc[(df['series_id'] == serie) & (df.index >= next_index), 'night'] = counter
        
        print("No occurrences found that meet the conditions.")

In [358]:
def one_sleep_widow(df):
    series = df.series_id.unique().tolist()
    df['event'] = np.NAN
    for serie in series:
        nights = df[df['series_id'] == serie]['night'].unique().tolist()
        for night in nights:
            max_window_duration = pd.Timedelta(0)
            current_window_start = None
            current_window_end = None
            onset_index = None
            wakeup_index = None
            
            for index, row in df[(df['series_id'] == serie) & (df['night'] == night)].iterrows():
                if row['onset'] == 1:
                    # Start of a potential sleeping window
                    current_window_start = row['timestamp']
                    onset_index = index
                elif row['wakeup'] == 1 and current_window_start is not None:
                    # End of a potential sleeping window
                    current_window_end = row['timestamp']
                    wakeup_index = index
                    window_duration = current_window_end - current_window_start
                    if window_duration > max_window_duration:
                        max_window_duration = window_duration
                        max_onset_index = onset_index
                        max_wakeup_index = wakeup_index

            print(f"The longest sleeping window duration is: {max_window_duration}")
            print(f"Starts at index: {max_onset_index}, Ends at index: {max_wakeup_index}")
            # Check if the longest window duration is longer than 30 minutes
            if max_window_duration > pd.Timedelta(minutes=30):
                # Assign 'onset' and 'wakeup' events based on indexes
                df.loc[max_onset_index, 'event'] = 'onset'
                df.loc[max_wakeup_index, 'event'] = 'wakeup'

In [359]:
def add_score(df):
    
    df['score'] = df.apply(calculate_score, axis=1)
    
    for index, row in df.iterrows():
        if row['score'] == 1:
            score_value = 1.0
            event_value = row['event']
            for i in range(index, -1, -1):
                df.at[i, 'score'] = score_value
                df.at[i, 'event'] = event_value
                score_value -= 0.1
                if score_value < 0:
                    break
            
    for index, row in df.iterrows():
        if row['score'] == 1:
            score_value = 1.0
            event_value = row['event']
            for i in range(index, len(df)):
                df.at[i, 'score'] = score_value
                df.at[i, 'event'] = event_value
                score_value -= 0.1
                if score_value < 0:
                    break
                
    df['score'] = df['score'].map('{:.1f}'.format)

In [360]:
def calculate_score(row):
    if row['event'] == 'onset' or row['event'] == 'wakeup':
        return 1
    else:
        return None

In [361]:
def heuristic_function(dataframe):
    df = dataframe.copy()
    
    # step1: restore timestamp information
    restore_timestamp(df)
    
    # step 2: only keep necessary columns
    df = df[['step', 'awake', 'series_id', 'timestamp']]
    
    # step 2.1: fill missing rows
    processed = missing_rows(df)

    # step 3: binning the data
    result = binning(processed)
    
    # step 4: find changes
    result['awake_changes'] = result['awake'].diff().ne(0).astype(int)

    # step 5: find onset and wakeup
    add_onset_add_wakeup(result)
    
    # step 6: if break between two sleeping windows is smaller than 30 min make one window out of it
    delete_small_breaks(result)

    # step 7: if sleeping period is smaller than 30 min delete it
    delete_too_small_periods(result)
    
    # step 8: get a column for night
    add_column_night(result)
    
    # step 9: only keep 1 sleeping window per night
    one_sleep_widow(result)
    
    # step 10: add score around the onset and wakeup
    add_score(result)
    
    # step 11: delete not necessary rows and columns
    result.drop(['timestamp', 'awake', 'awake_changes', 'wakeup', 'onset', 'night'], axis = 1, inplace=True)
    result =result.dropna(subset=['event'])
    
    # step 12: reset index
    result.reset_index(drop=True, inplace=True)
    result['row_id'] = result.reset_index().index
    
    return result
    
    

## test the sample data

In [371]:
sample.head(3)

Unnamed: 0,step,anglez,enmo,hour,minute,seconds,day,month,year,seconds_from_midnight,awake,wearable_on,series_id,onset,wakeup
26185272,3552,-74.877899,0.0188,21,41,0,15,8,2017,78060,1.0,1.0,31011ade7c0a,0,0
26185273,3553,-73.750504,0.0289,21,41,5,15,8,2017,78065,0.0,1.0,31011ade7c0a,0,1
26185274,3554,-75.431198,0.019,21,41,10,15,8,2017,78070,0.0,1.0,31011ade7c0a,0,0


In [372]:
sample.series_id.unique()

array(['31011ade7c0a', '3318a0e3ed6f'], dtype=object)

In [373]:
sample.tail(3)

Unnamed: 0,step,anglez,enmo,hour,minute,seconds,day,month,year,seconds_from_midnight,awake,wearable_on,series_id,onset,wakeup
27418317,671757,-6.411,0.0684,9,29,45,26,6,2018,34185,1.0,1.0,3318a0e3ed6f,0,0
27418318,671758,-9.1411,0.0692,9,29,50,26,6,2018,34190,1.0,1.0,3318a0e3ed6f,0,0
27418319,671759,-6.261,0.064,9,29,55,26,6,2018,34195,1.0,1.0,3318a0e3ed6f,0,0


In [374]:
versuch = heuristic_function(sample)
versuch.head()

[                 timestamp      step  awake     series_id
0      2017-08-15 21:41:00    3552.0    1.0  31011ade7c0a
1      2017-08-15 21:41:05    3553.0    0.0  31011ade7c0a
2      2017-08-15 21:41:10    3554.0    0.0  31011ade7c0a
3      2017-08-15 21:41:15    3555.0    0.0  31011ade7c0a
4      2017-08-15 21:41:20    3556.0    0.0  31011ade7c0a
...                    ...       ...    ...           ...
561283 2017-09-17 09:14:35  564835.0    1.0  31011ade7c0a
561284 2017-09-17 09:14:40  564836.0    1.0  31011ade7c0a
561285 2017-09-17 09:14:45  564837.0    1.0  31011ade7c0a
561286 2017-09-17 09:14:50  564838.0    1.0  31011ade7c0a
561287 2017-09-17 09:14:55  564839.0    1.0  31011ade7c0a

[561288 rows x 4 columns],                   timestamp      step  awake     series_id
561288  2018-05-18 12:30:00       0.0    1.0  3318a0e3ed6f
561289  2018-05-18 12:30:05       1.0    1.0  3318a0e3ed6f
561290  2018-05-18 12:30:10       2.0    1.0  3318a0e3ed6f
561291  2018-05-18 12:30:15       3.0  

UnboundLocalError: cannot access local variable 'max_onset_index' where it is not associated with a value

In [375]:
sample.series_id.unique()

array(['31011ade7c0a', '3318a0e3ed6f'], dtype=object)

In [376]:
versuch.series_id.unique()

array(['31011ade7c0a', '3318a0e3ed6f', '33ceeba8918a', '3452b878e596',
       '349c5562ee2c', '35826366dfc7', '361366da569e'], dtype=object)