### The following code snippet helps
+ Identify user annotated episodes which are cosecutive in nature
+ For our purpose, we define `consecutive` in two ways. We use either 
    - **Episodes annotated with the same stressor with no non-stress annotations in between**
    - **Episodes annotated with the same stressor with zero time-gap between them**

In [1]:
from cerebralcortex import Kernel
from datetime import datetime, date, timedelta
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
import pickle

pd.options.display.float_format = '{:.3f}'.format
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

### Fecth data

In [68]:
with open('/home/jupyter/sneupane/MOODS/analysis/papers/CHI/dataframe/final_df.pickle', 'rb') as handle:
    final_df = pickle.load(handle).assign(date=lambda df: df['starttime'].dt.date)

In [None]:
# display(final_df.head())

### Data cleaning and filtering

In [29]:
cols = ['user', 'stress_id', 'starttime', 'endtime', 
        'episode_class', 'user_generated', 'user_rating', 
        'selected', 'Stressor'] 

In [30]:
def tweak_ds_users_episodes(df_users_episodes):        
    return (df_users_episodes[cols]
            .assign(starttime=pd.to_datetime(df_users_episodes['starttime']), 
                    endtime=pd.to_datetime(df_users_episodes['endtime']),
                    eps_dur=((df_users_episodes['endtime'] - df_users_episodes['starttime'])
                             .dt.total_seconds() / 60),
                    date=df_users_episodes['starttime'].dt.date, 
                    rated=np.where(df_users_episodes['user_rating'].notnull(), 1, 0))
            .sort_values(by=['user', 'endtime'])
           )

In [31]:
def filter_df_users_episodes(df_users_episodes):
    return (df_users_episodes
            .loc[lambda df: df['endtime'].shift() != df['endtime']]
            .loc[lambda df: (df['episode_class'].isin([0, 2])) 
                 & (df['user_generated'] == 0) 
                 & (df['rated'] == 1)]
            .loc[lambda df: ~((df['Stressor'].notnull()) 
                 & (df['user_rating'].isin(['Probably not stressed', 
                                            'Not stressed'])))]
           )

In [32]:
def filter_users_with_min_annations(df_users_episodes, min_annot):
    df_users_episodes = (df_users_episodes
                         .groupby('user')
                         .filter(lambda gdf: gdf['rated'].sum() >= min_annot))
    return df_users_episodes

In [33]:
final_df = tweak_ds_users_episodes(final_df)
final_df = filter_df_users_episodes(final_df)
final_df = filter_users_with_min_annations(final_df, 1)

In [67]:
with open('/home/jupyter/sneupane/MOODS/analysis/papers/CHI/dataframe/final_df_stressed_with_category.pickle', 'rb') as handle:
    df_with_category = pickle.load(handle)

In [None]:
# display(df_with_category.head(5))

### Update final_df with cleaned stressors

In [66]:
mod_stressors = df_with_category[['stress_id', 'mod_stressor']]
final_df = pd.merge(final_df, mod_stressors, 
                    on='stress_id', how='left') 

In [None]:
final_df.head()

In [57]:
## A two-pointer approach has been used. seq_len indicates the maximum length of consequtive
## sequence. seq_match is incremented for successive stressors match. The inner loop
## terminates if a mismatch is found or end of dataframe is reached or seq_match + 1 equals seq_len.
## If seq_match + 1 equals seq_len, the stress episode pointed by lp marks the start of a 
## consequtive sequence of length seq_len. 
## lp and rp are updated then updated.

def anchor_of_consecutive_episodes(df_users_episodes, seq_len):
    df_users_episodes = df_users_episodes.sort_values(by=['user', 'starttime'])
    anchor_episodes = pd.DataFrame(columns=df_users_episodes.columns)
    
    for _, gdf in df_users_episodes.groupby(['user', 'date']):
        lp, rp, seq_match = 0, 1, 0
        gdf = gdf.sort_values(by='starttime').reset_index(drop=True)
        while rp < len(gdf):
            while rp < len(gdf) and (gdf['mod_stressor'].iloc[rp] == gdf['mod_stressor'].iloc[rp-1]):
                rp += 1
                seq_match += 1
                if seq_match + 1 == seq_len:
                    anchor_episodes.loc[len(anchor_episodes)] = gdf.iloc[lp]
                    break                
            lp = rp
            rp = lp + 1
            seq_match = 0
    return anchor_episodes

In [59]:
anchors = final_df.copy()
## starts with a seq_len of 2. 
seq_len = 2
while True:
    anchor_episodes = anchor_of_consecutive_episodes(anchors, seq_len)
    ## adds a flag of 1 for each stress episode that marks the start of a consequtive sequence of
    ## length seq_len. the code terminates when no anchor episodes exist for a certain seq_len since
    ## there won't be any anchor episodes for a higher seq_len as well.
    if len(anchor_episodes):
        col = "con_seq_" + str(seq_len)
        stress_ids = anchor_episodes['stress_id'].to_list()
        anchors[col] = anchors['stress_id'].apply(lambda sid: 1 if sid in stress_ids else 0)
        seq_len += 1
    else:
        break

In [65]:
# anchors.head()

In [60]:
anchors.columns

Index(['user', 'stress_id', 'starttime', 'endtime', 'episode_class',
       'user_generated', 'user_rating', 'selected', 'Stressor', 'eps_dur',
       'date', 'rated', 'mod_stressor', 'tgap', 'con_seq_2', 'con_seq_3',
       'con_seq_4', 'con_seq_5', 'con_seq_6', 'con_seq_7', 'con_seq_8',
       'con_seq_9', 'con_seq_10', 'con_seq_11', 'con_seq_12', 'con_seq_13',
       'con_seq_14', 'con_seq_15', 'con_seq_16', 'con_seq_17', 'con_seq_18',
       'con_seq_19', 'con_seq_20', 'con_seq_21'],
      dtype='object')

In [61]:
anchors.loc[(anchors['user'] == '*************************') 
             & (anchors['date'] == date(2023,1,4))
             & (anchors['stress_id'] >= 215910), ['stress_id', 'user_rating', 
                                                  'mod_stressor'] + list(anchors.columns)[-20:-1]]

Unnamed: 0,stress_id,user_rating,mod_stressor,con_seq_2,con_seq_3,con_seq_4,con_seq_5,con_seq_6,con_seq_7,con_seq_8,con_seq_9,con_seq_10,con_seq_11,con_seq_12,con_seq_13,con_seq_14,con_seq_15,con_seq_16,con_seq_17,con_seq_18,con_seq_19,con_seq_20
542,215936,Unsure,figuring out what to do,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
543,215942,Unsure,figuring out what to do,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
544,215943,Unsure,figuring out what to do,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
545,215945,Unsure,figuring out what to do,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
546,215947,Unsure,figuring out what to do,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
547,215948,Unsure,figuring out what to do,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
548,215951,Unsure,figuring out what to do,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
549,215954,Probably not stressed,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
550,215963,Probably not stressed,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [64]:
final_df = (final_df.sort_values(by=['user', 'date', 'starttime'])
                     .assign(tgap=lambda df: (df['starttime'].shift(-1) - df['endtime'])
                             .dt.total_seconds()))

In [63]:
# final_df.head()

In [53]:
## This is similar to the previous code with an added condition of time gap between successive 
## annotated stress episodes has to be 0.

def anchor_of_consecutive_episodes(df_users_episodes, seq_len):
    df_users_episodes = df_users_episodes.sort_values(by=['user', 'starttime'])
    if seq_len == 2:
        return df_users_episodes.loc[df_users_episodes['tgap'] == 0]
    
    anchor_episodes = pd.DataFrame(columns=df_users_episodes.columns)    
    for _, gdf in df_users_episodes.groupby(['user', 'date']):
        lp, rp, seq_match = 0, 1, 0
        gdf = gdf.sort_values(by='starttime').reset_index(drop=True)
        while rp < len(gdf):
            while (rp < len(gdf))\
            and ((gdf['starttime'].iloc[rp] - gdf['endtime'].iloc[rp-1]).total_seconds() == 0)\
            and (gdf['mod_stressor'].iloc[rp] == gdf['mod_stressor'].iloc[rp-1]):
                rp += 1
                seq_match += 1
                if seq_match + 1 == seq_len:
                    anchor_episodes.loc[len(anchor_episodes)] = gdf.iloc[lp]
                    break
            lp = rp
            rp = lp + 1
            seq_match = 0
    return anchor_episodes

In [54]:
anchors = final_df.copy()
## starts with a seq_len of 2. 
seq_len = 2
while True:
    anchor_episodes = anchor_of_consecutive_episodes(anchors, seq_len)
    ## adds a flag of 1 for each stress episode that marks the start of a consequtive sequence of
    ## length seq_len. the code terminates when no anchor episodes exist for a certain seq_len since
    ## there won't be any anchor episodes for a higher seq_len as well.
    if len(anchor_episodes):
        col = "con_seq_" + str(seq_len)
        stress_ids = anchor_episodes['stress_id'].to_list()
        anchors[col] = anchors['stress_id'].apply(lambda sid: 1 if sid in stress_ids else 0)
        seq_len += 1
    else:
        break

In [56]:
anchors.loc[(anchors['user'] == '*************************') 
                                & (anchors['date'] == date(2023,1,13))
                                & (anchors['stress_id'] >= 219376), 
                                ['stress_id', 'starttime', 'endtime', 'mod_stressor', 
                                'tgap'] + list(anchors.columns)[-9:-1]].sort_values(by='starttime')

Unnamed: 0,stress_id,starttime,endtime,mod_stressor,tgap,con_seq_3,con_seq_4,con_seq_5,con_seq_6,con_seq_7,con_seq_8,con_seq_9,con_seq_10
5806,219376,2023-01-13 22:37:13,2023-01-13 22:41:11,grieving,0.0,1,1,1,1,1,1,1,1
5807,219377,2023-01-13 22:41:11,2023-01-13 22:54:11,grieving,0.0,0,0,0,0,0,0,0,0
5808,219378,2023-01-13 22:54:11,2023-01-13 23:00:11,grieving,0.0,0,0,0,0,0,0,0,0
5809,219379,2023-01-13 23:00:11,2023-01-13 23:04:11,grieving,0.0,1,0,0,0,0,0,0,0
5810,219380,2023-01-13 23:04:11,2023-01-13 23:06:11,grieving,0.0,0,1,0,0,0,0,0,0
5811,219381,2023-01-13 23:06:11,2023-01-13 23:12:12,grieving,0.0,0,0,1,0,0,0,0,0
5812,219382,2023-01-13 23:12:12,2023-01-13 23:19:11,grieving,0.0,1,0,0,0,0,0,0,0
5813,219383,2023-01-13 23:19:11,2023-01-13 23:26:11,grieving,0.0,0,0,0,0,0,0,0,0
5814,219384,2023-01-13 23:26:11,2023-01-13 23:33:12,grieving,0.0,0,0,0,0,0,0,0,0
5815,219385,2023-01-13 23:33:12,2023-01-13 23:35:11,grieving,0.0,0,0,0,0,0,0,0,0
