In [32]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [33]:
# Load the data
data_dir = '../data'
files = ['gfsteps.csv', 'jbsteps.csv', 'suggestions.csv', 'users.csv']

users_df = pd.read_csv(os.path.join(data_dir, files[3]))
gfsteps_df = pd.read_csv(os.path.join(data_dir, files[0]))
jbsteps_df = pd.read_csv(os.path.join(data_dir, files[1]))
suggestions_df = pd.read_csv(os.path.join(data_dir, files[2]))

  suggestions_df = pd.read_csv(os.path.join(data_dir, files[2]))


1. Identify Time of Notification: Using the sugg.select.utime field from the suggestions.csv file, we can pinpoint when a suggestion was given.

2. Align Step Data: We'll align the step data from jbsteps.csv and gfsteps.csv with the time of notification. We'll calculate the total number of steps before and after the notification by choosing appropriate time windows (e.g., 30 minutes before and after the notification).

3. Calculate Step Difference: For each suggestion, we will compute the step counts in the defined time windows before and after the notification.

In [34]:
# Convert the 'sugg.select.utime' in the suggestions_df to datetime
suggestions_df['sugg.select.utime'] = pd.to_datetime(suggestions_df['sugg.select.utime'], format='mixed')

# Convert the step time fields in jbsteps_df and gfsteps_df to datetime
jbsteps_df['steps.utime'] = pd.to_datetime(jbsteps_df['steps.utime'])
gfsteps_df['steps.utime'] = pd.to_datetime(gfsteps_df['steps.utime'])

# Define a time window (e.g., 30 minutes) before and after the notification for calculating steps
time_window = pd.Timedelta(minutes=30)

# Function to calculate steps before and after notification
def calculate_steps(df_steps, suggestions_df, time_window):
    # List to store the results
    results = []

    for idx, row in suggestions_df.iterrows():
        user_index = row['user.index']
        notification_time = row['sugg.select.utime']

        # Filter step data for the user and for the time window before and after the notification
        user_steps = df_steps[df_steps['user.index'] == user_index]

        steps_before = user_steps[(user_steps['steps.utime'] >= notification_time - time_window) &
                                  (user_steps['steps.utime'] < notification_time)]['steps'].sum()

        steps_after = user_steps[(user_steps['steps.utime'] > notification_time) &
                                 (user_steps['steps.utime'] <= notification_time + time_window)]['steps'].sum()

        results.append({
            'user.index': user_index,
            'notification_time': notification_time,
            'steps_before': steps_before,
            'steps_after': steps_after
        })

    return pd.DataFrame(results)

# Calculate steps for jbsteps and gfsteps
jbsteps_results = calculate_steps(jbsteps_df, suggestions_df, time_window)
gfsteps_results = calculate_steps(gfsteps_df, suggestions_df, time_window)

# Combine jbsteps and gfsteps results into one dataframe
all_steps_results = pd.concat([jbsteps_results, gfsteps_results])

# Select only the relevant columns from suggestions_df ('send.active', 'send.sedentary', 'user.index', 'sugg.select.utime')
suggestions_filtered = suggestions_df[[
    'user.index', 
    'sugg.select.utime', 
    'send.active', 
    'send.sedentary',
    'dec.location.category',
    # 'dec.precipitation.chance',
]]
users_filtered = users_df[['user.index', 'age', 'gender', 'children', 'screentime']]
#rename columns in user to user. prefix except for user.index
users_filtered = users_filtered.rename(columns = lambda x: 'user.' + x if x != 'user.index' else x)

#convert dec.preciptation.chance to numeric
# suggestions_filtered['dec.precipitation.chance'] = pd.to_numeric(suggestions_filtered['dec.precipitation.chance'], errors='coerce')
# suggestions_filtered['dec.precipitation.chance'] = suggestions_filtered['dec.precipitation.chance'].fillna(0)
# Merge suggestions_filtered into all_steps_results based on 'user.index' and 'notification_time'/'sugg.select.utime'
all_steps_results = pd.merge(all_steps_results, suggestions_filtered,
                             left_on=['user.index', 'notification_time'],
                             right_on=['user.index', 'sugg.select.utime'],
                             how='left')
all_steps_results = pd.merge(all_steps_results, users_filtered,left_on=['user.index'],right_on=['user.index'],how='left')                        

# Drop the 'sugg.select.utime' column since it is redundant after the merge
all_steps_results.drop(columns=['sugg.select.utime'], inplace=True)

# Display the first few rows of the final dataframe
print(all_steps_results.head())

   user.index   notification_time  steps_before  steps_after send.active  \
0           1 2015-07-22 16:30:00             0         1311       False   
1           1 2015-07-22 18:30:00           418          414        True   
2           1 2015-07-22 21:30:00           261          341       False   
3           1 2015-07-22 23:30:00          3030          369       False   
4           1 2015-07-23 09:30:00             0            0       False   

  send.sedentary                              dec.location.category  user.age  \
0           True  restaurant,meal_takeaway,food,point_of_interes...        48   
1          False                                               work        48   
2          False                                               home        48   
3           True                                               home        48   
4          False                                               home        48   

  user.gender  user.children  user.screentime  
0      f

In [35]:
write_to = data_dir + '/preprocessed_all_steps_results.csv'
all_steps_results.to_csv(write_to, index=False)
print(f'Wrote to {write_to}')

Wrote to ../data/preprocessed_all_steps_results.csv
