In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from functions.inference_fns import *

In [None]:
#pip install permute

In [None]:
path = './../data/'

users_df = pd.read_csv(path + 'users.csv')
suggestions_df = pd.read_csv(path + 'suggestions.csv')
jbsteps_df = pd.read_csv(path + 'jbsteps.csv')
gfsteps_df = pd.read_csv(path + 'gfsteps.csv')

# print(users_df.head())
# print(suggestions_df.head())
# print(jbsteps_df.head())
# print(gfsteps_df.head())

1. Identify Time of Notification: Using the sugg.select.utime field from the suggestions.csv file, we can pinpoint when a suggestion was given.

2. Align Step Data: We'll align the step data from jbsteps.csv and gfsteps.csv with the time of notification. We'll calculate the total number of steps before and after the notification by choosing appropriate time windows (e.g., 30 minutes before and after the notification).

3. Calculate Step Difference: For each suggestion, we will compute the step counts in the defined time windows before and after the notification.

In [None]:
all_steps_results = pd.read_csv(path + 'preprocessed_all_steps_results.csv')

In [None]:
hypothesis_suite = {
        'user was sedentary': lambda df: df['send.sedentary'] == True,
        'user was active': lambda df: df['send.active'] == True,
        'location category is home': lambda df: df['dec.location.category'] == 'home',
        'location category is work': lambda df: df['dec.location.category'] == 'work',
        'location category is neither home nor work': lambda df: ~df['dec.location.category'].isin(['home', 'work']),
        'user gender is male': lambda df: df['user.gender'] == 'male',
        'user gender is female': lambda df: df['user.gender'] == 'female',
        'user has children': lambda df: df['user.children'] > 0,
        'user has no children': lambda df: df['user.children'] == 0,
        'user is under 30': lambda df: df['user.age'] < 30,
        'user is between 30 and 50': lambda df: (df['user.age'] >= 30) & (df['user.age'] < 50),
        'user is over 50': lambda df: df['user.age'] >= 50,
    }
p_values = {}
for hypothesis_name, hypothesis_fn in hypothesis_suite.items():
    print("Testing hypothesis for when:", hypothesis_name)
    p_result = test_hypothesis_for_criterion(all_steps_results, hypothesis_fn, reps=10**4)
    p_values[hypothesis_name] = p_result
    print("=====================================")

In [None]:
successful_hypotheses = benjamini_hochberg(p_values, alpha=0.05, plot=True)

In [None]:
print("Number of unique users when... ")
for hypothesis_name, hypothesis_fn in hypothesis_suite.items():
    test_population = all_steps_results.copy()
    test_population = test_population[hypothesis_fn(test_population)]
    num = test_population['user.index'].nunique()
    #print hypothesis name and number of unique users in a tabular format
    print(f"\t...{hypothesis_name:50}: {num}")