# Sample Size Calculator (WIP)

In [1]:
import tubi_data_runtime as tdr
import math
import pandas as pd
import numpy as np
from datetime import date
from statsmodels.stats.power import tt_ind_solve_power
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

from ssc_utils.filter_generator import filter_generator
from ssc_utils.raw_user_data import raw_user_data
from ssc_utils.metric_switcher import metric_switcher
from ssc_utils.metric_summary import metric_summary
from ssc_utils.cuped import cuped
import ssc_utils.calculator as c

# load choices
event_name_choices = filter_generator().event_name_choices()
filter_metrics_choices = filter_generator().filter_metrics_choices()

## Interactive calculator (end user starts here)

In [2]:
print('relative effect size')
EFFECT_SIZE_RELATIVE = interactive(c.effect, x=(0.0,1.0,0.01))
display(EFFECT_SIZE_RELATIVE)

print('')
print('number of treatments')
NUMBER_VARIATIONS = interactive(c.variations, x=(0,8,1))
display(NUMBER_VARIATIONS)

print('')
print('allocation per variation (including control)')
ALLOCATION = interactive(c.allocation, x=(0.0,1.0,0.01))
display(ALLOCATION)

print('')
print('power')
POWER = interactive(c.power, x=(0.0,1.0,0.01))
display(POWER)

print('')
print('alpha')
ALPHA = interactive(c.alpha, x=(0.0,1.0,0.01))
display(ALPHA)

relative effect size


interactive(children=(FloatSlider(value=0.01, description='x', max=1.0, step=0.01), Output()), _dom_classes=('…


number of treatments


interactive(children=(IntSlider(value=1, description='x', max=8), Output()), _dom_classes=('widget-interact',)…


allocation per variation (including control)


interactive(children=(FloatSlider(value=0.5, description='x', max=1.0, step=0.01), Output()), _dom_classes=('w…


power


interactive(children=(FloatSlider(value=0.8, description='x', max=1.0, step=0.01), Output()), _dom_classes=('w…


alpha


interactive(children=(FloatSlider(value=0.05, description='x', max=1.0, step=0.01), Output()), _dom_classes=('…

In [3]:
print('')
print('choose your primary metric')
metric_str = interactive(metric_switcher().choose_metric, metric = metric_switcher().possible_metrics())
display(metric_str)


choose your primary metric


interactive(children=(Dropdown(description='metric', options=('capped_tvt', 'new_viewer_first_day_capped_tvt',…

In [4]:
print('choose attribute filter')
attribute_filter = interactive(filter_generator().make_sql_condition_string, 
                                   field = filter_generator().filter_attributes_choices(), 
                                   condition = filter_generator().attribute_conditions_choices(), 
                                   value = '', 
                                   filter_type = fixed('attribute'))
display(attribute_filter)

choose attribute filter


interactive(children=(Dropdown(description='field', options=('no filters', 'user_id', 'device_id', 'platform',…

In [5]:
print('')
print('choose metric filter')
metric_filter = interactive(filter_generator().make_sql_condition_string, 
                                field = filter_metrics_choices, 
                                condition = filter_generator().metric_conditions_choices(), 
                                value = '', 
                                filter_type = fixed('metric'))
display(metric_filter)


choose metric filter


interactive(children=(Dropdown(description='field', options=('no filters', 'visit_total_count', 'tvt_sec', 'mo…

In [6]:
print('choose your primary event')
primary_event = widgets.SelectMultiple(
    options = event_name_choices,
    value = ('no event filter',), 
    description='Events',
    disabled=False
)
display(primary_event)

print('choose event sub-condition')
primary_event_sub_cond = interactive(filter_generator().make_sql_condition_string, 
                              field = filter_generator().event_sub_cond_field_choices(), 
                              condition = filter_generator().metric_conditions_choices(), 
                              value = '', 
                              filter_type = fixed('event'))
display(primary_event_sub_cond)


print('')
print('choose your pre-requisite event (does not work if primary event is empty)')
pre_event = widgets.SelectMultiple(
    options = event_name_choices,
    value = ('no event filter',), 
    description='Events',
    disabled=False
)
display(pre_event)

print('choose event sub-condition')
pre_event_sub_cond = interactive(filter_generator().make_sql_condition_string, 
                              field = filter_generator().event_sub_cond_field_choices(), 
                              condition = filter_generator().metric_conditions_choices(), 
                              value = '', 
                              filter_type = fixed('event'))
display(pre_event_sub_cond)


print('choose time interval (leave NULL if not needed)')
time_interval = interactive(filter_generator().choose, choice = 'NULL')
display(time_interval)

choose your primary event


SelectMultiple(description='Events', index=(0,), options=('no event filter', 'AccountEvent', 'ActiveEvent', 'A…

choose event sub-condition


interactive(children=(Dropdown(description='field', options=('no filters', 'content_completion_pct', 'componen…


choose your pre-requisite event (does not work if primary event is empty)


SelectMultiple(description='Events', index=(0,), options=('no event filter', 'AccountEvent', 'ActiveEvent', 'A…

choose event sub-condition


interactive(children=(Dropdown(description='field', options=('no filters', 'content_completion_pct', 'componen…

choose time interval (leave NULL if not needed)


interactive(children=(Text(value='NULL', description='choice'), Output()), _dom_classes=('widget-interact',))

## After the user specifies the settings, run everything below

In [7]:
filters = filter_generator().generate_filter_cte(attribute_condition_interact = attribute_filter, 
                                                 metric_condition_interact = metric_filter, 
                                                 event1_condition_interact = pre_event, 
                                                 event1_sub_condition_interact = pre_event_sub_cond, 
                                                 event2_condition_interact = primary_event, 
                                                 event2_sub_condition_interact = primary_event_sub_cond, 
                                                 event_time_interval_interact = time_interval)
raw_user = raw_user_data().generate_raw_user_data_cte(prev_cte_sql = filters)
user = metric_switcher().generate_user_data_cte(metric_str.result) 
summary = metric_summary().generate_metric_summary_cte() 
cuped = cuped().generate_cuped_cte()

FINAL_SQL = filters + raw_user + user + summary + cuped

# output SQL for debugging purposes; can copy and manually run this elsewhere
print(FINAL_SQL)

WITH raw_user_data AS (
              SELECT 
                  a.device_id,
                  device_first_seen_ts,
                  ds,
                  platform_type,
                  platform,
                  GETDATE() AS last_exposure_ds,
                  DATEADD('week', -2, DATE_TRUNC('week', last_exposure_ds)) AS first_exposure_ds,
                  -- Metrics
                  sum(tvt_sec) as tvt_sec,
                  sum(user_signup_count) as user_signup_count,
                  sum(device_registration_count) as device_registration_count,
                  sum(signup_or_registration_activity_count) as signup_or_registration_activity_count,
                  sum(visit_total_count) as visit_total_count
              FROM tubidw.device_metric_daily as a
        
              WHERE DATE_TRUNC('week',ds) >= dateadd('week', -4, DATE_TRUNC('week',GETDATE()))
                AND DATE_TRUNC('week',ds) < DATE_TRUNC('week', GETDATE())
              GROUP BY 1,2,3,4,5,6,7
        

In [8]:
df = tdr.query_redshift(FINAL_SQL).to_df()

## Sample size results

In [9]:
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning)

# ---------- Constants ---------- # 
COL_NAME_P = 'avg_cuped_result'
STD_COL_NAME = 'std_cuped_result'
RATIO = 1
SAMPLING = 1  # TODO: make this dynamic between 1 and 1000 for sampled analytics

CORRECTED_ALPHA = ALPHA.result / NUMBER_VARIATIONS.result
P2_MULTIPLICATIVE_FACTOR =  1 + EFFECT_SIZE_RELATIVE.result

# ---------- Implementation ---------- #
df['sample_required'] =  df.apply(lambda row: c.sample_power_ttest(
    p1 = row[COL_NAME_P],
    p2 = row[COL_NAME_P] * P2_MULTIPLICATIVE_FACTOR,
    sd_diff = row[STD_COL_NAME],
    alpha = CORRECTED_ALPHA,
    power = POWER.result,
    ratio = RATIO)
                                  , axis=1)

df['weeks_required'] = np.divide(df['sample_required'], (df['observations'] * 0.5 * ALLOCATION.result * SAMPLING))
df['sample_required'] = df['sample_required'].astype('float')
df['weeks_required'] = df['weeks_required'].astype('float')

df.sort_values('platform').style\
    .hide_index()\
    .set_precision(3)\
    

metric_name,platform,observations,avg_cuped_result,std_cuped_result,sample_required,weeks_required
capped_tvt,ALL,37657811,2.15,4.516,692529.0,0.074
capped_tvt,AMAZON,3811975,4.017,6.396,397923.0,0.418
capped_tvt,ANDROID,6984032,1.406,3.554,1002875.0,0.574
capped_tvt,COMCAST,2394550,2.963,4.688,392829.0,0.656
capped_tvt,COX,250823,3.385,5.039,347892.0,5.548
capped_tvt,IPAD,571331,1.588,3.678,841686.0,5.893
capped_tvt,IPHONE,3433708,0.791,2.491,1555674.0,1.812
capped_tvt,MOBILE,11125461,1.236,3.284,1107785.0,0.398
capped_tvt,OTT,20103202,3.192,5.429,453923.0,0.09
capped_tvt,PS4,543675,2.865,5.392,555882.0,4.09
