## Code exploration and check up of higher level metrics

Source code that throws similar TV infleunce (uplift) number to that shown on InnovidXP website

In [None]:
import os
currentdir = os.getcwd()
correctdir = currentdir.rsplit('tv2ds/',1)[0]
os.chdir(correctdir)

from tv2ds.ds_lib import notebook_prodrun
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import tvsquared.settings
from tvsquared.lib.request import Request
from tvsquared.lib.athena import AthenaDatabase
pd.options.display.float_format = '{:.4f}'.format
pd.set_option('display.max_colwidth', None)

In [None]:
notebook_prodrun.set_env('US', prodrunenv='PROD')
crosswalk_suffix = os.environ.get('TV2PRODRUNENV').lower()
crosswalk_suffix

'produsa'

In [None]:
def make_request(clientid, brandid = 1, datefrom=False, dateto=False):
    if datefrom and dateto:
        request = Request(clientarg=False, brandarg=False, datesarg=False)
        datefrom = datetime.datetime.strptime(datefrom, '%Y-%m-%d')
        dateto = datetime.datetime.strptime(dateto, '%Y-%m-%d')
        request = request.init(partnerid=None, clientid=clientid, brandid=brandid,datefrom=datefrom, dateto=dateto,loglevel=-1, extargs=None, request=None, usespark=None, readPreference=None,prodrun=False)
        return(request)
    else:
        request = Request(clientarg=False, brandarg=False, datesarg=False)
        request = request.init(partnerid=None, clientid=clientid, brandid=brandid,datefrom=None, dateto=None,loglevel=-1, extargs=None, request=None, usespark=None, readPreference=None,prodrun=False)
        return(request)


def query_athena(request, query, copy_to_local=False):
    athena_db = AthenaDatabase.get_client_database(request)
    query_results = AthenaDatabase.execute_query(athena_db, query, copy_to_local=copy_to_local)
    df = pd.DataFrame(query_results)
    return(df)

In [None]:
client = {'clientid': 9306}
request = make_request(client['clientid'])

### ➡️ Client and information used to extract sample data:
**Drizly**<br>
vendor_name='inscape', yy='2022', mm='03',dd='28',crosswalk_suffix='produsa',dateto='2022-03-31', datefrom='2022-03-01', clientid='c9306_drizly'<br>

**Therealreal**<br>
vendor_name='inscape', yy='2022', mm='01',dd='24',crosswalk_suffix='produsa',dateto='2022-01-30', datefrom='2022-01-01', clientid='c16319_the_realreal'<br>

**Uti**<br>
vendor_name='inscape', yy='2022', mm='01',dd='24',crosswalk_suffix='produsa',dateto='2022-01-30', datefrom='2022-01-01', clientid='c9534_uti'<br>

In [None]:
# Inputs needed for queries in Athena

yy_input='2022' 
mm_input='03'
dd_input='28'
dateto_input='2022-03-31' 
datefrom_input='2022-03-01'  
clientid_input='c9306_drizly'
# clientid_input='c9534_uti'
# clientid_input='c16319_the_realreal'

### ➡️ Original high level query 

In [None]:
def query(subquery_name:str, yy_value:str, mm_value:str, dd_value:str, dateto_value:str, datefrom_value:str, clientid_value:str): 
   return '''
   with filtered_hh as (
      select 
         key_value as mapped_tv2_hhid
      from {crosswalk_suffix}_modeldata.crosswalk
      where 
         yy='{yy}' 
         and mm='{mm}' 
         and dd='{dd}'
         and key_name ='tv2_hhid' 
         and vendor_name = '{vendor_name}'
         and excluded_stamp is null
   ),
   campaign_universe as (
      SELECT 
         complex_ranges.mapped_tv2_hhid 
      FROM {crosswalk_suffix}_modeldata.crosswalk cw
      CROSS JOIN UNNEST(complex_range) AS t (complex_ranges)
      join filtered_hh fh on fh.mapped_tv2_hhid = complex_ranges.mapped_tv2_hhid
      where 
         vendor_name = '{vendor_name}'
         and complex_ranges.first_seen <= timestamp '{dateto}' + interval '1' day
         and complex_ranges.last_seen >= timestamp '{datefrom}' 
         and key_name = 'tv2_hhid'
         and yy='{yy}'
         and mm='{mm}' 
         and dd='{dd}'
   ),
   client_eventlog as (
   select 
      *
   from {clientid}_{crosswalk_suffix}.eventlog
      where 
         datadatetime between timestamp '{datefrom}' 
         and timestamp '{dateto}' + interval '7' day
   ),
   hh_impressed_30days as (
      select
         distinct crosswalk_link_id as mapped_tv2_hhid
      from campaign_universe ex
      join client_eventlog ev on ev.crosswalk_link_id = ex.mapped_tv2_hhid
      where
         event_class='impression'
         and datadatetime between timestamp '{datefrom}' - interval '30' day    
         and timestamp '{dateto}' + interval '1' day + interval '6' day
         and in_scope
   ),
   hh_impressed_in_campaign as (
      select
         distinct crosswalk_link_id as mapped_tv2_hhid
      from campaign_universe ex
      join client_eventlog ev on ev.crosswalk_link_id = ex.mapped_tv2_hhid
      where
         event_class='impression'
         and event = 'linear'
         and datadatetime between timestamp '{datefrom}' 
         and timestamp '{dateto}' + interval '1' day
         and in_scope
   ),
   hh_control as (
      select
         distinct cu.mapped_tv2_hhid
      from campaign_universe cu
      left join hh_impressed_30days hh on hh.mapped_tv2_hhid  = cu.mapped_tv2_hhid
      where hh.mapped_tv2_hhid is null
   ),
   n_hh_impressed_visited as (
      select 
         count(distinct mapped_tv2_hhid) as exp_visited
      from client_eventlog
      join hh_impressed_in_campaign on crosswalk_link_id = mapped_tv2_hhid
      where
         event_class ='response' and event= 'all response'
         and datadatetime between timestamp '{datefrom}' 
         and timestamp '{dateto}' + interval '6' day + interval '1' day
         and in_scope in (TRUE, null)
   ),
   n_hh_control_visited as (
      select 
         count(distinct mapped_tv2_hhid) as ctrl_visited
      from client_eventlog
      join hh_control on crosswalk_link_id = mapped_tv2_hhid
      where
         event_class ='response' and event= 'all response'
         and datadatetime between timestamp '{datefrom}' 
         and timestamp '{dateto}' + interval '6' day + interval '1' day
         and in_scope in (TRUE, null)
   ),
   n_hh_impressed_in_campaign as (
      select 
         CAST(count(distinct mapped_tv2_hhid) AS double) as exp_hh
      from hh_impressed_in_campaign
   ),
   n_hh_control as (
      select 
         CAST(count(distinct mapped_tv2_hhid) AS double) as ctrl_hh
      from hh_control
   ),
   final_results as (
      select *
      from n_hh_impressed_visited
      cross join n_hh_control_visited
      cross join n_hh_impressed_in_campaign
      cross join n_hh_control
   )
   select
      ctrl_hh,
      ctrl_visited,
      exp_hh,
      exp_visited,
      ctrl_visited/ctrl_hh as ctrl_vr,
      exp_visited/exp_hh as exp_vr,
      (exp_visited/exp_hh - ctrl_visited/ctrl_hh)/(ctrl_visited/ctrl_hh) as uplift
   from final_results
   '''.format(
      vendor_name='inscape', 
      yy=yy_value, mm=mm_value,dd=dd_value,
      crosswalk_suffix='produsa',
      dateto=dateto_value, 
      datefrom=datefrom_value,  
      clientid=clientid_value,
      result=subquery_name
      )   

In [None]:
# --> 34s to run
df_filtered_hh= query_athena(
    request, 
    query(
        subquery_name='final_results',
        yy_value=yy_input, mm_value=mm_input,dd_value=dd_input,
        dateto_value=dateto_input, 
        datefrom_value=datefrom_input,  
        clientid_value=clientid_input,
        )) 

In [None]:
df_filtered_hh

Unnamed: 0,ctrl_hh,ctrl_visited,exp_hh,exp_visited,ctrl_vr,exp_vr,uplift
0,8040334.0,64506,1007152.0,16069,0.0080228010428422,0.0159548906222695,0.988693292662928


### ➡️ Query for control group 

In [None]:
def query_control(subquery_name: str, yy_value:str, mm_value:str, dd_value:str, dateto_value:str, datefrom_value:str, clientid_value:str): 
   return '''
   with filtered_hh as (
   select 
        key_value as mapped_tv2_hhid
   from {crosswalk_suffix}_modeldata.crosswalk
   where 
        yy='{yy}' 
        and mm='{mm}' 
        and dd='{dd}'
        and key_name ='tv2_hhid' 
        and vendor_name = '{vendor_name}'
        and excluded_stamp is null
   ),
   campaign_universe as (
      select 
         complex_ranges.mapped_tv2_hhid 
      FROM {crosswalk_suffix}_modeldata.crosswalk cw
      CROSS JOIN UNNEST(complex_range) AS t (complex_ranges)
      join filtered_hh fh on fh.mapped_tv2_hhid = complex_ranges.mapped_tv2_hhid
      where 
         vendor_name = '{vendor_name}'
         and complex_ranges.first_seen <= timestamp '{dateto}' + interval '1' day
         and complex_ranges.last_seen >= timestamp '{datefrom}' 
         and key_name = 'tv2_hhid'
         and yy='{yy}'
         and mm='{mm}' 
         and dd='{dd}'
   ),
   client_eventlog as (
      select 
      *
      from {clientid}_{crosswalk_suffix}.eventlog
      where 
         datadatetime between timestamp '{datefrom}' 
         and timestamp '{dateto}' + interval '7' day
   ),
   hh_impressed_30days as (
      select
         distinct crosswalk_link_id as mapped_tv2_hhid
      from campaign_universe ex
      join client_eventlog ev on ev.crosswalk_link_id = ex.mapped_tv2_hhid
      where
         event_class='impression'
         and datadatetime between timestamp '{datefrom}' - interval '30' day    
         and timestamp '{dateto}' + interval '1' day + interval '6' day
         and in_scope
   ),
   hh_control as (
      select
         distinct cu.mapped_tv2_hhid
      from campaign_universe cu
      left join hh_impressed_30days hh on hh.mapped_tv2_hhid  = cu.mapped_tv2_hhid
      where hh.mapped_tv2_hhid is null
   ),
   n_hh_control_visited as (
      select distinct 
         mapped_tv2_hhid
      from client_eventlog
      join hh_control on crosswalk_link_id = mapped_tv2_hhid
      where
         event_class ='response' and event= 'all response'
         and datadatetime between timestamp '{datefrom}' 
         and timestamp '{dateto}' + interval '6' day + interval '1' day
         and in_scope in (TRUE, null)
   ),
   hh_control_visited as (
      select
         ctrl.mapped_tv2_hhid 
         ,(case when visited.mapped_tv2_hhid is not null then 1 else 0 end) as visited 
      from hh_control ctrl
      left join n_hh_control_visited visited on ctrl.mapped_tv2_hhid = visited.mapped_tv2_hhid
   ),
   n_visited_control as (
      select 
         SUM(visited) as ctrl_visited
      from hh_control_visited
   ),
   n_hh_control as (
      select 
         CAST(count(distinct mapped_tv2_hhid) AS double) as ctrl_hh
      from hh_control_visited
   ),
   final_results as (
      select *
      from n_hh_control
      cross join n_visited_control
   )
   select
      *
   from {result}

'''.format(
      vendor_name='inscape', 
      yy=yy_value, mm=mm_value,dd=dd_value,
      crosswalk_suffix='produsa',
      dateto=dateto_value, 
      datefrom=datefrom_value,  
      clientid=clientid_value,
      result=subquery_name
      )   

In [None]:
#--> 33s to run
df_control_results= query_athena(
    request, 
    query_control(
        subquery_name='final_results',
        yy_value=yy_input, mm_value=mm_input,dd_value=dd_input,
        dateto_value=dateto_input, 
        datefrom_value=datefrom_input,  
        clientid_value=clientid_input,
        )) 

In [None]:
df_control_results

Unnamed: 0,ctrl_hh,ctrl_visited
0,8040334.0,64506


In [None]:
#-->34m  to run, modify query so that query_control brings back hh_control
# df_control= query_athena(request, query_control(subquery_name='hh_control_visited')) 

In [None]:
# df_control.visited.sum() -- why does this take so long??? doesn't make sense!! for some reason it looks as if the visited column created messes up the df

In [None]:
def query_control_perday(subquery_name: str, yy_value:str, mm_value:str, dd_value:str, dateto_value:str, datefrom_value:str, clientid_value:str): 
   return '''
   with filtered_hh as (
      select 
         key_value as mapped_tv2_hhid
      from {crosswalk_suffix}_modeldata.crosswalk
      where 
         yy='{yy}' 
         and mm='{mm}' 
         and dd='{dd}'
         and key_name ='tv2_hhid' 
         and vendor_name = '{vendor_name}'
         and excluded_stamp is null
      ),
      campaign_universe as (
         select 
            complex_ranges.mapped_tv2_hhid 
         FROM {crosswalk_suffix}_modeldata.crosswalk cw
         CROSS JOIN UNNEST(complex_range) AS t (complex_ranges)
         join filtered_hh fh on fh.mapped_tv2_hhid = complex_ranges.mapped_tv2_hhid
         where 
            vendor_name = '{vendor_name}'
            and complex_ranges.first_seen <= timestamp '{dateto}' + interval '1' day
            and complex_ranges.last_seen >= timestamp '{datefrom}' 
            and key_name = 'tv2_hhid'
            and yy='{yy}'
            and mm='{mm}' 
            and dd='{dd}'
      ),
      client_eventlog as (
         select 
         *
         from {clientid}_{crosswalk_suffix}.eventlog
         where 
            datadatetime between timestamp '{datefrom}' 
            and timestamp '{dateto}' + interval '7' day
      ),
      hh_impressed_30days as (
         select
            distinct crosswalk_link_id as mapped_tv2_hhid
         from campaign_universe ex
         join client_eventlog ev on ev.crosswalk_link_id = ex.mapped_tv2_hhid
         where
            event_class='impression'
            and datadatetime between timestamp '{datefrom}' - interval '30' day    
            and timestamp '{dateto}' + interval '1' day + interval '6' day
            and in_scope
      ),
      hh_control as (
         select
            distinct cu.mapped_tv2_hhid
         from campaign_universe cu
         left join hh_impressed_30days hh on hh.mapped_tv2_hhid  = cu.mapped_tv2_hhid
         where hh.mapped_tv2_hhid is null
      ),
      n_hh_control_visited_per_day as (
         select
            mapped_tv2_hhid, 
            date_trunc('day', datadatetime) as day
         from client_eventlog
         join hh_control on crosswalk_link_id = mapped_tv2_hhid
         where
            event_class ='response' and event= 'all response'
            and datadatetime between timestamp '{datefrom}' 
            and timestamp '{dateto}' + interval '6' day + interval '1' day
            and in_scope in (TRUE, null)
      ),
      hh_control_visited as (
         select 
            ctrl.mapped_tv2_hhid,
            (case when visited.mapped_tv2_hhid is not null then 1 else 0 end) as visits
         from hh_control ctrl
         left join n_hh_control_visited_per_day visited on ctrl.mapped_tv2_hhid = visited.mapped_tv2_hhid
      ),
      n_visited_control as (
         select 
            SUM(visits) as num_visits
         from hh_control_visited
      ),
      n_hh_control as (
         select 
            CAST(count(distinct mapped_tv2_hhid) AS double) as ctrl_hh
         from hh_control_visited
      ),
      final_results as (
         select *
         from n_hh_control
         cross join n_visited_control
      )
      select
         *
      from {result}


'''.format(
      vendor_name='inscape', 
      yy=yy_value, mm=mm_value,dd=dd_value,
      crosswalk_suffix='produsa',
      dateto=dateto_value, 
      datefrom=datefrom_value,  
      clientid=clientid_value,
      result=subquery_name
      )   

In [None]:
df_control_pday_results= query_athena(
    request, 
    query_control_perday(
        subquery_name='final_results',
        yy_value=yy_input, mm_value=mm_input,dd_value=dd_input,
        dateto_value=dateto_input, 
        datefrom_value=datefrom_input,  
        clientid_value=clientid_input,
        )) 

In [None]:
df_control_pday_results

Unnamed: 0,ctrl_hh,num_visits
0,8040334.0,175337


In [None]:
# notes of I wanted to achieve:
# n_hh_control_visited_per_day as (
#          select  
#             mapped_tv2_hhid
#             , date_trunc('day', datadatetime) as day
#             , count(*) as num_visits
#          from client_eventlog
#          join hh_control on crosswalk_link_id = mapped_tv2_hhid
#          where
#             event_class ='response' and event= 'all response'
#             and datadatetime between timestamp '{datefrom}' 
#             and timestamp '{dateto}' + interval '6' day + interval '1' day
#             and in_scope in (TRUE, null)
#          group by mapped_tv2_hhid, date_trunc('day', datadatetime)
#       ),
#       hh_control_visited as (
#          select 
#             ctrl.mapped_tv2_hhid
#             , num_visits
#             , (case when visited.mapped_tv2_hhid is not null then 1 else 0 end) as visited 
#          from hh_control ctrl
#          left join n_hh_control_visited_per_day visited on ctrl.mapped_tv2_hhid = visited.mapped_tv2_hhid
#       ),
#       n_visited_control as (
#          select 
#             SUM(num_visits) as num_visits,
#             SUM(visited) as ctrl_visited 
#          from hh_control_visited
#       ),

### ➡️ Query for exposed group

In [None]:
def query_exposed(subquery_name: str, yy_value:str, mm_value:str, dd_value:str, dateto_value:str, datefrom_value:str, clientid_value:str): 
   return '''
   with filtered_hh as (
   select 
        key_value as mapped_tv2_hhid
   from {crosswalk_suffix}_modeldata.crosswalk
   where 
        yy='{yy}' 
        and mm='{mm}' 
        and dd='{dd}'
        and key_name ='tv2_hhid' 
        and vendor_name = '{vendor_name}'
        and excluded_stamp is null
   ),
   campaign_universe as (
      select 
         complex_ranges.mapped_tv2_hhid 
      FROM {crosswalk_suffix}_modeldata.crosswalk cw
      CROSS JOIN UNNEST(complex_range) AS t (complex_ranges)
      join filtered_hh fh on fh.mapped_tv2_hhid = complex_ranges.mapped_tv2_hhid
      where 
         vendor_name = '{vendor_name}'
         and complex_ranges.first_seen <= timestamp '{dateto}' + interval '1' day
         and complex_ranges.last_seen >= timestamp '{datefrom}' 
         and key_name = 'tv2_hhid'
         and yy='{yy}'
         and mm='{mm}' 
         and dd='{dd}'
   ),
   client_eventlog as (
      select 
      *
      from {clientid}_{crosswalk_suffix}.eventlog
      where 
         datadatetime between timestamp '{datefrom}' 
         and timestamp '{dateto}' + interval '7' day
   ),
   hh_impressed_in_campaign as (
      select
         distinct crosswalk_link_id as mapped_tv2_hhid
      from campaign_universe ex
      join client_eventlog ev on ev.crosswalk_link_id = ex.mapped_tv2_hhid
      where
         event_class='impression'
         and event = 'linear'
         and datadatetime between timestamp '{datefrom}' 
         and timestamp '{dateto}' + interval '1' day
         and in_scope
   ),
   n_hh_impressed_visited as (
      select distinct 
         mapped_tv2_hhid
      from client_eventlog
      join hh_impressed_in_campaign on crosswalk_link_id = mapped_tv2_hhid
      where
         event_class ='response' and event= 'all response'
         and datadatetime between timestamp '{datefrom}' 
         and timestamp '{dateto}' + interval '6' day + interval '1' day
         and in_scope in (TRUE, null)
   ),
   hh_impressed_visited as (
      select
         impressed.mapped_tv2_hhid 
         ,(case when visited.mapped_tv2_hhid is not null then 1 else 0 end) as visited 
      from hh_impressed_in_campaign impressed
      left join n_hh_impressed_visited visited on impressed.mapped_tv2_hhid = visited.mapped_tv2_hhid
   ),
   n_visited_impressed as (
      select SUM(visited) as exp_visited
      from hh_impressed_visited
   ),
   n_hh_impressed as (
      select CAST(count(distinct mapped_tv2_hhid) AS double) as exp_hh
      from hh_impressed_visited
   ),
   final_results as (
      select *
      from n_hh_impressed
      cross join n_visited_impressed
   )
   select
      *
   from {result}

'''.format(
      vendor_name='inscape', 
      yy=yy_value, mm=mm_value,dd=dd_value,
      crosswalk_suffix='produsa',
      dateto=dateto_value, 
      datefrom=datefrom_value,  
      clientid=clientid_value,
      result=subquery_name
      )  

In [None]:
#--> 33s to run
df_exposed_results= query_athena(
    request, 
    query_exposed(
        subquery_name='final_results',
        yy_value=yy_input, mm_value=mm_input,dd_value=dd_input,
        dateto_value=dateto_input, 
        datefrom_value=datefrom_input,  
        clientid_value=clientid_input,
    )) 

In [None]:
df_exposed_results

Unnamed: 0,exp_hh,exp_visited
0,1007152.0,16069


In [None]:
#--> m modify query so that query_exposed_march brings back hh_impressed_in_campaign
# df_exposed= query_athena(request, query_exposed(subquery_name='hh_impressed_visited')) 

In [None]:
# df_exposed.visited.sum()

In [None]:
def query_exposed_perday(subquery_name: str, yy_value:str, mm_value:str, dd_value:str, dateto_value:str, datefrom_value:str, clientid_value:str):
   return '''
   with filtered_hh as (
   select 
        key_value as mapped_tv2_hhid
   from {crosswalk_suffix}_modeldata.crosswalk
   where 
        yy='{yy}' 
        and mm='{mm}' 
        and dd='{dd}'
        and key_name ='tv2_hhid' 
        and vendor_name = '{vendor_name}'
        and excluded_stamp is null
   ),
   campaign_universe as (
      select 
         complex_ranges.mapped_tv2_hhid 
      FROM {crosswalk_suffix}_modeldata.crosswalk cw
      CROSS JOIN UNNEST(complex_range) AS t (complex_ranges)
      join filtered_hh fh on fh.mapped_tv2_hhid = complex_ranges.mapped_tv2_hhid
      where 
         vendor_name = '{vendor_name}'
         and complex_ranges.first_seen <= timestamp '{dateto}' + interval '1' day
         and complex_ranges.last_seen >= timestamp '{datefrom}' 
         and key_name = 'tv2_hhid'
         and yy='{yy}'
         and mm='{mm}' 
         and dd='{dd}'
   ),
   client_eventlog as (
      select 
      *
      from {clientid}_{crosswalk_suffix}.eventlog
      where 
         datadatetime between timestamp '{datefrom}' 
         and timestamp '{dateto}' + interval '7' day
   ),
   hh_impressed_in_campaign as (
      select
         distinct crosswalk_link_id as mapped_tv2_hhid
      from campaign_universe ex
      join client_eventlog ev on ev.crosswalk_link_id = ex.mapped_tv2_hhid
      where
         event_class='impression'
         and event = 'linear'
         and datadatetime between timestamp '{datefrom}' 
         and timestamp '{dateto}' + interval '1' day
         and in_scope
   ),
   n_hh_impressed_visited_per_day as (
      select 
         mapped_tv2_hhid,
         date_trunc('day', datadatetime) as day
      from client_eventlog
      join hh_impressed_in_campaign on crosswalk_link_id = mapped_tv2_hhid
      where
         event_class ='response' and event= 'all response'
         and datadatetime between timestamp '{datefrom}' 
         and timestamp '{dateto}' + interval '6' day + interval '1' day
         and in_scope in (TRUE, null)
   ),
   hh_impressed_visited as (
      select
         impressed.mapped_tv2_hhid 
         ,(case when visited.mapped_tv2_hhid is not null then 1 else 0 end) as visits 
      from hh_impressed_in_campaign impressed
      left join n_hh_impressed_visited_per_day visited on impressed.mapped_tv2_hhid = visited.mapped_tv2_hhid
   ),
   n_visited_impressed as (
      select SUM(visits) as num_visits
      from hh_impressed_visited
   ),
   n_hh_impressed as (
      select CAST(count(distinct mapped_tv2_hhid) AS double) as exp_hh
      from hh_impressed_visited
   ),
   final_results as (
      select *
      from n_hh_impressed
      cross join n_visited_impressed
   )
   select
      *
   from {result}

'''.format(
      vendor_name='inscape', 
      yy=yy_value, mm=mm_value,dd=dd_value,
      crosswalk_suffix='produsa',
      dateto=dateto_value, 
      datefrom=datefrom_value,  
      clientid=clientid_value,
      result=subquery_name
      )  

In [None]:
df_exposed_pday_results= query_athena(
    request, 
    query_exposed_perday(
        subquery_name='final_results',
        yy_value=yy_input, mm_value=mm_input,dd_value=dd_input,
        dateto_value=dateto_input, 
        datefrom_value=datefrom_input,  
        clientid_value=clientid_input,
        )) 

In [None]:
df_exposed_pday_results

Unnamed: 0,exp_hh,num_visits
0,1007152.0,40529


### ➡️ Query for control group per day

#### ➜ Version using *'event'* var - but hhids that have done nothing but stay active and valid will have no event on the event log

In [None]:
# Modification for bringing control group by date
# I thought this might be bringing back control per day but it is bringing back 252,331 rows and this includes 175k visits plus other events not recorded as visited. 
# Visited = 1 means visit and visit = means other event
# Now that I have looked at another way of bring the  control per day, this is actually in a way it is bringing this, but it is only bringing the ones that have done 
# something in the event log, there are some hhs that have done nothing and they are just there .. so not sure what to do about those
def query_control_perday(subquery_name: str, yy_value:str, mm_value:str, dd_value:str, dateto_value:str, datefrom_value:str, clientid_value:str): 
   return '''
   with filtered_hh as (
      select 
         key_value as mapped_tv2_hhid
      from {crosswalk_suffix}_modeldata.crosswalk
      where 
         yy='{yy}' 
         and mm='{mm}' 
         and dd='{dd}'
         and key_name ='tv2_hhid' 
         and vendor_name = '{vendor_name}'
         and excluded_stamp is null
      ),
      campaign_universe as (
         select 
            complex_ranges.mapped_tv2_hhid 
         FROM {crosswalk_suffix}_modeldata.crosswalk cw
         CROSS JOIN UNNEST(complex_range) AS t (complex_ranges)
         join filtered_hh fh on fh.mapped_tv2_hhid = complex_ranges.mapped_tv2_hhid
         where 
            vendor_name = '{vendor_name}'
            and complex_ranges.first_seen <= timestamp '{dateto}' + interval '1' day
            and complex_ranges.last_seen >= timestamp '{datefrom}' 
            and key_name = 'tv2_hhid'
            and yy='{yy}'
            and mm='{mm}' 
            and dd='{dd}'
      ),
      client_eventlog as (
         select 
            *
         from {clientid}_{crosswalk_suffix}.eventlog
         where 
            datadatetime between timestamp '{datefrom}' 
            and timestamp '{dateto}' + interval '7' day
      ),
      hh_impressed_30days as (
         select
            distinct crosswalk_link_id as mapped_tv2_hhid
         from campaign_universe ex
         join client_eventlog ev on ev.crosswalk_link_id = ex.mapped_tv2_hhid
         where
            event_class='impression'
            and datadatetime between timestamp '{datefrom}' - interval '30' day    
            and timestamp '{dateto}' + interval '1' day + interval '6' day
            and in_scope
      ),
      hh_control as (
         select
            distinct cu.mapped_tv2_hhid
         from campaign_universe cu
         left join hh_impressed_30days hh on hh.mapped_tv2_hhid  = cu.mapped_tv2_hhid
         where hh.mapped_tv2_hhid is null
      ),
      n_hh_control_visited_per_day as (
         select
            mapped_tv2_hhid, 
            date_trunc('day', datadatetime) as day,
            CASE WHEN (event_class = 'response' AND event = 'all response') THEN 1 ELSE 0 END AS visited
         from client_eventlog
         join hh_control on crosswalk_link_id = mapped_tv2_hhid
         where
            datadatetime between timestamp '{datefrom}' 
            and timestamp '{dateto}' + interval '6' day + interval '1' day
            and in_scope in (TRUE, null)
      )
      select * 
      from {result}

'''.format(
      vendor_name='inscape', 
      yy=yy_value, mm=mm_value,dd=dd_value,
      crosswalk_suffix='produsa',
      dateto=dateto_value, 
      datefrom=datefrom_value,  
      clientid=clientid_value,
      result=subquery_name
      )   

In [None]:
df_control_visited_pday= query_athena(
    request, 
    query_control_perday(
        subquery_name='n_hh_control_visited_per_day',
        yy_value=yy_input, mm_value=mm_input,dd_value=dd_input,
        dateto_value=dateto_input, 
        datefrom_value=datefrom_input,  
        clientid_value=clientid_input,
        )).astype({"visited": "int"})

df_control_visited_pday.day = pd.to_datetime(df_control_visited_pday.day)

In [None]:
df_control_visited_pday.shape

(252331, 3)

In [None]:
df_control_visited_pday.mapped_tv2_hhid.nunique()

64506

In [None]:
control_n_total = df_control_visited_pday.groupby(['day']).agg(n_total=("mapped_tv2_hhid", "count"))
control_n_total.head()

Unnamed: 0_level_0,n_total
day,Unnamed: 1_level_1
2022-03-01,9164
2022-03-02,7092
2022-03-03,6622
2022-03-04,8205
2022-03-05,8448


In [None]:
control_n_visited = df_control_visited_pday.groupby(
    ['mapped_tv2_hhid', 'day']
    ).agg(n_visited=("visited", "sum")).reset_index(0, drop=False)

control_n_visited.head()

In [None]:
control_n_visited.merge(control_n_total, right_index=True, left_index=True).head(10)

#### ➜ Michael's function

In [None]:
from tvsquared.lib.athena import AthenaDatabase
import pandas as pd

def get_control_group(
    request, datefrom_value:str, dateto_value:str, yy_value:str, mm_value:str, dd_value:str,clientid_value:str, 
    granularity:str, lookback_window:int, filter_linear=True,):
    """
    Slow version to get us off the ground. Take a date range and then it will run the athena query per granularity
    specified to get the aggregate totals for eligible control households and the number of visits

    Parameters
    ----------
    request : Request
        request object
    datefrom : str
        start date for date range
    dateto : str
        end date for date range
    cw_yy : int
        crosswalk year
    cw_mm : int
        crosswalk month
    cw_dd : int
        crosswalk day
    To do: granularity: what frequency to pass to pandas for date range, hint weekly on Monday is 'W-MON'
    lookback_window : int
        how far to look back to fund elibigle households, e.g. not exposed in last '30' days
    crosswalk_suffix : str
        which env are you using, default 'prod'
    vendor_name : str
        crosswalk vendor name e.g. inscape
    filter_linear : bool
        should we consider both OTT and Linear impressions when building the control group? default True

    Returns
    -------
    DataFrame
        control households and visits per granularity specified, with visit rate calculated

    """
    # get db from request object
    athena_db = AthenaDatabase.get_client_database(request)

    # create list of dates from specified range
    datelist = pd.date_range(start=datefrom_input, end=dateto_input, freq=granularity).to_list()

    # if filter_linear is true we need to remove the event filter
    overlap = ''
    if not filter_linear:
        overlap = "and event = 'vod'"

    # we'll store the aggregated results in a DataFrame
    dfs=[]


    for date in datelist:

        request.log.info('Getting Data for {}...'.format(date))

        query = """
        -- select universe of unfiltered people for time range
        

        with filtered_hh as (
        select 
            key_value as mapped_tv2_hhid
        from {crosswalk_suffix}_modeldata.crosswalk
        where 
            yy='{yy}' 
            and mm='{mm}' 
            and dd='{dd}'
            and key_name ='tv2_hhid' 
            and vendor_name = '{vendor_name}'
            and excluded_stamp is null
        ),
        campaign_universe as (
            select 
                complex_ranges.mapped_tv2_hhid 
            FROM {crosswalk_suffix}_modeldata.crosswalk cw
            CROSS JOIN UNNEST(complex_range) AS t (complex_ranges)
            join filtered_hh fh on fh.mapped_tv2_hhid = complex_ranges.mapped_tv2_hhid
            where 
                vendor_name = '{vendor_name}'
                and complex_ranges.first_seen <= timestamp '{dateto}' + interval '1' day
                and complex_ranges.last_seen >= timestamp '{datefrom}' 
                and key_name = 'tv2_hhid'
                and yy='{yy}'
                and mm='{mm}' 
                and dd='{dd}'
        ),
        client_eventlog as (
            select 
            *
            from {clientid}_{crosswalk_suffix}.eventlog
            where 
                datadatetime between timestamp '{datefrom}' 
                and timestamp '{dateto}' + interval '7' day
        ),
        hh_impressed_30days as (
            select
                distinct crosswalk_link_id as mapped_tv2_hhid
            from campaign_universe ex
            join client_eventlog ev on ev.crosswalk_link_id = ex.mapped_tv2_hhid
            where
                event_class='impression'
                {overlap}
                and datadatetime between timestamp '{datefrom}' - interval '{lookback_window}' day    
                and timestamp '{dateto}' + interval '7' day
                and in_scope
        ),
        hh_control as (
            select
                distinct cu.mapped_tv2_hhid
            from campaign_universe cu
            left join hh_impressed_30days hh on hh.mapped_tv2_hhid  = cu.mapped_tv2_hhid
            where hh.mapped_tv2_hhid is null
        ),
        n_hh_control_visited as (
            select 
                count(distinct mapped_tv2_hhid) as ctrl_visited
            from client_eventlog
            join hh_control on crosswalk_link_id = mapped_tv2_hhid
            where
                event_class ='response' and event= 'all response'
                and datadatetime between timestamp '{datefrom}' 
                and timestamp '{dateto}' + interval '7' day
                and in_scope in (TRUE, null)
        ),
        n_hh_control as (
            select CAST(count(distinct mapped_tv2_hhid) AS double) as ctrl_hh
            from hh_control
        ),
        final_results as (
            select *
            from n_hh_control_visited
            cross join n_hh_control
        )
        select
            '{datefrom}' as date, 
            ctrl_hh,
            ctrl_visited,
            ctrl_visited/ctrl_hh as ctrl_vr
        from final_results

        """.format(
            yy=yy_value, mm=mm_value, dd=dd_value,
            dateto=dateto_value, 
            datefrom=datefrom_value,  
            clientid=clientid_value,
            lookback_window=lookback_window, 
            overlap=overlap,
            vendor_name='inscape',
            crosswalk_suffix='produsa',
            )   

        query_results = AthenaDatabase.execute_query(athena_db, query)
        
        dfs.append(pd.DataFrame(query_results))
        results = pd.concat(dfs, ignore_index=True)
        
        request.log.info(query)


    return results

In [None]:

yy_input='2022' 
mm_input='03'
dd_input='28'
dateto_input='2022-03-04' 
datefrom_input='2022-03-01'  
clientid_input='c9306_drizly'


df_control_test= get_control_group(
    request, 
    yy_value=yy_input, mm_value=mm_input,dd_value=dd_input,
    granularity='D',
    lookback_window=3,
    dateto_value=dateto_input, 
    datefrom_value=datefrom_input,  
    clientid_value=clientid_input,
    )


In [None]:
df_control_test #why is this bringing the same date??? 

Unnamed: 0,date,ctrl_hh,ctrl_visited,ctrl_vr
0,2022-03-01,8535380.0,23349,0.0027355548317708
1,2022-03-01,8535380.0,23349,0.0027355548317708
2,2022-03-01,8535380.0,23349,0.0027355548317708
3,2022-03-01,8535380.0,23349,0.0027355548317708


In [None]:
# Turn this into weekly!
datelist = pd.date_range(start=datefrom_input, end=dateto_input, freq='D').to_list() 
datelist

[Timestamp('2022-03-01 00:00:00', freq='D'),
 Timestamp('2022-03-02 00:00:00', freq='D'),
 Timestamp('2022-03-03 00:00:00', freq='D'),
 Timestamp('2022-03-04 00:00:00', freq='D')]