In [None]:
%pip install -q -r requirements.txt

In [None]:
import importlib
import functions.core_functions as core_functions
import functions.pyarrow_functions as pyarrow_functions
import dask
import dask.dataframe as dd
from dask.dataframe.utils import assert_eq
import pandas as pd
import numpy as np
import pandas_gbq
import json
import gc
import cudf
import os
import yaml
from concurrent.futures import ProcessPoolExecutor
from functools import partial
import uu

dask.config.set({"dataframe.backend": "cudf"})

importlib.reload(core_functions)
importlib.reload(pyarrow_functions)

In [None]:
table_schema = yaml.safe_load(open('table-schemas.yaml'))

detections_python = table_schema['detections_python_schema']
detections_python

n90_schema_dict = {item['name']: item['type'] for item in detections_python}
veil_schema_dict = {item['name']: item['type'] for item in detections_python if 'geo_' not in item['name']}
final_detections_cols = []
for item in detections_python:
    final_detections_cols.append(item['name'])
    # print(f"{item['name']} = {item['type']}")

In [None]:
resp = {}
resp = core_functions.initialize_clients(service_account_secret_name='SA_ADHOC_BILLING')
resp2 = core_functions.initialize_clients(service_account_secret_name='SA_N90_CORE_APPS')

config = resp.get('config')
bigquery_client = resp.get('clients').get('bigquery_client')
n90_bigquery_client = resp2.get('clients').get('bigquery_client')
storage_client = resp.get('clients').get('storage_client')
sf_client = resp.get('clients').get('sf_client')
veil_billing = resp.get('config').get('veil_billing')
veil_vars = resp.get('config').get('veil_billing').get('vars')
# print(veil_billing)
sfdc_adv_account_cols = veil_billing.get('vars').get('sfdc_adv_account_cols')
sfdc_rate_card_cols = veil_billing.get('vars').get('sfdc_rate_card_cols')
unknown_dma_overrides = config.get('national_dma_overrides_to_us_national')

In [None]:
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config.get('SA_ADHOC_BILLING')
# broadcast_cal_sql = f"""
#     SELECT id as bcw_id, bcw_index, bcm_index, bcw_start_date, bcw_end_date FROM `adhoc-billing.avs_billing_process.lu_broadcast_week`
# """
# broadcast_cal_df = core_functions.fetch_gbq_data(query=broadcast_cal_sql, bigquery_client=bigquery_client)
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config.get('SA_N90_CORE_APPS')

In [None]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config.get('SA_N90_CORE_APPS')
geo_zip_sql = f"""
    SELECT *
    from `next90-core-applications.next90_analytics.geos` WHERE geo_type = 'zip'
    AND geo_country in ('United States', 'Canada')
"""
geo_zip_df = core_functions.fix_df_dtypes(core_functions.fetch_gbq_data(query=geo_zip_sql, bigquery_client=n90_bigquery_client))

geo_dma_sql = f"""
    SELECT *
    from `next90-core-applications.next90_analytics.geos` WHERE geo_type = 'dma'
    AND geo_country in ('United States', 'Canada')
"""
geo_dma_df = core_functions.fix_df_dtypes(core_functions.fetch_gbq_data(query=geo_dma_sql, bigquery_client=n90_bigquery_client))


# int_cols = ['geo_neustar_id','geo_us_msa_id', 'geo_us_county_fips_id','geo_ca_cma_id']
# for col in int_cols:
#     geo_zip_df[col] = geo_df[col].fillna(-1).astype('Int64')

In [None]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config.get('SA_N90_CORE_APPS')
broadcast_cal_sql = f"""
    SELECT id as bcw_id, bcw_index, bcm_index, bcw_start_date, bcw_end_date FROM `next90-core-applications.n90_data_lake.lu_broadcast_week`
"""
broadcast_cal_df = core_functions.fetch_gbq_data(query=broadcast_cal_sql, bigquery_client=n90_bigquery_client)

In [None]:
geo_dma_df.dtypes

In [None]:
data_start_date = '2024-11-01'
data_end_date = '2024-12-01'

process_sql = f"""
    SELECT distinct EXTRACT(YEAR FROM created_time) as year, EXTRACT(MONTH FROM created_time) as month, EXTRACT(DAY FROM created_time) as  day
 from `next90-core-applications.n90_data_lake.activity_sessions`
    WHERE created_time >= '{data_start_date}' and created_time < '{data_end_date}'
    order by year, month, day
"""

process_df = core_functions.fetch_gbq_data(process_sql, n90_bigquery_client)
process_df

In [None]:
process_days = process_df.to_dict(orient='records')
process_days
for record in process_days:
    print(f'year: {record["year"]}, month: {record["month"]}, day: {record["day"]}')

In [None]:
process_days[0]

In [None]:
def process_activity_sessions_segments(df, _bcc_df, geo_z_df, geo_d_df, media='DIGITAL'):
    df['key'] = 1
    _bcc_df['key'] = 1
    df['activity_date_time'] = pd.to_datetime(df['created_time'])
    _bcc_df['bcw_start_date'] = pd.to_datetime(_bcc_df['bcw_start_date'])
    _bcc_df['bcw_end_date'] = pd.to_datetime(_bcc_df['bcw_end_date'])
    ref_df = None
    ref_df = _bcc_df.loc[(_bcc_df['bcw_start_date'] >= df['activity_date_time'].min()) & (_bcc_df['bcw_end_date'] <= df['activity_date_time'].max())]
    merged_df = None
    merged_df = pd.merge(df, ref_df, on='key').drop(['key', 'activity_date_time'], axis=1)
    merged_df.sort_values(by=['id'], inplace=True)
    merged_df = merged_df.drop_duplicates(subset=['id'], keep='first')
    merged_df['bc_year_index'] = merged_df['bcm_index'].astype(str).str[:4].astype('Int64')
    merged_df['bcm_index'] = merged_df['bcm_index'].astype('Float64')
    merged_df['bcw_index'] = merged_df['bcw_index'].astype('Float64')
    del merged_df
    gc.collect()
    df = merged_df
    df['_YEAR'] = df['created_time'].dt.year.astype('Int64')
    df['_MONTH'] = df['created_time'].dt.month.astype('Int64')
    df['_DAY'] = df['created_time'].dt.day.astype('Int64')
    
    activity_sessions_with_geos_df = df.merge(geo_z_df, how='left', left_on='zip_code', right_on='geo_location')
    del df
    gc.collect()
    activity_sessions_with_geos_df['neustar_dma_id'] = activity_sessions_with_geos_df['neustar_dma_id'].astype('string')
    activity_sessions_with_geos_df.loc[activity_sessions_with_geos_df['neustar_country'].isin(['us', 'ca']) & activity_sessions_with_geos_df['geo_country'].isnull()] = activity_sessions_with_geos_df.loc[activity_sessions_with_geos_df['neustar_country'].isin(['us', 'ca']) & activity_sessions_with_geos_df['geo_country'].isnull()].merge(geo_d_df, how='left', left_on='neustar_dma_id', right_on='geo_location')
    activity_sessions_with_geos_df['neustar_dma_id'] = activity_sessions_with_geos_df['neustar_dma_id'].astype('Int64')
    activity_sessions_with_geos_df.sort_values(by=['id', 'created_time'], ascending=[True, False] ,inplace=True)
    df = activity_sessions_with_geos_df.drop_duplicates(subset=['id'], keep='first').sort_values(by='created_time').copy().reset_index(drop=True)
    del activity_sessions_with_geos_df
    gc.collect()
    df['segments_date'] = pd.to_datetime(df['created_time'])
    df['segments_day_of_week'] = pd.to_datetime(df['segments_date']).dt.tz_localize(None).dt.day_name().astype('string')
    df['segments_media'] =  media
        
    df['segments_month_label'] = pd.to_datetime(df['segments_date']).dt.tz_localize(None).dt.to_period('M').astype('string')
    df['segments_quarter_label'] = pd.to_datetime(df['segments_date']).dt.tz_localize(None).dt.to_period('Q').astype('string')
    df['segments_week_label'] = pd.to_datetime(df['segments_date']).dt.tz_localize(None).dt.to_period('W').astype('string')

    # Convert timestamp to periods and then use start_time to get the first day of the period
    df['segments_month'] = pd.to_datetime(df['segments_date']).dt.tz_localize(None).dt.to_period('M').dt.start_time.dt.date.astype('string')
    df['segments_quarter'] = pd.to_datetime(df['segments_date']).dt.tz_localize(None).dt.to_period('Q').dt.start_time.dt.date.astype('string')
    df['segments_week'] = pd.to_datetime(df['segments_date']).dt.tz_localize(None).dt.to_period('W').dt.start_time.dt.date.astype('string')

    # Year can remain as a period or also be converted similarly if needed
    df['segments_year'] = pd.to_datetime(df['segments_date']).dt.tz_localize(None).dt.to_period('Y').astype('string')
    df['segments_broadcast_year'] = df['bc_year_index'].astype('Int64')
    df['segments_broadcast_month_index'] = df['bcm_index'].astype('Float64')
    df['segments_broadcast_week_index'] = df['bcw_index'].astype('Float64')
    df['_YEAR'] = df['segments_date'].dt.year.astype('Int64')
    df['_MONTH'] = df['segments_date'].dt.month.astype('Int64')
    df['_DAY'] = df['segments_date'].dt.day.astype('Int64')
    df['segments_date'] = df['segments_date'].dt.date.astype('string')
    df['session_timestamp'] = df['date_time'].dt.strftime('%Y-%m-%d %H:%M:%S')
    df['date_time'] = df['date_time'].dt.strftime('%Y-%m-%d %H:%M:%S')
    
    df['last_updated'] = df['billing_last_updated'].dt.strftime('%Y-%m-%d %H:%M:%S')
    df['last_updated_audit_id'] = uuid.uuid4()
    df['bcw_start_date'] = df['bcw_start_date'].dt.strftime('%Y-%m-%d %H:%M:%S')
    df['bcw_end_date'] = df['bcw_end_date'].dt.strftime('%Y-%m-%d %H:%M:%S')
    

    return df

In [None]:

def process_activity_sessions_segments(_df, _bcc_df, geo_z_df, geo_d_df, media='DIGITAL'):
    print('starting process_activity_sessions_segments')
    print('adding broadcast cal details')
    print(f'length of df: {len(_df)}')
    _df['key'] = 1
    _bcc_df['key'] = 1
    _df['activity_date_time'] = pd.to_datetime(_df['created_time'])
    _bcc_df['bcw_start_date'] = pd.to_datetime(_bcc_df['bcw_start_date'])
    _bcc_df['bcw_end_date'] = pd.to_datetime(_bcc_df['bcw_end_date'])
    # ref_df = None
    ref_df = _bcc_df.loc[(_bcc_df['bcw_start_date'] >= _df['created_time'].min()) & (_bcc_df['bcw_end_date'] <= _df['created_time'].max())]
    # merged_df = None
    merged_df = pd.merge(_df, ref_df, on='key').drop(['key', 'created_time'], axis=1)
    merged_df['bc_year_index'] = merged_df['bcm_index'].astype(str).str[:4].astype('Int64')
    merged_df['bcm_index'] = merged_df['bcm_index'].astype('Float64')
    merged_df['bcw_index'] = merged_df['bcw_index'].astype('Float64')

    merged_df.sort_values(by=['id'], inplace=True)
    merged_df = merged_df.drop_duplicates(subset=['id'], keep='first')
    # df = merged_df
    # print('finished adding broadcast cal details')
    # print(f'length of df: {len(df)}')
    # del merged_df
    # gc.collect()
    # df['_YEAR'] = df['created_time'].dt.year.astype('Int64')
    # df['_MONTH'] = df['created_time'].dt.month.astype('Int64')
    # df['_DAY'] = df['created_time'].dt.day.astype('Int64')
    # print('adding geos')
    # activity_sessions_with_geos_df = df.merge(geo_z_df, how='left', left_on='zip_code', right_on='geo_location')
    # del df
    # gc.collect()
    # activity_sessions_with_geos_df['neustar_dma_id'] = activity_sessions_with_geos_df['neustar_dma_id'].astype('string')
    # activity_sessions_with_geos_df.loc[activity_sessions_with_geos_df['neustar_country'].isin(['us', 'ca']) & activity_sessions_with_geos_df['geo_country'].isnull()] = activity_sessions_with_geos_df.loc[activity_sessions_with_geos_df['neustar_country'].isin(['us', 'ca']) & activity_sessions_with_geos_df['geo_country'].isnull()].merge(geo_d_df, how='left', left_on='neustar_dma_id', right_on='geo_location')
    # activity_sessions_with_geos_df['neustar_dma_id'] = activity_sessions_with_geos_df['neustar_dma_id'].astype('Int64')
    # activity_sessions_with_geos_df.sort_values(by=['id', 'created_time'], ascending=[True, False] ,inplace=True)
    # df = activity_sessions_with_geos_df.drop_duplicates(subset=['id'], keep='first').sort_values(by='created_time').copy().reset_index(drop=True)
    # del activity_sessions_with_geos_df
    # gc.collect()
    # print('adding segments')
    # df['segments_date'] = pd.to_datetime(df['created_time'])
    # df['segments_day_of_week'] = pd.to_datetime(df['segments_date']).dt.tz_localize(None).dt.day_name().astype('string')
    # df['segments_media'] =  media
        
    # df['segments_month_label'] = pd.to_datetime(df['segments_date']).dt.tz_localize(None).dt.to_period('M').astype('string')
    # df['segments_quarter_label'] = pd.to_datetime(df['segments_date']).dt.tz_localize(None).dt.to_period('Q').astype('string')
    # df['segments_week_label'] = pd.to_datetime(df['segments_date']).dt.tz_localize(None).dt.to_period('W').astype('string')

    # # Convert timestamp to periods and then use start_time to get the first day of the period
    # df['segments_month'] = pd.to_datetime(df['segments_date']).dt.tz_localize(None).dt.to_period('M').dt.start_time.dt.date.astype('string')
    # df['segments_quarter'] = pd.to_datetime(df['segments_date']).dt.tz_localize(None).dt.to_period('Q').dt.start_time.dt.date.astype('string')
    # df['segments_week'] = pd.to_datetime(df['segments_date']).dt.tz_localize(None).dt.to_period('W').dt.start_time.dt.date.astype('string')

    # # Year can remain as a period or also be converted similarly if needed
    # df['segments_year'] = pd.to_datetime(df['segments_date']).dt.tz_localize(None).dt.to_period('Y').astype('string')
    # df['segments_broadcast_year'] = df['bc_year_index'].astype('Int64')
    # df['segments_broadcast_month_index'] = df['bcm_index'].astype('Float64')
    # df['segments_broadcast_week_index'] = df['bcw_index'].astype('Float64')
    # df['_YEAR'] = df['segments_date'].dt.year.astype('Int64')
    # df['_MONTH'] = df['segments_date'].dt.month.astype('Int64')
    # df['_DAY'] = df['segments_date'].dt.day.astype('Int64')
    # df['segments_date'] = df['segments_date'].dt.date.astype('string')
    # df['session_timestamp'] = df['created_time'].dt.strftime('%Y-%m-%d %H:%M:%S')
    # df['created_time'] = df['created_time'].dt.strftime('%Y-%m-%d %H:%M:%S')

    # df['bcw_start_date'] = df['bcw_start_date'].dt.strftime('%Y-%m-%d %H:%M:%S')
    # df['bcw_end_date'] = df['bcw_end_date'].dt.strftime('%Y-%m-%d %H:%M:%S')
    # print('finished process_activity_sessions_segments')
    

    return _df


process_days = [{'year': 2024, 'month': 11, 'day': 1}]
for record in process_days:
    year = record['year']
    month = record['month']
    day = record['day']
    print(f'Processing year: {year}, month: {month}')

    # activity_sessions_sql = f"""
    #     WITH brandToEncoding AS (
    #     SELECT distinct
    #     t.brand_id, t.aeis_id, e.encoding_id, e.sfdc_account_id, e.sfdc_account_name, e.sfdc_advertiser_id
    #     FROM `next90-core-applications.omniData.first_party_triggers` t
    #     join `next90-core-applications.omniData.avs_encodings` e
    #     on cast(t.aeis_id as INT64) = e.aeis_id
    #     )

    #     SELECT distinct a.*, sfdc_account_id, sfdc_account_name, sfdc_advertiser_id,  p.ip_address
    #     FROM `next90-core-applications.omniData.activity_sessions` a
    #     JOIN `next90-core-applications.omniData.pageviews` p
    #     on a.session_id = p.activity_session_id
    #     left join brandToEncoding b
    #     on a.brand_id = b.brand_id
    #     WHERE EXTRACT(YEAR FROM created_time) = {year} and EXTRACT(MONTH FROM created_time) = {month}  and EXTRACT(DAY FROM created_time) = {day}
    # """
    activity_sessions_sql = f"""
        SELECT *,
            TIMESTAMP_TRUNC(created_time, SECOND) AS activity_session_timestamp,
            --- CONCAT(EXTRACT(YEAR FROM created_time), '-', FORMAT('%02d', EXTRACT(MONTH FROM created_time))) AS process_month_group
        FROM `next90-core-applications.omniData.temp_activity_sessions`
        WHERE EXTRACT(YEAR FROM created_time) = {year} 
        AND EXTRACT(MONTH FROM created_time) = {month}
    """
    
    print('loading activity sessions')
    activity_sessions_df = core_functions.fetch_gbq_data(activity_sessions_sql, n90_bigquery_client)
    activity_sessions_df.sort_values(by=['id', 'created_time'], ascending=[True, False] ,inplace=True)
    activity_sessions_df = activity_sessions_df.drop_duplicates(subset=['id'], keep='first')
    # activity_sessions_df = activity_sessions_df[0:1000]
    print('processing with function process_activity_sessions_segments')
    # activity_sessions_df = process_activity_sessions_segments(activity_sessions_df, broadcast_cal_df, geo_zip_df, geo_dma_df)
    
    # billing_last_updated = pd.Timestamp.utcnow()
    # activity_sessions_df['activity_session_last_updated'] = billing_last_updated
    # billing_last_audit_id = core_functions.generate_uuid()
    # activity_sessions_df['activity_session_last_audit_id'] = billing_last_audit_id
    # # print('adding broadcast cal details')
    

    activity_sessions_df


In [None]:
# 27331840
# 27331844
# 27331840
activity_sessions_df.sort_values(by=['id', 'created_time'], ascending=[True, False] ,inplace=True)
activity_sessions_df = activity_sessions_df.drop_duplicates(subset=['id'], keep='first')
activity_sessions_df

In [None]:
activity_sessions_df['_YEAR'] = activity_sessions_df['created_time'].dt.year.astype('Int64')
activity_sessions_df['_MONTH'] = activity_sessions_df['created_time'].dt.month.astype('Int64')
activity_sessions_df['_DAY'] = activity_sessions_df['created_time'].dt.day.astype('Int64')

In [None]:
activity_sessions_with_geos_df = activity_sessions_df.merge(geo_zip_df, how='left', left_on='zip_code', right_on='geo_location')
activity_sessions_with_geos_df

In [None]:
activity_sessions_with_geos_df.loc[activity_sessions_with_geos_df['neustar_country'].isin(['us', 'ca']) & activity_sessions_with_geos_df['geo_country'].isnull()]
activity_sessions_with_geos_df['neustar_dma_id'] = activity_sessions_with_geos_df['neustar_dma_id'].astype('string')
activity_sessions_with_geos_df.loc[activity_sessions_with_geos_df['neustar_country'].isin(['us', 'ca']) & activity_sessions_with_geos_df['geo_country'].isnull()] = activity_sessions_with_geos_df.loc[activity_sessions_with_geos_df['neustar_country'].isin(['us', 'ca']) & activity_sessions_with_geos_df['geo_country'].isnull()].merge(geo_dma_df, how='left', left_on='neustar_dma_id', right_on='geo_location')
activity_sessions_with_geos_df['neustar_dma_id'] = activity_sessions_with_geos_df['neustar_dma_id'].astype('Int64')
activity_sessions_with_geos_df.loc[activity_sessions_with_geos_df['neustar_country'].isin(['us', 'ca']) & activity_sessions_with_geos_df['geo_country'].isnull()]

In [None]:
activity_sessions_with_geos_df['neustar_dma_id'] = activity_sessions_with_geos_df['neustar_dma_id'].astype('Int64')

In [None]:
activity_sessions_df['created_time_2'] = pd.to_datetime(activity_sessions_df['created_time'])
broadcast_cal_df['bcw_start_date_2'] = pd.to_datetime(broadcast_cal_df['bcw_start_date'])
broadcast_cal_df['bcw_end_date_2'] = pd.to_datetime(broadcast_cal_df['bcw_end_date'])


activity_sessions_df['created_time'].max()
activity_sessions_df['_YEAR'] = activity_sessions_df['created_time'].dt.year.astype('string')
activity_sessions_df['_MONTH'] = activity_sessions_df['created_time'].dt.month.astype('string')

ref_1_ids_df = broadcast_cal_df[broadcast_cal_df['bcw_start_date_2'] <= activity_sessions_df['created_time_2'].min()]
ref_2_ids_df = broadcast_cal_df[broadcast_cal_df['bcw_end_date_2'] > activity_sessions_df['created_time_2'].max()]
ref_1_ids_df
# common_ids = []
# for id in ref_1_ids:
#     if id in ref_2_ids:
#         common_ids.append(id)
# common_ids


In [None]:
activity_sessions_df['created_time'] = activity_sessions_df['created_time'].apply(lambda x: x.strftime('%Y-%m-%d %H:%M:%S'))



In [None]:
activity_sessions_df.head(5)

In [None]:
broadcast_cal_df['bcw_start_date'] = pd.to_datetime(broadcast_cal_df['bcw_start_date'])
broadcast_cal_df['bcw_end_date'] = pd.to_datetime(broadcast_cal_df['bcw_end_date'])
activity_sessions_df['created_time'] = pd.to_datetime(activity_sessions_df['created_time'])


In [None]:
ref_min_date = pd.to_datetime(activity_sessions_df['created_time']).min()
ref_max_date = pd.to_datetime(activity_sessions_df['created_time']).max()
ref_max_date

In [None]:

ref_1 = broadcast_cal_df[(broadcast_cal_df['bcw_start_date'] <= ref_min_date)]
# ref_2 = ref_1.loc[(ref_1['bcw_end_date'] < ref_max_date)].copy()
ref_1
# broadcast_cal_df.loc[(broadcast_cal_df['bcw_start_date'] >= activity_sessions_df['created_time'].min()) & (broadcast_cal_df['bcw_end_date'] <= activity_sessions_df['created_time'].max())]
# # Get scalar values for the date range
# min_created_time = activity_sessions_df['created_time'].min()
# max_created_time = activity_sessions_df['created_time'].max()

# # Filter the DataFrame
# filtered_broadcast_cal_df = broadcast_cal_df.loc[
#     (broadcast_cal_df['bcw_start_date'] >= min_created_time) &
#     (broadcast_cal_df['bcw_end_date'] <= max_created_time)
# ]
# filtered_broadcast_cal_df
# # reff_df
# # activity_sessions_df['created_time'].max()

In [None]:
ddf = broadcast_cal_df
for col in ddf.columns:
    print(f'col: {col}, dtype: {ddf[col].dtype}, sample: {ddf[col].iloc[0]}')