In [None]:
%pip install -q -r requirements.txt

In [None]:
import importlib
import functions.core_functions as core_functions
import functions.pyarrow_functions as pyarrow_functions
import dask
import dask.dataframe as dd
from dask.dataframe.utils import assert_eq
import pandas as pd
import numpy as np
import pandas_gbq
import json
import gc
import cudf
import os
import yaml
from concurrent.futures import ProcessPoolExecutor
from functools import partial

dask.config.set({"dataframe.backend": "cudf"})

importlib.reload(core_functions)
importlib.reload(pyarrow_functions)

In [None]:
table_schema = yaml.safe_load(open('table-schemas.yaml'))

detections_python = table_schema['detections_python_schema']
detections_python

n90_schema_dict = {item['name']: item['type'] for item in detections_python}
veil_schema_dict = {item['name']: item['type'] for item in detections_python if 'geo_' not in item['name']}
final_detections_cols = []
for item in detections_python:
    final_detections_cols.append(item['name'])
    # print(f"{item['name']} = {item['type']}")

In [None]:
resp = {}
resp = core_functions.initialize_clients(service_account_secret_name='SA_ADHOC_BILLING')
resp2 = core_functions.initialize_clients(service_account_secret_name='SA_N90_CORE_APPS')

config = resp.get('config')
bigquery_client = resp.get('clients').get('bigquery_client')
n90_bigquery_client = resp2.get('clients').get('bigquery_client')
storage_client = resp.get('clients').get('storage_client')
sf_client = resp.get('clients').get('sf_client')
veil_billing = resp.get('config').get('veil_billing')
veil_vars = resp.get('config').get('veil_billing').get('vars')
# print(veil_billing)
sfdc_adv_account_cols = veil_billing.get('vars').get('sfdc_adv_account_cols')
sfdc_rate_card_cols = veil_billing.get('vars').get('sfdc_rate_card_cols')
unknown_dma_overrides = config.get('national_dma_overrides_to_us_national')

In [None]:

broadcast_cal_sql = f"""
    SELECT id as bcw_id, bcw_index, bcm_index, bcw_start_date, bcw_end_date FROM `next90-core-applications.n90_data_lake.lu_broadcast_week`
"""
broadcast_cal_df = core_functions.fetch_gbq_data(query=broadcast_cal_sql, bigquery_client=bigquery_client)

In [None]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config.get('SA_N90_CORE_APPS')
geo_zip_sql = f"""
    SELECT *
    from `next90-core-applications.next90_analytics.geos` WHERE geo_type = 'zip'
    AND geo_country in ('United States', 'Canada')
"""
geo_zip_df = core_functions.fix_df_dtypes(core_functions.fetch_gbq_data(query=geo_zip_sql, bigquery_client=n90_bigquery_client))

geo_dma_sql = f"""
    SELECT *
    from `next90-core-applications.next90_analytics.geos` WHERE geo_type = 'dma'
    AND geo_country in ('United States', 'Canada')
"""
geo_dma_df = core_functions.fix_df_dtypes(core_functions.fetch_gbq_data(query=geo_dma_sql, bigquery_client=n90_bigquery_client))


# int_cols = ['geo_neustar_id','geo_us_msa_id', 'geo_us_county_fips_id','geo_ca_cma_id']
# for col in int_cols:
#     geo_zip_df[col] = geo_df[col].fillna(-1).astype('Int64')

In [None]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config.get('SA_N90_CORE_APPS')
broadcast_cal_sql = f"""
    SELECT id as bcw_id, bcw_index, bcm_index, bcw_start_date, bcw_end_date FROM `next90-core-applications.n90_data_lake.lu_broadcast_week`
"""
broadcast_cal_df = core_functions.fetch_gbq_data(query=broadcast_cal_sql, bigquery_client=n90_bigquery_client)

In [None]:
geo_dma_df.dtypes

In [None]:
data_start_date = '2024-11-01'
data_end_date = '2024-12-01'

process_sql = f"""
    SELECT distinct EXTRACT(YEAR FROM created_time) as year, EXTRACT(MONTH FROM created_time) as month, EXTRACT(DAY FROM created_time) as  day
 from `next90-core-applications.n90_data_lake.activity_sessions`
    WHERE created_time >= '{data_start_date}' and created_time < '{data_end_date}'
    order by year, month, day
"""

process_df = core_functions.fetch_gbq_data(process_sql, n90_bigquery_client)
process_df

In [None]:
process_days = process_df.to_dict(orient='records')
process_days
for record in process_days:
    print(f'year: {record["year"]}, month: {record["month"]}, day: {record["day"]}')

In [None]:
process_days[0]

In [None]:
process_days = [{'year': 2024, 'month': 11, 'day': 1}]
for record in process_days:
    year = record['year']
    month = record['month']
    day = record['day']
    print(f'Processing year: {year}, month: {month}')

    activity_sessions_sql = f"""
        WITH brandToEncoding AS (
        SELECT distinct
        t.brand_id, t.aeis_id, e.encoding_id, e.sfdc_account_id, e.sfdc_account_name, e.sfdc_advertiser_id
        FROM `next90-core-applications.omniData.first_party_triggers` t
        join `next90-core-applications.omniData.avs_encodings` e
        on cast(t.aeis_id as INT64) = e.aeis_id
        )

        SELECT distinct a.*, sfdc_account_id, sfdc_account_name, sfdc_advertiser_id,  p.ip_address
        FROM `next90-core-applications.omniData.activity_sessions` a
        JOIN `next90-core-applications.omniData.pageviews` p
        on a.session_id = p.activity_session_id
        left join brandToEncoding b
        on a.brand_id = b.brand_id
        WHERE EXTRACT(YEAR FROM created_time) = {year} and EXTRACT(MONTH FROM created_time) = {month} 
    """
    activity_sessions_df = core_functions.fetch_gbq_data(activity_sessions_sql, n90_bigquery_client)
    
    print('adding broadcast cal details')
    
    activity_sessions_df['key'] = 1
    broadcast_cal_df['key'] = 1
    activity_sessions_df['activity_date_time'] = pd.to_datetime(activity_sessions_df['created_time'])
    broadcast_cal_df['bcw_start_date'] = pd.to_datetime(broadcast_cal_df['bcw_start_date'])
    broadcast_cal_df['bcw_end_date'] = pd.to_datetime(broadcast_cal_df['bcw_end_date'])
    ref_df = None
    ref_df = broadcast_cal_df.loc[(broadcast_cal_df['bcw_start_date'] >= activity_sessions_df['activity_date_time'].min()) & (broadcast_cal_df['bcw_end_date'] <= activity_sessions_df['activity_date_time'].max())]
    merged_df = None
    merged_df = pd.merge(activity_sessions_df, ref_df, on='key').drop(['key', 'activity_date_time'], axis=1)
    merged_df.sort_values(by=['id'], inplace=True)
    merged_df = merged_df.drop_duplicates(subset=['id'], keep='first')
    activity_sessions_df = merged_df
    activity_sessions_df['_YEAR'] = activity_sessions_df['created_time'].dt.year.astype('Int64')
    activity_sessions_df['_MONTH'] = activity_sessions_df['created_time'].dt.month.astype('Int64')
    activity_sessions_df['_DAY'] = activity_sessions_df['created_time'].dt.day.astype('Int64')
    
    activity_sessions_with_geos_df = activity_sessions_df.merge(geo_zip_df, how='left', left_on='zip_code', right_on='geo_location')
    activity_sessions_with_geos_df.loc[activity_sessions_with_geos_df['neustar_country'].isin(['us', 'ca']) & activity_sessions_with_geos_df['geo_country'].isnull()]
    activity_sessions_with_geos_df['neustar_dma_id'] = activity_sessions_with_geos_df['neustar_dma_id'].astype('string')
    activity_sessions_with_geos_df.loc[activity_sessions_with_geos_df['neustar_country'].isin(['us', 'ca']) & activity_sessions_with_geos_df['geo_country'].isnull()] = activity_sessions_with_geos_df.loc[activity_sessions_with_geos_df['neustar_country'].isin(['us', 'ca']) & activity_sessions_with_geos_df['geo_country'].isnull()].merge(geo_dma_df, how='left', left_on='neustar_dma_id', right_on='geo_location')
    activity_sessions_with_geos_df['neustar_dma_id'] = activity_sessions_with_geos_df['neustar_dma_id'].astype('Int64')
    activity_sessions_df


In [None]:
activity_sessions_df

In [None]:
activity_sessions_df['_YEAR'] = activity_sessions_df['created_time'].dt.year.astype('Int64')
activity_sessions_df['_MONTH'] = activity_sessions_df['created_time'].dt.month.astype('Int64')
activity_sessions_df['_DAY'] = activity_sessions_df['created_time'].dt.day.astype('Int64')

In [None]:
activity_sessions_with_geos_df = activity_sessions_df.merge(geo_zip_df, how='left', left_on='zip_code', right_on='geo_location')
activity_sessions_with_geos_df

In [None]:
activity_sessions_with_geos_df.loc[activity_sessions_with_geos_df['neustar_country'].isin(['us', 'ca']) & activity_sessions_with_geos_df['geo_country'].isnull()]
activity_sessions_with_geos_df['neustar_dma_id'] = activity_sessions_with_geos_df['neustar_dma_id'].astype('string')
activity_sessions_with_geos_df.loc[activity_sessions_with_geos_df['neustar_country'].isin(['us', 'ca']) & activity_sessions_with_geos_df['geo_country'].isnull()] = activity_sessions_with_geos_df.loc[activity_sessions_with_geos_df['neustar_country'].isin(['us', 'ca']) & activity_sessions_with_geos_df['geo_country'].isnull()].merge(geo_dma_df, how='left', left_on='neustar_dma_id', right_on='geo_location')
activity_sessions_with_geos_df['neustar_dma_id'] = activity_sessions_with_geos_df['neustar_dma_id'].astype('Int64')
activity_sessions_with_geos_df.loc[activity_sessions_with_geos_df['neustar_country'].isin(['us', 'ca']) & activity_sessions_with_geos_df['geo_country'].isnull()]

In [None]:
activity_sessions_with_geos_df['neustar_dma_id'] = activity_sessions_with_geos_df['neustar_dma_id'].astype('Int64')

In [None]:
for col in activity_sessions_with_geos_df.columns:
    print(f'col: {col}, dtype: {activity_sessions_with_geos_df[col].dtype}, sample: {activity_sessions_with_geos_df[col].iloc[0]}')