In [None]:
%pip install -q -r requirements.txt

In [None]:

import importlib
import functions.core_functions as core_functions
import dask
import dask.dataframe as dd
from dask.dataframe.utils import assert_eq
import pandas as pd
import numpy as np
import pandas_gbq
import json
import gc
import cudf
import os

dask.config.set({"dataframe.backend": "cudf"})

importlib.reload(core_functions)

In [None]:
veil_schema = {
    'occurrence_id': 'int64',
    'encoding_id': 'int64',
    'broadcaster_id': 'int64',
    'cost': 'float64',
    'tv_show_id': 'int64',
    'origin': 'string',
    'group_occurrence_id': 'int64',
    'last_updated': 'datetime64[ns, UTC]',
    'last_audit_id': 'string',
    'date_time': 'datetime64[ns, UTC]',
    'year': 'int64',
    'month': 'int64',
    'day': 'int64',
    'affiliate': 'string',
    'callsign': 'string',
    'dma_id': 'string',
    'kantar_region': 'string',
    'neustar_id': 'int64',
    'name': 'string',
    'timezone': 'string',
    'dma_rank': 'int64',
    'id': 'string',
    'show_id': 'int64',
    'is_barter': 'bool',
    'is_cable': 'bool',
    'is_network': 'bool',
    'is_spot': 'bool',
    'is_canadian': 'bool',
    'is_hispanic': 'bool',
    'is_local_cable': 'bool',
    'is_active': 'bool',
    'length': 'float64',
    'show_name': 'string',
    'format_id': 'int64',
    'customer_id': 'int64',
    'sfdc_account_id': 'string',
    'sfdc_advertiser_id': 'string',
    'aeis_id': 'int64',
    'billing_det_last_updated': 'datetime64[ns, UTC]',
}

n90_schema = {
   'occurrence_id': 'int64',
   'encoding_id': 'int64',
   'broadcaster_id': 'int64',
   'cost': 'float64',
   'tv_show_id': 'int64',
   'origin': 'string',
   'group_occurrence_id': 'int64',
   'last_updated': 'datetime64[ns, UTC]',
   'last_audit_id': 'string',
   'date_time': 'datetime64[ns, UTC]',
   'year': 'int64',
   'month': 'int64',
   'day': 'int64',
   'affiliate': 'string',
   'callsign': 'string',
   'dma_id': 'string',
   'kantar_region': 'string',
   'neustar_id': 'int64',
   'name': 'string',
   'timezone': 'string',
   'dma_rank': 'int64',
   'id': 'string',
   'show_id': 'int64',
   'is_barter': 'bool',
   'is_cable': 'bool',
   'is_network': 'bool',
   'is_spot': 'bool',
   'is_canadian': 'bool',
   'is_hispanic': 'bool',
   'is_local_cable': 'bool',
   'is_active': 'bool',
   'length': 'float64',
   'show_name': 'string',
   'geo_location': 'string',
   'geo_type': 'string',
   'geo_latitude': 'float64',
   'geo_longitude': 'float64',
   'geo_city': 'string',
   'geo_state': 'string',
   'geo_country': 'string',
   'geo_neustar_id': 'string',
   'geo_us_msa_id': 'string',
   'geo_us_county_fips_id': 'string',
   'geo_ca_cma_id': 'string',
   'geo_gm_zip_code': 'string',
   'format_id': 'int64',
   'encoder_group_id': 'int64',
   'encoded_timestamp': 'datetime64[ns, UTC]',
   'clone_of': 'int64',
   'status': 'string',
   'last_updated_encoding': 'datetime64[ns, UTC]',
   'last_audit_id_encoding': 'string',
   'encoder_id': 'int64',
   'detection_end_date': 'datetime64[ns, UTC]',
   'encoded_timestamp_epoch': 'int64',
   'attributes_advertiser': 'string',
   'attributes_audience': 'string',
   'attributes_audience_2': 'string',
   'attributes_cable_estimate': 'string',
   'attributes_campaign': 'string',
   'attributes_category': 'string',
   'attributes_client_code': 'string',
   'attributes_commercial_id': 'string',
   'attributes_contour_id': 'string',
   'attributes_creative_offer': 'string',
   'attributes_description': 'string',
   'attributes_donovan_agency_advertiser_code': 'string',
   'attributes_donovan_agency_estimate_code': 'string',
   'attributes_donovan_agency_product_code': 'string',
   'attributes_eid': 'string',
   'attributes_group': 'string',
   'attributes_hd_sd': 'string',
   'attributes_id': 'string',
   'attributes_isci': 'string',
   'length_in_seconds': 'string',
   'attributes_length': 'string',
   'attributes_lob': 'string',
   'attributes_media_type': 'string',
   'attributes_message': 'string',
   'attributes_misc': 'string',
   'attributes_module_code': 'string',
   'attributes_offer': 'string',
   'attributes_offer_2': 'string',
   'attributes_phone_number': 'string',
   'attributes_product_code': 'string',
   'attributes_product_name': 'string',
   'attributes_project_name': 'string',
   'attributes_quality': 'string',
   'attributes_revision': 'string',
   'attributes_show_name': 'string',
   'attributes_slug': 'string',
   'attributes_sport_id': 'string',
   'attributes_sport_show_sub_category': 'string',
   'attributes_spot_estimate': 'string',
   'attributes_spot_name': 'string',
   'attributes_tag': 'string',
   'attributes_text': 'string',
   'attributes_title': 'string',
   'attributes_veil_id': 'string',
   'attributes_version_name': 'string',
   'attributes_year': 'string',
   'product_code': 'string',
   'isci': 'string',
   'advertiser': 'string',
   'encoder_group_name': 'string',
   'encoder_group__deleted': 'bool',
   'encoder_group__last_audit_id': 'string',
   'encoder_group__last_updated': 'datetime64[ns, UTC]',
   'aeis_id': 'int64',
   'aeis__encoding_id': 'int64',
   'aeis__encoding_offset': 'string',
   'aeis__last_updated': 'datetime64[ns, UTC]',
   'aeis__last_audit_id': 'string',
   'format_name': 'string',
   'format__profile_id': 'int64',
   'format__customer_id': 'int64',
   'format__report_breakup': 'string',
   'format__deleted': 'bool',
   'format__last_updated': 'datetime64[ns, UTC]',
   'format__last_audit_id': 'string',
   'sfdc_account_name': 'string',
   'customer_id': 'int64',
   'account_id': 'string',
   'contract_item': 'string',
   'customer_name': 'string',
   'contract_number': 'string',
   'sales_person_code': 'string',
   'deleted': 'bool',
   'profile_id': 'int64',
   'profile_name': 'string',
   'profile__deleted': 'bool',
   'profile__default_asset_code': 'string',
   'profile__last_updated': 'datetime64[ns, UTC]',
   'profile__last_audit_id': 'string',
   'ad_prod_campaign': 'string',
   'campaign': 'string',
   'sfdc_advertiser_id': 'string',
   'billing_last_updated': 'datetime64[ns, UTC]',
   'billing_last_audit_id': 'string',
   'sfdc_account_id': 'string',
   'billing_det_last_updated': 'datetime64[ns, UTC]'
}

In [None]:
resp = {}
resp = core_functions.initialize_clients(service_account_secret_name='SA_ADHOC_BILLING')
resp2 = core_functions.initialize_clients(service_account_secret_name='SA_N90_CORE_APPS')

config = resp.get('config')
bigquery_client = resp.get('clients').get('bigquery_client')
n90_bigquery_client = resp2.get('clients').get('bigquery_client')
storage_client = resp.get('clients').get('storage_client')
sf_client = resp.get('clients').get('sf_client')
veil_billing = resp.get('config').get('veil_billing')
veil_vars = resp.get('config').get('veil_billing').get('vars')
# print(veil_billing)
sfdc_adv_account_cols = veil_billing.get('vars').get('sfdc_adv_account_cols')
sfdc_rate_card_cols = veil_billing.get('vars').get('sfdc_rate_card_cols')
unknown_dma_overrides = config.get('national_dma_overrides_to_us_national')

In [None]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config.get('SA_ADHOC_BILLING')
mongo_tables = ['dmas', 'shows']
mongo_data = core_functions.fetch_table_data(
    project_id=veil_billing.get('avs_project_id'),
    dataset_id='mongo',
    table_names=mongo_tables,
    bigquery_client=bigquery_client
)
dmas_df = mongo_data['dmas']
shows_df = mongo_data['shows']
master_channel_sql = f"""
    SELECT * from `adhoc-billing.avs_billing_process.master_channels_expanded`
"""
master_channel_df = core_functions.fetch_gbq_data(master_channel_sql, bigquery_client)
dmas_df['neustar_id'] = dmas_df['neustar_id'].fillna(808080).astype(int)
mask = dmas_df['neustar_id'].isin(unknown_dma_overrides)
dmas_df.loc[mask, 'neustar_id'] = 808080
dmas_df['dma_rank'] = dmas_df['dma_rank'].fillna(0).astype(int)
shows_df['show_id'] = shows_df['show_id'].fillna(-6).astype(int)
master_channel_df['broadcaster_id'] = master_channel_df['broadcaster_id'].fillna(-10).astype(int)


In [None]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config.get('SA_N90_CORE_APPS')
geo_sql = f"""
    SELECT * from `next90-core-applications.next90_analytics.geos` WHERE geo_type = 'dma'
"""
geo_df = core_functions.fetch_gbq_data(query=geo_sql, bigquery_client=n90_bigquery_client)
# n90_bigquery_client
int_cols = ['geo_location', 'geo_neustar_id','geo_us_msa_id', 'geo_us_county_fips_id','geo_ca_cma_id']
for col in int_cols:
    geo_df[col] = geo_df[col].fillna(-1).astype(int)
float_cols = ['geo_latitude', 'geo_longitude']
for col in float_cols:
    geo_df[col] = geo_df[col].fillna(0.0).astype(float)
    
string_cols = ['geo_type', 'geo_city', 'geo_state', 'geo_country', 'geo_gm_zip_code']
for col in string_cols:
    geo_df[col] = geo_df[col].fillna('').replace('nan', '').astype(str)
# geo_df['geo_gm_zip_code'] = geo_df['geo_gm_zip_code'].fillna('').replace('nan', '').astype(str)
# geo_df['geo_location']
geo_df

In [None]:
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config.get('SA_ADHOC_BILLING')
# min_date_sql = f"""
#     SELECT cast(cast(MIN(date_time) as date) as string) as min_date, cast(cast(MAX(date_time) as date) as string) as max_date FROM `{veil_billing.get('avs_project_id')}.{veil_billing.get('avs_dataset_id')}.detections`
#     """
# min_max = core_functions.fetch_gbq_data(query=min_date_sql, bigquery_client=bigquery_client)
# min_date = min_max['min_date'][0]
# max_date = min_max['max_date'][0]

In [None]:
# from datetime import datetime
# from dateutil.relativedelta import relativedelta

# def generate_month_list(start_date, end_date):
#     """
#     Generate a list of months in 'YYYY-MM' format from start_date to end_date (inclusive).
    
#     Args:
#         start_date (str): The start date in 'YYYY-MM-DD' format.
#         end_date (str): The end date in 'YYYY-MM-DD' format.
    
#     Returns:
#         list: List of months in 'YYYY-MM' format.
#     """
#     start = datetime.strptime(start_date, "%Y-%m-%d")
#     end = datetime.strptime(end_date, "%Y-%m-%d")
    
#     if start > end:
#         raise ValueError("start_date must be before or equal to end_date")
    
#     month_list = []
#     current = start
#     while current <= end:
#         month_list.append(current.strftime("%Y-%m"))
#         current += relativedelta(months=1)
    
#     return month_list

In [None]:
billing_tables = ['encodings']
billing_data = core_functions.fetch_table_data(
    project_id=veil_billing.get('billing_project_id'),
    dataset_id=veil_billing.get('billing_dataset_id'),
    table_names=billing_tables,
    bigquery_client=bigquery_client
)

# Access specific DataFrames
encodings_df = billing_data['encodings']

In [None]:
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config.get('SA_N90_CORE_APPS')
# geo_sql = f"""
#     SELECT * from `next90-core-applications.next90_analytics.geos` WHERE geo_type = 'dma'
# """
# geo_df = core_functions.fetch_gbq_data(query=geo_sql, bigquery_client=n90_bigquery_client)
# # n90_bigquery_client

In [None]:

# detections_df['date_time'] = pd.to_datetime(detections_df['date_time'], utc=True)
# detections_df['year'] = detections_df['date_time'].dt.year
# detections_df['month'] = detections_df['date_time'].dt.month
# detections_df['day'] = detections_df['date_time'].dt.day
# detections_df['tv_show_id'].fillna(-10, inplace=True) 
# detections_df['group_occurrence_id'].fillna(-6, inplace=True)
# dmas_df['dma_rank'].replace('', 0).fillna(0, inplace=True)

# detections_df['occurrence_id'] = detections_df['occurrence_id'].astype(float)
# # detections_df = detections_df.merge(master_channels_df, how='left', left_on='broadcaster_id', right_on='broadcaster_id')
# dmas_df.loc[mask, 'neustar_id'] = 808080
# dmas_df['neustar_id'] = dmas_df['neustar_id'].fillna(808080).astype(int)

# detections_df = detections_df.merge(dmas_df, how='left', left_on='dma_id', right_on='dma_id')

# shows_df['show_id'].fillna(-5, inplace=True)
# mask = ((dmas_df['dma_id'].isin(unknown_dma_overrides)) )
# detections_df = detections_df.merge(shows_df, how='left', left_on='tv_show_id', right_on='show_id')


# geo_df['geo_location'] = geo_df['geo_location'].astype(int)
# detections_df = detections_df.merge(geo_df, how='left', left_on='neustar_id', right_on='geo_location')
# gc.collect()
# # df_1 = df_1.merge(encodings_df[['encoding_id', 'format_id', 'customer_id', 'sfdc_account_id', 'sfdc_advertiser_id', 'aeis_id']], how='left', left_on='encoding_id', right_on='encoding_id')
# detections_df = detections_df.merge(encodings_df, how='left', left_on='encoding_id', right_on='encoding_id', suffixes=('', '_encoding'))
# gc.collect()



In [None]:
# df_1_date_cols = ['date_time', 'last_updated','billing_det_last_updated']
# df_1_int_cols = ['occurrence_id', 'encoding_id', 'broadcaster_id', 'tv_show_id', 'group_occurrence_id', 'year', 'month', 'day', 'neustar_id', 'dma_rank', 'show_id', 'format_id', 'customer_id', 'aeis_id']
# df_1_float_cols = ['cost', 'length']
# df_1_bool_cols = ['is_barter', 'is_cable', 'is_network', 'is_spot', 'is_canadian', 'is_hispanic', 'is_local_cable', 'is_active']


# df_2_date_cols = ['date_time', 'last_updated','billing_det_last_updated', 'encoded_timestamp','last_updated_encoding','detection_end_date','encoder_group__last_updated','aeis__last_updated',
#              'format__last_updated','profile__last_updated','billing_last_updated']
# df_2_int_cols = ['occurrence_id', 'encoding_id', 'broadcaster_id', 'tv_show_id', 'group_occurrence_id', 'year', 'month', 'day', 'neustar_id', 'dma_rank', 'show_id', 'format_id', 'customer_id', 'aeis_id',
#             'encoder_group_id', 'clone_of', 'encoder_id','encoded_timestamp_epoch','aeis__encoding_id','format__profile_id','format__customer_id',
#             'profile_id']
# df_2_float_cols = ['cost', 'length', 'geo_latitude', 'geo_longitude']
# df_2_bool_cols = ['is_barter', 'is_cable', 'is_network', 'is_spot', 'is_canadian', 'is_hispanic', 'is_local_cable', 'is_active','encoder_group__deleted',
#              'format__deleted','deleted','profile__deleted']

In [None]:
# for col in df_2.columns:
#     print(f"'{col}': '{df_2[col].dtype}',")

In [None]:
# # Enforce schema

# def enforce_schema(df, schema):
#     for column, dtype in schema.items():
#         if dtype == 'string':
#             df[column] = df[column].astype(str).fillna('')
#         elif dtype == 'int64':
#             df[column] = pd.to_numeric(df[column], errors='coerce').fillna(-1).astype('int64')
#         elif dtype == 'float64':
#             df[column] = pd.to_numeric(df[column], errors='coerce').fillna(-1.0).astype('float64')
#         elif dtype == 'datetime64[ns, UTC]':
#             df[column] = pd.to_datetime(df[column], errors='coerce', utc=True).fillna(pd.Timestamp.min.tz_localize('UTC'))
#     return df

# df_1 = enforce_schema(df_1, df_1_schema)
# df_2 = enforce_schema(df_2, df_2_schema)
# # for column, dtype in df_1_schema.items():
# #     if dtype == 'string':
# #         df_1[column] = df_1[column].astype(str).fillna('')
# #     elif dtype == 'int64':
# #         df_1[column] = pd.to_numeric(df_1[column], errors='coerce').fillna(-1).astype('int64')
# #     elif dtype == 'float64':
# #         df_1[column] = pd.to_numeric(df_1[column], errors='coerce').fillna(-1.0).astype('float64')
# #     elif dtype == 'datetime64[ns, UTC]':
# #         df_1[column] = pd.to_datetime(df_1[column], errors='coerce', utc=True).fillna(pd.Timestamp.min.tz_localize('UTC'))

# # Validate column types
# print(df_1.dtypes)

In [None]:
veil_storage_options = None
veil_storage_options = config.get('VEIL_GCS_STORAGE_OPTIONS')

# n90_storage_options = None
n90_storage_options = config.get('N90_GCS_STORAGE_OPTIONS')

veil_billing_bucket = None
veil_billing_bucket = config.get('veil_billing').get('billing_gcs_bucket_id')

# process_df['profile__attributes']
n90_bucket = None
n90_bucket = 'n90_veil_partner'
veil_output_prefix = None
veil_output_prefix = 'detections'
n90_output_prefix = None
n90_output_prefix = 'advocado-looker/avs_prod/detections'
partition_cols = None
partition_cols = ['year', 'month', 'day']
importlib.reload(core_functions)

## New Monthly Start

In [None]:
def process_detections(df):
    df.dtypes

    df['date_time'] = pd.to_datetime(df['date_time'], utc=True)
    df['year'] = df['date_time'].dt.year
    df['month'] = df['date_time'].dt.month
    df['day'] = df['date_time'].dt.day
    df['tv_show_id'] = df['tv_show_id'].fillna(-10) 
    df['group_occurrence_id'] = df['group_occurrence_id'].fillna(-6)
    # dmas_df['dma_rank'].replace('', 0).fillna(0, inplace=True)

    df['occurrence_id'] = df['occurrence_id'].astype(float)
    # # detections_df = detections_df.merge(master_channels_df, how='left', left_on='broadcaster_id', right_on='broadcaster_id')
    # dmas_df.loc[mask, 'neustar_id'] = 808080
    # dmas_df['neustar_id'] = dmas_df['neustar_id'].fillna(808080).astype(int)

    df = df.merge(dmas_df, how='left', left_on='dma_id', right_on='dma_id')

    shows_df['show_id'] = shows_df['show_id'].fillna(-5)
    # mask = ((dmas_df['dma_id'].isin(unknown_dma_overrides)) )
    df = df.merge(shows_df, how='left', left_on='tv_show_id', right_on='show_id')


    geo_df['geo_location'] = geo_df['geo_location'].astype(int)
    df = df.merge(geo_df, how='left', left_on='neustar_id', right_on='geo_location')
    gc.collect()

    needed_encodings = df['encoding_id'].unique().tolist()
    encodings_df.sort_values(by=['encoding_id', 'billing_last_updated'], ascending=[True, False], inplace=True)
    encodings_df = encodings_df.drop_duplicates(subset=['encoding_id'], keep='first')
    needed_encodings_df = encodings_df[encodings_df['encoding_id'].isin(needed_encodings)].copy()
    df = df.merge(needed_encodings_df, how='left', left_on='encoding_id', right_on='encoding_id', suffixes=('', '_encoding'))

    len(needed_encodings_df)
    df = df.sort_values(by=['occurrence_id', 'year', 'month', 'day'])
    billing_last_updated = pd.Timestamp.utcnow()
    df['billing_det_last_updated'] = billing_last_updated
    df['billing_det_last_updated'] = pd.to_datetime(df['billing_det_last_updated'], utc=True)
    billing_last_audit_id = core_functions.generate_uuid()
    df['billing_det_last_updated'] = billing_last_audit_id
    df.head()

    df_clean = core_functions.enforce_schema(df, n90_schema)

    # os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config.get('SA_ADHOC_BILLING')
    # core_functions.write_hive_partitioned_parquet(df_1, veil_billing_bucket, veil_output_prefix, partition_cols, veil_storage_options)
    # print(f"Finished writing to {veil_billing_bucket}/{veil_output_prefix}")
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config.get('SA_N90_CORE_APPS')
    core_functions.write_hive_partitioned_parquet(df_clean, n90_bucket, n90_output_prefix, partition_cols, n90_storage_options)
    print(f"Finished writing to {n90_bucket}/{n90_output_prefix}")

    # new cell
    partition_cols
    # print(encodings_bvs_df_to_write.dtypes)
    # new data starts 4/9/2024 - check if mid-day or midnight 	
    #  2024-04-08 13:10:23 UTC is the first detection in the new data

    # change to veil format
    veil_keys = list(veil_schema.keys())

    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config.get('SA_ADHOC_BILLING')
    core_functions.write_hive_partitioned_parquet(df_clean[veil_keys], veil_billing_bucket, veil_output_prefix, partition_cols, veil_storage_options)
    print(f"Finished writing to {veil_billing_bucket}/{veil_output_prefix}")
    # os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config.get('SA_N90_CORE_APPS')
    # core_functions.write_hive_partitioned_parquet(df_2, n90_bucket, n90_output_prefix, partition_cols, n90_storage_options)
    # print(f"Finished writing to {n90_bucket}/{n90_output_prefix}")

    # new cell
    partition_cols
    # print(encodings_bvs_df_to_write.dtypes)

    del df_clean
    del needed_encodings_df
    gc.collect()
    



In [None]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config.get('SA_ADHOC_BILLING')
min_date = '2024-12-01'
max_date = '2025-01-01'
dates_sql = f"""
    SELECT  cast(MAX(date_time) as string) as min_date , cast(current_date() as string) as max_date
    FROM `adhoc-billing.avs_billing_process.detections`
    """
dates_df = core_functions.fetch_gbq_data(query=dates_sql, bigquery_client=bigquery_client)
min_date = dates_df['min_date'][0]
max_date = dates_df['max_date'][0]


In [None]:
dates_df
# min_date = dates_df['min_date'][0]
min_date = '2024-12-01'
max_date = dates_df['max_date'][0]
min_date

In [None]:


try:
    del detections_df
except:
    pass
gc.collect()
detections_sql = f"""
    select * from 
    `{veil_billing.get('avs_project_id')}.{veil_billing.get('avs_dataset_id')}.detections` d

    WHERE date_time > '{min_date}' AND date_time < '{max_date}'
    """
detections_df = core_functions.fetch_gbq_data(query=detections_sql, bigquery_client=bigquery_client)
detections_df['occurrence_id'] = detections_df['occurrence_id'].astype(float)

min_occurrence_id = detections_df['occurrence_id'].min()
max_occurrence_id = detections_df['occurrence_id'].max()

archive_sql = f"""
select occurrence_id , any_value(affiliate) as affiliate, any_value(callsign) as callsign, any_value(dma_id) as dma_id
from `adhoc-billing.avs_billing_process.billing_records_archive`
where occurrence_id >= {min_occurrence_id} and occurrence_id <= {max_occurrence_id}
group by occurrence_id
"""

# archive_df = core_functions.fetch_gbq_data(query=archive_sql, bigquery_client=bigquery_client)
print(len(detections_df))
# print(len(archive_df))
detection2 = detections_df.merge(master_channel_df, how='left', left_on='broadcaster_id', right_on='broadcaster_id')

# detection2.loc[detection2['affiliate'].isnull(), 'affiliate'] 

detections_df = detection2.copy()
del detection2
gc.collect()

# prod is good from april 2024 on the 8th


In [None]:
detections_df.head()
# detections_df.dtypes

In [None]:
detections_df.head()

In [None]:
importlib.reload(core_functions)

# # # def process_detections(df, encodings_df):
df = detections_df.copy().reset_index(drop=True)
df = df.sort_values(by=['occurrence_id', 'date_time'], ascending=[True, False])
df = df.drop_duplicates(subset=['occurrence_id'], keep='first')
df = df.reset_index(drop=True)
# # # def process_detections(df):
# df.head()
# # for col in df.columns:
# #     print(f"'{col}': '{df[col].dtype}',")

df['date_time'] = pd.to_datetime(df['date_time'], utc=True)
df['year'] = df['date_time'].dt.year
df['month'] = df['date_time'].dt.month
df['day'] = df['date_time'].dt.day
df['tv_show_id'] = df['tv_show_id'].fillna(-10) 
df['group_occurrence_id'] = df['group_occurrence_id'].fillna(-6)
# dmas_df['dma_rank'].replace('', 0).fillna(0, inplace=True)

df['occurrence_id'] = df['occurrence_id'].astype(float)
# # detections_df = detections_df.merge(master_channels_df, how='left', left_on='broadcaster_id', right_on='broadcaster_id')
# dmas_df.loc[mask, 'neustar_id'] = 808080
# dmas_df['neustar_id'] = dmas_df['neustar_id'].fillna(808080).astype(int)

df = df.merge(dmas_df, how='left', left_on='dma_id', right_on='dma_id')

shows_df['show_id'] = shows_df['show_id'].fillna(-5)
# mask = ((dmas_df['dma_id'].isin(unknown_dma_overrides)) )
df = df.merge(shows_df, how='left', left_on='tv_show_id', right_on='show_id')


geo_df['geo_location'] = geo_df['geo_location'].astype(int)
df = df.merge(geo_df, how='left', left_on='neustar_id', right_on='geo_location')
gc.collect()

needed_encodings = df['encoding_id'].unique().tolist()
encodings_df.sort_values(by=['encoding_id', 'billing_last_updated'], ascending=[True, False], inplace=True)
encodings_df = encodings_df.drop_duplicates(subset=['encoding_id'], keep='first')
needed_encodings_df = encodings_df[encodings_df['encoding_id'].isin(needed_encodings)].copy()
df = df.merge(needed_encodings_df, how='left', left_on='encoding_id', right_on='encoding_id', suffixes=('', '_encoding'))

len(needed_encodings_df)
df = df.sort_values(by=['occurrence_id', 'year', 'month', 'day'])
billing_last_updated = pd.Timestamp.utcnow()
df['billing_det_last_updated'] = billing_last_updated
df['billing_det_last_updated'] = pd.to_datetime(df['billing_det_last_updated'], utc=True)
billing_last_audit_id = core_functions.generate_uuid()
df['billing_det_last_updated'] = billing_last_audit_id
df.head()




# df.loc[df['is_barter'].isnull(), 'is_barter'] = False
# df['length_in_seconds'] = df['length_in_seconds'].fillna(0).astype(str)

df_clean = core_functions.enforce_schema(df, n90_schema)

# df.loc[df['last_updated_encoding'].isnull(),'last_updated_encoding']
# for col, expected_type in n90_schema.items():
#     if col in df.columns:
#         print(f"{col}: DataFrame type = {df[col].dtype}, Expected type = {expected_type}")

# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config.get('SA_ADHOC_BILLING')
# core_functions.write_hive_partitioned_parquet(df_1, veil_billing_bucket, veil_output_prefix, partition_cols, veil_storage_options)
# print(f"Finished writing to {veil_billing_bucket}/{veil_output_prefix}")
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config.get('SA_N90_CORE_APPS')
core_functions.write_hive_partitioned_parquet(df_clean, n90_bucket, n90_output_prefix, partition_cols, n90_storage_options)
print(f"Finished writing to {n90_bucket}/{n90_output_prefix}")

# new cell
partition_cols
# print(encodings_bvs_df_to_write.dtypes)
# new data starts 4/9/2024 - check if mid-day or midnight 	
#  2024-04-08 13:10:23 UTC is the first detection in the new data

# change to veil format
veil_keys = list(veil_schema.keys())

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config.get('SA_ADHOC_BILLING')
core_functions.write_hive_partitioned_parquet(df_clean[veil_keys], veil_billing_bucket, veil_output_prefix, partition_cols, veil_storage_options)
print(f"Finished writing to {veil_billing_bucket}/{veil_output_prefix}")
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config.get('SA_N90_CORE_APPS')
# core_functions.write_hive_partitioned_parquet(df_2, n90_bucket, n90_output_prefix, partition_cols, n90_storage_options)
# print(f"Finished writing to {n90_bucket}/{n90_output_prefix}")

# new cell
partition_cols
# print(encodings_bvs_df_to_write.dtypes)

del df_clean
del needed_encodings_df
gc.collect()
    

In [None]:
df.head()


In [None]:
df_clean = core_functions.enforce_schema(df, n90_schema)

# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config.get('SA_ADHOC_BILLING')
# core_functions.write_hive_partitioned_parquet(df_1, veil_billing_bucket, veil_output_prefix, partition_cols, veil_storage_options)
# print(f"Finished writing to {veil_billing_bucket}/{veil_output_prefix}")
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config.get('SA_N90_CORE_APPS')
core_functions.write_hive_partitioned_parquet(df_clean, n90_bucket, n90_output_prefix, partition_cols, n90_storage_options)
print(f"Finished writing to {n90_bucket}/{n90_output_prefix}")

# new cell
partition_cols
# print(encodings_bvs_df_to_write.dtypes)
# new data starts 4/9/2024 - check if mid-day or midnight 	
#  2024-04-08 13:10:23 UTC is the first detection in the new data

# change to veil format
veil_keys = list(veil_schema.keys())

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config.get('SA_ADHOC_BILLING')
core_functions.write_hive_partitioned_parquet(df_clean[veil_keys], veil_billing_bucket, veil_output_prefix, partition_cols, veil_storage_options)
print(f"Finished writing to {veil_billing_bucket}/{veil_output_prefix}")
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config.get('SA_N90_CORE_APPS')
# core_functions.write_hive_partitioned_parquet(df_2, n90_bucket, n90_output_prefix, partition_cols, n90_storage_options)
# print(f"Finished writing to {n90_bucket}/{n90_output_prefix}")

# new cell
partition_cols
# print(encodings_bvs_df_to_write.dtypes)

del df_clean
del needed_encodings_df
gc.collect()
    


In [None]:

date_list = [['2022-01-01', '2022-02-01'], ['2022-02-01', '2022-03-01'], ['2022-03-01', '2022-04-01'], ['2022-04-01', '2022-05-01'], ['2022-05-01', '2022-06-01'], ['2022-06-01', '2022-07-01'], ['2022-07-01', '2022-08-01'], ['2022-08-01', '2022-09-01'], ['2022-09-01', '2022-10-01'], ['2022-10-01', '2022-11-01'], ['2022-11-01', '2022-12-01'], ['2022-12-01', '2023-01-01'], ['2023-01-01', '2023-02-01'], ['2023-02-01', '2023-03-01'], ['2023-03-01', '2023-04-01'], ['2023-04-01', '2023-05-01'], ['2023-05-01', '2023-06-01'], ['2023-06-01', '2023-07-01'], ['2023-07-01', '2023-08-01'], ['2023-08-01', '2023-09-01'], ['2023-09-01', '2023-10-01'], ['2023-10-01', '2023-11-01'], ['2023-11-01', '2023-12-01'], ['2023-12-01', '2024-01-01'], ['2024-01-01', '2024-02-01'], ['2024-02-01', '2024-03-01']]


for dates in date_list:
    new_min = dates[0]
    new_max = dates[1]
    
    archive_sql = f"""
    select occurrence_id , detection_encoding_id as encoding_id, broadcaster_id, detection_timestamp as date_time, cost, tv_show_id, origin, group_occurrence_id, last_updated, last_audit_id, 

    affiliate, callsign,  dma_id
    from `adhoc-billing.avs_billing_process.billing_records_archive`
    where detection_timestamp >= '{new_min}' and detection_timestamp < '{new_max}'
    and clone_of is null
    """

    archive_df = core_functions.fetch_gbq_data(query=archive_sql, bigquery_client=bigquery_client)

    # archive_df['occurrence_id'] = archive_df['occurrence_id'].astype(int)
    archive_df['group_occurrence_id'] = archive_df['encoding_id'].astype(int)
    archive_df.dtypes
    detections_df = archive_df.copy()
    del archive_df
    gc.collect()
    process_detections(detections_df, encodings_df)


In [None]:
for dates in date_list:
    new_min = dates[0]
    new_max = dates[1]
    
    archive_sql = f"""
    select occurrence_id , detection_encoding_id as encoding_id, broadcaster_id, detection_timestamp as date_time, cost, tv_show_id, origin, group_occurrence_id, last_updated, last_audit_id, 

    affiliate, callsign,  dma_id
    from `adhoc-billing.avs_billing_process.billing_records_archive`
    where detection_timestamp >= '{new_min}' and detection_timestamp < '{new_max}'
    and clone_of is null
    """

    archive_df = core_functions.fetch_gbq_data(query=archive_sql, bigquery_client=bigquery_client)

    # archive_df['occurrence_id'] = archive_df['occurrence_id'].astype(int)
    archive_df['group_occurrence_id'] = archive_df['encoding_id'].astype(int)
    archive_df.dtypes
    detections_df = archive_df.copy()
    del archive_df
    gc.collect()
    process_detections(detections_df, encodings_df)


In [None]:
from datetime import datetime
from dateutil.relativedelta import relativedelta

def generate_overlapping_date_sets(start_date, num_sets):
    """
    Generate overlapping date sets.

    Args:
        start_date (str): Start date in 'YYYY-MM-DD' format.
        num_sets (int): Number of overlapping date sets to generate.

    Returns:
        list: List of date sets in the format [['YYYY-MM-DD', 'YYYY-MM-DD'], ...].
    """
    start = datetime.strptime(start_date, "%Y-%m-%d")
    date_sets = []

    for i in range(num_sets):
        end = start + relativedelta(months=1)
        date_sets.append([start.strftime("%Y-%m-%d"), end.strftime("%Y-%m-%d")])
        start = end  # Update start to be the end of the previous set

    return date_sets

start_date = "2022-01-01"
num_sets = 30  # Number of overlapping date sets to generate
date_sets = generate_overlapping_date_sets(start_date, num_sets)

print(date_sets)

In [None]:
# detections_df2 = detections_df.merge(master_channel_df, how='left', left_on='broadcaster_id', right_on='broadcaster_id')
# # detections_df2['affiliate'].isna()
# detections_df2


In [None]:

detections_df.dtypes

detections_df['date_time'] = pd.to_datetime(detections_df['date_time'], utc=True)
detections_df['year'] = detections_df['date_time'].dt.year
detections_df['month'] = detections_df['date_time'].dt.month
detections_df['day'] = detections_df['date_time'].dt.day
detections_df['tv_show_id'] = detections_df['tv_show_id'].fillna(-10) 
detections_df['group_occurrence_id'] = detections_df['group_occurrence_id'].fillna(-6)
# dmas_df['dma_rank'].replace('', 0).fillna(0, inplace=True)

detections_df['occurrence_id'] = detections_df['occurrence_id'].astype(float)
# # detections_df = detections_df.merge(master_channels_df, how='left', left_on='broadcaster_id', right_on='broadcaster_id')
# dmas_df.loc[mask, 'neustar_id'] = 808080
# dmas_df['neustar_id'] = dmas_df['neustar_id'].fillna(808080).astype(int)

detections_df = detections_df.merge(dmas_df, how='left', left_on='dma_id', right_on='dma_id')

shows_df['show_id'] = shows_df['show_id'].fillna(-5)
# mask = ((dmas_df['dma_id'].isin(unknown_dma_overrides)) )
detections_df = detections_df.merge(shows_df, how='left', left_on='tv_show_id', right_on='show_id')


geo_df['geo_location'] = geo_df['geo_location'].astype(int)
detections_df = detections_df.merge(geo_df, how='left', left_on='neustar_id', right_on='geo_location')
gc.collect()

needed_encodings = detections_df['encoding_id'].unique().tolist()
encodings_df.sort_values(by=['encoding_id', 'billing_last_updated'], ascending=[True, False], inplace=True)
encodings_df = encodings_df.drop_duplicates(subset=['encoding_id'], keep='first')
needed_encodings_df = encodings_df[encodings_df['encoding_id'].isin(needed_encodings)].copy()
detections_df = detections_df.merge(needed_encodings_df, how='left', left_on='encoding_id', right_on='encoding_id', suffixes=('', '_encoding'))

len(needed_encodings_df)
detections_df = detections_df.sort_values(by=['occurrence_id', 'year', 'month', 'day'])
billing_last_updated = pd.Timestamp.utcnow()
detections_df['billing_det_last_updated'] = billing_last_updated
detections_df['billing_det_last_updated'] = pd.to_datetime(detections_df['billing_det_last_updated'], utc=True)
billing_last_audit_id = core_functions.generate_uuid()
detections_df['billing_det_last_updated'] = billing_last_audit_id
detections_df.head()

detections_df_clean = core_functions.enforce_schema(detections_df, n90_schema)

# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config.get('SA_ADHOC_BILLING')
# core_functions.write_hive_partitioned_parquet(df_1, veil_billing_bucket, veil_output_prefix, partition_cols, veil_storage_options)
# print(f"Finished writing to {veil_billing_bucket}/{veil_output_prefix}")
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config.get('SA_N90_CORE_APPS')
core_functions.write_hive_partitioned_parquet(detections_df_clean, n90_bucket, n90_output_prefix, partition_cols, n90_storage_options)
print(f"Finished writing to {n90_bucket}/{n90_output_prefix}")

# new cell
partition_cols
# print(encodings_bvs_df_to_write.dtypes)
# new data starts 4/9/2024 - check if mid-day or midnight 	
#  2024-04-08 13:10:23 UTC is the first detection in the new data

# change to veil format
veil_keys = list(veil_schema.keys())

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config.get('SA_ADHOC_BILLING')
core_functions.write_hive_partitioned_parquet(detections_df_clean[veil_keys], veil_billing_bucket, veil_output_prefix, partition_cols, veil_storage_options)
print(f"Finished writing to {veil_billing_bucket}/{veil_output_prefix}")
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config.get('SA_N90_CORE_APPS')
# core_functions.write_hive_partitioned_parquet(df_2, n90_bucket, n90_output_prefix, partition_cols, n90_storage_options)
# print(f"Finished writing to {n90_bucket}/{n90_output_prefix}")

# new cell
partition_cols
# print(encodings_bvs_df_to_write.dtypes)

del detections_df_clean
del needed_encodings_df
gc.collect()


In [None]:
# month_list = generate_month_list(min_date, max_date)
# month_list

# start_date = f'{month_list[0]}-01'
# end_date = f'{month_list[1]}-01'
# detections_sql = f"""
#     with cleanMasterChannels AS (
#     SELECT channel_id, affiliate, callsign, monitored_channels, dma_id, safe_cast(broadcaster_id as int64) as broadcaster_id 
#     FROM `{veil_billing.get('avs_project_id')}.{veil_billing.get('mongo_dataset_id')}.master_channels`

#     )
#     select * from 
#     `{veil_billing.get('avs_project_id')}.{veil_billing.get('avs_dataset_id')}.detections` d
#     left join 
#     cleanMasterChannels mc
#     using(broadcaster_id)
#     WHERE date_time >= '{start_date}' AND date_time < '{end_date}'
#     """
# detections_df = core_functions.fetch_gbq_data(query=detections_sql, bigquery_client=bigquery_client)






In [None]:

print("GOOGLE_APPLICATION_CREDENTIALS:", os.environ['GOOGLE_APPLICATION_CREDENTIALS'])