In [None]:

import importlib
import functions.core.core_functions as core_functions
import dask
import dask.dataframe as dd
from dask.dataframe.utils import assert_eq
import pandas as pd
import numpy as np
import pandas_gbq
import json
import gc
import cudf
import yaml

dask.config.set({"dataframe.backend": "cudf"})

importlib.reload(core_functions)

resp = {}
resp = core_functions.initialize_clients()

config = resp.get('config')
bigquery_client = resp.get('clients').get('bigquery_client')
storage_client = resp.get('clients').get('storage_client')
sf_client = resp.get('clients').get('sf_client')
veil_billing = resp.get('config').get('veil_billing')
veil_vars = resp.get('config').get('veil_billing').get('vars')
# print(veil_billing)
sfdc_adv_account_cols = veil_billing.get('vars').get('sfdc_adv_account_cols')
sfdc_rate_card_cols = veil_billing.get('vars').get('sfdc_rate_card_cols')

table_schema = yaml.safe_load(open('table-schemas.yaml'))

encodings_python = table_schema['encodings_python_schema']
encodings_parquet = table_schema['encodings_parquet_schema']



In [None]:
# Set Billing Process Vars
broadcast_month_to_process = 2024.11
encoding_month_to_process = '2024-11-01'

In [None]:
detections_for_billing_sql = f"""
SELECT occurrence_id, encoding_id, detection_timestamp, broadcaster_id, origin, group_occurrence_id, affiliate, callsign, dma_rank, show_record_id, show_id, is_barter, is_cable, is_network, is_spot, is_canadian, is_hispanic, is_local_cable, is_active, show_name, isci, aeis_id, format_id, format_name, customer_id, customer_name, sfdc_account_id, sfdc_account_name, sfdc_advertiser_id, attributes_cable_estimate, attributes_spot_estimate, length_in_seconds, clone_of, segments_format_id_group, bcw_id, bcw_index, bcm_index, bcw_start_date, bcw_end_date, bc_year_index, segments_date, segments_day_of_week, segments_media, segments_broadcast_week_index, segments_broadcast_month_index, segments_broadcast_year 
FROM `adhoc-billing.avs_billing_datalake.p_detections` 
WHERE segments_broadcast_month_index = {broadcast_month_to_process}
"""

detections_for_billing_df = bigquery_client.query(detections_for_billing_sql).result().to_dataframe()
encodings_in_detections = detections_for_billing_df['encoding_id'].unique().tolist()

In [None]:
encodings_for_billing_sql = f"""
    SELECT encoding_id, format_id, encoder_group_id, encoded_timestamp, encoder_id, length_in_seconds, e.description, product_code, product_name, isci, project_name, advertiser, client_code, encoder_group_name, aeis_id, format_name, e.sfdc_account_id, sfdc_account_name, customer_id, account_id, customer_name, profile_name, ad_prod_campaign, campaign, sfdc_advertiser_id, 
    billing_frequency, po_number, po_required, active_po, advertiser_status, invoice_format, sfdc_rate_card_id, match_type, product_code_list, dish_included, directv_included,
    sfdc_rate_card_name, business_type, rate_card_type, currency, billing_type, rc.description as rate_card_desc, media, default_rate_card, media_ocean_formatting, sfdc_product_id, sfdc_product_code, rate_card_diginet_handling, no_spot_ordered, rate_currency, seasonal_only, min_max_type, min_value, min_label, min_applied_at, always_bill_min, usage_incl_in_min, max_value, max_label, max_applied_at, usage_incl_in_max, usage_usage_per_unit_rate, usage_usage_per_unit_label, usage_usage_billing_unit, usage_tier_plat_detection_num_tiers, usage_tier_plat_detection_billing_unit, usage_tier1_from, usage_tier1_price, usage_tier1_label, usage_tier2_from, usage_tier2_price, usage_tier2_label, usage_tier3_from, usage_tier3_price, usage_tier3_label, usage_tier4_from, usage_tier4_price, usage_tier4_label, usage_tier5_from, usage_tier5_price, usage_tier5_label, weekly_cable_rate, weekly_spot_rate, weekly_canadian_rate, weekly_network_rate, weekly_hispanic_rate, segments_month


    FROM `adhoc-billing.avs_billing_datalake.p_encodings` e 
    left join `adhoc-billing.avs_billing_process.sfdc_advertiser` adv
    using(sfdc_advertiser_id)
    left join `adhoc-billing.avs_billing_process.sfdc_rate_card` rc
    using (sfdc_rate_card_id)
"""
encodings_for_billing_df = bigquery_client.query(encodings_for_billing_sql).result().to_dataframe()




In [None]:
encodings_for_encoder_billing = encodings_for_billing_df.loc[encodings_for_billing_df['segments_month'] == encoding_month_to_process].copy().reset_index(drop=True)
encodings_for_encoder_billing

In [None]:
encodings_for_detection_billing = encodings_for_billing_df.loc[encodings_for_billing_df['encoding_id'].isin(encodings_in_detections)].copy().reset_index(drop=True)
ad_hoc_billing_master = detections_for_billing_df.merge(encodings_for_detection_billing, on='encoding_id', how='left', suffixes=('', '_dupe')).copy().reset_index(drop=True)
ad_hoc_billing_master = ad_hoc_billing_master.drop(columns=[col for col in ad_hoc_billing_master.columns if '_dupe' in col]).sort_values(by=['sfdc_account_id', 'sfdc_advertiser_id', 'occurrence_id'], ascending=True).copy().reset_index(drop=True)
record_last_updated = pd.Timestamp.utcnow().floor('s')
ad_hoc_billing_master['_BROADCAST_MONTH'] = broadcast_month_to_process
ad_hoc_billing_master['record_last_updated'] = record_last_updated
record_last_audit_id = core_functions.generate_uuid()
ad_hoc_billing_master['record_last_audit_id'] = record_last_audit_id
ad_hoc_billing_master

In [None]:
for col in ad_hoc_billing_master.columns:
    print(f'col: {col} - datatype: {ad_hoc_billing_master[col].dtype} sample: {ad_hoc_billing_master[col][0]}')


In [None]:
veil_storage_options = config.get('VEIL_GCS_STORAGE_OPTIONS')
veil_billing_bucket = config.get('veil_billing').get('billing_gcs_bucket_id')
veil_output_prefix = 'ad_hoc_billing_master'
partition_cols = ['_BROADCAST_MONTH']



In [None]:

core_functions.write_hive_partitioned_parquet(ad_hoc_billing_master, veil_billing_bucket, veil_output_prefix, partition_cols, veil_storage_options, max_records_per_file=10_000_000)


In [None]:

print(f"Finished writing to {veil_billing_bucket}/{veil_output_prefix}")

In [None]:
ad_hoc_billing_master