In [None]:
%pip install -q -r requirements.txt

In [None]:

import importlib
import functions.core_functions as core_functions
import dask
import dask.dataframe as dd
from dask.dataframe.utils import assert_eq
import pandas as pd
import numpy as np
import pandas_gbq
import json
import gc
import cudf
import yaml

dask.config.set({"dataframe.backend": "cudf"})

importlib.reload(core_functions)


In [None]:
resp = {}
resp = core_functions.initialize_clients()

config = resp.get('config')
bigquery_client = resp.get('clients').get('bigquery_client')
storage_client = resp.get('clients').get('storage_client')
sf_client = resp.get('clients').get('sf_client')
veil_billing = resp.get('config').get('veil_billing')
veil_vars = resp.get('config').get('veil_billing').get('vars')
# print(veil_billing)
sfdc_adv_account_cols = veil_billing.get('vars').get('sfdc_adv_account_cols')
sfdc_rate_card_cols = veil_billing.get('vars').get('sfdc_rate_card_cols')

In [None]:
table_schema = yaml.safe_load(open('table-schemas.yaml'))

In [None]:
encodings_python = table_schema['encodings_python_schema']
encodings_parquet = table_schema['encodings_parquet_schema']

In [None]:
encodings_python
encodings_python
for item in encodings_python:
    print(f"{item['name']} = {item['type']}")

In [None]:

reload_encodings = True
#  Need to refactor code updates to take last_updated into consideration and add a view to bq to only keep one record per encoding_id

if reload_encodings:
    avs_tables = ['encodings','encoders', 'encoder_groups', 'formats', 'customers', 'profiles', 'aeismaps']
else:
    avs_tables = ['encoders', 'encoder_groups', 'formats', 'customers', 'profiles', 'aeismaps']
avs_data = core_functions.fetch_table_data(
    project_id=veil_billing.get('avs_project_id'),
    dataset_id=veil_billing.get('avs_dataset_id'),
    table_names=avs_tables,
    bigquery_client=bigquery_client
)

# Access specific DataFrames
encoding_sql = f"""
    with includedEncodings as (
    select distinct encoding_id from `adhoc-billing.avs_billing_process.encodings_complete` 

    ),
    lastUpdated as (
    select 
    max(billing_last_updated) 
    -- max(TIMESTAMP_seconds(cast((last_updated/1000000000) as int)))
    as billing_last_updated
    from `adhoc-billing.avs_billing_process.encodings_complete` 
    ),
    newEncodings AS (
    select e.* from `bigquery-sandbox-393916.prod_avs.encodings` e
    left join includedEncodings ie
    using (encoding_id)
    where ie.encoding_id is null
    )
    select * from newEncodings
    union all
    select e.* from `bigquery-sandbox-393916.prod_avs.encodings` e
    where cast(e.last_updated as timestamp) > (select billing_last_updated from lastUpdated)
    and encoding_id not in (select encoding_id from newEncodings)
"""
if not reload_encodings:
    encodings_df = core_functions.fix_df_dtypes(core_functions.fetch_gbq_data(encoding_sql, bigquery_client).dropna(subset=['encoded_timestamp']))
# encodings_df = core_functions.fix_df_dtypes(core_functions.fetch_gbq_data(encoding_sql, bigquery_client).dropna(subset=['encoded_timestamp']))
if reload_encodings:
    encodings_sql = f""" select * from `adhoc-billing.avs_billing_process.avs_encodings_master` where status = 'encoded'"""
    encodings_df = core_functions.fix_df_dtypes(core_functions.fetch_gbq_data(encodings_sql, bigquery_client))
    if 'aeis_id' in encodings_df.columns:
        encodings_df.drop(columns=['aeis_id'], inplace=True)

encodings_df = (encodings_df.loc[(encodings_df['status'] == 'encoded')]).sort_values(by=['format_id', 'encoding_id'], ascending=[True, True]).copy().reset_index(drop=True)
encodings_df['segments_format_id_group'] = encodings_df['format_id'].apply(core_functions.assign_segment_group)
encoders_df = core_functions.fix_df_dtypes(avs_data['encoders'])
encoder_groups_df = core_functions.fix_df_dtypes(avs_data['encoder_groups'])
formats_df = core_functions.fix_df_dtypes(avs_data['formats'])
customers_df = core_functions.fix_df_dtypes(avs_data['customers'])
profiles_df = core_functions.fix_df_dtypes(avs_data['profiles'])
aeismaps_df = core_functions.fix_df_dtypes(avs_data['aeismaps'])

# Directly sort the Series, extract unique values, and convert to a list

encoding_format_ids = encodings_df['format_id'].sort_values().unique().tolist()


In [None]:
encodings_df



In [None]:
encodings_df.dtypes


In [None]:

# test_mask = encodings_df['format_id'] == 13568
# encodings_df = encodings_df[test_mask].copy()
# encoding_format_ids
len(encodings_df)
# 1666255
# 2070-01-01 05:00:00+00:00

In [None]:

billing_tables = [
    'sfdc_bvs_customer__c_obj',
    'sfdc_bvs_format__c_obj',
    'sfdc_account_obj',
    'sfdc_advertiser__c_obj',
    'sfdc_rate_card__c_obj'
]
billing_data = core_functions.fetch_table_data(
    project_id=veil_billing.get('billing_project_id'),
    dataset_id=veil_billing.get('billing_dataset_id'),
    table_names=billing_tables,
    bigquery_client=bigquery_client
)

# Access specific DataFrames
sfdc_bvs_customer_df = core_functions.fix_df_dtypes(billing_data['sfdc_bvs_customer__c_obj'])
sfdc_account_df = core_functions.fix_df_dtypes(billing_data['sfdc_account_obj'])
sfdc_advertiser_df = core_functions.fix_df_dtypes(billing_data['sfdc_advertiser__c_obj'])
sfdc_rate_card_df = core_functions.fix_df_dtypes(billing_data['sfdc_rate_card__c_obj'])
sfdc_bvs_format_df = core_functions.fix_df_dtypes(billing_data['sfdc_bvs_format__c_obj'])

In [None]:


prepped_encodings_df = core_functions.fix_df_dtypes(core_functions.pre_prep_dataframe(encodings_df))

# len(encodings_df)

In [None]:

print(prepped_encodings_df.dtypes)

In [None]:
len(encoding_format_ids)

In [None]:
len(prepped_encodings_df)

prepped_encodings_df

In [None]:
processed_encodings_df = prepped_encodings_df.sort_values(['format_id', 'encoding_id']).reset_index(drop=True).copy()

In [None]:
clean_processed_encodings_df = pd.DataFrame()
# for format_id in encoding_format_ids:
#     if format_id == 0:
#         continue
#     else:
#         mask = processed_encodings_df['format_id'] == format_id
#         print(f"Processing format_id: {format_id}...")
#         format_df = processed_encodings_df.loc[processed_encodings_df['format_id'] == format_id].copy()
#         # df = core_functions.clean_encodings_df(format_df)
#         df = format_df
#         clean_processed_encodings_df = pd.concat([clean_processed_encodings_df, df], ignore_index=True)
    


# clean_processed_encodings_df = core_functions.clean_encodings_df(processed_encodings_df)
# clean_processed_encodings_df

from itertools import islice

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

# Parameters
batch_size = 1000  # Change to 1000 if desired

# Initialize an empty DataFrame to store the cleaned results
clean_processed_encodings_df = pd.DataFrame()

# Define your encoding_format_ids as per your corrected approach
encoding_format_ids = encodings_df['format_id'].sort_values().unique().tolist()

# Remove format_id = 0 beforehand to avoid checking inside the loop
encoding_format_ids = [fid for fid in encoding_format_ids if fid != 0]

# Split the format_id list into chunks
format_id_batches = chunks(encoding_format_ids, batch_size)

# Optionally, wrap the batches with tqdm for a progress bar
# for batch_num, format_id_batch in enumerate(tqdm(format_id_batches, desc="Processing Batches"), start=1):
for batch_num, format_id_batch in enumerate(format_id_batches, start=1):
    print(f"Processing batch {batch_num} with {len(format_id_batch)} format_ids...")
    
    # Create a mask for the current batch of format_ids
    mask = processed_encodings_df['format_id'].isin(format_id_batch)
    
    # Filter the DataFrame for the current batch
    format_df = processed_encodings_df.loc[mask].copy()
    
    # Apply your cleaning function (uncomment and use as needed)
    # df = core_functions.clean_encodings_df(format_df)
    
    # For demonstration, we'll assume no cleaning is needed
    df = format_df
    
    # Concatenate the cleaned batch to the main DataFrame
    clean_processed_encodings_df = pd.concat([clean_processed_encodings_df, df], ignore_index=True)

# Optional: Reset index if needed
clean_processed_encodings_df.reset_index(drop=True, inplace=True)

print("Batch processing completed.")



In [None]:
for col in encodings_df.columns:
    print(f"{col}: {processed_encodings_df[col].dtype}")

In [None]:
# Define conditions
# Ensure 'description' column does not contain None values
processed_encodings_df = clean_processed_encodings_df.copy()
processed_encodings_df['description'] = processed_encodings_df['attributes_description'].fillna('')
processed_encodings_df['product_code'] = processed_encodings_df['attributes_product_code'].fillna('')

processed_encodings_df['product_name'] = processed_encodings_df['attributes_product_name'].fillna('')
processed_encodings_df['donovan_agency_product_code'] = processed_encodings_df['attributes_donovan_agency_product_code'].fillna('')
processed_encodings_df['isci'] = processed_encodings_df['attributes_isci'].fillna('')
processed_encodings_df['project_name'] = processed_encodings_df['attributes_project_name'].fillna('')
processed_encodings_df['advertiser'] = processed_encodings_df['attributes_advertiser'].fillna('')

processed_encodings_df['client_code'] = processed_encodings_df['attributes_client_code'].fillna('')
processed_encodings_df['donovan_agency_advertiser_code'] = processed_encodings_df['attributes_donovan_agency_advertiser_code'].fillna('')



In [None]:
clean_processed_encodings_df

In [None]:
# Define conditions
conditions = [
    processed_encodings_df['product_code'].notnull(),
    processed_encodings_df['product_code'].isnull() & processed_encodings_df['product_name'].notnull(),
    processed_encodings_df['product_code'].isnull() & processed_encodings_df['product_name'].isnull() & processed_encodings_df['donovan_agency_product_code'].notnull(),
    processed_encodings_df['product_code'].isnull() & processed_encodings_df['product_name'].isnull() & processed_encodings_df['donovan_agency_product_code'].isnull() & processed_encodings_df['description'].notnull() & processed_encodings_df['description'].str.len() > 10 & ~processed_encodings_df['description'].str.startswith(('TV', 'RA')),
    processed_encodings_df['product_code'].isnull() & processed_encodings_df['product_name'].isnull() & processed_encodings_df['donovan_agency_product_code'].isnull() & processed_encodings_df['description'].notnull() & processed_encodings_df['description'].str.len() > 10 & processed_encodings_df['description'].str.startswith(('TV', 'RA'))
]

# Define corresponding values
choices = [
    processed_encodings_df['product_code'],
    processed_encodings_df['product_name'],
    processed_encodings_df['donovan_agency_product_code'],
    processed_encodings_df['description'].str[26:30].str.strip(),
    processed_encodings_df['description'].str[6:10].str.strip()
]

# Apply conditions and choices to create the new column
processed_encodings_df['product_code'] = np.select(conditions, choices, default=None)



# Define conditions
conditions = [
    processed_encodings_df['isci'].notnull(),
    processed_encodings_df['isci'].isnull() & processed_encodings_df['project_name'].notnull(),
    processed_encodings_df['isci'].isnull() & processed_encodings_df['project_name'].isnull() & processed_encodings_df['description'].notnull() & processed_encodings_df['description'].str.len() > 10 & ~processed_encodings_df['description'].str.startswith(('TV', 'RA')),
    processed_encodings_df['isci'].isnull() & processed_encodings_df['project_name'].isnull() & processed_encodings_df['description'].notnull() & processed_encodings_df['description'].str.len() > 10 & processed_encodings_df['description'].str.startswith(('TV', 'RA'))
]

# Define corresponding values
choices = [
    processed_encodings_df['isci'],
    processed_encodings_df['project_name'],
    processed_encodings_df['description'].str[8:18].str.strip(),
    processed_encodings_df['description'].str[18:38].str.strip()
]

# Apply conditions and choices to create the new column
processed_encodings_df['isci'] = np.select(conditions, choices, default=None)


# Define conditions
conditions = [
    processed_encodings_df['advertiser'].notnull(),
    processed_encodings_df['advertiser'].isnull() & processed_encodings_df['client_code'].notnull(),
    processed_encodings_df['advertiser'].isnull() & processed_encodings_df['client_code'].isnull() & processed_encodings_df['donovan_agency_advertiser_code'].notnull(),
    processed_encodings_df['advertiser'].isnull() & processed_encodings_df['client_code'].isnull() & processed_encodings_df['donovan_agency_advertiser_code'].isnull() & processed_encodings_df['description'].notnull() & processed_encodings_df['description'].str.len() > 10 & ~processed_encodings_df['description'].str.startswith(('TV', 'RA')),
    processed_encodings_df['advertiser'].isnull() & processed_encodings_df['client_code'].isnull() & processed_encodings_df['donovan_agency_advertiser_code'].isnull() & processed_encodings_df['description'].notnull() & processed_encodings_df['description'].str.len() > 10 & processed_encodings_df['description'].str.startswith(('TV', 'RA'))
]

# Define corresponding values
choices = [
    processed_encodings_df['advertiser'],
    processed_encodings_df['client_code'],
    processed_encodings_df['donovan_agency_advertiser_code'],
    processed_encodings_df['description'].str[22:26].str.strip(),
    processed_encodings_df['description'].str[2:6].str.strip()
]

# Apply conditions and choices to create the new column
processed_encodings_df['advertiser'] = np.select(conditions, choices, default=None)
processed_encodings_df

In [None]:

clean_sfdc_bvs_customer_df = core_functions.clean_sfdc_df(sfdc_bvs_customer_df, id_col='sfdc_bvs_customer_id', name_col='sfdc_bvs_customer_name')
clean_sfdc_account_df = core_functions.clean_sfdc_df(sfdc_account_df, id_col='sfdc_account_id', name_col='sfdc_account_name')
clean_sfdc_advertiser_df = core_functions.clean_sfdc_df(sfdc_advertiser_df, id_col='sfdc_advertiser_id', name_col='sfdc_advertiser_name')
clean_sfdc_rate_card_df = core_functions.clean_sfdc_df(sfdc_rate_card_df, id_col='sfdc_rate_card_id', name_col='rate_card_name')

In [None]:
# processed_encodings_df['attributes_product_code']
processed_encodings_df.head()


In [None]:
clean_sfdc_advertiser_df['sfdc_advertiser_id']

In [None]:
clean_sfdc_account_df

clean_sfdc_account_df.sort_values(by='sfdc_account_id', inplace=True)

# sfdc_advertiser_df['sfdc_advertiser_id'] = sfdc_advertiser_df['id']
# sfdc_advertiser_df['sfdc_advertiser_name'] = sfdc_advertiser_df['name']
# sfdc_advertiser_df['sfdc_account_id'] = sfdc_advertiser_df['account']

clean_sfdc_advertiser_df.sort_values(by=['sfdc_account_id', 'sfdc_advertiser_id', 'encoding_format_id', 'encoding_advertiser', 'encoding_product_code', 'encoding_module_code'], inplace=True)
# # # sfdc_account_df[['sfdc_account_id', 'sfdc_account_name']] 
clean_sfdc_adv_account_df = clean_sfdc_advertiser_df.merge(clean_sfdc_account_df, how='left', on='sfdc_account_id', suffixes=('_adv', '_acc'))
clean_sfdc_adv_account_df = clean_sfdc_adv_account_df[sfdc_adv_account_cols].copy()
clean_sfdc_rate_card_df = clean_sfdc_rate_card_df[sfdc_rate_card_cols].copy()
clean_sfdc_adv_account_rate_card_df = clean_sfdc_adv_account_df[sfdc_adv_account_cols].merge(clean_sfdc_rate_card_df[sfdc_rate_card_cols], how='left', on='sfdc_rate_card_id', suffixes=('_adv', '_rc')).copy().reset_index(drop=True)


# .columns.to_list()

In [None]:
# attributes_list = ['product_code', 'product_name', 'donovan_agency_product_code', 'description', 'isci', 'project_name', 'advertiser', 'client_code',
#                    'cable_estimate', 'spot_estimate', 'campaign', 'audience', 'audience2', 'category', 'comercial_id', 'contour_id', 'creative_offer',
#                    'donovan_agency_advertiser_code', 'donovan_agency_estimate_code', 'eid', 'group', 'hd_sd', 'id', 'length', 'lob', 'media_type',
#                    'message', 'misc', 'module_code', 'offer', 'offer_2', 'phone_number', 'quality', 'revision', 'show_name', 'slug', 'sport_id',
#                    'sport_show_sub_category', 'spot_name', 'tag', 'text', 'title', 'veil_id', 'version_name', 'year']
# for attr in attributes_list:
#     encodings_df[attr] = encodings_df['attributes'].apply(lambda x: x.get(attr))
# encodings_df['product_code'] = encodings_df['attributes'].apply(lambda x: x.get('product_code'))

# Define conditions
# Ensure 'description' column does not contain None values


#  new cell

# sfdc_account_df['sfdc_account_id'] = sfdc_account_df['Id']
# sfdc_account_df['sfdc_account_name'] = sfdc_account_df['Name']
# sfdc_bvs_customer_df['customer_id'] = sfdc_bvs_customer_df['customer_id__c'].astype(int)
# sfdc_bvs_customer_df['sfdc_account_id'] = sfdc_bvs_customer_df['Account__c']
account_cols = ['sfdc_account_id', 'sfdc_account_name']
customer_cols = ['customer_id', 'sfdc_account_id']
clean_sfdc_cust_account_df = clean_sfdc_account_df[account_cols].merge(clean_sfdc_bvs_customer_df[customer_cols], on='sfdc_account_id', how='inner', suffixes=('_sfdc_account', '_sfdc_customer'))

clean_sfdc_cust_account_df
sfdc_bvs_cust_account_df = clean_sfdc_cust_account_df.merge(customers_df, left_on='customer_id', right_on='customer_id', how='inner', suffixes=('_sfdc', '_avs'))

# new cell

core_functions.rename_columns(encoder_groups_df, 'encoder_group_')
core_functions.rename_columns(encoders_df, 'encoder_')
core_functions.rename_columns(formats_df, 'format_')
core_functions.rename_columns(customers_df, 'customer_')
core_functions.rename_columns(profiles_df, 'profile_')
core_functions.rename_columns(aeismaps_df, 'aeis_')

# clean_processed_encodings_df

# encodings_df['encoding_id'] = encodings_df['encoding_id'].astype(int)
encoders_groups_df = processed_encodings_df.merge(encoder_groups_df, left_on='encoder_group_id', right_on='encoder_group_id', how='left').drop_duplicates(subset=['encoding_id']).copy()
encoders_groups_df.head()


In [None]:
print(encoders_groups_df.dtypes)

In [None]:
len(encoders_groups_df.drop_duplicates(subset=['encoding_id']))

In [None]:
encoders_groups_df[encoders_groups_df['encoding_id'].isna()]

In [None]:
processed_encodings_df.head()
len(processed_encodings_df)

In [None]:
encoders_groups_df
cols_in_processed_encodings_df = processed_encodings_df.columns.to_list()
cols_in_processed_encodings_df = list(set(cols_in_processed_encodings_df) - {'encoding_id'})
for col in cols_in_processed_encodings_df:
    if col in encoders_groups_df.columns:
        encoders_groups_df = encoders_groups_df.drop(col, axis=1)
encoders_groups_df.head()



In [None]:
len(encoders_groups_df)

In [None]:

encodings_encoders_df = processed_encodings_df.merge(encoders_groups_df, left_on='encoding_id', right_on='encoding_id', how='left', suffixes=('', '_dupe'))


In [None]:
encodings_encoders_df.head()
len(encodings_encoders_df)

In [None]:

aeismaps_df.sort_values(by=['aeis__encoding_id', 'aeis_id', ], inplace=True)

# processed_aeismaps_df = processed_aeismaps_df.loc[(processed_aeismaps_df['aeis_id'].fillna(0.0).astype(int) > 0) & (aeismaps_df['aeis__encoding_id'].fillna(0.0).astype(int) > 0)]
aeismaps_df.drop_duplicates(subset=('aeis__encoding_id' ), inplace=True)
aeismaps_df
print(len(aeismaps_df))

encodings_aeis_df = encodings_encoders_df.merge(aeismaps_df, left_on='encoding_id', right_on='aeis__encoding_id', how='left')
encodings_aeis_df

# processed_formats_df['format__customer_id'] = formats_df['format__customer_id'].astype(int)
# formats_df['format__profile_id'] = formats_df['format__profile_id'].astype(int)
# sfdc_bvs_cust_account_df['customer_id'] = sfdc_bvs_cust_account_df['customer_id'].astype(int)
formats_customers_df = formats_df.merge(sfdc_bvs_cust_account_df, left_on='format__customer_id', right_on='customer_id', how='left')
formats_customers_df

formats_customers_profiles_df = formats_customers_df.merge(profiles_df, left_on='format__profile_id', right_on='profile_id', how='left')
formats_customers_profiles_df

encodings_bvs_df = encodings_aeis_df.merge(formats_customers_profiles_df, on='format_id',  how='left', suffixes=('', '_drop'))
for col in encodings_bvs_df.columns:
    if col.endswith('_drop'):
        encodings_bvs_df.drop(columns=col, inplace=True)
encodings_bvs_df

encodings_bvs_df['ad_prod_campaign'] = None
encodings_bvs_df['advertiser'] = encodings_bvs_df['advertiser'].fillna('')
encodings_bvs_df['product_code'] = encodings_bvs_df['product_code'].fillna('')
encodings_bvs_df['campaign'] = encodings_bvs_df['attributes_campaign'].fillna('')

# Update ad_prod_campaign
encodings_bvs_df['ad_prod_campaign'] = encodings_bvs_df.apply(
    lambda row: f"{row['advertiser'].strip()}-{row['product_code'].strip()}-{row['campaign'].strip()}".replace(' ', '_') if pd.isnull(row['ad_prod_campaign']) else row['ad_prod_campaign'],
    axis=1
)

# new cell

encodings_bvs_df.sort_values(by=[ 'encoding_id', 'last_updated','sfdc_account_id', 'format_id', 'profile_id', 'customer_id'], inplace=True)

# new cell




In [None]:
encodings_bvs_df.drop_duplicates(subset=['encoding_id'], keep='last', inplace=True)
encodings_bvs_df.head()

len(encodings_bvs_df)

In [None]:
encodings_bvs_df.sort_values(by=[ 'format_id', 'encoding_id'], inplace=True)

In [None]:
print(f"Starting with {len(encodings_bvs_df)} rows")
accounted_for = 0
print(f"Accounted for {accounted_for} rows")

In [None]:
clean_sfdc_advertiser_df['match_type'].head()

In [None]:
print(f"Starting with {len(encodings_bvs_df)} rows")
accounted_for = 0
print(f"Accounted for {accounted_for} rows")

mask = clean_sfdc_advertiser_df['match_type'] == 'encoding_product_code_multiple'
print(len(clean_sfdc_advertiser_df.loc[mask]))
encoding_product_code_multiple_sfdc_advertiser_df = clean_sfdc_advertiser_df.loc[mask].copy()
accounted_for += len(encoding_product_code_multiple_sfdc_advertiser_df)
print(f"Accounted for {accounted_for} rows")

mask = clean_sfdc_advertiser_df['match_type'] == 'encoding_product_code'
print(len(clean_sfdc_advertiser_df.loc[mask]))
encoding_product_code_sfdc_advertiser_df = clean_sfdc_advertiser_df.loc[mask].copy()
accounted_for += len(encoding_product_code_sfdc_advertiser_df)
print(f"Accounted for {accounted_for} rows")

mask = clean_sfdc_advertiser_df['match_type'] == 'encoding_advertiser'
print(len(clean_sfdc_advertiser_df.loc[mask]))
encoding_advertiser_sfdc_advertiser_df = clean_sfdc_advertiser_df.loc[mask].copy()
accounted_for += len(encoding_advertiser_sfdc_advertiser_df)
print(f"Accounted for {accounted_for} rows")

mask = clean_sfdc_advertiser_df['match_type'] == 'encoding_product_code_ignore_format'
print(len(clean_sfdc_advertiser_df.loc[mask]))
encoding_product_code_ignore_format_sfdc_advertiser_df = clean_sfdc_advertiser_df.loc[mask].copy()
accounted_for += len(encoding_product_code_ignore_format_sfdc_advertiser_df)
print(f"Accounted for {accounted_for} rows")



mask = clean_sfdc_advertiser_df['match_type'] == 'encoding_advertiser_ignore_format'
print(len(clean_sfdc_advertiser_df.loc[mask]))
encoding_advertiser_ignore_format_sfdc_advertiser_df = clean_sfdc_advertiser_df.loc[mask].copy()
accounted_for += len(encoding_advertiser_ignore_format_sfdc_advertiser_df)
print(f"Accounted for {accounted_for} rows")


mask = clean_sfdc_advertiser_df['match_type'] == 'encoding_format'
print(len(clean_sfdc_advertiser_df.loc[mask]))
encoding_format_sfdc_advertiser_df = clean_sfdc_advertiser_df.loc[mask].copy()
accounted_for += len(encoding_format_sfdc_advertiser_df)
print(f"Accounted for {accounted_for} rows")
print("finished normal matches")
# 
# 
# 

mask = clean_sfdc_advertiser_df['match_type'] == 'encoder_group'
print(len(clean_sfdc_advertiser_df.loc[mask]))
encoder_group_sfdc_advertiser_df = clean_sfdc_advertiser_df.loc[mask].copy()
accounted_for += len(encoder_group_sfdc_advertiser_df)
print(f"Accounted for {accounted_for} rows")

mask = clean_sfdc_advertiser_df['match_type'] == 'Clone'
print(len(clean_sfdc_advertiser_df.loc[mask]))
clone_sfdc_advertiser_df = clean_sfdc_advertiser_df.loc[mask].copy()
accounted_for += len(clone_sfdc_advertiser_df)
print(f"Accounted for {accounted_for} rows")



mask = clean_sfdc_advertiser_df['match_type'].isna()
print(len(clean_sfdc_advertiser_df.loc[mask]))
null_match_sfdc_advertiser_df = clean_sfdc_advertiser_df.loc[mask].copy()
accounted_for += len(null_match_sfdc_advertiser_df)
print(f"Accounted for {accounted_for} rows")
print(f"Started with {len(clean_sfdc_advertiser_df)} rows and accounted for {accounted_for} rows")




# array(['encoding_format', 'encoder_group', 'Clone',
    #    'encoding_product_code_multiple', None, 'encoding_advertiser',
    #    'encoding_product_code', 'encoding_advertiser_ignore_format'],
    #   dtype=object)
# 14327 total
# encoding_format: 14174
# encoder_group: 32
# Clone: 2
# encoding_product_code_multiple: 17
# encoding_advertiser: 7
# encoding_product_code: 37
# encoding_advertiser_ignore_format: 45
#  null: 13
# print(14327 - 14174 - 32 - 2 - 17 - 7 - 37 - 45 - 13)
# clean_sfdc_advertiser_df.loc[mask].head()

In [None]:
encodings_bvs_encoding_product_code_multiple_sfdc_advertiser_df = encodings_bvs_df.merge(encoding_product_code_multiple_sfdc_advertiser_df, left_on='format_id', right_on='encoding_format_id', how='left', suffixes=('', '_encoding_format')).dropna(subset=['sfdc_advertiser_id'])
print(f"Starting with {len(clean_sfdc_advertiser_df)} rows")
print(f"Accounted for {len(encoding_product_code_multiple_sfdc_advertiser_df)} rows")
# ["ALDR", "ALGM", "ALHP"]
processed = []


In [None]:
processed

In [None]:
# new cell

encoding_product_code_multiple_sfdc_advertiser_df['product_code_list_list'] = encoding_product_code_multiple_sfdc_advertiser_df['product_code_list'].apply(lambda x: x.split(','))



In [None]:
# new cell

# Filter the main DataFrame to include only relevant sfdc_account_ids
meta_df = encoding_product_code_multiple_sfdc_advertiser_df
df_name = 'encoding_product_code_multiple_sfdc_advertiser_df'
if df_name in processed:
    print(f"Already processed {df_name}")
else:
    print(f"Processing {df_name}...")
    sfdc_account_ids = meta_df['sfdc_account_id'].unique().tolist()
    working_df = encodings_bvs_df[encodings_bvs_df['sfdc_account_id'].isin(sfdc_account_ids)].copy()

    # Ensure 'product_code' column is filled with empty strings for null values
    working_df['product_code'] = working_df['product_code'].fillna('')
    # working_df.drop(columns=['sfdc_advertiser_id'], inplace=True)
    if 'sfdc_advertiser_id' in working_df.columns:
        working_df.drop(columns=['sfdc_advertiser_id'], inplace=True)

    # Explode product_code_list_list into multiple rows
    encoding_expanded_df = meta_df.copy()
    encoding_expanded_df = encoding_expanded_df.explode('product_code_list_list')
    encoding_expanded_df['product_code_list_list'] = encoding_expanded_df['product_code_list_list'].str.replace('"', '').copy()
    encoding_expanded_df['encoding_format_id'] = encoding_expanded_df['encoding_format_id'].astype('Int64')
    encoding_expanded_df[['product_code_list_list', 'encoding_format_id', 'sfdc_advertiser_id']]
    # encoding_expanded_master_df = encoding_expanded_df.sort_values(by=['encoding_format_id', 'product_code_list_list'], inplace=True)
    encoding_expanded_df.dropna(subset=['sfdc_advertiser_id'], inplace=True)
    encoding_expanded_df.drop_duplicates(subset=['encoding_format_id', 'product_code_list_list'], keep='first', inplace=True)
    # encoding_expanded_master_df['encoding_format_id'].fillna('0', inplace=True)
    # encoding_expanded_master_df['product_code_list_list'].fillna('', inplace=True)
    # encoding_expanded_master2_df = encoding_expanded_master_df.drop_duplicates(subset=['encoding_format_id', 'product_code_list_list'], keep='first')
    # len(encoding_expanded_master2_df)
    working_df['format_id'] = working_df['format_id'].astype('Int64')
    # Perform the merge based on format_id and product_code
    merged_df = working_df.merge(
        encoding_expanded_df,
        left_on=['format_id', 'product_code'],
        right_on=['encoding_format_id', 'product_code_list_list'],
        how='left', suffixes=('', '_encoding_format')
    )
    merged_df = merged_df.dropna(subset=['sfdc_advertiser_id'])
    merged_df['encoding_id'] = merged_df['encoding_id'].astype('Int64')
    working_df['encoding_id'] = working_df['encoding_id'].astype('Int64')
    merged_df[['encoding_id','sfdc_account_id']]
    print(f"Starting with {len(working_df)} rows")
    working_df2 = working_df.merge(merged_df[['encoding_id', 'sfdc_advertiser_id']], on='encoding_id', how='left',)
    print(f"Accounted for {len(working_df2)} rows")
    working_df2 = working_df2.dropna(subset=['sfdc_advertiser_id'])
    
    # # Assign the advertiser ID to the original DataFrame
    # working_df['sfdc_advertiser_id'] = merged_df['sfdc_advertiser_id']
    # # working_df.dropna(subset=['sfdc_advertiser_id'])
    # working_df
    working_df2.columns.tolist()

    new_encodings_bvs_df = pd.DataFrame(columns=working_df2.columns)
    new_encodings_bvs_df = pd.concat([new_encodings_bvs_df, working_df2], ignore_index=True)
    new_encodings_encodings_ids = new_encodings_bvs_df['encoding_id'].unique().tolist()
    len(new_encodings_bvs_df)
    processed.append(df_name)



In [None]:
processed

In [None]:
# new cell

meta_df = encoding_product_code_sfdc_advertiser_df
df_name = 'encoding_product_code_sfdc_advertiser_df'
if df_name in processed:
    print(f"Already processed {df_name}")
else:

    sfdc_account_ids = meta_df['sfdc_account_id'].unique().tolist()
    working_df = encodings_bvs_df[(encodings_bvs_df['sfdc_account_id'].isin(sfdc_account_ids)) & ~(encodings_bvs_df['encoding_id'].isin(new_encodings_encodings_ids))].copy()

    # Ensure 'product_code' column is filled with empty strings for null values
    working_df['product_code'] = working_df['product_code'].fillna('')
    if 'sfdc_advertiser_id' in working_df.columns:
        working_df.drop(columns=['sfdc_advertiser_id'], inplace=True)
    # working_df.drop(columns=['sfdc_advertiser_id'], inplace=True)
    encoding_expanded_df = meta_df.copy()

    encoding_expanded_df[['sfdc_account_id', 'sfdc_advertiser_name', 'sfdc_advertiser_id', 'encoding_format_id', 'encoding_product_code', 'product_code']]
    encoding_expanded_df.rename(columns={'encoding_format_id': 'format_id'}, inplace=True)
    encoding_expanded_df.drop(columns=['product_code'], inplace=True)
    encoding_expanded_df.rename(columns={'enc_product_code': 'product_code'}, inplace=True)
    encoding_expanded_df
    encoding_expanded_df = encoding_expanded_df.dropna(subset=['sfdc_advertiser_id']).copy()
    encoding_expanded_slim_df = encoding_expanded_df[['sfdc_account_id', 'sfdc_advertiser_id', 'format_id', 'product_code']].copy()
    encoding_expanded_slim_df

    merged_df = working_df.merge(encoding_expanded_slim_df, on=['sfdc_account_id', 'format_id', 'product_code'], how='left')
    merged_df.dropna(subset=['sfdc_advertiser_id'], inplace=True)
    merged_df
    new_encodings_bvs_df
    new_encodings_bvs_df = pd.concat([new_encodings_bvs_df, merged_df], ignore_index=True)
    new_encodings_encodings_ids = new_encodings_bvs_df['encoding_id'].unique().tolist()
    len(new_encodings_bvs_df)

    processed.append(df_name)

In [None]:
# new cell

# encoding_product_code_ignore_format
meta_df = encoding_product_code_ignore_format_sfdc_advertiser_df
df_name = 'encoding_product_code_ignore_format_sfdc_advertiser_df'
if df_name in processed:
    print(f"Already processed {df_name}")
else:
    sfdc_account_ids = meta_df['sfdc_account_id'].unique().tolist()
    working_df = encodings_bvs_df[(encodings_bvs_df['sfdc_account_id'].isin(sfdc_account_ids)) & ~(encodings_bvs_df['encoding_id'].isin(new_encodings_encodings_ids))].copy()

    # Ensure 'product_code' column is filled with empty strings for null values
    working_df['product_code'] = working_df['product_code'].fillna('')
    if 'sfdc_advertiser_id' in working_df.columns:
        working_df.drop(columns=['sfdc_advertiser_id'], inplace=True)
    encoding_expanded_df = meta_df.copy()

    encoding_expanded_df[['sfdc_account_id', 'sfdc_advertiser_name', 'sfdc_advertiser_id',  'encoding_product_code', 'product_code']]
    # encoding_expanded_df.rename(columns={'encoding_format_id': 'format_id'}, inplace=True)
    encoding_expanded_df.drop(columns=['product_code'], inplace=True)
    encoding_expanded_df.rename(columns={'enc_product_code': 'product_code'}, inplace=True)
    encoding_expanded_df
    encoding_expanded_df = encoding_expanded_df.dropna(subset=['sfdc_advertiser_id']).copy()
    encoding_expanded_slim_df = encoding_expanded_df[['sfdc_account_id', 'sfdc_advertiser_id',  'product_code']].copy()
    encoding_expanded_slim_df

    merged_df = working_df.merge(encoding_expanded_slim_df, on=['sfdc_account_id',  'product_code'], how='left')
    merged_df.dropna(subset=['sfdc_advertiser_id'], inplace=True)
    merged_df
    new_encodings_bvs_df
    new_encodings_bvs_df = pd.concat([new_encodings_bvs_df, merged_df], ignore_index=True)
    new_encodings_encodings_ids = new_encodings_bvs_df['encoding_id'].unique().tolist()
    len(new_encodings_bvs_df)

    processed.append(df_name)

In [None]:
# new cell

# encoding_advertiser
meta_df = encoding_advertiser_sfdc_advertiser_df
df_name = 'encoding_advertiser_sfdc_advertiser_df'
if df_name in processed:
    print(f"Already processed {df_name}")
else:
    sfdc_account_ids = meta_df['sfdc_account_id'].unique().tolist()
    working_df = encodings_bvs_df[(encodings_bvs_df['sfdc_account_id'].isin(sfdc_account_ids)) & ~(encodings_bvs_df['encoding_id'].isin(new_encodings_encodings_ids))].copy()

    # Ensure 'product_code' column is filled with empty strings for null values
    working_df['product_code'] = working_df['advertiser'].fillna('')
    if 'sfdc_advertiser_id' in working_df.columns:
        working_df.drop(columns=['sfdc_advertiser_id'], inplace=True)
    encoding_expanded_df = meta_df.copy()

    encoding_expanded_df[['sfdc_account_id', 'sfdc_advertiser_name', 'sfdc_advertiser_id', 'encoding_format_id', 'enc_advertiser']]
    encoding_expanded_df.rename(columns={'encoding_format_id': 'format_id'}, inplace=True)
    encoding_expanded_df.drop(columns=['product_code'], inplace=True)
    encoding_expanded_df.rename(columns={'enc_advertiser': 'advertiser'}, inplace=True)
    encoding_expanded_df
    encoding_expanded_df = encoding_expanded_df.dropna(subset=['sfdc_advertiser_id']).copy()
    encoding_expanded_slim_df = encoding_expanded_df[['sfdc_account_id', 'sfdc_advertiser_id', 'format_id', 'advertiser']].copy()
    encoding_expanded_slim_df

    merged_df = working_df.merge(encoding_expanded_slim_df, on=['sfdc_account_id', 'format_id', 'advertiser'], how='left')
    merged_df.dropna(subset=['sfdc_advertiser_id'], inplace=True)
    merged_df
    new_encodings_bvs_df
    new_encodings_bvs_df = pd.concat([new_encodings_bvs_df, merged_df], ignore_index=True)
    new_encodings_encodings_ids = new_encodings_bvs_df['encoding_id'].unique().tolist()
    len(new_encodings_bvs_df)

    processed.append(df_name)

In [None]:
# new cell

# encoding_advertiser_ignore_format
meta_df = encoding_advertiser_ignore_format_sfdc_advertiser_df
df_name = 'encoding_advertiser_ignore_format_sfdc_advertiser_df'
if df_name in processed:
    print(f"Already processed {df_name}")
else:
    sfdc_account_ids = meta_df['sfdc_account_id'].unique().tolist()
    working_df = encodings_bvs_df[(encodings_bvs_df['sfdc_account_id'].isin(sfdc_account_ids)) & ~(encodings_bvs_df['encoding_id'].isin(new_encodings_encodings_ids))].copy()

    # Ensure 'product_code' column is filled with empty strings for null values
    working_df['product_code'] = working_df['advertiser'].fillna('')
    if 'sfdc_advertiser_id' in working_df.columns:
        working_df.drop(columns=['sfdc_advertiser_id'], inplace=True)
    encoding_expanded_df = meta_df.copy()

    encoding_expanded_df[['sfdc_account_id', 'sfdc_advertiser_name', 'sfdc_advertiser_id',  'enc_advertiser']]
    # encoding_expanded_df.rename(columns={'encoding_format_id': 'format_id'}, inplace=True)
    encoding_expanded_df.drop(columns=['product_code'], inplace=True)
    encoding_expanded_df.rename(columns={'enc_advertiser': 'advertiser'}, inplace=True)
    encoding_expanded_df
    encoding_expanded_df = encoding_expanded_df.dropna(subset=['sfdc_advertiser_id']).copy()
    encoding_expanded_slim_df = encoding_expanded_df[['sfdc_account_id', 'sfdc_advertiser_id',  'advertiser']].copy()
    encoding_expanded_slim_df

    merged_df = working_df.merge(encoding_expanded_slim_df, on=['sfdc_account_id', 'advertiser'], how='left')
    merged_df.dropna(subset=['sfdc_advertiser_id'], inplace=True)
    merged_df
    new_encodings_bvs_df
    new_encodings_bvs_df = pd.concat([new_encodings_bvs_df, merged_df], ignore_index=True)
    new_encodings_encodings_ids = new_encodings_bvs_df['encoding_id'].unique().tolist()
    len(new_encodings_bvs_df)

    processed.append(df_name)

In [None]:
# new cell

# encoding_format
meta_df = encoding_format_sfdc_advertiser_df
df_name = 'encoding_format_sfdc_advertiser_df'
if df_name in processed:
    print(f"Already processed {df_name}")
else:
    sfdc_account_ids = meta_df['sfdc_account_id'].unique().tolist()
    working_df = encodings_bvs_df[(encodings_bvs_df['sfdc_account_id'].isin(sfdc_account_ids)) & ~(encodings_bvs_df['encoding_id'].isin(new_encodings_encodings_ids))].copy()

    # Ensure 'product_code' column is filled with empty strings for null values
    working_df['product_code'] = working_df['advertiser'].fillna('')
    if 'sfdc_advertiser_id' in working_df.columns:
        working_df.drop(columns=['sfdc_advertiser_id'], inplace=True)
    encoding_expanded_df = meta_df.copy()

    encoding_expanded_df[['sfdc_account_id', 'sfdc_advertiser_name', 'sfdc_advertiser_id', 'encoding_format_id']]
    encoding_expanded_df.rename(columns={'encoding_format_id': 'format_id'}, inplace=True)
    # encoding_expanded_df.drop(columns=['product_code'], inplace=True)
    # encoding_expanded_df.rename(columns={'enc_advertiser': 'advertiser'}, inplace=True)
    encoding_expanded_df
    encoding_expanded_df = encoding_expanded_df.dropna(subset=['sfdc_advertiser_id']).copy()
    encoding_expanded_slim_df = encoding_expanded_df[['sfdc_account_id', 'sfdc_advertiser_id', 'format_id']].copy()
    encoding_expanded_slim_df

    merged_df = working_df.merge(encoding_expanded_slim_df, on=['sfdc_account_id', 'format_id'], how='left')
    merged_df.dropna(subset=['sfdc_advertiser_id'], inplace=True)
    merged_df
    new_encodings_bvs_df
    new_encodings_bvs_df = pd.concat([new_encodings_bvs_df, merged_df], ignore_index=True)
    new_encodings_encodings_ids = new_encodings_bvs_df['encoding_id'].unique().tolist()
    len(new_encodings_bvs_df)
    # 1686537

    processed.append(df_name)

In [None]:
# new cell

new_encodings_bvs_df.sort_values(by=['format_id', 'encoding_id'], inplace=True)



In [None]:
# new cell

# new_encodings_bvs_merge_df = new_encodings_bvs_df[['encoding_id', 'sfdc_advertiser_id']].drop_
# new_encodings_bvs_merge_df
# encodings_bvs_df_to_write = encodings_bvs_df.merge(new_encodings_bvs_merge_df, on='encoding_id', how='left')
# encodings_bvs_df_to_write
len(encodings_bvs_df)
len(encodings_bvs_df[~(encodings_bvs_df['encoding_id'].isin(new_encodings_encodings_ids))]) + len(new_encodings_bvs_df)
encodings_bvs_df['sfdc_advertiser_id'] = ''
# 1686537
# 1686866
encodings_bvs_df_to_write = pd.concat([encodings_bvs_df[~(encodings_bvs_df['encoding_id'].isin(new_encodings_encodings_ids))], new_encodings_bvs_df], ignore_index=True).reset_index(drop=True)

# 

In [None]:


# encodings_bvs_df_to_write.sort_values(by=['sfdc_account_id'], inplace=True)
billing_last_updated = pd.Timestamp.utcnow().floor('s')
encodings_bvs_df_to_write['billing_last_updated'] = billing_last_updated
billing_last_audit_id = core_functions.generate_uuid()
encodings_bvs_df_to_write['billing_last_audit_id'] = billing_last_audit_id
# encodings_bvs_df_to_write



In [None]:
core_functions.print_dataframe_parquet_schema(encodings_bvs_df_to_write, 'encodings')

In [None]:
core_functions.print_dataframe_bigquery_schema_yaml(encodings_bvs_df_to_write, 'encodings')

In [None]:
# def process_encodings_segments(df):
#     df['segments_date'] = pd.to_datetime(df['encoded_timestamp']).dt.date
#     df['segments_day_of_week'] = pd.to_datetime(df['encoded_timestamp']).dt.day_name()
#     df['segments_device'] = 'tv'
#     df['segments_month'] = pd.to_datetime(df['encoded_timestamp']).dt.to_period('M')
#     df['segments_quarter'] = pd.to_datetime(df['encoded_timestamp']).dt.to_period('Q')
#     df['segments_week'] = pd.to_datetime(df['encoded_timestamp']).dt.to_period('W')
#     df['segments_year'] = pd.to_datetime(df['encoded_timestamp']).dt.to_period('Y')
#     return df





In [None]:
encodings_bvs_df_to_write2 = core_functions.process_encodings_segments(encodings_bvs_df_to_write)
encodings_bvs_df_to_write2

In [None]:
core_functions.print_dataframe_parquet_schema(encodings_bvs_df_to_write2, 'encodings')

In [None]:
encodings_bvs_df_to_write2['clone_of'] = encodings_bvs_df_to_write2['clone_of'].astype('Int64').fillna(-11)
encodings_bvs_df_to_write2['customer_id'] = encodings_bvs_df_to_write2['customer_id'].astype('Int64').fillna(0)
string_cols = ['encoded_timestamp',  'last_updated', 'detection_end_date', 'encoder_group__last_updated','aeis__last_updated' ,'format__last_updated', 'profile__last_updated',
               'billing_last_updated']

# for col in encodings_bvs_df_to_write2.columns:
#     if encodings_bvs_df_to_write2[col].dtype == 'object':
#         encodings_bvs_df_to_write2[col] = encodings_bvs_df_to_write2[col].astype('string')
#     if col in string_cols:
#         encodings_bvs_df_to_write2[col] = encodings_bvs_df_to_write2[col].astype('string')

core_functions.print_dataframe_python_schema(encodings_bvs_df_to_write2, 'encodings')



In [None]:
print(encodings_bvs_df_to_write2.dtypes)

In [None]:
# skipping to test
valid_final_cols = ['encoding_id',
 'format_id',
 'encoder_group_id',
 'encoded_timestamp',
 'clone_of',
 'status',
 'last_updated',
 'last_audit_id',
 'encoder_id',
 'detection_end_date',
 'encoded_timestamp_epoch',
 'attributes_advertiser',
 'attributes_audience',
 'attributes_audience_2',
 'attributes_cable_estimate',
 'attributes_campaign',
 'attributes_category',
 'attributes_client_code',
 'attributes_commercial_id',
 'attributes_contour_id',
 'attributes_creative_offer',
 'attributes_description',
 'attributes_donovan_agency_advertiser_code',
 'attributes_donovan_agency_estimate_code',
 'attributes_donovan_agency_product_code',
 'attributes_eid',
 'attributes_group',
 'attributes_hd_sd',
 'attributes_id',
 'attributes_isci',
 'length_in_seconds',
 'attributes_length',
 'attributes_lob',
 'attributes_media_type',
 'attributes_message',
 'attributes_misc',
 'attributes_module_code',
 'attributes_offer',
 'attributes_offer_2',
 'attributes_phone_number',
 'attributes_product_code',
 'attributes_product_name',
 'attributes_project_name',
 'attributes_quality',
 'attributes_revision',
 'attributes_show_name',
 'attributes_slug',
 'attributes_sport_id',
 'attributes_sport_show_sub_category',
 'attributes_spot_estimate',
 'attributes_spot_name',
 'attributes_tag',
 'attributes_text',
 'attributes_title',
 'attributes_veil_id',
 'attributes_version_name',
 'attributes_year',
 'product_code',
 'isci',
 'advertiser',
 'encoder_group_name',
 'encoder_group__deleted',
 'encoder_group__last_audit_id',
 'encoder_group__last_updated',
 'aeis_id',
 'aeis__encoding_id',
 'aeis__encoding_offset',
 'aeis__last_updated',
 'aeis__last_audit_id',
 'format_name',
 'format__profile_id',
 'format__customer_id',
 'format__report_breakup',
 'format__deleted',
 'format__last_updated',
 'format__last_audit_id',
 'sfdc_account_id',
 'sfdc_account_name',
 'customer_id',
 'account_id',
 'contract_item',
 'customer_name',
 'contract_number',
 'sales_person_code',
 'deleted',
 'profile_id',
 'profile_name',
 'profile__deleted',
 'profile__default_asset_code',
 'profile__last_updated',
 'profile__last_audit_id',
 'ad_prod_campaign',
 'campaign',
 'sfdc_advertiser_id',
 'billing_last_updated',
 'billing_last_audit_id']

In [None]:
# int_cols = ['encoding_id','format_id','encoder_group_id','clone_of','encoder_id','encoded_timestamp_epoch', 'length_in_seconds', 'aeis_id','aeis__encoding_id','aeis__encoding_offset','format__profile_id','format__customer_id','format__last_updated','customer_id','account_id','contract_item','contract_number']
# date_cols = ['encoded_timestamp', 'last_updated','detection_end_date','encoder_group__last_updated','aeis__last_updated','profile__last_updated','billing_last_updated']
# drop_cols = ['attributes','profile__attributes']
# bool_cols = ['encoder_group__deleted','format__deleted','deleted']
# for col in encodings_bvs_df_to_write.columns:
#     if col in encodings_bvs_df_to_write.columns and col not in valid_final_cols:
#         encodings_bvs_df_to_write.drop(columns=col, inplace=True)
#     if col in int_cols:
#         encodings_bvs_df_to_write[col] = encodings_bvs_df_to_write[col].fillna(-1).astype(int)
#     if col in date_cols:
#         encodings_bvs_df_to_write[col] = pd.to_datetime(encodings_bvs_df_to_write[col], errors='coerce', utc=True)
#     if col in drop_cols and col in encodings_bvs_df_to_write.columns:
#         encodings_bvs_df_to_write.drop(columns=col, inplace=True)
#     if col in bool_cols:
#         encodings_bvs_df_to_write[col] = encodings_bvs_df_to_write[col].fillna(False).astype(bool)
#     if col in valid_final_cols and col not in encodings_bvs_df_to_write.columns:
#         encodings_bvs_df_to_write[col] = ''
#     if col in valid_final_cols and col not in int_cols and col not in date_cols and col not in bool_cols:
#         encodings_bvs_df_to_write[col] = encodings_bvs_df_to_write[col].fillna('').astype(str)
# for col in encodings_bvs_df_to_write.columns:
#     print(f"{col}: type: {encodings_bvs_df_to_write[col].dtype}")
# # encodings_bvs_df_to_write.columns.to_list()

In [None]:
encodings_bvs_df_to_write2

In [None]:
# from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# for col in encodings_bvs_df_to_write2.columns:
#     if encodings_bvs_df_to_write2[col].dtype == StructType:
#         print(col)
        

In [None]:
# from collections import defaultdict

# # Function to detect mixed types in each column
# def detect_mixed_types(df):
#     mixed_type_columns = []
#     type_counts = defaultdict(set)
    
#     for col in df.columns:
#         for val in df[col].dropna().unique():
#             type_counts[col].add(type(val))
#             if len(type_counts[col]) > 1:
#                 mixed_type_columns.append(col)
#                 break  # No need to check further once mixed types are found
                
#     return mixed_type_columns, type_counts

# mixed_cols, type_counts = detect_mixed_types(df)

# print("Columns with mixed types:")
# for col in mixed_cols:
#     print(f"{col}: {type_counts[col]}")

In [None]:
# from pyhive import hive


# # Connect to Hive Metastore
# conn = hive.Connection(host='10.11.0.10', port=10000, username='anonymous')

# cursor = conn.cursor()
# cursor.execute('CREATE DATABASE IF NOT EXISTS stl_data_lake')
# cursor.execute("USE stl_data_lake")
# cursor.execute('SHOW DATABASES')

# print("Databases:")
# for database in cursor.fetchall():
#     print(database)

In [None]:
# dtype_map = {
#     'object': 'STRING',
#     'string': 'STRING',
#     'int64': 'BIGINT',
#     'Int64': 'BIGINT',
#     'float64': 'DOUBLE',
#     'Float64': 'DOUBLE',
#     'datetime64[ns]': 'TIMESTAMP',
#     'bool': 'BOOLEAN'
# }

# # Create a function to map dtype
# def map_dtype(pandas_dtype):
#     pandas_str = str(pandas_dtype)
#     if pandas_str in dtype_map:
#         return dtype_map[pandas_str]
#     else:
#         # Default fallback, adjust as needed
#         return 'STRING'

# # Build the column definitions for Hive
# column_defs = []
# for col in encodings_bvs_df_to_write2.columns:
#     hive_type = map_dtype(encodings_bvs_df_to_write2[col].dtype)
#     column_defs.append(f"`{col}` {hive_type}")

# # Join all column definitions into a single string
# columns_str = ",\n    ".join(column_defs)

# # Build the final CREATE TABLE statement
# table_name = "expanded_encodings_bvs"
# create_table_statement = f"""
# CREATE TABLE IF NOT EXISTS {table_name} (
#     {columns_str}
# )
# STORED AS PARQUET
# """

# print(create_table_statement)
# # cursor.execute(create_table_statement)

In [None]:
# cursor.execute('SHOW TABLES')

# print("Tables:")
# for table in cursor.fetchall():
#     print(table)

In [None]:
# test utility to check the data types of the columns

for col in encodings_bvs_df_to_write2.columns:
    print(f'Column: {col} - Type: {encodings_bvs_df_to_write2[col].dtype}')
    print(f'Sample Data: {encodings_bvs_df_to_write2[col][0]}')

In [None]:
# encodings_bvs_df_to_write2['attributes'] = encodings_bvs_df_to_write2['attributes'].apply(lambda x: json.dumps(x)).astype('string')
# encodings_bvs_df_to_write2['profile__attributes'] = encodings_bvs_df_to_write2['profile__attributes'].apply(lambda x: json.dumps(x)).astype('string')
encodings_bvs_df_to_write2.rename(columns={'segments_format_id_group': '_FORMAT_ID_GROUP'}, inplace=True)

In [None]:
# encodings_bvs_df_to_write2.drop(columns=['attributes', 'profile__attributes'], inplace=True)

In [None]:
# new cell

# import json
# from datetime import datetime

process_df = encodings_bvs_df_to_write2.copy()

# nested_columns = ['attributes']
# for col in nested_columns:
#     process_df[col] = process_df[col].apply(lambda x: json.dumps(x) if pd.notna(x) else '')
    
# def parse_iso_with_timezone(ts):
#     # Replace timezone colon
#     ts = ts.replace(":", "", 1) if "+" in ts or "-" in ts else ts
#     return datetime.strptime(ts, "%Y-%m-%dT%H:%M:%S.%f%z")

# # parsed_timestamps = [parse_iso_with_timezone(ts) for ts in process_df['encoded_timestamp'] ]
# process_df['encoded_timestamp'] = process_df['encoded_timestamp'].apply(parse_iso_with_timezone)

# print(parsed_timestamps)
# process_df['encoded_timestamp'] = pd.to_datetime(process_df['encoded_timestamp'])
# process_df = core_functions.convert_to_string_except_exclusions(process_df, exclude_columns=['encoded_timestamp', 'encoding_id', 'format_id','customer_id', 'profile_id', 'billing_last_updated', 'deleted', 'profile_deleted'])
# process_df.drop(columns=['attributes'], inplace=True)

process_df = process_df.sort_values(by=['segments_year', 'format_id', 'encoding_id'], ascending=True).copy().reset_index(drop=True)

s4_storage_options = config.get('S4_STORAGE_OPTIONS')
veil_storage_options = config.get('VEIL_GCS_STORAGE_OPTIONS')
n90_storage_options = config.get('N90_GCS_STORAGE_OPTIONS')


s4_bucket = 'n90-data-lake-stl'
s4_output_prefix = 'veil/encodings_v2b'
veil_billing_bucket = config.get('veil_billing').get('billing_gcs_bucket_id')
process_df['encoded_timestamp']
# process_df['profile__attributes']
n90_bucket = 'n90_veil_partner'
n90_bucket_2 = 'n90-data-lake'
veil_output_prefix = 'encodings_v2b'
n90_output_prefix = 'advocado-looker/avs_prod/encodings_v2b'
n90_output_prefix_2 = 'avs_prod/encodings_v2b'
partition_cols = ['_FORMAT_ID_GROUP']


core_functions.write_hive_partitioned_parquet_s4(process_df, s4_bucket, s4_output_prefix, partition_cols, s4_storage_options, spec='s3')
print(f"Finished writing to {s4_bucket}/{s4_output_prefix}")


core_functions.write_hive_partitioned_parquet(process_df, veil_billing_bucket, veil_output_prefix, partition_cols, veil_storage_options)
print(f"Finished writing to {veil_billing_bucket}/{veil_output_prefix}")
core_functions.write_hive_partitioned_parquet(process_df, n90_bucket, n90_output_prefix, partition_cols, n90_storage_options)
print(f"Finished writing to {n90_bucket}/{n90_output_prefix}")
core_functions.write_hive_partitioned_parquet(process_df, n90_bucket_2, n90_output_prefix_2, partition_cols, n90_storage_options)
print(f"Finished writing to {n90_bucket_2}/{n90_output_prefix_2}")
core_functions.write_hive_partitioned_parquet_s4(process_df, s4_bucket, s4_output_prefix, partition_cols, s4_storage_options)
print(f"Finished writing to {s4_bucket}/{s4_output_prefix}")


# # new cell

# print(encodings_bvs_df_to_write.dtypes)


