In [None]:

import importlib
import functions.core.core_functions as core_functions
import functions.pyarrow_functions as pyarrow_functions
import dask
import dask.dataframe as dd
from dask.dataframe.utils import assert_eq
import pandas as pd
import numpy as np
import pandas_gbq
import json
import gc
import cudf
import os
import yaml
from concurrent.futures import ProcessPoolExecutor
from functools import partial
from google.cloud import bigquery, storage, bigquery_storage

dask.config.set({"dataframe.backend": "cudf"})

importlib.reload(core_functions)
importlib.reload(pyarrow_functions)

In [None]:
resp = {}
resp = core_functions.initialize_clients(service_account_secret_name='SA_N90_CORE_APPS')

config = resp.get('config')
n90_bigquery_client = resp.get('clients').get('bigquery_client')
storage_client = resp.get('clients').get('storage_client')

unknown_dma_overrides = config.get('national_dma_overrides_to_us_national')

In [None]:
detections_sql = f"""
    SELECT * FROM `next90-core-applications.n90_data_lake.avs_detections`
    WHERE sfdc_advertiser_id = 'a1mUV000000FiYbYAK'
    AND detection_timestamp >= '2024-09-01' and detection_timestamp < '2024-12-01'
    AND origin = 'L'
    AND name not in ('DIRECTV', 'DISH')
    """
    
detections_df = bigquery.Client().query(detections_sql).result().to_dataframe()

activity_session_sql = f"""
    SELECT activity_session_id, uuid, created_time, neustar_dma_id, neustar_country, neustar_state, url, domain, zip_code, lat, lon, calculated_source, is_bot, device_type, device_model, hour, city, session_id, numPvs, gclid, brand_id, year, month, day, sfdc_account_id, sfdc_account_name, sfdc_advertiser_id, activity_session_timestamp, process_month_group, bcw_monday_match, week_within_month, activity_date_time, click_view_gclid, bcw_id, bcw_index, bcm_index, geo_location, geo_type, geo_neustar_id, segments_date, segments_day_of_week, segments_media, segments_month_label, segments_quarter_label, segments_week_label, segments_month, segments_quarter, segments_week, segments_year, segments_broadcast_year, segments_broadcast_month_index, segments_broadcast_week_index, session_timestamp, activity_session_last_updated, activity_session_last_audit_id
    FROM `next90-core-applications.n90_data_lake.activity_sessions`
    WHERE sfdc_advertiser_id = 'a1mUV000000FiYbYAK'
    AND activity_date_time >= '2024-09-01' and activity_date_time < '2024-12-01'
    """
    
activity_session_df = bigquery.Client().query(activity_session_sql).result().to_dataframe()

In [None]:
detections_df['length_in_seconds'] = detections_df['length_in_seconds'].astype('Int64').fillna(30)
detections_df['attribution_window'] = (detections_df['length_in_seconds'] + 300).astype('Int64')
detections_df['detection_timestamp'] = pd.to_datetime(detections_df['detection_timestamp'])
detections_df['attribution_end_time'] = detections_df['detection_timestamp'] + pd.to_timedelta(detections_df['attribution_window'], unit='s')

In [None]:
activity_session_df['session_time'] = pd.to_datetime(activity_session_df['activity_session_timestamp'])

In [None]:
detections_df['geo_neustar_id'].drop_duplicates().to_list()

In [None]:
activity_session_df.dtypes

In [None]:
import pandas as pd

# Example dataframes: airings_df and web_sessions_df
# airings_df: ['airing_time', 'dma', 'attribution_window']
# web_sessions_df: ['session_time', 'dma']

# Step 1: Calculate attribution windows
detections_df['attribution_end_time'] = detections_df['detection_timestamp'] + pd.to_timedelta(detections_df['attribution_window'], unit='s')
detections_df['airing_time'] = detections_df['detection_timestamp']
airings_df = detections_df.copy()
web_sessions_df = activity_session_df.copy()
# Step 2: Match sessions to airings
def match_sessions(airings_df, web_sessions_df):
    matches = []
    for _, airing in airings_df.iterrows():
        # Filter web sessions within time range and matching DMA
        valid_sessions = web_sessions_df[
            (web_sessions_df['session_time'] >= airing['detection_timestamp']) &
            (web_sessions_df['session_time'] <= airing['attribution_end_time']) &
            (web_sessions_df['geo_neustar_id'] == airing['geo_neustar_id'])
        ]
        valid_sessions = valid_sessions.copy()
        valid_sessions['airing_time'] = airing['airing_time']
        matches.append(valid_sessions)

    # Combine all matches
    matched_df = pd.concat(matches, ignore_index=True)
    return matched_df

# Step 3: Apply function
matched_sessions = match_sessions(airings_df, web_sessions_df)

# Step 4: Analyze results
# Example: Response curve by time since airing
matched_sessions['time_since_airing'] = (matched_sessions['session_time'] - matched_sessions['airing_time']).dt.total_seconds()
response_curve = matched_sessions.groupby('time_since_airing').size()
response_curve.plot(kind='line', title='Response Curve')