In [None]:

import importlib
import functions.core.core_functions as core_functions
import functions.pyarrow_functions as pyarrow_functions
import dask
import dask.dataframe as dd
from dask.dataframe.utils import assert_eq
import pandas as pd
import numpy as np
import pandas_gbq
import json
import gc
import cudf
import os
import yaml
from concurrent.futures import ProcessPoolExecutor
from functools import partial
from google.cloud import bigquery, storage, bigquery_storage
import ace_tools_open as tools
import matplotlib.pyplot as plt

dask.config.set({"dataframe.backend": "cudf"})

importlib.reload(core_functions)
importlib.reload(pyarrow_functions)

In [None]:
resp = {}
resp = core_functions.initialize_clients(service_account_secret_name='SA_N90_CORE_APPS')

config = resp.get('config')
n90_bigquery_client = resp.get('clients').get('bigquery_client')
storage_client = resp.get('clients').get('storage_client')

unknown_dma_overrides = config.get('national_dma_overrides_to_us_national')
national_media_only = True

In [None]:
national_media_only = True

detections_sql = f"""
    SELECT * FROM `next90-core-applications.n90_data_lake.p_avs_detections`
    WHERE sfdc_advertiser_id = 'a1mUV000000FiYbYAK'
    AND detection_timestamp >= '2024-09-01' and detection_timestamp < '2024-12-01'
    AND origin = 'L'
    AND name not in ('DIRECTV', 'DISH')
    
    """
if national_media_only:
    detections_sql += f"""
AND geo_neustar_id > 800000
ORDER BY detection_timestamp ASC
"""
else:
    detections_sql += f"""
    ORDER BY detection_timestamp ASC
    """
    
detections_df = bigquery.Client().query(detections_sql).result().to_dataframe()

activity_session_sql = f"""
    SELECT activity_session_id, uuid, created_time, neustar_dma_id, neustar_country, neustar_state, url, domain, zip_code, lat, lon, calculated_source, is_bot, device_type, device_model, hour, city, session_id, numPvs, gclid, brand_id, year, month, day, sfdc_account_id, sfdc_account_name, sfdc_advertiser_id, activity_session_timestamp, process_month_group, bcw_monday_match, week_within_month, activity_date_time, click_view_gclid, bcw_id, bcw_index, bcm_index, geo_location, geo_type, geo_neustar_id, segments_date, segments_day_of_week, segments_media, segments_month_label, segments_quarter_label, segments_week_label, segments_month, segments_quarter, segments_week, segments_year, segments_broadcast_year, segments_broadcast_month_index, segments_broadcast_week_index, session_timestamp, activity_session_last_updated, activity_session_last_audit_id
    FROM `next90-core-applications.n90_data_lake.p_activity_sessions`
    WHERE sfdc_advertiser_id = 'a1mUV000000FiYbYAK'
    AND activity_date_time >= '2024-09-01' and activity_date_time < '2024-12-01'
    ORDER BY activity_date_time ASC
    """
    
activity_session_df = bigquery.Client().query(activity_session_sql).result().to_dataframe()

In [None]:
detections_df['length_in_seconds'] = detections_df['length_in_seconds'].astype('Int64').fillna(30)
detections_df.loc[detections_df['length_in_seconds'] == 0, 'length_in_seconds'] = 30
detections_df['attribution_window'] = (detections_df['length_in_seconds'] + 300).astype('Int64')
detections_df['detection_timestamp'] = pd.to_datetime(detections_df['detection_timestamp'])
detections_df['attribution_end_time'] = detections_df['detection_timestamp'] + pd.to_timedelta(detections_df['attribution_window'], unit='s')
try:
    activity_session_df['session_timestamp'] = pd.to_datetime(activity_session_df['activity_date_time']).dt.tz_convert('UTC')
except:
    activity_session_df['session_timestamp'] = pd.to_datetime(activity_session_df['activity_date_time']).dt.tz_localize('UTC')
try:
    detections_df['detection_timestamp'] = pd.to_datetime(detections_df['detection_timestamp']).dt.tz_convert('UTC')
except:
    detections_df['detection_timestamp'] = pd.to_datetime(detections_df['detection_timestamp']).dt.tz_localize('UTC')
try:
    detections_df['attribution_end_time'] = pd.to_datetime(detections_df['attribution_end_time']).dt.tz_convert('UTC')
except:
    detections_df['attribution_end_time'] = pd.to_datetime(detections_df['attribution_end_time']).dt.tz_localize('UTC')


In [None]:
test_activity_session_df = activity_session_df[0:10000].copy()
test_activity_session_df = core_functions.extract_url_data(test_activity_session_df, 'url')
test_activity_session_df['url_gclid'].dropna()



In [None]:
test_activity_session_df[['tracking_id', 'tracking_type']] = test_activity_session_df.apply(core_functions.set_tracking_info, axis=1)
test_activity_session_df['activity_source'] = test_activity_session_df.apply(lambda row: core_functions.calculate_source(row['referrer'], row['tracking_type']), axis=1)

In [None]:
test_activity_session_df['referrer']


In [None]:
detections_df['geo_neustar_id'].drop_duplicates().to_list()

In [None]:
detections_df.loc[detections_df['geo_neustar_id'].isin([602,504]),['affiliate']]

In [None]:
detections_df.sort_values(by=['detection_timestamp'], ascending=True, inplace=True)

In [None]:
# Check for overlaps
overlap_flags = []
for i in range(len(detections_df)):
    current_end = detections_df.loc[i, 'attribution_end_time']
    overlaps = ((detections_df['detection_timestamp'] < current_end) & (detections_df['detection_timestamp'] >= detections_df.loc[i, 'detection_timestamp'])).sum() > 1
    overlap_flags.append(overlaps)

detections_df['overlaps'] = overlap_flags


In [None]:
# Add overlap details
overlap_details = []
for i in range(len(detections_df)):
    parent_id = detections_df.loc[i, 'occurrence_id']
    parent_start = detections_df.loc[i, 'detection_timestamp']
    parent_end = detections_df.loc[i, 'attribution_end_time']
    
    # Find overlapping airings
    overlaps = detections_df[
        (detections_df['detection_timestamp'] < parent_end) & 
        (detections_df['detection_timestamp'] >= parent_start) & 
        (detections_df['occurrence_id'] != parent_id)
    ]
    
    # If overlaps exist, calculate the overlap gap and store details
    if len(overlaps) > 0:
        overlap_data = []
        for _, row in overlaps.iterrows():
            overlap_start = row['detection_timestamp']
            gap = (overlap_start - parent_start).total_seconds() / 60  # Gap in minutes
            overlap_data.append({
                'overlap_id': row['occurrence_id'],
                'overlap_gap': gap
            })
        overlap_details.append(overlap_data)
    else:
        overlap_details.append([])

# Add overlap details to the DataFrame
detections_df['overlap_details'] = overlap_details

# Display the DataFrame with overlap details
tools.display_dataframe_to_user(name="Detections with Overlap Details", dataframe=detections_df)

In [None]:
activity_session_na_df = activity_session_df[activity_session_df['neustar_country'].isin(['us', 'ca'])].copy().reset_index(drop=True)
# activity_session_df.columns.to_list()

In [None]:
activity_session_na_df.head()

In [None]:
# Ensure timestamps are consistent in their time zones
try:
    activity_session_na_df['session_timestamp'] = pd.to_datetime(activity_session_na_df['session_time']).dt.tz_convert('UTC')
except:
    activity_session_na_df['session_timestamp'] = pd.to_datetime(activity_session_na_df['session_time']).dt.tz_localize('UTC')
try:
    detections_df['detection_timestamp'] = pd.to_datetime(detections_df['detection_timestamp']).dt.tz_convert('UTC')
except:
    detections_df['detection_timestamp'] = pd.to_datetime(detections_df['detection_timestamp']).dt.tz_localize('UTC')

# Match sessions with airings
time_window = pd.Timedelta(minutes=10)  # 10 minutes before and after
session_curves = []

# Convert detections_df to a dictionary for faster access
detections_dict = detections_df.set_index('occurrence_id').to_dict(orient='index')
for i, airing in detections_df.iterrows():
    airing_id = airing['occurrence_id']
    airing_time = airing['detection_timestamp']
    
    # Filter sessions within 10 minutes of airing
    relevant_sessions = activity_session_na_df[
        (activity_session_na_df['session_timestamp'] >= (airing_time - time_window)) &
        (activity_session_na_df['session_timestamp'] <= (airing_time + time_window))
    ]
    relevant_sessions['time_diff'] = (relevant_sessions['session_timestamp'] - airing_time).dt.total_seconds()  # Time difference in seconds
    relevant_sessions['airing_id'] = airing_id
    session_curves.append(relevant_sessions)

# Combine all session curves
all_sessions_df = pd.concat(session_curves)


In [None]:

# Plot bell curve for each source
sources = ['google-ppc', 'direct', 'other', 'bing', 'google-organic', 'yahoo']
plt.figure(figsize=(10, 6))

for source in sources:
    source_data = all_sessions_df[all_sessions_df['calculated_source'] == source]
    plt.hist(
        source_data['time_diff'], 
        bins=range(-600, 601, 10),  # Binning every 10 seconds
        alpha=0.5, 
        label=source, 
        density=True
    )
plt.ylim(0, 0.00225)
plt.title('Web Session Response Curves (10 Min Before and After Airing)')
plt.xlabel('Time Difference (seconds)')
plt.ylabel('Density')
plt.legend()
plt.show()

# Statistics for each source
statistics = all_sessions_df.loc[all_sessions_df['calculated_source'].isin(sources)].groupby('calculated_source')['time_diff'].agg(['mean', 'std', 'count'])
print(statistics)

sources2 = all_sessions_df['calculated_source'].unique()

for source in sources2:
    if source in sources:
        continue
    source_data = all_sessions_df[all_sessions_df['calculated_source'] == source]
    plt.hist(
        source_data['time_diff'], 
        bins=range(-600, 601, 10),  # Binning every 10 seconds
        alpha=0.5, 
        density=True
    )
plt.ylim(0, 0.0225)
plt.title('Web Session Response Curves (10 Min Before and After Airing)')
plt.xlabel('Time Difference (seconds)')
plt.ylabel('Density')
plt.legend()
plt.show()

# Statistics for each source
statistics = all_sessions_df.loc[all_sessions_df['calculated_source'].isin(sources2)].groupby('calculated_source')['time_diff'].agg(['mean', 'std', 'count'])
print(statistics)



In [None]:
from concurrent.futures import ProcessPoolExecutor
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Ensure timestamps are consistent in their time zones
try:
    activity_session_na_df['session_timestamp'] = pd.to_datetime(activity_session_na_df['session_time']).dt.tz_convert('UTC')
except:
    activity_session_na_df['session_timestamp'] = pd.to_datetime(activity_session_na_df['session_time']).dt.tz_localize('UTC')

try:
    detections_df['detection_timestamp'] = pd.to_datetime(detections_df['detection_timestamp']).dt.tz_convert('UTC')
except:
    detections_df['detection_timestamp'] = pd.to_datetime(detections_df['detection_timestamp']).dt.tz_localize('UTC')

# Identify airings with overlaps
time_window_overlap = pd.Timedelta(minutes=5)
overlap_flags = []

for i, airing in detections_df.iterrows():
    current_end = airing['detection_timestamp'] + time_window_overlap
    overlaps = (
        detections_df[
            (detections_df['detection_timestamp'] < current_end) &
            (detections_df['detection_timestamp'] > airing['detection_timestamp'])
        ]
    )
    overlap_flags.append(len(overlaps) > 0)

detections_df['has_overlap'] = overlap_flags

# Split airings into overlapping and non-overlapping subsets
overlapping_airings = detections_df[detections_df['has_overlap'] == True]
non_overlapping_airings = detections_df[detections_df['has_overlap'] == False]

# Function to calculate response curves for a subset of airings
def calculate_response_curve_for_airing(airing, time_window, sources):
    airing_time = airing['detection_timestamp']
    relevant_sessions = activity_session_na_df[
        (activity_session_na_df['session_timestamp'] >= (airing_time - time_window)) &
        (activity_session_na_df['session_timestamp'] <= (airing_time + time_window)) &
        (activity_session_na_df['calculated_source'].isin(sources))
    ]
    relevant_sessions['time_diff'] = (relevant_sessions['session_timestamp'] - airing_time).dt.total_seconds()  # Seconds
    return relevant_sessions

def calculate_response_curve_parallel(airings_subset, time_window, sources):
    session_curves = []
    with ProcessPoolExecutor() as executor:
        # Parallelize the computation of response curves for each airing
        futures = [
            executor.submit(calculate_response_curve_for_airing, airing, time_window, sources)
            for _, airing in airings_subset.iterrows()
        ]
        for future in futures:
            session_curves.append(future.result())

    # Combine all session curves
    if session_curves:
        return pd.concat(session_curves)
    else:
        return pd.DataFrame()

# Calculate response curves for overlapping and non-overlapping airings
time_window = pd.Timedelta(minutes=10)
sources = activity_session_na_df['calculated_source'].unique()

response_curve_overlap = calculate_response_curve_parallel(overlapping_airings, time_window, sources)
response_curve_no_overlap = calculate_response_curve_parallel(non_overlapping_airings, time_window, sources)

# Plot response curves by source for overlapping and non-overlapping airings
plt.figure(figsize=(12, 6))
bins = range(-600, 601, 10)  # Bin every 10 seconds

sources = ['google-ppc']
for source in sources:
    # Overlapping airings
    source_data_overlap = response_curve_overlap[response_curve_overlap['calculated_source'] == source]
    plt.hist(
        source_data_overlap['time_diff'],
        bins=bins,
        alpha=0.5,
        label=f"Overlapping - {source}",
        density=True
    )

    # Non-overlapping airings
    source_data_no_overlap = response_curve_no_overlap[response_curve_no_overlap['calculated_source'] == source]
    plt.hist(
        source_data_no_overlap['time_diff'],
        bins=bins,
        alpha=0.5,
        label=f"Non-Overlapping - {source}",
        density=True
    )

# Set vertical scale to match the peak of the other chart
plt.ylim(0, 0.00225)
    
plt.title('Response Curve Comparison by Source: Overlapping vs Non-Overlapping Airings')
plt.xlabel('Time Difference (seconds)')
plt.ylabel('Density')
plt.legend()
plt.show()

# Compare statistics for overlapping and non-overlapping airings by source
overlap_stats = response_curve_overlap.groupby('calculated_source')['time_diff'].agg(['mean', 'std', 'count'])
no_overlap_stats = response_curve_no_overlap.groupby('calculated_source')['time_diff'].agg(['mean', 'std', 'count'])

print("Statistics for Overlapping Airings by Source:\n", overlap_stats)
print("Statistics for Non-Overlapping Airings by Source:\n", no_overlap_stats)

In [None]:
activity_session_na_df['calculated_source'].unique()

In [None]:
activity_session_na_df['calculated_source'].unique()

In [None]:
sources = ['google-ppc', 'direct', 'other', 'bing', 'google-organic', 'yahoo']
# Overlapping airings

source_data_overlap2 = response_curve_overlap.loc[response_curve_overlap['calculated_source'].isin(sources)].copy().reset_index(drop=True)
plt.hist(
    source_data_overlap2['time_diff'],
    bins=bins,
    alpha=0.5,
    label=f"Overlapping - Airings",
    density=True
)

# Non-overlapping airings
source_data_no_overlap2 = response_curve_no_overlap[response_curve_no_overlap['calculated_source'].isin(sources)].copy().reset_index(drop=True)
plt.hist(
    source_data_no_overlap2['time_diff'],
    bins=bins,
    alpha=0.5,
    label=f"Non-Overlapping - Airings",
    density=True
)
# Set vertical scale to match the peak of the other chart
plt.ylim(0, 0.00125)

plt.title(f'Response Curve Comparison for Targeted Sources ({sources}): Overlapping vs Non-Overlapping Airings')
plt.xlabel('Time Difference (seconds)')
plt.ylabel('Density')
plt.legend()
plt.show()

source_data_overlap2 = response_curve_overlap.loc[~response_curve_overlap['calculated_source'].isin(sources)].copy().reset_index(drop=True)
plt.hist(
    source_data_overlap2['time_diff'],
    bins=bins,
    alpha=0.5,
    label=f"Overlapping - Airings",
    density=True
)

# Non-overlapping airings
source_data_no_overlap2 = response_curve_no_overlap[~response_curve_no_overlap['calculated_source'].isin(sources)].copy().reset_index(drop=True)
plt.hist(
    source_data_no_overlap2['time_diff'],
    bins=bins,
    alpha=0.5,
    label=f"Non-Overlapping - Airings",
    density=True
)
# Set vertical scale to match the peak of the other chart
plt.ylim(0, 0.00125)

plt.title(f'Response Curve Comparison for non Targeted Sources (not {sources}): Overlapping vs Non-Overlapping Airings')
plt.xlabel('Time Difference (seconds)')
plt.ylabel('Density')
plt.legend()
plt.show()

In [None]:
sources = ['google-ppc', 'direct', 'other', 'bing', 'google-organic', 'yahoo', 'youtube', 'instagram', 'reddit', 'trustpilot', 'yelp', 'print', 'display', 'pinterest', 'amazon']
# Overlapping airings
for source in sources:
    source_data_overlap2 = response_curve_overlap.loc[response_curve_overlap['calculated_source'] == source].copy().reset_index(drop=True)
    plt.hist(
        source_data_overlap2['time_diff'],
        bins=bins,
        alpha=0.5,
        label=f"Overlapping - Airings",
        density=True
    )

    # Non-overlapping airings
    source_data_no_overlap2 = response_curve_no_overlap[response_curve_no_overlap['calculated_source'] == source].copy().reset_index(drop=True)
    plt.hist(
        source_data_no_overlap2['time_diff'],
        bins=bins,
        alpha=0.5,
        label=f"Non-Overlapping - Airings",
        density=True
    )
    # Set vertical scale to match the peak of the other chart
    plt.ylim(0, 0.00225)

    plt.title(f'Response Curve Comparison by Targeted Source ({source}): Overlapping vs Non-Overlapping Airings')
    plt.xlabel('Time Difference (seconds)')
    plt.ylabel('Density')
    plt.legend()
    plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

try:
    activity_session_na_df['session_timestamp'] = pd.to_datetime(activity_session_na_df['session_time']).dt.tz_convert('UTC')
except:
    activity_session_na_df['session_timestamp'] = pd.to_datetime(activity_session_na_df['session_time']).dt.tz_localize('UTC')
try:
    detections_df['detection_timestamp'] = pd.to_datetime(detections_df['detection_timestamp']).dt.tz_convert('UTC')
except:
    detections_df['detection_timestamp'] = pd.to_datetime(detections_df['detection_timestamp']).dt.tz_localize('UTC')

# Match sessions with airings and calculate lag
time_window = pd.Timedelta(minutes=10)
session_curves = []

# Calculate response curves for each airing
lag_data = []

for i, airing in detections_df.iterrows():
    airing_time = airing['detection_timestamp']
    ad_length = airing.get('ad_length', 'Unknown')  # Use 'Unknown' if ad_length isn't provided
    
    # Filter sessions within the 10-minute window
    relevant_sessions = activity_session_na_df[
        (activity_session_na_df['session_timestamp'] >= (airing_time - time_window)) &
        (activity_session_na_df['session_timestamp'] <= (airing_time + time_window))
    ]
    relevant_sessions['time_diff'] = (relevant_sessions['session_timestamp'] - airing_time).dt.total_seconds()  # Time difference in seconds
    
    if len(relevant_sessions) > 0:
        # Calculate kernel density estimation to find peak response
        kde = gaussian_kde(relevant_sessions['time_diff'])
        time_range = np.linspace(-600, 600, 1200)  # Range of time differences (seconds)
        kde_values = kde(time_range)
        peak_time = time_range[np.argmax(kde_values)]  # Peak time relative to airing
        
        # Store lag data
        lag_data.append({
            'airing_id': airing['occurrence_id'],
            'ad_length': ad_length,
            'lag': peak_time  # Lag in seconds
        })
    session_curves.append(relevant_sessions)

# Combine all session curves
all_sessions_df = pd.concat(session_curves)
lag_df = pd.DataFrame(lag_data)

# Plot lag distribution by ad length
plt.figure(figsize=(10, 6))
for ad_length in lag_df['ad_length'].unique():
    subset = lag_df[lag_df['ad_length'] == ad_length]
    plt.hist(subset['lag'], bins=30, alpha=0.5, label=f"{ad_length}s Ad Length", density=True)

plt.title('Lag Distribution by Ad Length')
plt.xlabel('Lag (seconds)')
plt.ylabel('Density')
plt.legend()
plt.show()

# Compare lag statistics by ad length
lag_stats = lag_df.groupby('ad_length')['lag'].agg(['mean', 'std', 'count'])
print("Lag Statistics by Ad Length:\n", lag_stats)

# Response curve comparison by ad length
plt.figure(figsize=(12, 6))
bins = range(-600, 601, 10)  # Bin every 10 seconds

for ad_length in detections_df['ad_length'].unique():
    # Filter sessions for this ad length
    airings_subset = detections_df[detections_df['ad_length'] == ad_length]
    session_curves_length = []
    for i, airing in airings_subset.iterrows():
        airing_time = airing['detection_timestamp']
        relevant_sessions = activity_session_na_df[
            (activity_session_na_df['session_timestamp'] >= (airing_time - time_window)) &
            (activity_session_na_df['session_timestamp'] <= (airing_time + time_window))
        ]
        relevant_sessions['time_diff'] = (relevant_sessions['session_timestamp'] - airing_time).dt.total_seconds()
        session_curves_length.append(relevant_sessions)
    
    if session_curves_length:
        sessions_df = pd.concat(session_curves_length)
        plt.hist(
            sessions_df['time_diff'], 
            bins=bins, 
            alpha=0.5, 
            label=f"{ad_length}s Ad Length", 
            density=True
        )

plt.title('Response Curves by Ad Length')
plt.xlabel('Time Difference (seconds)')
plt.ylabel('Density')
plt.legend()
plt.show()

In [None]:
# Define targeted sources
targeted_sources = ['google-ppc', 'direct', 'other', 'bing', 'google-organic', 'yahoo']

# Filter web sessions to only include targeted sources
filtered_sessions = activity_session_na_df[
    activity_session_na_df['calculated_source'].isin(targeted_sources)
]

# Step 1: Filter to only non-overlapping airings
non_overlapping_airings = detections_df[detections_df['has_overlap'] == False]

try:
    filtered_sessions['session_timestamp'] = pd.to_datetime(filtered_sessions['session_time']).dt.tz_convert('UTC')
except:
    filtered_sessions['session_timestamp'] = pd.to_datetime(filtered_sessions['session_time']).dt.tz_localize('UTC')
try:
    detections_df['detection_timestamp'] = pd.to_datetime(detections_df['detection_timestamp']).dt.tz_convert('UTC')
except:
    detections_df['detection_timestamp'] = pd.to_datetime(detections_df['detection_timestamp']).dt.tz_localize('UTC')
try:
    detections_df['attribution_end_time'] = pd.to_datetime(detections_df['attribution_end_time']).dt.tz_convert('UTC')
except:
    detections_df['attribution_end_time'] = pd.to_datetime(detections_df['attribution_end_time']).dt.tz_localize('UTC')

# Time-decay function
def calculate_credit(possible_sessions, airing_time, decay_factor=0.01):
    """
    Add context to sessions: time_diff and credit_weight using a decay function.
    """
    updated_sessions = []
    for session in possible_sessions:
        time_diff = (session['session_timestamp'] - airing_time).total_seconds()
        # Credit weight using time-decay; no credit before airing starts
        credit_weight = np.exp(-decay_factor * max(time_diff, 0)) if time_diff >= 0 else 0
        updated_sessions.append({
            'activity_session_id': session['activity_session_id'],
            'calculated_source': session['calculated_source'],
            'time_diff': time_diff,
            'credit_weight': credit_weight
        })
    return updated_sessions

# Step 1: Collect all possible sessions
def collect_possible_sessions(airing_row, sessions_df):
    airing_start = airing_row['detection_timestamp']
    airing_end = airing_row['attribution_end_time']
    possible_sessions = sessions_df[
        (sessions_df['session_timestamp'] >= airing_start) &
        (sessions_df['session_timestamp'] <= airing_end)
    ]
    return possible_sessions[['activity_session_id', 'calculated_source', 'session_timestamp']].to_dict(orient='records')

# Step 2: Update detections_df with full context
def add_context_to_detections(detections_df, sessions_df):
    detections_df['possible_attributed_sessions'] = detections_df.apply(
        lambda row: collect_possible_sessions(row, sessions_df), axis=1
    )
    detections_df['attributed_sessions_with_context'] = detections_df.apply(
        lambda row: calculate_credit(row['possible_attributed_sessions'], row['detection_timestamp']), axis=1
    )
    return detections_df

# Apply the updated function
detections_df = add_context_to_detections(detections_df, filtered_sessions)

# Preview the updated detections_df
print(detections_df[['occurrence_id', 'detection_timestamp', 'attribution_end_time', 'attributed_sessions_with_context']])

In [None]:

# Plot bell curve for each source
sources = ['Direct', 'Search', 'Unknown']
for source in sources:
    source_data = all_sessions_pd[all_sessions_pd['source'] == source]
    plt.hist(source_data['time_diff'], bins=30, alpha=0.5, label=source, density=True)

plt.title('Web Session Response Curves')
plt.xlabel('Time Difference (minutes)')
plt.ylabel('Density')
plt.legend()
plt.show()

# Statistics for each source
statistics = all_sessions_pd.groupby('source')['time_diff'].agg(['mean', 'std', 'count'])
print(statistics)

In [None]:


tools.display_dataframe_to_user(name="Overlapping Airings Detection", dataframe=detections_df.loc[detections_df['overlaps'] == True, :])


In [None]:
import pandas as pd

# Example dataframes: airings_df and web_sessions_df
# airings_df: ['airing_time', 'dma', 'attribution_window']
# web_sessions_df: ['session_time', 'dma']

# Step 1: Calculate attribution windows
detections_df['attribution_end_time'] = detections_df['detection_timestamp'] + pd.to_timedelta(detections_df['attribution_window'], unit='s')
detections_df['airing_time'] = detections_df['detection_timestamp']
airings_df = detections_df.copy()
web_sessions_df = activity_session_df.copy()
# Step 2: Match sessions to airings
def match_sessions(airings_df, web_sessions_df):
    matches = []
    for _, airing in airings_df.iterrows():
        # Filter web sessions within time range and matching DMA
        valid_sessions = web_sessions_df[
            (web_sessions_df['session_time'] >= airing['detection_timestamp']) &
            (web_sessions_df['session_time'] <= airing['attribution_end_time']) &
            (web_sessions_df['geo_neustar_id'] == airing['geo_neustar_id'])
        ]
        valid_sessions = valid_sessions.copy()
        valid_sessions['airing_time'] = airing['airing_time']
        matches.append(valid_sessions)

    # Combine all matches
    matched_df = pd.concat(matches, ignore_index=True)
    return matched_df

# Step 3: Apply function
matched_sessions = match_sessions(airings_df, web_sessions_df)

# Step 4: Analyze results
# Example: Response curve by time since airing
matched_sessions['time_since_airing'] = (matched_sessions['session_time'] - matched_sessions['airing_time']).dt.total_seconds()
response_curve = matched_sessions.groupby('time_since_airing').size()
response_curve.plot(kind='line', title='Response Curve')