In [1]:
import pandas as pd 
import numpy as np

import os
from tqdm import tqdm

In [2]:
from config_GAM2025 import gam_info

import test_functions 
import functions 

In [3]:
platformID = 'WSC'
# country
pop_size_col = 'Population Size (We are social)'
pop_size_col = 'Population2020'

country_codes_cols = ['PlaceID', pop_size_col]
country_codes = pd.read_excel(f"../../{gam_info['lookup_file']}", sheet_name='CountryID')[country_codes_cols]

# week 
week_tester = pd.read_excel(f"../../{gam_info['lookup_file']}", sheet_name='GAM Period',)
week_tester['w/c'] = pd.to_datetime(week_tester['w/c'])

service_tester = pd.read_excel(f"../../{gam_info['lookup_file']}", sheet_name='ServiceID',)
service_hierarchy = pd.read_excel(f"../../{gam_info['lookup_file']}", sheet_name='Service Hierarchy',)

platform_tester = pd.read_excel(f"../../{gam_info['lookup_file']}", sheet_name='PlatformID',)

overlap_SocWebOverlap = pd.read_excel("helper/Final Overlaps 2021.xlsx", sheet_name='SocWebOverlap').drop(columns=['Population 2020']).drop_duplicates()
overlap_SocWebOverlap['PlaceID'] = overlap_SocWebOverlap['PlaceID'].replace('MYT', 'MAY').replace('WLF', 'WFI')
#overlap_SocWebOverlap = overlap_SocWebOverlap.merge(country_codes, on='PlaceID', how='left')


# functions 

In [4]:
def compute_combined_reach(df, services, label, pop_size_col, country_codes, deal_with_zero=True, 
                          calc_type='sainsbury'):
    """
    Filters, merges, aggregates, and applies the Sainsbury formula to compute combined reach.

    Parameters:
    df (pd.DataFrame): Source DataFrame with weekly reach data.
    services (list): List of ServiceIDs to include.
    label (str): Label to assign to the resulting ServiceID.
    pop_size_col (str): Column name for population size.
    country_codes (pd.DataFrame): Mapping DataFrame for PlaceID enrichment.
    deal_with_zero (bool): Whether to apply shortcut logic in the formula.

    Returns:
    pd.DataFrame: Aggregated and transformed DataFrame with combined reach.
    """
    filtered_df = df[df['ServiceID'].isin(services)].merge(country_codes, on='PlaceID', how='left')
    
    pivot_df = pd.crosstab(
        index=[filtered_df['PlaceID'], filtered_df[pop_size_col], 
               filtered_df['w/c']],
        columns=filtered_df['ServiceID'],
        values=filtered_df['Reach'],
        aggfunc='sum'
    ).reset_index().fillna(0)

    if calc_type == 'add':
        pivot_df['Reach'] = pivot_df[services].sum(axis=1)
    elif calc_type == 'sainsbury':
        pivot_df = functions.sainsbury_formula(pivot_df, pop_size_col, services, 
                                               'Reach', deal_with_zero=deal_with_zero)
        
    else: 
        print('error')
        
    pivot_df['ServiceID'] = label
    return pivot_df[['w/c', 'ServiceID', 'PlaceID', 'Reach']]

# ingestion 

## workflow 5

In [5]:
weekly_folder = "../data/singlePlatform/output/weekly/"
singlePlatform_df_list = []

valid_platforms = set(platform_tester['PlatformID']) 
valid_services = set(service_tester['ServiceID'])| {'WSL'}
            
for file in tqdm(os.listdir(weekly_folder)):
    if file == ".DS_Store" or "podcast" in file.lower() or "site" in file.lower():
        continue
    parts = file.split("_")
    if len(parts) >= 4:
        platform_id = parts[2]
        service_id = parts[3].replace("byCountry.xlsx", "")
        
        if (platform_id in valid_platforms) and (service_id in valid_services):
            print(f"Valid file: {file}")
            file_path = os.path.join(weekly_folder, file)
            temp = pd.read_excel(file_path)
            temp['w/c'] = pd.to_datetime(temp['w/c'])
            temp['source'] = file
            singlePlatform_df_list.append(temp)

singlePlatform_df_raw = pd.concat(singlePlatform_df_list, ignore_index=True)


  0%|                                                    | 0/30 [00:00<?, ?it/s]

Valid file: GAM2025_WEEKLY_YT-_AX2byCountry.xlsx


  3%|█▍                                          | 1/30 [00:01<00:31,  1.10s/it]

Valid file: GAM2025_WEEKLY_YT-_FOAbyCountry.xlsx


  7%|██▉                                         | 2/30 [00:02<00:28,  1.01s/it]

Valid file: GAM2025_WEEKLY_YT-_GNLbyCountry.xlsx


 10%|████▍                                       | 3/30 [00:03<00:26,  1.01it/s]

Valid file: GAM2025_WEEKLY_YT-_EN2byCountry.xlsx


 13%|█████▊                                      | 4/30 [00:04<00:26,  1.02s/it]

Valid file: GAM2025_WEEKLY_TWI_GNLbyCountry.xlsx
Valid file: GAM2025_WEEKLY_YT-_ANWbyCountry.xlsx


 20%|████████▊                                   | 6/30 [00:05<00:18,  1.27it/s]

Valid file: GAM2025_WEEKLY_TTK_ANYbyCountry.xlsx


 23%|██████████▎                                 | 7/30 [00:05<00:17,  1.35it/s]

Valid file: GAM2025_WEEKLY_YT-_TOTbyCountry.xlsx


 27%|███████████▋                                | 8/30 [00:06<00:18,  1.19it/s]

Valid file: GAM2025_WEEKLY_YT-_WSLbyCountry.xlsx


 33%|██████████████▎                            | 10/30 [00:30<01:53,  5.66s/it]

Valid file: GAM2025_WEEKLY_TTK_WORbyCountry.xlsx


 37%|███████████████▊                           | 11/30 [00:31<01:25,  4.49s/it]

Valid file: GAM2025_WEEKLY_YT-_ALLbyCountry.xlsx


 40%|█████████████████▏                         | 12/30 [00:32<01:05,  3.63s/it]

Valid file: GAM2025_WEEKLY_TTK_MA-byCountry.xlsx


 43%|██████████████████▋                        | 13/30 [00:32<00:46,  2.72s/it]

Valid file: GAM2025_WEEKLY_YT-_ENGbyCountry.xlsx


 47%|████████████████████                       | 14/30 [00:33<00:36,  2.27s/it]

Valid file: GAM2025_WEEKLY_TTK_ALLbyCountry.xlsx


 50%|█████████████████████▌                     | 15/30 [00:34<00:28,  1.88s/it]

Valid file: GAM2025_WEEKLY_YT-_MA-byCountry.xlsx


 53%|██████████████████████▉                    | 16/30 [00:34<00:20,  1.43s/it]

Valid file: GAM2025_WEEKLY_YT-_ANYbyCountry.xlsx


 57%|████████████████████████▎                  | 17/30 [00:35<00:17,  1.33s/it]

Valid file: GAM2025_WEEKLY_TTK_ANWbyCountry.xlsx


 60%|█████████████████████████▊                 | 18/30 [00:36<00:13,  1.12s/it]

Valid file: GAM2025_WEEKLY_TTK_TOTbyCountry.xlsx


 63%|███████████████████████████▏               | 19/30 [00:37<00:10,  1.02it/s]

Valid file: GAM2025_WEEKLY_TTK_WSLbyCountry.xlsx


 67%|████████████████████████████▋              | 20/30 [00:38<00:11,  1.14s/it]

Valid file: GAM2025_WEEKLY_YT-_WORbyCountry.xlsx


 70%|██████████████████████████████             | 21/30 [00:39<00:09,  1.07s/it]

Valid file: GAM2025_WEEKLY_TTK_GNLbyCountry.xlsx


 73%|███████████████████████████████▌           | 22/30 [00:40<00:07,  1.10it/s]

Valid file: GAM2025_WEEKLY_YT-_WSEbyCountry.xlsx


 77%|████████████████████████████████▉          | 23/30 [00:40<00:06,  1.11it/s]

Valid file: GAM2025_WEEKLY_TTK_AX2byCountry.xlsx


 80%|██████████████████████████████████▍        | 24/30 [00:41<00:04,  1.24it/s]

Valid file: GAM2025_WEEKLY_YT-_AXEbyCountry.xlsx


 83%|███████████████████████████████████▊       | 25/30 [00:42<00:04,  1.13it/s]

Valid file: GAM2025_WEEKLY_YT-_ENWbyCountry.xlsx


 87%|█████████████████████████████████████▎     | 26/30 [00:43<00:03,  1.09it/s]

Valid file: GAM2025_WEEKLY_TTK_AXEbyCountry.xlsx


100%|███████████████████████████████████████████| 30/30 [00:44<00:00,  1.47s/it]


# processing 

## test columns (& remove total )

In [6]:
def test_merge(df, tester_df, key, label):
    merged = df.merge(tester_df, on=key, how='left', indicator=True)
    print(f"\nMissing {label}:", merged['_merge'].value_counts())
    print(f"Unmatched {label}s:", merged[merged['_merge'] == 'left_only'][key].unique())
    return merged[merged['_merge'] == 'both'].drop(columns=['_merge'])

singlePlatform_df = test_merge(singlePlatform_df_raw, country_codes, 'PlaceID', 'PlaceID')
singlePlatform_df = test_merge(singlePlatform_df, week_tester, 'w/c', 'w/c')
singlePlatform_df = test_merge(singlePlatform_df, platform_tester, 'PlatformID', 'PlatformID')
singlePlatform_df = test_merge(singlePlatform_df, service_tester, 'ServiceID', 'ServiceID')

reach_issues = singlePlatform_df[(singlePlatform_df['Reach'] == 0) | (singlePlatform_df['Reach'].isna())]
print("Rows with zero or missing Reach:", reach_issues.shape[0])

duplicates = singlePlatform_df[singlePlatform_df.duplicated(subset=['ServiceID', 'PlatformID', 'PlaceID', 'w/c'])]
print("Duplicate rows:", duplicates.shape[0])



Missing PlaceID: _merge
both          550384
left_only          0
right_only         0
Name: count, dtype: int64
Unmatched PlaceIDs: []

Missing w/c: _merge
both          550384
left_only          0
right_only         0
Name: count, dtype: int64
Unmatched w/cs: <DatetimeArray>
[]
Length: 0, dtype: datetime64[ns]

Missing PlatformID: _merge
both          550384
left_only          0
right_only         0
Name: count, dtype: int64
Unmatched PlatformIDs: []

Missing ServiceID: _merge
both          550384
left_only          0
right_only         0
Name: count, dtype: int64
Unmatched ServiceIDs: []
Rows with zero or missing Reach: 5877
Duplicate rows: 0


## workflow 6

In [7]:
full_service_df = pd.crosstab(
        index=[ singlePlatform_df['PlaceID'], 
                singlePlatform_df['w/c'], 
                singlePlatform_df['ServiceID']],
        columns=singlePlatform_df['PlatformID'],
        values= singlePlatform_df['Reach'],
        aggfunc='sum'
    ).reset_index().fillna(0)

full_service_df = full_service_df.merge(country_codes, on='PlaceID', how='left', )
full_service_df.head()
cols = full_service_df.columns

### WSL

In [8]:
# remove MA / WOR and agg services 
exclude_ids = ['WOR', 'MA-', 
               'ENG', 'EN2', 'ENW', 
               'ANW', 'TOT', 'AX2', 'ANY', 'ALL', ]
weekly_ws_df = full_service_df[~full_service_df['ServiceID'].isin(exclude_ids)]

# add overlaps
weekly_ws_df = weekly_ws_df.merge(overlap_SocWebOverlap, on='PlaceID', how='left', indicator=True)
weekly_ws_df.ServiceID.unique()
weekly_ws_df.columns

Index(['PlaceID', 'w/c', 'ServiceID', 'TTK', 'TWI', 'YT-', 'Population2020',
       'Tapestry Market', 'Country Name', 'FB & YT Factor',
       'Own Web & Social Factor', 'Web', 'Facebook Incremental',
       'YouTube Incremental', 'Social Incremental if YouTube bigger',
       'Social Incremental if Facebook bigger', 'Social Incremental',
       '% Twitter', '% Instagram', '% socialdedup Factor', 'Unnamed: 15',
       'Unnamed: 16', '_merge'],
      dtype='object')

Version 1

In [9]:
# Define expected platform columns
platform_cols = ['FBE', 'INS', 'TWI', 'YT-', 'TTK', 'TEL', 'WEI']
                
# Add missing columns with default value 0
for col in platform_cols:
    if col not in weekly_ws_df.columns:
        print(f'{col} missing!')
        weekly_ws_df[col] = 0
        
# Step 1: Calculate Max Reach
weekly_ws_df['Max Reach'] = weekly_ws_df[platform_cols].max(axis=1)

FBE missing!
INS missing!
TEL missing!
WEI missing!


In [10]:
# Step 2: Identify Max Platform
def get_max_platform(row):
    if row['Max Reach'] == row['FBE']:
        return 'Facebook'
    elif row['Max Reach'] == row['YT-']:
        return 'YouTube'
    elif row['Max Reach'] == row['TWI']:
        return 'Twitter'
    elif row['Max Reach'] == row['TTK']:
        return 'Tiktok'
    else:
        return 'Instagram'

weekly_ws_df['Max Platform'] = weekly_ws_df.apply(get_max_platform, axis=1)

In [11]:
# Step 3: Calculate WSC1
def calculate_wsc1(row):
    if row['Max Platform'] == 'Facebook':
        return (row['FBE'] + row['YT-'] * row['YouTube Incremental'] +
                row['INS'] * row['% Instagram'] +
                row['TWI'] * row['% Twitter'] +
                0.28 * row['TTK'])
    elif row['Max Platform'] == 'YouTube':
        return (row['YT-'] + row['FBE'] * row['Facebook Incremental'] +
                row['INS'] * row['% Instagram'] +
                row['TWI'] * row['% Twitter'] +
                0.28 * row['TTK'])
    elif row['Max Platform'] == 'Instagram':
        return (row['INS'] + row['YT-'] * row['YouTube Incremental'] +
                row['FBE'] * 0.03030303 +
                0.28 * row['TTK'])
    elif row['Max Platform'] == 'Tiktok':
        return (row['TTK'] + row['YT-'] * row['YouTube Incremental'] +
                row['INS'] * row['% Instagram'] +
                row['TWI'] * row['% Twitter'] +
                row['FBE'] * row['Facebook Incremental'])
    else:  # Twitter
        return (row['TWI'] + row['YT-'] * row['YouTube Incremental'] +
                row['INS'] * row['% Instagram'] +
                row['FBE'] * row['Facebook Incremental'])

weekly_ws_df[f'{platformID}1'] = weekly_ws_df.apply(calculate_wsc1, axis=1)

Version 2

In [12]:
# Ensure all required columns exist, fill missing ones with 0
required_cols = ['FBE', 'YT-', 'INS', 'TWI', 'TTK', 'WEI', 'TEL', '% socialdedup Factor']
for col in required_cols:
    if col not in weekly_ws_df.columns:
        print(f'{col} missing!')
        weekly_ws_df[col] = 0

# Calculate WSC2
weekly_ws_df[f'{platformID}2'] = (
    (weekly_ws_df['FBE'] + weekly_ws_df['YT-'] + weekly_ws_df['INS'] + weekly_ws_df['TWI']) * weekly_ws_df['% socialdedup Factor']
    + 0.28 * weekly_ws_df['TTK']
)


combined

In [13]:
# Ensure all required columns exist
required_cols = [f'{platformID}1', f'{platformID}2', 'FBE', 'YT-', 'INS', 'TWI', 'TTK', 'WEI', 'TEL']
for col in required_cols:
    if col not in weekly_ws_df.columns:
        print(f'{col} missing!')
        weekly_ws_df[col] = 0

# Compute WSC Final
def compute_wsc_final(row):
    wsc1 = row[f'{platformID}1']
    wsc2 = row[f'{platformID}2']
    if (
        wsc2 < wsc1 or
        wsc2 < row['FBE'] or
        wsc2 < row['YT-'] or
        wsc2 < row['INS'] or
        wsc2 < row['TWI'] or
        wsc2 < row['TTK']
    ):
        return wsc1
    else:
        return wsc2

weekly_ws_df['Reach'] = weekly_ws_df.apply(compute_wsc_final, axis=1)
weekly_ws_df = weekly_ws_df[['w/c', 'PlaceID', 'ServiceID', 'Reach']]
weekly_ws_df.head()

Unnamed: 0,w/c,PlaceID,ServiceID,Reach
0,2024-04-01,AFG,ARA,859.991595
1,2024-04-01,AFG,AXE,51803.55176
2,2024-04-01,AFG,AZE,2.763772
3,2024-04-01,AFG,BEN,12.841184
4,2024-04-01,AFG,BUR,7.664414


### MA & Studios

In [14]:
ma_wor_df = full_service_df[full_service_df['ServiceID'].isin(['WOR', 'MA-'])]
required_cols = ['FBE', 'YT-', 'INS', 'TWI', 'TTK', 'WEI', 'TEL']
for col in required_cols:
    if col not in ma_wor_df.columns:
        print(f'{col} missing!')
        ma_wor_df[col] = 0

ma_wor_df = functions.sainsbury_formula(ma_wor_df, pop_size_col, 
                                        required_cols, 
                                        'Reach')

weekly_ma_wor_df = ma_wor_df[['w/c', 'PlaceID', 'ServiceID', 'Reach']]
weekly_ma_wor_df.head()

FBE missing!
INS missing!
WEI missing!
TEL missing!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ma_wor_df[col] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ma_wor_df[col] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ma_wor_df[col] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

Unnamed: 0,w/c,PlaceID,ServiceID,Reach
39,2024-04-01,AFG,WOR,4588.206711
80,2024-04-08,AFG,WOR,5157.029512
121,2024-04-15,AFG,WOR,5317.983034
165,2024-04-22,AFG,WOR,5866.91863
209,2024-04-29,AFG,WOR,6581.134325


prep the aggregate calculation

In [15]:
weekly_df = pd.concat([weekly_ws_df, weekly_ma_wor_df])


### ENW

In [16]:
# Usage
enw_services = ['FOA', 'WSE']
enw_df = compute_combined_reach(weekly_df, enw_services, 'ENW', pop_size_col, country_codes)

### ENG

In [17]:
# Usage
eng_services = ['GNL', 'WSE']
eng_df = compute_combined_reach(weekly_df, eng_services, 'ENG', pop_size_col, country_codes)

### EN2

In [18]:

en2_services = ['ENG', 'WOR']
en2_df = compute_combined_reach(pd.concat([weekly_df, eng_df]), en2_services, 'EN2', pop_size_col, country_codes)


### AX2

In [19]:
cols = ['PlaceID', 'digiGAM_FOA_WT-']
africa_dedup_countries = pd.read_excel(f"../../{gam_info['lookup_file']}", sheet_name='CountryID')[cols]

ax2_services = [
    'AFA','AMH','ARA','AZE','BEN','BUR','DAR','ECH','ELT','PER','FRE','GUJ','HAU','HIN','IGB','INO',
    'KOR','KRW','KYR','MAN','MAR','NEP','PAS','PDG','POR','PUN','RUS','SER','SIN','SOM','SPA','SWA',
    'TAM','TEL','THA','TIG','TUR','UKR','URD','UZB','VIE','YOR', 'FOA', 'UKPS'
]

ax2_df = weekly_df[weekly_df['ServiceID'].isin(ax2_services)].merge(country_codes, on='PlaceID', how='left')
ax2_df = pd.crosstab(
                    index = [ ax2_df['PlaceID'], 
                              ax2_df[pop_size_col], 
                              ax2_df['w/c'],],
                    columns = ax2_df['ServiceID'],
                    values =  ax2_df['Reach'],
                    aggfunc='sum'
                ).reset_index()
ax2_df = ax2_df.fillna(0)

for col in ax2_services:
    if col not in ax2_df.columns:
        print(f'{col} missing!')
        ax2_df[col] = 0

temp2 = ax2_df.merge(africa_dedup_countries, on='PlaceID', how='outer')
africa_df = temp2[~temp2['digiGAM_FOA_WT-'].isna()]
nonAfrica_df = temp2[temp2['digiGAM_FOA_WT-'].isna()]

# Apply the logic row-wise
def compute_value(row):
    others_sum = sum(row.get(code, 0) for code in ax2_services)
    if row['FOA'] > others_sum:
        return row['FOA'] + 0.60745497 * others_sum
    else:
        return others_sum + row['FOA'] * 0.60745497

africa_df['Reach'] = africa_df.apply(compute_value, axis=1)
nonAfrica_df = functions.sainsbury_formula(nonAfrica_df, 'Population2020', ax2_services, 'Reach')
ax2_df = pd.concat([africa_df, nonAfrica_df])
ax2_df['ServiceID'] = 'AX2'
ax2_df = ax2_df[['w/c', 'ServiceID', 'PlaceID', 'Reach']]

ax2_df.head()

AFA missing!
AMH missing!
DAR missing!
IGB missing!
KRW missing!
PDG missing!
TIG missing!
YOR missing!
UKPS missing!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  africa_df['Reach'] = africa_df.apply(compute_value, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_name] = df.apply(calculate_formula, axis=1)


Unnamed: 0,w/c,ServiceID,PlaceID,Reach
208,2024-04-01,AX2,ALG,518320.581417
209,2024-04-08,AX2,ALG,855570.059536
210,2024-04-15,AX2,ALG,911421.061368
211,2024-04-22,AX2,ALG,857463.025015
212,2024-04-29,AX2,ALG,651868.102728


### ANW

In [20]:
anw_services = ['AX2', 'WSE']
anw_df = compute_combined_reach(pd.concat([weekly_df, ax2_df]), anw_services, 'ANW', 
                                pop_size_col, country_codes)

### ANY

In [21]:

any_services = ['ANW', 'GNL']
any_df = compute_combined_reach(pd.concat([weekly_df, anw_df]), any_services, 'ANY', 
                                pop_size_col, country_codes)


### TOT

In [22]:

tot_services = ['ANY', 'MA-']
tot_df = compute_combined_reach(pd.concat([weekly_df, any_df]), tot_services, 'TOT', 
                                pop_size_col, country_codes, calc_type='add')

### ALL

In [23]:

all_services = ['TOT', 'WOR']
all_df = compute_combined_reach(pd.concat([weekly_df, tot_df]), all_services, 'ALL', 
                                pop_size_col, country_codes, calc_type='add')

## finalising

In [24]:
final_weekly_df = pd.concat([weekly_df, weekly_ma_wor_df, 
                             enw_df, eng_df, en2_df, 
                             ax2_df, anw_df, any_df, tot_df, all_df])

final_weekly_df['PlatformID'] = platformID

final_weekly_df.to_csv(f"../data/combinePlatforms/{gam_info['file_timeinfo']}_weekly_{platformID}.csv", 
                       index=None)

# store dataset