In [1]:
import pandas as pd
from config_GAM2025 import gam_info
import functions

In [2]:
# Load country mapping
country_map = pd.read_excel(f"../../{gam_info['lookup_file']}", sheet_name='CountryID')[['PlaceID', 'YouTube Codes']]
# Load country mapping
week_map = pd.read_excel(f"../../{gam_info['lookup_file']}", sheet_name='GAM Period')[['w/c', 'WeekNumber_finYear']]


In [3]:
# Utility functions
def load_excel(path):
    return pd.read_excel(path, engine='openpyxl')

def load_csv(path):
    return pd.read_csv(path)

def standardize_country_codes(df, column='Country Code'):
    return df.replace({column: {'WLF': 'WFI', '* Total': 'Total'}})

def run_comparison(original_df, new_df, column_mapping, key_columns, method='integer', threshold=0.0001):
    if method == 'integer':
        return compare_dataframes_integer(original_df, new_df, column_mapping, key_columns)
    elif method == 'percentage':
        return compare_dataframes_percentage(original_df, new_df, column_mapping, key_columns, threshold)
    else:
        raise ValueError("Unknown comparison method")

In [4]:
def compare_dataframes_integer(original_df, new_df, column_mapping, key_columns_new):
    """
    Compare two DataFrames and return rows that are missing or different.

    Parameters:
    - original_df: DataFrame from the original source
    - new_df: DataFrame from the new source
    - column_mapping: dict mapping original_df column names to new_df column names
    - key_columns_new: list of key columns using new_df naming

    Returns:
    - missing_from_new: rows in original_df not found in new_df
    - differing_rows: rows where key matches but mapped columns differ
    """

    # Rename original_df to match new_df column names
    original_df_renamed = original_df.rename(columns=column_mapping)

    # Ensure all required columns exist
    all_columns = list(column_mapping.values())
    original_subset = original_df_renamed[all_columns].copy()
    new_subset = new_df[all_columns].copy()

    # Round numeric columns to nearest integer
    for col in all_columns:
        if pd.api.types.is_numeric_dtype(original_subset[col]) and pd.api.types.is_numeric_dtype(new_subset[col]):
            original_subset[col] = original_subset[col].round(0).astype('Int64')
            new_subset[col] = new_subset[col].round(0).astype('Int64')
        
    # Merge to find differences
    merged = pd.merge(
        original_subset,
        new_subset,
        on=key_columns_new,
        how='outer',
        suffixes=('_orig', '_new'),
        indicator=True
    )

    # Missing rows: present in original but not in new
    missing_from_new = merged[merged['_merge'] == 'left_only']

    # Differing rows: same keys but different values
    comparison_cols = [col for col in all_columns if col not in key_columns_new]
        
    differing_rows = merged[
        (merged['_merge'] == 'both') &
        merged[[f"{col}_orig" for col in comparison_cols]].ne(
            merged[[f"{col}_new" for col in comparison_cols]].values
        ).any(axis=1)
    ]

    return missing_from_new, differing_rows

In [5]:
def compare_dataframes_percentage(original_df, new_df, column_mapping, key_columns_new, threshold=0.0001):
    """
    Compare two DataFrames and return rows that are missing or have percentage differences.

    Parameters:
    - original_df: DataFrame from the original source
    - new_df: DataFrame from the new source
    - column_mapping: dict mapping original_df column names to new_df column names
    - key_columns_new: list of key columns using new_df naming
    - threshold: minimum absolute difference to consider as significant

    Returns:
    - missing_from_new: rows in original_df not found in new_df
    - differing_rows: rows where key matches but mapped columns differ beyond threshold
    """

    # Rename original_df to match new_df column names
    original_df_renamed = original_df.rename(columns=column_mapping)

    # Ensure all required columns exist
    all_columns = list(column_mapping.values())
    original_subset = original_df_renamed[all_columns].copy()
    new_subset = new_df[all_columns].copy()

    # Merge to find differences
    merged = pd.merge(
        original_subset,
        new_subset,
        on=key_columns_new,
        how='outer',
        suffixes=('_orig', '_new'),
        indicator=True
    )

    # Missing rows: present in original but not in new
    missing_from_new = merged[merged['_merge'] == 'left_only']

    # Compute differences
    comparison_cols = [col for col in all_columns if col not in key_columns_new]
    for col in comparison_cols:
        merged[f"{col}_diff"] = merged[f"{col}_new"] - merged[f"{col}_orig"]

    # Filter rows where any difference exceeds threshold
    diff_mask = merged['_merge'] == 'both'
    for col in comparison_cols:
        diff_mask &= merged[f"{col}_diff"].abs() > threshold

    differing_rows = merged[diff_mask]

    return missing_from_new, differing_rows


In [21]:
'''{
        "name": "Unique Viewers",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/data/Final Raw/YouTube Unique Viewers.xlsx",
        "new_path": f"../data/processed/YT-/_{gam_info['file_timeinfo']}_uniqueViewer_withAds.csv",
        "column_mapping": {
            "Channel": "Channel ID",
            "YT Service Code": "ServiceID",
            "w/c": "w/c",
            "Unique viewers": "Unique viewers"
        },
        "key_columns": ["Channel ID", "ServiceID", "w/c"],
        "method": "integer",
        "preprocess": {
            "standardize_country": False,
            "week_mapping": False
        }
    },
    {
        "name": "Country Percentage",
        "original_path": "../data/minnie_country_YT_data_2025.csv",
        "new_path": f"../data/processed/YT-/{gam_info['file_timeinfo']}_country.csv",
        "column_mapping": {
            "Channel": "Channel ID",
            "Country": "PlaceID",
            "Date": "w/c",
            "Country %": "country_%"
        },
        "key_columns": ["Channel ID", "w/c", "PlaceID"],
        "method": "percentage",
        "threshold": 0.0001,
        "preprocess": {
            "standardize_country": False,
            "week_mapping": False
        }
    },
    {
        "name": "GNL Weekly",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/Output/Weekly/WEEKLY YouTube - GNL by country.xlsx",
        "new_path": "../data/singlePlatform/output/weekly/GAM2025_WEEKLY_YT-_GNLbyCountry.xlsx",
        "column_mapping": {
            "Service Code": "ServiceID",
            "Country Code": "PlaceID",
            "YouTube Engaged Reach": "Reach",
            "w/c": "w/c"
        },
        "key_columns": ["ServiceID", "PlaceID", "w/c"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": True
        }
    },
    {
        "name": "GNL Annual",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/Output/YouTube - (GNL).xlsx",
        "new_path": "../data/singlePlatform/output/GAM2025_YT-_GNL.xlsx",
        "column_mapping": {
            "Service Code": "ServiceID",
            "Country Code": "PlaceID",
            "YouTube Engaged Reach": "Reach"
        },
        "key_columns": ["ServiceID", "PlaceID"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": False
        }
    },
    {
        "name": "WSL Weekly",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/Output/Weekly/WEEKLY YouTube - WSL by country.xlsx",
        "new_path": "../data/singlePlatform/output/weekly/GAM2025_WEEKLY_YT-_WSLbyCountry.xlsx",
        "column_mapping": {
            "Service Code": "ServiceID",
            "Country Code": "PlaceID",
            "YouTube Engaged Reach": "Reach",
            "w/c": "w/c"
        },
        "key_columns": ["ServiceID", "PlaceID", "w/c"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": True
        }
    },
    {
        "name": "WSL Annual",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/Output/YouTube - (WSL).xlsx",
        "new_path": "../data/singlePlatform/output/GAM2025_YT-_WSL.xlsx",
        "column_mapping": {
            "Service Code": "ServiceID",
            "Country Code": "PlaceID",
            "YouTube Engaged Reach": "Reach"
        },
        "key_columns": ["ServiceID", "PlaceID"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": False
        },
        "notes": [
            "Ser Sin country info was lost in Minnie's workflow but retained in original.",
            "Differences due to averaging vs dividing by number of weeks."
        ]
    },
    '''

# Dataset configuration
datasets = [
    {
        "name": "WOR Weekly",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/Output/Weekly/WEEKLY YouTube - WOR by country.xlsx",
        "new_path": "../data/singlePlatform/output/weekly/GAM2025_WEEKLY_YT-_WORbyCountry.xlsx",
        "column_mapping": {
            'Service Code': 'ServiceID', 
            'Country Code': 'PlaceID',
            'YouTube Engaged Reach': 'Reach',
            'w/c': 'w/c'
        },
        "key_columns": ["ServiceID", "PlaceID", "w/c"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": True
        }
    },
    {
        "name": "WOR Annualy",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/Output/YouTube - (WOR).xlsx",
        "new_path": "../data/singlePlatform/output/GAM2025_YT-_Studios.xlsx",
        "column_mapping": {
            'Service Code': 'ServiceID', 
            'Country Code': 'PlaceID',
            'YouTube Engaged Reach': 'Reach',
        },
        "key_columns": ["ServiceID", "PlaceID"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": False
        }
    },
    {
        "name": "WSE Weekly",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/Output/Weekly/WEEKLY YouTube - WSE by country.xlsx",
        "new_path": "../data/singlePlatform/output/weekly/GAM2025_WEEKLY_YT-_WSEbyCountry.xlsx",
        "column_mapping": {
            'Service Code': 'ServiceID', 
            'Country Code': 'PlaceID',
            'YouTube Engaged Reach': 'Reach',
            'w/c': 'w/c'
        },
        "key_columns": ["ServiceID", "PlaceID", "w/c"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": True
        }
    },
    {
        "name": "WSE Annualy",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/Output/YouTube - (WSE).xlsx",
        "new_path": "../data/singlePlatform/output/GAM2025_YT-_WSE.xlsx",
        "column_mapping": {
            'Service Code': 'ServiceID', 
            'Country Code': 'PlaceID',
            'YouTube Engaged Reach': 'Reach',
        },
        "key_columns": ["ServiceID", "PlaceID"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": False
        }
    },
    {
        "name": "MA- Weekly",
        "original_path": "../test/alteryx_datasets/mk_weekly_MA_YT.csv",
        "new_path": "../data/singlePlatform/output/weekly/GAM2025_WEEKLY_YT-_MA-byCountry.xlsx",
        "column_mapping": {
            'Service Code': 'ServiceID', 
            'Country Code': 'PlaceID',
            'YouTube Engaged Reach': 'Reach',
            'w/c': 'w/c'
        },
        "key_columns": ["ServiceID", "PlaceID", "w/c"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": True
        }
    },
    {
        "name": "MA- Annualy",
        "original_path": "../test/alteryx_datasets/mk_annualy_MA_YT.csv",
        "new_path": "../data/singlePlatform/output/GAM2025_YT-_MA-.xlsx",
        "column_mapping": {
            'YT Service Code': 'ServiceID', 
            'Country': 'PlaceID',
            'YouTube Engaged Reach': 'Reach',
        },
        "key_columns": ["ServiceID", "PlaceID"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": False
        }
    },
{
        "name": "FOA Weekly",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/Output/Weekly/WEEKLY YouTube - FOA by country.xlsx",
        "new_path": "../data/singlePlatform/output/weekly/GAM2025_WEEKLY_YT-_FOAbyCountry.xlsx",
        "column_mapping": {
            'Service Code': 'ServiceID', 
            'Country Code': 'PlaceID',
            'YouTube Engaged Reach': 'Reach',
            'w/c': 'w/c'
        },
        "key_columns": ["ServiceID", "PlaceID", "w/c"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": True
        }
    },
    {
        "name": "FOA Annualy",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/Output/YouTube - (FOA).xlsx",
        "new_path": "../data/singlePlatform/output/GAM2025_YT-_FOA.xlsx",
        "column_mapping": {
            'Service Code': 'ServiceID', 
            'Country Code': 'PlaceID',
            'YouTube Engaged Reach': 'Reach',
        },
        "key_columns": ["ServiceID", "PlaceID"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": False
        }
    },
    {
        "name": "AXE Weekly",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/Output/Weekly/WEEKLY YouTube - AXE by country.xlsx",
        "new_path": "../data/singlePlatform/output/weekly/GAM2025_WEEKLY_YT-_AXEbyCountry.xlsx",
        "column_mapping": {
            'Service Code': 'ServiceID', 
            'Country Code': 'PlaceID',
            'YouTube Engaged Reach': 'Reach',
            'w/c': 'w/c'
        },
        "key_columns": ["ServiceID", "PlaceID", "w/c"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": True
        }
    },
    {
        "name": "AXE Annualy",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/Output/YouTube - (AXE).xlsx",
        "new_path": "../data/singlePlatform/output/GAM2025_YT-_AXE.xlsx",
        "column_mapping": {
            'Service Code': 'ServiceID', 
            'Country Code': 'PlaceID',
            'YouTube Engaged Reach': 'Reach',
        },
        "key_columns": ["ServiceID", "PlaceID"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": False
        }
    },
    {
        "name": "AX2 Weekly",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/Output/Weekly/WEEKLY YouTube - AX2 by country.xlsx",
        "new_path": "../data/singlePlatform/output/weekly/GAM2025_WEEKLY_YT-_AX2byCountry.xlsx",
        "column_mapping": {
            'Service Code': 'ServiceID', 
            'Country Code': 'PlaceID',
            'YouTube Engaged Reach': 'Reach',
            'w/c': 'w/c'
        },
        "key_columns": ["ServiceID", "PlaceID", "w/c"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": True
        }
    },
    {
        "name": "AX2 Annualy",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/Output/YouTube - (AX2) WS inc FOA.xlsx",
        "new_path": "../data/singlePlatform/output/GAM2025_YT-_AX2.xlsx",
        "column_mapping": {
            'Service Code': 'ServiceID', 
            'Country Code': 'PlaceID',
            'YouTube Engaged Reach': 'Reach',
        },
        "key_columns": ["ServiceID", "PlaceID"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": False
        }
    },
    {
        "name": "ANW Weekly",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/Output/Weekly/WEEKLY YouTube - ANW by country.xlsx",
        "new_path": "../data/singlePlatform/output/weekly/GAM2025_WEEKLY_YT-_ANWbyCountry.xlsx",
        "column_mapping": {
            'Service Code': 'ServiceID', 
            'Country Code': 'PlaceID',
            'YouTube Engaged Reach': 'Reach',
            'w/c': 'w/c'
        },
        "key_columns": ["ServiceID", "PlaceID", "w/c"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": True
        }
    },
    {
        "name": "ANW Annualy",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/Output/YouTube - (ANW) Any WS inc WSE.xlsx",
        "new_path": "../data/singlePlatform/output/GAM2025_YT-_ANW.xlsx",
        "column_mapping": {
            'Service Code': 'ServiceID', 
            'Country Code': 'PlaceID',
            'YouTube Engaged Reach': 'Reach',
        },
        "key_columns": ["ServiceID", "PlaceID"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": False
        }
    },
    {
        "name": "ANY Weekly",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/Output/Weekly/WEEKLY YouTube - ANY by country.xlsx",
        "new_path": "../data/singlePlatform/output/weekly/GAM2025_WEEKLY_YT-_ANYbyCountry.xlsx",
        "column_mapping": {
            'Service Code': 'ServiceID', 
            'Country Code': 'PlaceID',
            'YouTube Engaged Reach': 'Reach',
            'w/c': 'w/c'
        },
        "key_columns": ["ServiceID", "PlaceID", "w/c"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": True
        }
    },
    {
        "name": "ANY Annualy",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/Output/YouTube - (ANY) GNL & Any WS.xlsx",
        "new_path": "../data/singlePlatform/output/GAM2025_YT-_ANY.xlsx",
        "column_mapping": {
            'Service Code': 'ServiceID', 
            'Country Code': 'PlaceID',
            'YouTube Engaged Reach': 'Reach',
        },
        "key_columns": ["ServiceID", "PlaceID"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": False
        }
    },
    {
        "name": "TOT Weekly",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/Output/Weekly/WEEKLY YouTube - TOT by country.xlsx",
        "new_path": "../data/singlePlatform/output/weekly/GAM2025_WEEKLY_YT-_TOTbyCountry.xlsx",
        "column_mapping": {
            'Service Code': 'ServiceID', 
            'Country Code': 'PlaceID',
            'YouTube Engaged Reach': 'Reach',
            'w/c': 'w/c'
        },
        "key_columns": ["ServiceID", "PlaceID", "w/c"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": True
        }
    },
    {
        "name": "TOT Annualy",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/Output/YouTube - (TOT) WS GNL MA by country.xlsx",
        "new_path": "../data/singlePlatform/output/GAM2025_YT-_TOT.xlsx",
        "column_mapping": {
            'Service Code': 'ServiceID', 
            'Country Code': 'PlaceID',
            'YouTube Engaged Reach': 'Reach',
        },
        "key_columns": ["ServiceID", "PlaceID"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": False
        }
    },  
    {
        "name": "ALL Weekly",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/Output/Weekly/WEEKLY YouTube - ALL by country.xlsx",
        "new_path": "../data/singlePlatform/output/weekly/GAM2025_WEEKLY_YT-_ALLbyCountry.xlsx",
        "column_mapping": {
            'Service Code': 'ServiceID', 
            'Country Code': 'PlaceID',
            'YouTube Engaged Reach': 'Reach',
            'w/c': 'w/c'
        },
        "key_columns": ["ServiceID", "PlaceID", "w/c"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": True
        }
    },
    {
        "name": "ALL Annualy",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/Output/Youtube - (ALL) WS GNL MA WOR by country.xlsx",
        "new_path": "../data/singlePlatform/output/GAM2025_YT-_ALL.xlsx",
        "column_mapping": {
            'Service Code': 'ServiceID', 
            'Country Code': 'PlaceID',
            'YouTube Engaged Reach': 'Reach',
        },
        "key_columns": ["ServiceID", "PlaceID"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": False
        }
    },
    {
        "name": "ENG Weekly",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/Output/Weekly/WEEKLY YouTube - ENG by country.xlsx",
        "new_path": "../data/singlePlatform/output/weekly/GAM2025_WEEKLY_YT-_ENGbyCountry.xlsx",
        "column_mapping": {
            'w/c': 'w/c',
            'Service Code': 'ServiceID', 
            'Country Code': 'PlaceID',
            'YouTube Engaged Reach': 'Reach',
        },
        "key_columns": ["w/c", "ServiceID", "PlaceID"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": True
        }
    },
    {
        "name": "ENG Annualy",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/Output/YouTube - (ENG).xlsx",
        "new_path": "../data/singlePlatform/output/GAM2025_YT-_ENG.xlsx",
        "column_mapping": {
            'Service Code': 'ServiceID', 
            'Country Code': 'PlaceID',
            'YouTube Engaged Reach': 'Reach',
        },
        "key_columns": ["ServiceID", "PlaceID"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": False
        }
    },
    {
        "name": "ENW Weekly",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/Output/Weekly/WEEKLY YouTube - ENW by country.xlsx",
        "new_path": "../data/singlePlatform/output/weekly/GAM2025_WEEKLY_YT-_ENWbyCountry.xlsx",
        "column_mapping": {
            'w/c': 'w/c',
            'Service Code': 'ServiceID', 
            'Country Code': 'PlaceID',
            'YouTube Engaged Reach': 'Reach',
        },
        "key_columns": ["w/c", "ServiceID", "PlaceID"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": True
        }
    },
    {
        "name": "ENW Annualy",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/Output/YouTube - (ENW).xlsx",
        "new_path": "../data/singlePlatform/output/GAM2025_YT-_ENW.xlsx",
        "column_mapping": {
            'Service Code': 'ServiceID', 
            'Country Code': 'PlaceID',
            'YouTube Engaged Reach': 'Reach',
        },
        "key_columns": ["ServiceID", "PlaceID"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": False
        }
    },
    {
        "name": "EN2 Weekly",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/Output/Weekly/WEEKLY YouTube - EN2 by country.xlsx",
        "new_path": "../data/singlePlatform/output/weekly/GAM2025_WEEKLY_YT-_EN2byCountry.xlsx",
        "column_mapping": {
            'w/c': 'w/c',
            'Service Code': 'ServiceID', 
            'Country Code': 'PlaceID',
            'YouTube Engaged Reach': 'Reach',
        },
        "key_columns": ["w/c", "ServiceID", "PlaceID"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": True
        }
    },
    {
        "name": "EN2 Annualy",
        "original_path": "../../../../Research Projects/GAM/Digital GAM/2025/Social Media/Output/YouTube - (EN2).xlsx",
        "new_path": "../data/singlePlatform/output/GAM2025_YT-_EN2.xlsx",
        "column_mapping": {
            'Service Code': 'ServiceID', 
            'Country Code': 'PlaceID',
            'YouTube Engaged Reach': 'Reach',
        },
        "key_columns": ["ServiceID", "PlaceID"],
        "method": "integer",
        "preprocess": {
            "standardize_country": True,
            "week_mapping": False
        }
    },
]

In [22]:

# Execute comparisons
for ds in datasets:
    # TODO - test currently doesn't catch additional things in my dataset that are not in minnie's 
    # e.g. I included Studios for UK / Youtube and Minnie did not - that did not show up here
    print(f"\n--- Processing {ds['name']} ---")

    orig = load_excel(ds["original_path"]) if ds["original_path"].endswith(".xlsx") else load_csv(ds["original_path"])
    new  = load_excel(ds["new_path"]) if ds["new_path"].endswith(".xlsx") else load_csv(ds["new_path"])

    # Special preprocessing for Country Percentage dataset
    if ds["name"] == "Country Percentage":
        
        # Rename 'Country' to 'YouTube Codes' in original data and merge with mapping
        orig = orig.rename(columns={'Country': 'YouTube Codes'})
        orig = orig.merge(country_map, on='YouTube Codes', how='left').drop(columns=['YouTube Codes'])

    if "Country Code" in orig.columns:
        orig = standardize_country_codes(orig)
    if "Country Code" in new.columns:
        new = standardize_country_codes(new)

    # Rename columns according to mapping
    orig = orig.rename(columns={k: v for k, v in ds["column_mapping"].items() if k in orig.columns})
    new  = new.rename(columns={k: v for k, v in ds["column_mapping"].items() if k in new.columns})

    # Special preprocessing for Country Percentage dataset
    if ds['preprocess']['week_mapping']:
        # add w/c using Week Number
        orig = orig.merge(week_map, left_on='Week Number', right_on='WeekNumber_finYear',
                                              how='left').drop(columns=['Week Number', 'WeekNumber_finYear'])

    '''# Special preprocessing for Country Percentage dataset
    if ds["name"] in ["GNL Weekly", "WSL Weekly", "WOR Weekly", 
                      "WSE Weekly", "MA- Weekly", "FOA Weekly", 
                      "AXE Weekly", "AX2 Weekly", "ANW Weekly",
                      "ANY Weekly", "TOT Weekly", "ALL Weekly",
                     ]:
        
        # Rename 'Country' to 'YouTube Codes' in original data and merge with mapping
        orig = orig.merge(week_map, left_on='Week Number', right_on='WeekNumber_finYear',
                                              how='left').drop(columns=['Week Number', 'WeekNumber_finYear'])
    '''
    # Ensure 'w/c' columns are datetime in both DataFrames
    if 'w/c' in orig.columns:
        orig['w/c'] = pd.to_datetime(orig['w/c'], errors='coerce')
    if 'w/c' in new.columns:
        new['w/c'] = pd.to_datetime(new['w/c'], errors='coerce')

    missing, different = run_comparison(
        orig, new,
        ds["column_mapping"],
        ds["key_columns"],
        method=ds.get("method", "integer"),
        threshold=ds.get("threshold", 0.0001)
    )

    print("Rows missing from new:")
    display(missing)
    print("Rows with differences:")
    if len(different) > 0:
        different['diff'] = different['Reach_orig'] - different['Reach_new']
        display(different.sort_values('diff', ascending=False))
    else:
        display(different)


--- Processing WOR Weekly ---
Rows missing from new:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,w/c,Reach_new,_merge
12550,WOR,,,2024-04-01,,left_only
12551,WOR,,,2024-04-08,,left_only
12552,WOR,,,2024-04-15,,left_only
12553,WOR,,,2024-04-22,,left_only
12554,WOR,,,2024-04-29,,left_only
12555,WOR,,,2024-05-06,,left_only
12556,WOR,,,2024-05-13,,left_only
12557,WOR,,,2024-05-20,,left_only
12558,WOR,,,2024-05-27,,left_only
12559,WOR,,,2024-06-03,,left_only


Rows with differences:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,w/c,Reach_new,_merge



--- Processing WOR Annualy ---
Rows missing from new:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,Reach_new,_merge
247,WOR,,0,,left_only


Rows with differences:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,Reach_new,_merge,diff
190,WOR,SGS,0,1,both,-1



--- Processing WSE Weekly ---
Rows missing from new:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,w/c,Reach_new,_merge


Rows with differences:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,w/c,Reach_new,_merge



--- Processing WSE Annualy ---
Rows missing from new:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,Reach_new,_merge


Rows with differences:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,Reach_new,_merge,diff
238,WSE,VAT,0,1,both,-1



--- Processing MA- Weekly ---
Rows missing from new:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,w/c,Reach_new,_merge
3068,MA-,Total,66566,2024-04-01,,left_only
3069,MA-,Total,53222,2024-04-08,,left_only
3070,MA-,Total,70504,2024-04-15,,left_only
3071,MA-,Total,69262,2024-04-22,,left_only
3072,MA-,Total,92958,2024-04-29,,left_only
3073,MA-,Total,72930,2024-05-06,,left_only
3074,MA-,Total,83665,2024-05-13,,left_only
3075,MA-,Total,84758,2024-05-20,,left_only
3076,MA-,Total,76328,2024-05-27,,left_only
3077,MA-,Total,187175,2024-06-03,,left_only


Rows with differences:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,w/c,Reach_new,_merge



--- Processing MA- Annualy ---
Rows missing from new:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,Reach_new,_merge
0,MA-,* Total,141082,,left_only


Rows with differences:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,Reach_new,_merge



--- Processing FOA Weekly ---
Rows missing from new:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,w/c,Reach_new,_merge


Rows with differences:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,w/c,Reach_new,_merge



--- Processing FOA Annualy ---
Rows missing from new:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,Reach_new,_merge


Rows with differences:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,Reach_new,_merge,diff
46,FOA,CHR,0,1,both,-1
189,FOA,SGS,0,1,both,-1
217,FOA,TK,0,1,both,-1



--- Processing AXE Weekly ---
Rows missing from new:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,w/c,Reach_new,_merge
12442,AXE,,19566,2024-04-01,,left_only
12443,AXE,,12305,2024-04-08,,left_only
12444,AXE,,46857,2024-04-15,,left_only
12445,AXE,,46758,2024-04-22,,left_only
12446,AXE,,40455,2024-04-29,,left_only
12447,AXE,,94361,2024-05-06,,left_only
12448,AXE,,40575,2024-05-13,,left_only
12449,AXE,,58900,2024-05-20,,left_only
12450,AXE,,36068,2024-05-27,,left_only
12451,AXE,,26795,2024-06-03,,left_only


Rows with differences:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,w/c,Reach_new,_merge



--- Processing AXE Annualy ---
Rows missing from new:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,Reach_new,_merge
247,AXE,,38490,,left_only


Rows with differences:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,Reach_new,_merge,diff
226,AXE,Total,47095129,47056634,both,38495
46,AXE,CHR,1,0,both,1
160,AXE,NRK,1,0,both,1
189,AXE,SGS,1,0,both,1
238,AXE,VAT,1,0,both,1
242,AXE,WFI,3,2,both,1



--- Processing AX2 Weekly ---
Rows missing from new:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,w/c,Reach_new,_merge
12510,AX2,,19566,2024-04-01,,left_only
12511,AX2,,12305,2024-04-08,,left_only
12512,AX2,,46857,2024-04-15,,left_only
12513,AX2,,46758,2024-04-22,,left_only
12514,AX2,,40455,2024-04-29,,left_only
12515,AX2,,94361,2024-05-06,,left_only
12516,AX2,,40575,2024-05-13,,left_only
12517,AX2,,58900,2024-05-20,,left_only
12518,AX2,,36068,2024-05-27,,left_only
12519,AX2,,26795,2024-06-03,,left_only


Rows with differences:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,w/c,Reach_new,_merge,diff
3262,AX2,EGY,429879,2024-07-29,429878,both,1
5072,AX2,IND,18110495,2024-05-20,18110494,both,1



--- Processing AX2 Annualy ---
Rows missing from new:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,Reach_new,_merge
248,AX2,,37750,,left_only


Rows with differences:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,Reach_new,_merge,diff
227,AX2,Total,47721057,47683308,both,37749



--- Processing ANW Weekly ---
Rows missing from new:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,w/c,Reach_new,_merge
12584,ANW,,19566,2024-04-01,,left_only
12585,ANW,,12305,2024-04-08,,left_only
12586,ANW,,46857,2024-04-15,,left_only
12587,ANW,,46758,2024-04-22,,left_only
12588,ANW,,40455,2024-04-29,,left_only
12589,ANW,,94361,2024-05-06,,left_only
12590,ANW,,40575,2024-05-13,,left_only
12591,ANW,,58900,2024-05-20,,left_only
12592,ANW,,36068,2024-05-27,,left_only
12593,ANW,,26795,2024-06-03,,left_only


Rows with differences:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,w/c,Reach_new,_merge,diff
715,ANW,AUS,293555,2024-12-30,293554,both,1
870,ANW,BAN,1792265,2024-12-23,1792264,both,1
2045,ANW,CAN,487222,2024-07-29,487221,both,1
2072,ANW,CAN,636626,2025-02-03,636625,both,1
2073,ANW,CAN,621037,2025-02-10,621036,both,1
4306,ANW,GRE,31888,2024-07-15,31887,both,1
5102,ANW,IND,13637958,2024-11-04,13637957,both,1
5114,ANW,IND,19342822,2025-01-27,19342821,both,1
5116,ANW,IND,20236022,2025-02-10,20236021,both,1
5117,ANW,IND,15622104,2025-02-17,15622103,both,1



--- Processing ANW Annualy ---
Rows missing from new:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,Reach_new,_merge
248,ANW,,37750,,left_only


Rows with differences:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,Reach_new,_merge,diff
227,ANW,Total,49214990,49177241,both,37749



--- Processing ANY Weekly ---
Rows missing from new:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,w/c,Reach_new,_merge
12669,ANY,,19566,2024-04-01,,left_only
12670,ANY,,12305,2024-04-08,,left_only
12671,ANY,,46857,2024-04-15,,left_only
12672,ANY,,46758,2024-04-22,,left_only
12673,ANY,,40455,2024-04-29,,left_only
12674,ANY,,94361,2024-05-06,,left_only
12675,ANY,,40575,2024-05-13,,left_only
12676,ANY,,58900,2024-05-20,,left_only
12677,ANY,,36068,2024-05-27,,left_only
12678,ANY,,26795,2024-06-03,,left_only


Rows with differences:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,w/c,Reach_new,_merge,diff
707,ANY,AUS,751045,2024-11-04,751044,both,1
1619,ANY,BRA,1368522,2024-05-20,1368521,both,1
2036,ANY,CAN,739510,2024-05-27,739509,both,1
2040,ANY,CAN,698034,2024-06-24,698033,both,1
2054,ANY,CAN,939441,2024-09-30,939440,both,1
3868,ANY,FIN,59331,2024-08-05,59330,both,1
4199,ANY,GER,1242763,2024-12-16,1242762,both,1
4904,ANY,HK,567607,2024-07-08,567606,both,1
5117,ANY,IND,17020550,2024-08-12,17020549,both,1
5120,ANY,IND,15618738,2024-09-02,15618737,both,1



--- Processing ANY Annualy ---
Rows missing from new:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,Reach_new,_merge
248,ANY,,37750,,left_only
249,ANY,,57449433,,left_only


Rows with differences:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,Reach_new,_merge



--- Processing TOT Weekly ---
Rows missing from new:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,w/c,Reach_new,_merge


Rows with differences:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,w/c,Reach_new,_merge,diff
5163,TOT,INO,4212752,2024-07-01,734479,both,3478273
5179,TOT,INO,3383714,2024-10-21,1240250,both,2143464
5155,TOT,INO,3424182,2024-05-06,1471208,both,1952974
5154,TOT,INO,3104940,2024-04-29,1390596,both,1714344
5178,TOT,INO,2314827,2024-10-14,775412,both,1539415
...,...,...,...,...,...,...,...
7927,TOT,NEP,374314,2024-08-26,409188,both,-34874
7928,TOT,NEP,334150,2024-09-02,376447,both,-42297
5160,TOT,INO,482141,2024-06-10,577068,both,-94927
5159,TOT,INO,585143,2024-06-03,684623,both,-99480



--- Processing TOT Annualy ---
Rows missing from new:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,Reach_new,_merge


Rows with differences:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,Reach_new,_merge,diff
227,TOT,Total,58126391,57552034,both,574357
100,TOT,INO,1470503,944436,both,526067
48,TOT,CMB,72681,38241,both,34440
129,TOT,MAL,339408,328852,both,10556
192,TOT,SIN,243551,242906,both,645
214,TOT,TAI,416707,416103,both,604
235,TOT,USA,3650750,3650437,both,313
108,TOT,JAP,1034194,1033891,both,303
185,TOT,SAU,876087,875799,both,288
153,TOT,NEP,400480,400207,both,273



--- Processing ALL Weekly ---
Rows missing from new:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,w/c,Reach_new,_merge


Rows with differences:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,w/c,Reach_new,_merge,diff
5194,ALL,INO,4740326,2024-07-01,1270869,both,3469457
5210,ALL,INO,4149460,2024-10-21,2013851,both,2135609
5186,ALL,INO,4026366,2024-05-06,2079021,both,1947345
5185,ALL,INO,3784267,2024-04-29,2075488,both,1708779
5209,ALL,INO,3383347,2024-10-14,1851764,both,1531583
...,...,...,...,...,...,...,...
7960,ALL,NEP,414045,2024-08-26,448855,both,-34810
7961,ALL,NEP,398624,2024-09-02,440794,both,-42170
5191,ALL,INO,1279813,2024-06-10,1374383,both,-94570
5190,ALL,INO,2006742,2024-06-03,2105554,both,-98812



--- Processing ALL Annualy ---
Rows missing from new:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,Reach_new,_merge


Rows with differences:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,Reach_new,_merge,diff
228,ALL,Total,94141336,93568777,both,572559
100,ALL,INO,2096863,1572331,both,524532
48,ALL,CMB,108624,74272,both,34352
130,ALL,MAL,740446,730026,both,10420
193,ALL,SIN,351220,350588,both,632
215,ALL,TAI,490615,490013,both,602
108,ALL,JAP,1209367,1209065,both,302
236,ALL,USA,14109905,14109604,both,301
186,ALL,SAU,1068354,1068068,both,286
154,ALL,NEP,461526,461255,both,271



--- Processing ENG Weekly ---
Rows missing from new:


Unnamed: 0,w/c,ServiceID,PlaceID,Reach_orig,Reach_new,_merge


Rows with differences:


Unnamed: 0,w/c,ServiceID,PlaceID,Reach_orig,Reach_new,_merge,diff
356,2024-04-08,ENG,KRG,4352,4351,both,1
1572,2024-05-13,ENG,KRG,3393,3392,both,1
4244,2024-07-29,ENG,KRG,2812,2811,both,1
8615,2024-12-02,ENG,KRG,7288,7287,both,1
9100,2024-12-16,ENG,KRG,4968,4967,both,1
9826,2025-01-06,ENG,KRG,6658,6657,both,1
10068,2025-01-13,ENG,KRG,5702,5701,both,1
12255,2025-03-17,ENG,KRG,3464,3463,both,1
12495,2025-03-24,ENG,KRG,5620,5619,both,1



--- Processing ENG Annualy ---
Rows missing from new:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,Reach_new,_merge


Rows with differences:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,Reach_new,_merge,diff
38,ENG,CAM,4416,4415,both,1
47,ENG,CHR,2,1,both,1
116,ENG,KRG,3632,3631,both,1
225,ENG,TUR,56855,56854,both,1
52,ENG,COO,192,193,both,-1
174,ENG,POR,34012,34013,both,-1
217,ENG,TIM,1292,1293,both,-1



--- Processing ENW Weekly ---
Rows missing from new:


Unnamed: 0,w/c,ServiceID,PlaceID,Reach_orig,Reach_new,_merge


Rows with differences:


Unnamed: 0,w/c,ServiceID,PlaceID,Reach_orig,Reach_new,_merge



--- Processing ENW Annualy ---
Rows missing from new:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,Reach_new,_merge


Rows with differences:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,Reach_new,_merge,diff
190,ENW,SGS,0,1,both,-1
239,ENW,VAT,0,1,both,-1



--- Processing EN2 Weekly ---
Rows missing from new:


Unnamed: 0,w/c,ServiceID,PlaceID,Reach_orig,Reach_new,_merge


Rows with differences:


Unnamed: 0,w/c,ServiceID,PlaceID,Reach_orig,Reach_new,_merge



--- Processing EN2 Annualy ---
Rows missing from new:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,Reach_new,_merge


Rows with differences:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,Reach_new,_merge


In [37]:

# Execute comparisons
for ds in datasets:
    # TODO - test currently doesn't catch additional things in my dataset that are not in minnie's 
    # e.g. I included Studios for UK / Youtube and Minnie did not - that did not show up here
    print(f"\n--- Processing {ds['name']} ---")

    orig = load_excel(ds["original_path"]) if ds["original_path"].endswith(".xlsx") else load_csv(ds["original_path"])
    new  = load_excel(ds["new_path"]) if ds["new_path"].endswith(".xlsx") else load_csv(ds["new_path"])

    # Special preprocessing for Country Percentage dataset
    if ds["name"] == "Country Percentage":
        
        # Rename 'Country' to 'YouTube Codes' in original data and merge with mapping
        orig = orig.rename(columns={'Country': 'YouTube Codes'})
        orig = orig.merge(country_map, on='YouTube Codes', how='left').drop(columns=['YouTube Codes'])

    if "Country Code" in orig.columns:
        orig = standardize_country_codes(orig)
    if "Country Code" in new.columns:
        new = standardize_country_codes(new)

    # Rename columns according to mapping
    orig = orig.rename(columns={k: v for k, v in ds["column_mapping"].items() if k in orig.columns})
    new  = new.rename(columns={k: v for k, v in ds["column_mapping"].items() if k in new.columns})

    # Special preprocessing for Country Percentage dataset
    if ds['preprocess']['week_mapping']:
        # add w/c using Week Number
        orig = orig.merge(week_map, left_on='Week Number', right_on='WeekNumber_finYear',
                                              how='left').drop(columns=['Week Number', 'WeekNumber_finYear'])

    '''# Special preprocessing for Country Percentage dataset
    if ds["name"] in ["GNL Weekly", "WSL Weekly", "WOR Weekly", 
                      "WSE Weekly", "MA- Weekly", "FOA Weekly", 
                      "AXE Weekly", "AX2 Weekly", "ANW Weekly",
                      "ANY Weekly", "TOT Weekly", "ALL Weekly",
                     ]:
        
        # Rename 'Country' to 'YouTube Codes' in original data and merge with mapping
        orig = orig.merge(week_map, left_on='Week Number', right_on='WeekNumber_finYear',
                                              how='left').drop(columns=['Week Number', 'WeekNumber_finYear'])
    '''
    # Ensure 'w/c' columns are datetime in both DataFrames
    if 'w/c' in orig.columns:
        orig['w/c'] = pd.to_datetime(orig['w/c'], errors='coerce')
    if 'w/c' in new.columns:
        new['w/c'] = pd.to_datetime(new['w/c'], errors='coerce')

    missing, different = run_comparison(
        orig, new,
        ds["column_mapping"],
        ds["key_columns"],
        method=ds.get("method", "integer"),
        threshold=ds.get("threshold", 0.0001)
    )

    print("Rows missing from new:")
    display(missing)
    print("Rows with differences:")
    if len(different) > 0:
        different['diff'] = different['Reach_orig'] - different['Reach_new']
        display(different.sort_values('diff', ascending=False))
    else:
        display(different)


--- Processing EN2 Annualy ---
Rows missing from new:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,Reach_new,_merge


Rows with differences:


Unnamed: 0,ServiceID,PlaceID,Reach_orig,Reach_new,_merge


In [26]:
new.shape

(12686, 6)

In [25]:
orig.head()

Unnamed: 0,PlaceID,Platform,Reach,ServiceID,YearGAE,w/c
0,AFG,YT-,11534.765369,EN2,2020,2024-04-01
1,AFG,YT-,12087.009057,EN2,2020,2024-04-08
2,AFG,YT-,11685.018613,EN2,2020,2024-04-15
3,AFG,YT-,10129.126149,EN2,2020,2024-04-22
4,AFG,YT-,10513.376384,EN2,2020,2024-04-29
