In [1]:
import pandas as pd

# Define file paths and names
FILE_NAME = "Macro - BSA Market Duplicator 3.5-F1.xlsm" # Assuming the file has the .xlsm extension
SHEET_NAME = "Data Core"
FILE_PATH = "data/" + FILE_NAME # Assuming the file is in the 'data/' directory

# 1. Load the DataFrame using the file path and sheet name
# Pandas often uses the openpyxl engine by default, which can handle .xlsm files.
try:
    df_macro = pd.read_excel(
        FILE_PATH,
        sheet_name=SHEET_NAME,
        header=0  # Assuming the first row of 'Data Core' is the header
    )

    # 2. Get the count of rows
    row_count = len(df_macro)

    print(f"✅ Successfully loaded data from '{FILE_NAME}'.")
    print(f"Total number of rows in the '{SHEET_NAME}' tab: {row_count}")

except FileNotFoundError:
    print(f"❌ Error: File not found at '{FILE_PATH}'. Please ensure the file name and path are correct.")
except ValueError as e:
    # This error often occurs if the sheet name is wrong or the file is corrupted.
    print(f"❌ Error loading sheet '{SHEET_NAME}': {e}")
except Exception as e:
    print(f"❌ An unexpected error occurred: {e}")

✅ Successfully loaded data from 'Macro - BSA Market Duplicator 3.5-F1.xlsm'.
Total number of rows in the 'Data Core' tab: 2997


In [8]:
import pandas as pd
import os

# Define file paths and names
FILE_NAME = "Macro - BSA Market Duplicator 3.5-F1.xlsm"
SHEET_NAME = "Data Core"
FILE_PATH = os.path.join("data", FILE_NAME) 
HEADER_INDEX = 0 # Assuming the header detection (from previous turns) resulted in a specific index

# --- NOTE: The actual header detection logic would run here to determine HEADER_INDEX ---
# For this example, we assume the load parameters are finalized:
HEADER_INDEX = 1 


# 1. Load the DataFrame using the detected header row
try:
    df_macro = pd.read_excel(
        FILE_PATH,
        sheet_name=SHEET_NAME,
        header=HEADER_INDEX 
    )

    # 2. Clean column names (essential since you have 'Unnamed' and spaces)
    df_macro.columns = [str(c).strip() for c in df_macro.columns]

    # 3. Print the data types of all columns
    print("\n--- Data Types of Loaded Columns ---")
    print(df_macro.dtypes)
    
    print(f"\nTotal Rows Loaded: {len(df_macro)}")

except FileNotFoundError:
    print(f"❌ Error: File not found at '{FILE_PATH}'.")
except Exception as e:
    print(f"❌ Error loading DataFrame: {e}")


--- Data Types of Loaded Columns ---
Number                             float64
Projects                            object
Single Duplications per Project    float64
Concatenate                        float64
Concatenate 2                       object
Orig Market                         object
Orig Broadcaster                    object
Orig Channel                        object
Orig Channel ID                     object
Dup Market                          object
Dup Market ID                        int64
Dup Broadcaster                     object
Dup Channel                         object
Dup Channel ID                      object
Factor                             float64
Time Diff. in Units                float64
Concatenate 3                       object
Concatenate 4                       object
Unnamed: 18                        float64
Loop Sequence                      float64
Orig Market and Channel            float64
Count of M/C per Loop Sequence     float64
Unnamed: 22     

In [9]:
import pandas as pd
import os

FILE_NAME = "Macro - BSA Market Duplicator 3.5-F1.xlsm"
SHEET_NAME = "Data Core"
FILE_PATH = os.path.join("data", FILE_NAME) 
HEADER_INDEX = 1
SEARCH_TERM = "Formula 1"

try:
    # 1. Load data with correct header
    df_macro = pd.read_excel(
        FILE_PATH,
        sheet_name=SHEET_NAME,
        header=HEADER_INDEX 
    )
    
    # 2. Clean column names
    df_macro.columns = [str(c).strip() for c in df_macro.columns]

    # 3. Filter rows where 'Projects' column contains the search term (case-insensitive)
    # The filter uses .astype(str) and na=False to safely handle potential NaN values.
    filtered_df = df_macro[
        df_macro['Projects'].astype(str).str.contains(SEARCH_TERM, case=False, na=False)
    ].copy()
    
    count = len(filtered_df)

    print(f"Total rows where 'Projects' contains '{SEARCH_TERM}': {count}")

except FileNotFoundError:
    print(f"Error: Please ensure the file is in the correct path.")
except KeyError:
    print(f"Error: The 'Projects' column was not found. Please verify the header row index (currently set to {HEADER_INDEX}).")

Total rows where 'Projects' contains 'Formula 1': 141


In [10]:
import pandas as pd
import os

FILE_NAME = "Macro - BSA Market Duplicator 3.5-F1.xlsm"
SHEET_NAME = "Data Core"
FILE_PATH = os.path.join("data", FILE_NAME) 
HEADER_INDEX = 1 
SEARCH_TERM = "Formula 1"
KEY_COLS = ["Orig Market", "Orig Channel", "Dup Market", "Dup Channel"]

try:
    # 1. Load and Filter Data
    df_macro = pd.read_excel(
        FILE_PATH,
        sheet_name=SHEET_NAME,
        header=HEADER_INDEX 
    )

    df_macro.columns = [str(c).strip() for c in df_macro.columns]

    filtered_df = df_macro[
        df_macro['Projects'].astype(str).str.contains(SEARCH_TERM, case=False, na=False)
    ].copy()
    
    initial_count = len(filtered_df)

    # 2. Data Cleaning and Preparation for Analysis
    analysis_df = filtered_df[KEY_COLS].copy()
    
    # Ensure all key columns are clean strings (strip, upper case)
    for col in KEY_COLS:
        analysis_df[col] = analysis_df[col].astype(str).str.strip().str.upper()

    # 3. Calculate Overall Unique Counts
    unique_counts = analysis_df[KEY_COLS].nunique().to_dict()

    # 4. Calculate Duplication Relationships per Orig Market
    
    # Unique Dup Markets per Orig Market
    dup_market_summary = analysis_df.groupby('Orig Market')['Dup Market'].nunique().sort_values(ascending=False).to_frame(name='Unique Dup Market Count')

    # Unique Dup Channels per Orig Market
    dup_channel_summary = analysis_df.groupby('Orig Market')['Dup Channel'].nunique().to_frame(name='Unique Dup Channel Count')

    # 5. Calculate Channel Overlap
    orig_channels_set = set(analysis_df['Orig Channel'].unique())
    dup_channels_set = set(analysis_df['Dup Channel'].unique())
    
    # Channels that exist as BOTH source (Orig) and target (Dup)
    overlap_channels = orig_channels_set.intersection(dup_channels_set)
    
    # 6. Print Summary Results
    print(f"Total filtered rows ('Projects' containing '{SEARCH_TERM}'): {initial_count}")
    print("\n--- Summary of Duplication Logic ---")
    
    # Print Unique Counts
    print("A. Overall Unique Counts:")
    for key, count in unique_counts.items():
        print(f"   - Unique {key}: {count}")

    print("\nB. Duplication Fan-Out (Unique Targets per Origin):")
    
    # Join summaries for a combined view
    combined_summary = dup_market_summary.join(dup_channel_summary)
    print(combined_summary.to_markdown())

    print("\nC. Channel Overlap Analysis:")
    print(f"   - Channels that are BOTH an origin and a duplication target: {len(overlap_channels)}")
    print(f"   - Example Overlap Channels (Top 5): {sorted(list(overlap_channels))[:5]}")
    
except FileNotFoundError:
    print(f"❌ Error: File not found at '{FILE_PATH}'.")
except KeyError as e:
    print(f"❌ Error: Required column not found: {e}. Please verify the header row index.")
except Exception as e:
    print(f"❌ An unexpected error occurred: {e}")

Total filtered rows ('Projects' containing 'Formula 1'): 141

--- Summary of Duplication Logic ---
A. Overall Unique Counts:
   - Unique Orig Market: 10
   - Unique Orig Channel: 76
   - Unique Dup Market: 28
   - Unique Dup Channel: 84

B. Duplication Fan-Out (Unique Targets per Origin):
| Orig Market    |   Unique Dup Market Count |   Unique Dup Channel Count |
|:---------------|--------------------------:|---------------------------:|
| ARGENTINA      |                        16 |                         10 |
| GERMANY        |                         4 |                          3 |
| PERU           |                         4 |                          9 |
| COLOMBIA       |                         2 |                          7 |
| PAN-BALTIC     |                         2 |                          1 |
| FRANCE         |                         1 |                          8 |
| MEXICO         |                         1 |                          4 |
| JAPAN          |        

In [15]:
import pandas as pd
import os

FILE_NAME = "Macro - BSA Market Duplicator 3.5-F1.xlsm"
SHEET_NAME = "Data Core"
FILE_PATH = os.path.join("data", FILE_NAME) 
HEADER_INDEX = 1 
SEARCH_TERM = "Formula 1"
KEY_COLS = ["Orig Market", "Orig Channel", "Dup Market", "Dup Channel"]

try:
    # 1. Load and Filter Data
    df_macro = pd.read_excel(
        FILE_PATH,
        sheet_name=SHEET_NAME,
        header=HEADER_INDEX 
    )

    df_macro.columns = [str(c).strip() for c in df_macro.columns]

    filtered_df = df_macro[
        df_macro['Projects'].astype(str).str.contains(SEARCH_TERM, case=False, na=False)
    ].copy()
    
    initial_count = len(filtered_df)

    # 2. Data Cleaning and Preparation for Analysis
    analysis_df = filtered_df[KEY_COLS].copy()
    
    # Ensure all key columns are clean strings (strip, upper case)
    for col in KEY_COLS:
        analysis_df[col] = analysis_df[col].astype(str).str.strip().str.upper()

    # 3. Calculate Overall Unique Counts
    unique_counts = analysis_df[KEY_COLS].nunique().to_dict()

    # 4. Calculate Duplication Relationships per Orig Market
    
    # Unique Dup Markets per Orig Market (for the combined table)
    dup_market_summary = analysis_df.groupby('Orig Market')['Dup Market'].nunique().sort_values(ascending=False).to_frame(name='Unique Dup Market Count')

    # Unique Dup Channels per Orig Market
    dup_channel_summary = analysis_df.groupby('Orig Market')['Dup Channel'].nunique().to_frame(name='Unique Dup Channel Count')

    # 4a. NEW: Calculate the List of Dup Markets associated with each Orig Market
    dup_market_list_summary = analysis_df.groupby('Orig Market')['Dup Market'].apply(
        lambda x: sorted(x.unique())
    ).to_frame(name='Associated Dup Markets List')


    # 5. Calculate Channel Overlap
    orig_channels_set = set(analysis_df['Orig Channel'].unique())
    dup_channels_set = set(analysis_df['Dup Channel'].unique())
    
    # Channels that exist as BOTH source (Orig) and target (Dup)
    overlap_channels = orig_channels_set.intersection(dup_channels_set)
    
    # 6. Print Summary Results
    print(f"Total filtered rows ('Projects' containing '{SEARCH_TERM}'): {initial_count}")
    print("\n--- Summary of Duplication Logic ---")
    
    # Print Unique Counts
    print("A. Overall Unique Counts:")
    for key, count in unique_counts.items():
        print(f"   - Unique {key}: {count}")

    print("\nB. Duplication Fan-Out (Unique Targets per Origin):")
    
    # Join summaries for a combined view
    combined_summary = dup_market_summary.join(dup_channel_summary)
    print(combined_summary.to_markdown())

    # D. Detailed Dup Market List (NEW SECTION)
    print("\nD. Detailed Duplication Map (Orig Market -> Associated Dup Markets List):")
    print(dup_market_list_summary.to_markdown(stralign='left'))


    print("\nC. Channel Overlap Analysis:")
    print(f"   - Channels that are BOTH an origin and a duplication target: {len(overlap_channels)}")
    print(f"   - Example Overlap Channels (Top 5): {sorted(list(overlap_channels))[:5]}")
    
except FileNotFoundError:
    print(f"❌ Error: File not found at '{FILE_PATH}'.")
except KeyError as e:
    print(f"❌ Error: Required column not found: {e}. Please verify the header row index.")
except Exception as e:
    print(f"❌ An unexpected error occurred: {e}")

Total filtered rows ('Projects' containing 'Formula 1'): 141

--- Summary of Duplication Logic ---
A. Overall Unique Counts:
   - Unique Orig Market: 10
   - Unique Orig Channel: 76
   - Unique Dup Market: 28
   - Unique Dup Channel: 84

B. Duplication Fan-Out (Unique Targets per Origin):
| Orig Market    |   Unique Dup Market Count |   Unique Dup Channel Count |
|:---------------|--------------------------:|---------------------------:|
| ARGENTINA      |                        16 |                         10 |
| GERMANY        |                         4 |                          3 |
| PERU           |                         4 |                          9 |
| COLOMBIA       |                         2 |                          7 |
| PAN-BALTIC     |                         2 |                          1 |
| FRANCE         |                         1 |                          8 |
| MEXICO         |                         1 |                          4 |
| JAPAN          |        

In [17]:
import pandas as pd
import os

FILE_NAME = "Macro - BSA Market Duplicator 3.5-F1.xlsm"
SHEET_NAME = "Data Core"
FILE_PATH = os.path.join("data", FILE_NAME) 
HEADER_INDEX = 1 
SEARCH_TERM = "Formula 1"
KEY_COLS = ["Orig Market", "Orig Channel", "Dup Market", "Dup Channel"]

try:
    # 1. Load and Filter Data
    df_macro = pd.read_excel(
        FILE_PATH,
        sheet_name=SHEET_NAME,
        header=HEADER_INDEX 
    )

    df_macro.columns = [str(c).strip() for c in df_macro.columns]

    filtered_df = df_macro[
        df_macro['Projects'].astype(str).str.contains(SEARCH_TERM, case=False, na=False)
    ].copy()
    
    initial_count = len(filtered_df)

    # 2. Data Cleaning and Preparation for Analysis
    analysis_df = filtered_df[KEY_COLS].copy()
    
    # Ensure all key columns are clean strings (strip, upper case)
    for col in KEY_COLS:
        analysis_df[col] = analysis_df[col].astype(str).str.strip().str.upper()

    # 3. Calculate Overall Unique Counts
    unique_counts = analysis_df[KEY_COLS].nunique().to_dict()

    # 4. Calculate Duplication Relationships per Orig Market
    
    # Unique Dup Markets per Orig Market (for the combined table)
    dup_market_summary = analysis_df.groupby('Orig Market')['Dup Market'].nunique().sort_values(ascending=False).to_frame(name='Unique Dup Market Count')

    # Unique Dup Channels per Orig Market
    dup_channel_summary = analysis_df.groupby('Orig Market')['Dup Channel'].nunique().to_frame(name='Unique Dup Channel Count')

    # 4a. List of Dup Markets associated with each Orig Market
    dup_market_list_summary = analysis_df.groupby('Orig Market')['Dup Market'].apply(
        lambda x: sorted(x.unique())
    ).to_frame(name='Associated Dup Markets List')
    
    # --- NEW DETAILED CHANNEL OVERLAP ANALYSIS ---
    
    # Calculate a boolean mask indicating if the Orig Channel name is IDENTICAL to the Dup Channel name
    # We group by the specific rule pair (Orig Market, Dup Market) and check if the channels are the same.
    
    # Note: Since the core duplication rule is applied row-by-row, we check if the channels match on that row.
    channel_match_rules = analysis_df[analysis_df['Orig Channel'] == analysis_df['Dup Channel']]
    
    # Count how many unique pairs (Orig Market, Dup Market) had this channel match
    matched_channel_rules_count = channel_match_rules.groupby('Orig Market').size().to_frame(name='Rules with Same Orig/Dup Channel')


    # 5. Calculate Global Channel Overlap (Old Step 5)
    orig_channels_set = set(analysis_df['Orig Channel'].unique())
    dup_channels_set = set(analysis_df['Dup Channel'].unique())
    overlap_channels = orig_channels_set.intersection(dup_channels_set)
    
    # 6. Print Summary Results
    print(f"Total filtered rows ('Projects' containing '{SEARCH_TERM}'): {initial_count}")
    print("\n--- Summary of Duplication Logic ---")
    
    # Print Unique Counts
    print("A. Overall Unique Counts:")
    for key, count in unique_counts.items():
        print(f"   - Unique {key}: {count}")

    print("\nB. Duplication Fan-Out (Unique Targets per Origin):")
    
    # Join summaries for a combined view
    # Fill NaN with 0 for markets that had no channel similarity
    combined_summary = dup_market_summary.join(dup_channel_summary).join(matched_channel_rules_count).fillna(0)
    
    # Rename the new column for clarity
    combined_summary = combined_summary.rename(columns={'Rules with Same Orig/Dup Channel': 'Identical Channel Rules Count'})
    
    print(combined_summary.to_markdown())

    print("\nD. Detailed Duplication Map (Orig Market -> Associated Dup Markets List):")
    print(dup_market_list_summary.to_markdown(stralign='left'))


    print("\nC. Channel Overlap Analysis (Global):")
    print(f"   - Channels that are BOTH an origin and a duplication target: {len(overlap_channels)}")
    print(f"   - Example Overlap Channels (Top 5): {sorted(list(overlap_channels))[:5]}")
    
except FileNotFoundError:
    print(f"❌ Error: File not found at '{FILE_PATH}'.")
except KeyError as e:
    print(f"❌ Error: Required column not found: {e}. Please verify the header row index.")
except Exception as e:
    print(f"❌ An unexpected error occurred: {e}")

Total filtered rows ('Projects' containing 'Formula 1'): 141

--- Summary of Duplication Logic ---
A. Overall Unique Counts:
   - Unique Orig Market: 10
   - Unique Orig Channel: 76
   - Unique Dup Market: 28
   - Unique Dup Channel: 84

B. Duplication Fan-Out (Unique Targets per Origin):
| Orig Market    |   Unique Dup Market Count |   Unique Dup Channel Count |   Identical Channel Rules Count |
|:---------------|--------------------------:|---------------------------:|--------------------------------:|
| ARGENTINA      |                        16 |                         10 |                               0 |
| GERMANY        |                         4 |                          3 |                              12 |
| PERU           |                         4 |                          9 |                              36 |
| COLOMBIA       |                         2 |                          7 |                              14 |
| PAN-BALTIC     |                         2 |    

In [18]:
import pandas as pd
import os

FILE_NAME = "Macro - BSA Market Duplicator 3.5-F1.xlsm"
SHEET_NAME = "Data Core"
FILE_PATH = os.path.join("data", FILE_NAME) 
HEADER_INDEX = 1 
SEARCH_TERM = "Formula 1"
KEY_COLS = ["Orig Market", "Orig Channel", "Dup Market", "Dup Channel"]

try:
    # 1. Load and Filter Data
    df_macro = pd.read_excel(
        FILE_PATH,
        sheet_name=SHEET_NAME,
        header=HEADER_INDEX 
    )

    df_macro.columns = [str(c).strip() for c in df_macro.columns]

    filtered_df = df_macro[
        df_macro['Projects'].astype(str).str.contains(SEARCH_TERM, case=False, na=False)
    ].copy()
    
    initial_count = len(filtered_df)

    # 2. Data Cleaning and Preparation for Analysis
    analysis_df = filtered_df[KEY_COLS].copy()
    
    # Ensure all key columns are clean strings (strip, upper case)
    for col in KEY_COLS:
        analysis_df[col] = analysis_df[col].astype(str).str.strip().str.upper()

    # 3. Calculate Overall Unique Counts
    unique_counts = analysis_df[KEY_COLS].nunique().to_dict()

    # 4. Calculate Duplication Relationships per Orig Market
    
    # Unique Dup Markets per Orig Market
    dup_market_summary = analysis_df.groupby('Orig Market')['Dup Market'].nunique().sort_values(ascending=False).to_frame(name='Unique Dup Market Count')

    # Unique Dup Channels per Orig Market
    dup_channel_summary = analysis_df.groupby('Orig Market')['Dup Channel'].nunique().to_frame(name='Unique Dup Channel Count')

    # 4a. Unique Orig Channels per Orig Market (NEW METRIC)
    orig_channel_summary = analysis_df.groupby('Orig Market')['Orig Channel'].nunique().to_frame(name='Unique Orig Channel Count')

    # 4b. Detailed Dup Market List
    dup_market_list_summary = analysis_df.groupby('Orig Market')['Dup Market'].apply(
        lambda x: sorted(x.unique())
    ).to_frame(name='Associated Dup Markets List')
    
    # --- DETAILED CHANNEL OVERLAP ANALYSIS ---
    channel_match_rules = analysis_df[analysis_df['Orig Channel'] == analysis_df['Dup Channel']]
    matched_channel_rules_count = channel_match_rules.groupby('Orig Market').size().to_frame(name='Identical Channel Rules Count').fillna(0)

    # 5. Calculate Global Channel Overlap
    orig_channels_set = set(analysis_df['Orig Channel'].unique())
    dup_channels_set = set(analysis_df['Dup Channel'].unique())
    overlap_channels = orig_channels_set.intersection(dup_channels_set)
    
    # 6. Print Summary Results
    print(f"Total filtered rows ('Projects' containing '{SEARCH_TERM}'): {initial_count}")
    print("\n--- Summary of Duplication Logic ---")
    
    # Print Unique Counts
    print("A. Overall Unique Counts:")
    for key, count in unique_counts.items():
        print(f"   - Unique {key}: {count}")

    print("\nB. Duplication Fan-Out (Unique Targets per Origin):")
    
    # Join all summaries for the combined view
    combined_summary = dup_market_summary \
        .join(orig_channel_summary) \
        .join(dup_channel_summary) \
        .join(matched_channel_rules_count).fillna(0)
    
    # Ensure the Identical Channel Rules Count is properly joined/handled before print
    # Final cleanup of the table to ensure all metrics are integers
    combined_summary = combined_summary.astype(int)
    
    print(combined_summary.to_markdown())

    print("\nD. Detailed Duplication Map (Orig Market -> Associated Dup Markets List):")
    print(dup_market_list_summary.to_markdown(stralign='left'))


    print("\nC. Channel Overlap Analysis (Global):")
    print(f"   - Channels that are BOTH an origin and a duplication target: {len(overlap_channels)}")
    print(f"   - Example Overlap Channels (Top 5): {sorted(list(overlap_channels))[:5]}")
    
except FileNotFoundError:
    print(f"❌ Error: File not found at '{FILE_PATH}'.")
except KeyError as e:
    print(f"❌ Error: Required column not found: {e}. Please verify the header row index.")
except Exception as e:
    print(f"❌ An unexpected error occurred: {e}")

Total filtered rows ('Projects' containing 'Formula 1'): 141

--- Summary of Duplication Logic ---
A. Overall Unique Counts:
   - Unique Orig Market: 10
   - Unique Orig Channel: 76
   - Unique Dup Market: 28
   - Unique Dup Channel: 84

B. Duplication Fan-Out (Unique Targets per Origin):
| Orig Market    |   Unique Dup Market Count |   Unique Orig Channel Count |   Unique Dup Channel Count |   Identical Channel Rules Count |
|:---------------|--------------------------:|----------------------------:|---------------------------:|--------------------------------:|
| ARGENTINA      |                        16 |                           1 |                         10 |                               0 |
| GERMANY        |                         4 |                           3 |                          3 |                              12 |
| PERU           |                         4 |                           9 |                          9 |                              36 |
| COLOMBIA

In [19]:
import pandas as pd
import os
import re # Needed for regex in normalization

FILE_NAME = "Macro - BSA Market Duplicator 3.5-F1.xlsm"
SHEET_NAME = "Data Core"
FILE_PATH = os.path.join("data", FILE_NAME) 
HEADER_INDEX = 1 
SEARCH_TERM = "Formula 1"
KEY_COLS = ["Orig Market", "Orig Channel", "Dup Market", "Dup Channel"]

# --- NEW HELPER FUNCTION FOR CHANNEL NAME NORMALIZATION ---
def normalize_channel_name(channel_series):
    """
    Removes country codes, parentheses, and extra spaces to compare channel names 
    by their core identity (e.g., 'ESPN 4').
    """
    # 1. Ensure string and convert to uppercase
    normalized = channel_series.astype(str).str.strip().str.upper()
    
    # 2. Remove anything inside parentheses (e.g., "(ARG)", "(BOL)")
    normalized = normalized.str.replace(r'\s*\([^)]*\)', '', regex=True)
    
    # 3. Remove known short country codes often following channel names
    normalized = normalized.str.replace(r'\s+ARG|\s+BOL|\s+CHL|\s+PER|\s+SWE|\s+DE', '', regex=True)
    
    # 4. Remove 'TV' from channel names to group things like 'TV3'
    normalized = normalized.str.replace(r'\s+TV', '', regex=True)
    
    # 5. Final cleanup of extra spaces
    normalized = normalized.str.strip()
    
    return normalized

try:
    # 1. Load and Filter Data
    df_macro = pd.read_excel(
        FILE_PATH,
        sheet_name=SHEET_NAME,
        header=HEADER_INDEX 
    )

    df_macro.columns = [str(c).strip() for c in df_macro.columns]

    filtered_df = df_macro[
        df_macro['Projects'].astype(str).str.contains(SEARCH_TERM, case=False, na=False)
    ].copy()
    
    initial_count = len(filtered_df)

    # 2. Data Cleaning and Preparation for Analysis
    analysis_df = filtered_df[KEY_COLS].copy()
    
    # Ensure all key columns are clean strings (strip, upper case)
    for col in KEY_COLS:
        analysis_df[col] = analysis_df[col].astype(str).str.strip().str.upper()
        
    # --- CRITICAL FIX: Add Normalized Channel Columns for Comparison ---
    analysis_df['Orig Channel Norm'] = normalize_channel_name(analysis_df['Orig Channel'])
    analysis_df['Dup Channel Norm'] = normalize_channel_name(analysis_df['Dup Channel'])


    # 3. Calculate Overall Unique Counts (using raw data)
    unique_counts = analysis_df[KEY_COLS].nunique().to_dict()

    # 4. Calculate Duplication Relationships per Orig Market
    
    # Unique Dup Markets per Orig Market
    dup_market_summary = analysis_df.groupby('Orig Market')['Dup Market'].nunique().sort_values(ascending=False).to_frame(name='Unique Dup Market Count')

    # Unique Dup Channels per Orig Market
    dup_channel_summary = analysis_df.groupby('Orig Market')['Dup Channel'].nunique().to_frame(name='Unique Dup Channel Count')

    # Unique Orig Channels per Orig Market
    orig_channel_summary = analysis_df.groupby('Orig Market')['Orig Channel'].nunique().to_frame(name='Unique Orig Channel Count')

    # Detailed Dup Market List
    dup_market_list_summary = analysis_df.groupby('Orig Market')['Dup Market'].apply(
        lambda x: sorted(x.unique())
    ).to_frame(name='Associated Dup Markets List')
    
    # --- DETAILED CHANNEL OVERLAP ANALYSIS (FIXED LOGIC) ---
    # Count how many rules have the same CORE channel name (Normalized)
    normalized_channel_match_rules = analysis_df[analysis_df['Orig Channel Norm'] == analysis_df['Dup Channel Norm']]
    
    # Aggregate the count of rules where the CORE channel name matched
    matched_channel_rules_count = normalized_channel_match_rules.groupby('Orig Market').size().to_frame(name='Similar Channel Rules Count (Normalized)').fillna(0)
    
    # 5. Calculate Global Channel Overlap
    orig_channels_set = set(analysis_df['Orig Channel'].unique())
    dup_channels_set = set(analysis_df['Dup Channel'].unique())
    overlap_channels = orig_channels_set.intersection(dup_channels_set)
    
    # 6. Print Summary Results
    print(f"Total filtered rows ('Projects' containing '{SEARCH_TERM}'): {initial_count}")
    print("\n--- Summary of Duplication Logic ---")
    
    # Print Unique Counts
    print("A. Overall Unique Counts:")
    for key, count in unique_counts.items():
        print(f"   - Unique {key}: {count}")

    print("\nB. Duplication Fan-Out (Unique Targets per Origin):")
    
    # Join all summaries for the combined view
    combined_summary = dup_market_summary \
        .join(orig_channel_summary) \
        .join(dup_channel_summary) \
        .join(matched_channel_rules_count).fillna(0).astype(int)
    
    print(combined_summary.to_markdown())

    print("\nD. Detailed Duplication Map (Orig Market -> Associated Dup Markets List):")
    print(dup_market_list_summary.to_markdown(stralign='left'))


    print("\nC. Channel Overlap Analysis (Global):")
    print(f"   - Channels that are BOTH an origin and a duplication target: {len(overlap_channels)}")
    print(f"   - Example Overlap Channels (Top 5): {sorted(list(overlap_channels))[:5]}")
    
except FileNotFoundError:
    print(f"❌ Error: File not found at '{FILE_PATH}'.")
except KeyError as e:
    print(f"❌ Error: Required column not found: {e}. Please verify the header row index.")
except Exception as e:
    print(f"❌ An unexpected error occurred: {e}")

Total filtered rows ('Projects' containing 'Formula 1'): 141

--- Summary of Duplication Logic ---
A. Overall Unique Counts:
   - Unique Orig Market: 10
   - Unique Orig Channel: 76
   - Unique Dup Market: 28
   - Unique Dup Channel: 84

B. Duplication Fan-Out (Unique Targets per Origin):
| Orig Market    |   Unique Dup Market Count |   Unique Orig Channel Count |   Unique Dup Channel Count |   Similar Channel Rules Count (Normalized) |
|:---------------|--------------------------:|----------------------------:|---------------------------:|-------------------------------------------:|
| ARGENTINA      |                        16 |                           1 |                         10 |                                         16 |
| GERMANY        |                         4 |                           3 |                          3 |                                         12 |
| PERU           |                         4 |                           9 |                          9 | 

In [22]:
import pandas as pd
df = pd.read_excel(
    "data/WF 3 F1-R12 - Great Britain.xlsx",
    sheet_name="Worksheet",  # Specify the tab name
    header=5 # 
)

# print(df.head())
print(df.dtypes)

Region                                     object
Market                                     object
Market ID                                 float64
Broadcaster                                object
TV-Channel                                 object
Channel ID                                float64
Pay/Free TV                                object
Date (UTC/GMT)                     datetime64[ns]
Date                               datetime64[ns]
Day                                        object
Start (UTC)                                object
End (UTC)                                  object
Start                                      object
End                                        object
Duration                                   object
Program Title                              object
Program Description                        object
Combined                                   object
Type of program                            object
Event                                      object


In [26]:
import pandas as pd
import os
import re
from typing import Dict, Any, List, Set

# --- FILE PATHS AND CONSTANTS ---
BSR_FILE_PATH = "data/WF 3 F1-R12 - Great Britain.xlsx"
BSR_SHEET_NAME = "Worksheet"
BSR_HEADER_ROW = 5 

MACRO_FILE_NAME = "Macro - BSA Market Duplicator 3.5-F1.xlsm"
MACRO_SHEET_NAME = "Data Core"
MACRO_FILE_PATH = os.path.join("data", MACRO_FILE_NAME) 
MACRO_HEADER_INDEX = 1 
SEARCH_TERM = "Formula 1"
KEY_COLS = ["Orig Market", "Dup Market", "Dup Channel", "Projects", "TV-Channel", "Market"] 
REQUIRED_RULE_COLS = ['Orig Market', 'Dup Market', 'Dup Channel'] # Columns needed for validation logic

# --- HELPER FUNCTION FOR CHANNEL NAME NORMALIZATION (MUST BE ACCESSIBLE) ---
def normalize_channel_name(channel_series):
    """Removes country codes and parentheses to get core channel identity."""
    normalized = channel_series.astype(str).str.strip().str.upper()
    normalized = normalized.str.replace(r'\s*\([^)]*\)', '', regex=True)
    normalized = normalized.str.replace(r'\s+ARG|\s+BOL|\s+CHL|\s+PER|\s+SWE|\s+DE', '', regex=True)
    normalized = normalized.str.replace(r'\s+TV', '', regex=True)
    return normalized.str.strip()

# =========================================================================
# === 1. DEFINITION OF THE VALIDATION FUNCTION (STANDALONE VERSION) =======
# =========================================================================

def validate_dup_channel_existence(df: pd.DataFrame, df_dup_rules: pd.DataFrame) -> Dict[str, Any]:
    """
    Checks if every 'Dup Channel' required by the Duplication Rules is actually present 
    in the list of 'TV-Channel's within the corresponding 'Dup Market' of the BSR.
    
    Args:
        df: The main BSR DataFrame.
        df_dup_rules: Filtered DataFrame containing the duplication rules.
    """
    initial_rows = len(df)
    FLAG_COLUMN = 'QC_Dup_Channel_Existence_Flag'
    
    # 1. Initialization and Checks
    df[FLAG_COLUMN] = 'OK'
    REQUIRED_BSR_COLS = ['Market', 'TV-Channel']

    if not all(col in df.columns for col in REQUIRED_BSR_COLS) or \
       not all(col in df_dup_rules.columns for col in REQUIRED_RULE_COLS):
        return {
            "check_key": "dup_channel_existence", "status": "Skipped",
            "action": "Duplication Channel Check",
            "description": "Skipped: Missing required columns in BSR or Rule set.",
            "details": {"rows_flagged": 0}
        }
    
    # 2. Data Preparation for Efficient Lookup
    
    # Standardize the BSR Market and Channel names (UPPER/Strip)
    # We work on a copy of the BSR's key columns to avoid modifying the original DF unnecessarily during preparation
    bsr_df_check = df[REQUIRED_BSR_COLS].copy()
    bsr_df_check['Market_Norm'] = bsr_df_check['Market'].astype(str).str.strip().str.upper()
    bsr_df_check['TV-Channel_Norm'] = bsr_df_check['TV-Channel'].astype(str).str.strip().str.upper()
    
    # Create a dictionary for quick lookup of existing channels in the BSR:
    # Key: Dup Market Name (Normalized) -> Value: Set of existing TV-Channels (Normalized)
    existing_channels_map = bsr_df_check.groupby('Market_Norm')['TV-Channel_Norm'].apply(set).to_dict()
    
    # 3. Iterate Through Duplication Rules and Validate
    
    missing_channels_log = []
    
    # Filter the rules and standardize keys for comparison
    rules_to_check = df_dup_rules[REQUIRED_RULE_COLS].drop_duplicates().copy()
    rules_to_check['Dup Market_Norm'] = rules_to_check['Dup Market'].astype(str).str.strip().str.upper()
    
    # The required channel names must be normalized before validation!
    rules_to_check['Required_Channel_Norm'] = normalize_channel_name(rules_to_check['Dup Channel']) 
    
    rows_flagged = 0
    
    for index, rule in rules_to_check.iterrows():
        dup_market = rule['Dup Market_Norm']
        required_channel_norm = rule['Required_Channel_Norm']
        orig_market = rule['Orig Market']
        
        # Look up the set of existing channels in the target Dup Market
        existing_channels = existing_channels_map.get(dup_market, set())
        
        # 4. Validation Check: Does the required normalized channel exist in the target market's set?
        if required_channel_norm not in existing_channels:
            
            # Log the specific failure reason
            missing_channels_log.append({
                "Orig_Market": orig_market,
                "Dup_Market": rule['Dup Market'], 
                "Missing_Channel": rule['Dup Channel'], # Use raw channel name for reporting clarity
                "Normalized_Match_Key": required_channel_norm
            })
            
            # 5. Apply Flag to the BSR
            # Flag ALL rows in the target Dup Market (since the issue is market-wide completeness)
            flag_mask = (df['Market'].astype(str).str.strip().str.upper() == dup_market)
            
            flag_message = f"Completeness Error: Missing required Dup Channel '{rule['Dup Channel']}' (Source: {orig_market})."
            
            # Only flag rows that were not already flagged for a similar or harder error
            current_flags = df.loc[flag_mask, FLAG_COLUMN]
            rows_to_flag = flag_mask & (current_flags == 'OK')
            
            df.loc[rows_to_flag, FLAG_COLUMN] = flag_message
            rows_flagged += rows_to_flag.sum()


    final_status = "Completed" if rows_flagged == 0 else "Flagged"

    return {
        "check_key": "dup_channel_existence",
        "status": final_status,
        "action": "Duplication Channel Check",
        "description": f"Checked {len(rules_to_check)} unique Dup Market/Channel requirements. Flagged {rows_flagged} rows in markets missing required channels.",
        "details": {
            "rows_flagged": int(rows_flagged),
            "missing_requirements_count": len(missing_channels_log),
            "missing_channels_list": missing_channels_log
        }
    }
    
# =========================================================================
# === 2. EXECUTION IN JUPYTER NOTEBOOK (USING GLOBAL SCOPE) ===============
# =========================================================================

print("--- Starting Duplication Channel Existence Check ---")

try:
    # 1. LOAD BSR DATA
    df_bsr = pd.read_excel(BSR_FILE_PATH, sheet_name=BSR_SHEET_NAME, header=BSR_HEADER_ROW)
    df_bsr.columns = [str(c).strip() for c in df_bsr.columns]

    # 2. LOAD AND FILTER MACRO DATA (Your Duplication Rules)
    df_macro = pd.read_excel(MACRO_FILE_PATH, sheet_name=MACRO_SHEET_NAME, header=MACRO_HEADER_INDEX)
    df_macro.columns = [str(c).strip() for c in df_macro.columns]

    filtered_df = df_macro[
        df_macro['Projects'].astype(str).str.contains(SEARCH_TERM, case=False, na=False)
    ].copy()
    
    # 3. Prepare Rule DataFrame (analysis_df / DF_DUP_RULES)
    analysis_df = filtered_df[REQUIRED_RULE_COLS].copy()
    
    # Normalize Market names in the Rule sheet (REQUIRED for checking)
    analysis_df['Orig Market'] = analysis_df['Orig Market'].astype(str).str.strip().str.upper()
    analysis_df['Dup Market'] = analysis_df['Dup Market'].astype(str).str.strip().str.upper()

    # 4. RUN VALIDATION
    # The function modifies df_bsr in place.
    validation_result = validate_dup_channel_existence(df_bsr, analysis_df)
    
    print("\n✅ CHECK RESULT:")
    print(validation_result)
    
    if validation_result.get('rows_flagged', 0) > 0:
        print("\n--- Sample of Flagged BSR Rows ---")
        print(df_bsr[df_bsr[validation_result['details'].get('flag_column', 'QC_Dup_Channel_Existence_Flag')] != 'OK'][[
            'Market', 'TV-Channel', 'Broadcaster', validation_result['details'].get('flag_column', 'QC_Dup_Channel_Existence_Flag')
        ]].head(10).to_markdown())

except FileNotFoundError as e:
    print(f"❌ File Error: Ensure both BSR ({BSR_FILE_PATH}) and Macro ({MACRO_FILE_PATH}) files exist. Error: {e}")
except KeyError as e:
    print(f"❌ Column Error: Missing expected column in BSR or Macro file: {e}")
except Exception as e:
    print(f"❌ An unexpected error occurred during execution: {e}")

--- Starting Duplication Channel Existence Check ---

✅ CHECK RESULT:
{'check_key': 'dup_channel_existence', 'status': 'Flagged', 'action': 'Duplication Channel Check', 'description': 'Checked 136 unique Dup Market/Channel requirements. Flagged 940 rows in markets missing required channels.', 'details': {'rows_flagged': 940, 'missing_requirements_count': 132, 'missing_channels_list': [{'Orig_Market': 'SOUTH AFRICA', 'Dup_Market': 'PAN-AFRICA', 'Missing_Channel': 'ESPN Africa (AFR)', 'Normalized_Match_Key': 'ESPN AFRICA'}, {'Orig_Market': 'SOUTH AFRICA', 'Dup_Market': 'PAN-AFRICA', 'Missing_Channel': 'ESPN 2 Africa (AFR)', 'Normalized_Match_Key': 'ESPN 2 AFRICA'}, {'Orig_Market': 'PERU', 'Dup_Market': 'BOLIVIA', 'Missing_Channel': 'ESPN (PER)', 'Normalized_Match_Key': 'ESPN'}, {'Orig_Market': 'PERU', 'Dup_Market': 'BOLIVIA', 'Missing_Channel': 'ESPN 2 (PER)', 'Normalized_Match_Key': 'ESPN 2'}, {'Orig_Market': 'PERU', 'Dup_Market': 'BOLIVIA', 'Missing_Channel': 'ESPN 3 (PER)', 'Normalize

In [27]:
import pandas as pd
import os
import re
from typing import Dict, Any, List, Set

# --- FILE PATHS AND CONSTANTS ---
# NOTE: These paths must be correct relative to where you run this script.
BSR_FILE_PATH = "data/WF 3 F1-R12 - Great Britain.xlsx"
BSR_SHEET_NAME = "Worksheet"
BSR_HEADER_ROW = 5 

MACRO_FILE_NAME = "Macro - BSA Market Duplicator 3.5-F1.xlsm"
MACRO_SHEET_NAME = "Data Core"
MACRO_FILE_PATH = os.path.join("data", MACRO_FILE_NAME) 
MACRO_HEADER_INDEX = 1 
SEARCH_TERM = "Formula 1"
REQUIRED_RULE_COLS = ['Orig Market', 'Dup Market', 'Dup Channel'] 

OUTPUT_FILE_NAME = "QC_Processed_BSR_Channel_Existence.xlsx"

# --- HELPER FUNCTION FOR CHANNEL NAME NORMALIZATION (Must be accessible) ---
def normalize_channel_name(channel_series):
    """Removes country codes and parentheses to get core channel identity."""
    normalized = channel_series.astype(str).str.strip().str.upper()
    normalized = normalized.str.replace(r'\s*\([^)]*\)', '', regex=True)
    normalized = normalized.str.replace(r'\s+ARG|\s+BOL|\s+CHL|\s+PER|\s+SWE|\s+DE', '', regex=True)
    normalized = normalized.str.replace(r'\s+TV', '', regex=True)
    return normalized.str.strip()

# =========================================================================
# === 1. VALIDATION FUNCTION (Simulated BSRValidator Method) ==============
# =========================================================================

def validate_dup_channel_existence(df: pd.DataFrame, df_dup_rules: pd.DataFrame) -> Dict[str, Any]:
    """
    Checks if every 'Dup Channel' required by the Duplication Rules is actually present 
    in the list of 'TV-Channel's within the corresponding 'Dup Market' of the BSR.
    """
    initial_rows = len(df)
    FLAG_COLUMN = 'QC_Dup_Channel_Existence_Flag'
    
    # 1. Initialization and Checks
    df[FLAG_COLUMN] = 'OK'
    REQUIRED_BSR_COLS = ['Market', 'TV-Channel']
    
    # Pre-check is crucial here since the main script handles the loading
    if not all(col in df.columns for col in REQUIRED_BSR_COLS):
        print(f"FATAL: BSR is missing columns: {list(set(REQUIRED_BSR_COLS) - set(df.columns))}")
        return {"check_key": "dup_channel_existence", "status": "Skipped", "action": "Duplication Channel Check", "details": {"rows_flagged": 0}}
    
    # 2. Data Preparation for Efficient Lookup
    
    # Standardize the BSR Market and Channel names (UPPER/Strip)
    bsr_df_check = df.copy()
    bsr_df_check['Market_Norm'] = bsr_df_check['Market'].astype(str).str.strip().str.upper()
    
    # Apply normalization to BSR channels only for the lookup map values
    bsr_df_check['TV-Channel_Norm'] = normalize_channel_name(bsr_df_check['TV-Channel'])
    
    # Create a dictionary for quick lookup of existing channels in the BSR:
    # Key: Market Name (Normalized) -> Value: Set of existing TV-Channels (Normalized)
    existing_channels_map = bsr_df_check.groupby('Market_Norm')['TV-Channel_Norm'].apply(set).to_dict()
    
    # 3. Iterate Through Duplication Rules and Validate
    
    missing_channels_log = []
    
    # Filter and prepare rules data
    rules_to_check = df_dup_rules[REQUIRED_RULE_COLS].drop_duplicates().copy()
    rules_to_check['Dup Market_Norm'] = rules_to_check['Dup Market'].astype(str).str.strip().str.upper()
    
    # Apply normalization to the REQUIRED channel name from the rule sheet
    rules_to_check['Required_Channel_Norm'] = normalize_channel_name(rules_to_check['Dup Channel'])
    
    rows_flagged = 0
    
    for index, rule in rules_to_check.iterrows():
        dup_market = rule['Dup Market_Norm']
        required_channel_norm = rule['Required_Channel_Norm']
        orig_market = rule['Orig Market']
        
        # Look up the set of existing channels in the target Dup Market
        existing_channels = existing_channels_map.get(dup_market, set())
        
        # 4. Validation Check: Does the required normalized channel exist in the target market's set?
        if required_channel_norm not in existing_channels:
            
            # Log the specific failure reason
            missing_channels_log.append({
                "Orig_Market": orig_market,
                "Dup_Market": rule['Dup Market'], 
                "Missing_Channel": rule['Dup Channel'], # Use raw channel name for report clarity
                "Normalized_Match_Key": required_channel_norm
            })
            
            # 5. Apply Flag to the BSR (Flag all rows in the target Dup Market)
            flag_mask = (df['Market'].astype(str).str.strip().str.upper() == dup_market)
            
            flag_message = f"Completeness Error: Missing required Dup Channel '{rule['Dup Channel']}' (Source: {orig_market})."
            
            current_flags = df.loc[flag_mask, FLAG_COLUMN]
            rows_to_flag = flag_mask & (current_flags == 'OK')
            
            df.loc[rows_to_flag, FLAG_COLUMN] = flag_message
            rows_flagged += rows_to_flag.sum()


    final_status = "Completed" if rows_flagged == 0 else "Flagged"

    return {
        "check_key": "dup_channel_existence",
        "status": final_status,
        "action": "Duplication Channel Check",
        "description": f"Checked {len(rules_to_check)} unique Dup Market/Channel requirements. Flagged {rows_flagged} rows in markets missing required channels.",
        "details": {
            "rows_flagged": int(rows_flagged),
            "missing_requirements_count": len(missing_channels_log),
            "missing_channels_list": missing_channels_log
        }
    }
    
# =========================================================================
# === 2. EXECUTION AND SAVE (LOCAL FILESYSTEM) ============================
# =========================================================================

try:
    print("--- Starting Local File Processing ---")
    
    # 1. LOAD BSR DATA (Main Data)
    df_bsr = pd.read_excel(BSR_FILE_PATH, sheet_name=BSR_SHEET_NAME, header=BSR_HEADER_ROW)
    df_bsr.columns = [str(c).strip() for c in df_bsr.columns]

    # 2. LOAD AND FILTER MACRO DATA (Duplication Rules)
    df_macro = pd.read_excel(MACRO_FILE_PATH, sheet_name=MACRO_SHEET_NAME, header=MACRO_HEADER_INDEX)
    df_macro.columns = [str(c).strip() for c in df_macro.columns]

    filtered_df = df_macro[
        df_macro['Projects'].astype(str).str.contains(SEARCH_TERM, case=False, na=False)
    ].copy()
    
    # Final Duplication Rules DataFrame
    df_dup_rules = filtered_df[REQUIRED_RULE_COLS].copy()
    
    # 3. RUN VALIDATION (Function modifies df_bsr in place)
    validation_result = validate_dup_channel_existence(df_bsr, df_dup_rules)
    
    print("\n✅ VALIDATION COMPLETE:")
    print(validation_result.get('description'))
    
    # 4. SAVE PROCESSED FILE
    output_path = os.path.join(os.path.dirname(BSR_FILE_PATH), OUTPUT_FILE_NAME)
    df_bsr.to_excel(output_path, index=False, sheet_name=BSR_SHEET_NAME)

    print("-" * 50)
    print(f"✅ Success! Processed file saved locally to: {output_path}")
    print(f"Total rows flagged: {validation_result.get('details', {}).get('rows_flagged', 0)}")
    print("-" * 50)


except FileNotFoundError as e:
    print(f"❌ File Error: Ensure BSR ({BSR_FILE_PATH}) and Macro ({MACRO_FILE_PATH}) files exist. Error: {e}")
except KeyError as e:
    print(f"❌ Column Error: Missing expected column in BSR or Macro file: {e}")
except Exception as e:
    print(f"❌ An unexpected error occurred during execution: {e}")

--- Starting Local File Processing ---

✅ VALIDATION COMPLETE:
Checked 136 unique Dup Market/Channel requirements. Flagged 509 rows in markets missing required channels.
--------------------------------------------------
✅ Success! Processed file saved locally to: data\QC_Processed_BSR_Channel_Existence.xlsx
Total rows flagged: 509
--------------------------------------------------


In [31]:
import pandas as pd
import os
import re
from typing import Dict, Any, List, Set

# --- FILE PATHS AND CONSTANTS ---
# NOTE: These paths must be correct relative to where you run this script.
BSR_FILE_PATH = "data/WF 3 F1-R12 - Great Britain.xlsx"
BSR_SHEET_NAME = "Worksheet"
BSR_HEADER_ROW = 5 

MACRO_FILE_NAME = "Macro - BSA Market Duplicator 3.5-F1.xlsm"
MACRO_SHEET_NAME = "Data Core"
MACRO_FILE_PATH = os.path.join("data", MACRO_FILE_NAME) 
MACRO_HEADER_INDEX = 1 
SEARCH_TERM = "Formula 1"
REQUIRED_RULE_COLS = ['Orig Market', 'Dup Market', 'Dup Channel'] 

OUTPUT_FILE_NAME = "QC_Processed_BSR_Channel_Existence.xlsx"

# --- HELPER FUNCTION FOR CHANNEL NAME NORMALIZATION ---
def normalize_channel_name(channel_series):
    """Removes country codes and parentheses to get core channel identity."""
    normalized = channel_series.astype(str).str.strip().str.upper()
    normalized = normalized.str.replace(r'\s*\([^)]*\)', '', regex=True)
    normalized = normalized.str.replace(r'\s+ARG|\s+BOL|\s+CHL|\s+PER|\s+SWE|\s+DE', '', regex=True)
    normalized = normalized.str.replace(r'\s+TV', '', regex=True)
    return normalized.str.strip()

# =========================================================================
# === 1. VALIDATION FUNCTION (STANDALONE VERSION) =========================
# =========================================================================

def validate_dup_channel_existence(df: pd.DataFrame, df_dup_rules: pd.DataFrame) -> Dict[str, Any]:
    """
    Checks if every 'Dup Channel' required by the Duplication Rules is actually present 
    in the list of 'TV-Channel's within the corresponding 'Dup Market' of the BSR.
    Reports the full list of missing channels for each market pair.
    """
    initial_rows = len(df)
    FLAG_COLUMN = 'QC_Dup_Channel_Existence_Flag'
    
    # 1. Initialization and Checks
    df[FLAG_COLUMN] = 'OK'
    REQUIRED_BSR_COLS = ['Market', 'TV-Channel']

    if not all(col in df.columns for col in REQUIRED_BSR_COLS) or \
       not all(col in df_dup_rules.columns for col in REQUIRED_RULE_COLS):
        return {
            "check_key": "dup_channel_existence", "status": "Skipped",
            "action": "Duplication Channel Check",
            "description": "Skipped: Missing required columns in BSR or Rule set.",
            "details": {"rows_flagged": 0}
        }
    
    # 2. Data Preparation for Efficient Lookup
    
    bsr_df_check = df[['Market', 'TV-Channel']].copy()
    bsr_df_check['Market_Norm'] = bsr_df_check['Market'].astype(str).str.strip().str.upper()
    bsr_df_check['TV-Channel_Norm'] = normalize_channel_name(bsr_df_check['TV-Channel'])
    
    existing_channels_map = bsr_df_check.groupby('Market_Norm')['TV-Channel_Norm'].apply(set).to_dict()
    
    # 3. Iterate Through Duplication Rules and Validate
    
    missing_channels_log = []
    
    # Use the variable name 'rules_to_check_df' to avoid confusion if needed elsewhere
    rules_to_check_df = df_dup_rules[REQUIRED_RULE_COLS].drop_duplicates().copy()
    rules_to_check_df['Dup Market_Norm'] = rules_to_check_df['Dup Market'].astype(str).str.strip().str.upper()
    rules_to_check_df['Required_Channel_Norm'] = normalize_channel_name(rules_to_check_df['Dup Channel'])
    
    rows_flagged = 0
    
    for index, rule in rules_to_check_df.iterrows():
        dup_market = rule['Dup Market_Norm']
        required_channel_norm = rule['Required_Channel_Norm']
        orig_market = rule['Orig Market']
        
        existing_channels = existing_channels_map.get(dup_market, set())
        
        # 4. Validation Check: Does the required normalized channel exist in the target market's set?
        if required_channel_norm not in existing_channels:
            
            # --- Collect All Missing Channels for this Market Pair (The User's Request) ---
            # Re-filter the original rules DF to get ALL required channels for this pair
            all_required_for_pair = df_dup_rules[
                (df_dup_rules['Orig Market'] == orig_market) & 
                (df_dup_rules['Dup Market'] == rule['Dup Market'])
            ]['Dup Channel'].unique()
            
            # Now, check the missing status for this ENTIRE group
            current_missing_channels = [
                ch for ch in all_required_for_pair 
                if normalize_channel_name(pd.Series([ch])).iloc[0] not in existing_channels
            ]
            
            # Log the specific failure reason
            missing_channels_log.append({
                "Orig_Market": orig_market,
                "Dup_Market": rule['Dup Market'], 
                "Missing_Channels_Count": len(current_missing_channels),
                "Missing_Channels_List": sorted(current_missing_channels)
            })
            
            # 5. Apply Flag to the BSR
            flag_mask = (df['Market'].astype(str).str.strip().str.upper() == dup_market)
            
            flag_message = f"Completeness Error: {len(current_missing_channels)} Dup Channel(s) missing (Source: {orig_market})."
            
            current_flags = df.loc[flag_mask, FLAG_COLUMN]
            rows_to_flag = flag_mask & (current_flags == 'OK')
            
            df.loc[rows_to_flag, FLAG_COLUMN] = flag_message
            rows_flagged += rows_to_flag.sum()


    final_status = "Completed" if rows_flagged == 0 else "Flagged"

    return {
        "check_key": "dup_channel_existence",
        "status": final_status,
        "action": "Duplication Channel Check",
        "description": f"Checked Duplication rules. Flagged {rows_flagged} rows in markets missing required channels.",
        "details": {
            "rows_flagged": int(rows_flagged),
            "missing_market_pairs_count": len(missing_channels_log),
            "missing_market_details": missing_channels_log
        }
    }

# =========================================================================
# === 2. EXECUTION AND SAVE (LOCAL FILESYSTEM) ============================
# =========================================================================

try:
    print("--- Starting Local File Processing ---")
    
    # 1. LOAD BSR DATA (Main Data)
    df_bsr = pd.read_excel(BSR_FILE_PATH, sheet_name=BSR_SHEET_NAME, header=BSR_HEADER_ROW)
    df_bsr.columns = [str(c).strip() for c in df_bsr.columns]

    # 2. LOAD AND FILTER MACRO DATA (Duplication Rules)
    df_macro = pd.read_excel(MACRO_FILE_PATH, sheet_name=MACRO_SHEET_NAME, header=MACRO_HEADER_INDEX)
    df_macro.columns = [str(c).strip() for c in df_macro.columns]

    filtered_df = df_macro[
        df_macro['Projects'].astype(str).str.contains(SEARCH_TERM, case=False, na=False)
    ].copy()
    
    # Final Duplication Rules DataFrame
    df_dup_rules = filtered_df[REQUIRED_RULE_COLS].copy()
    
    # 3. RUN VALIDATION (Function modifies df_bsr in place)
    # The fix ensures this function runs without throwing the NameError
    validation_result = validate_dup_channel_existence(df_bsr, df_dup_rules)
    
    print("\n✅ VALIDATION COMPLETE:")
    print(f"Total Rows Flagged: {validation_result['details'].get('rows_flagged', 0)}")
    print(f"Total Unique Market Pairs Missing Channels: {validation_result['details'].get('missing_market_pairs_count', 0)}")
    
    # 4. SAVE PROCESSED FILE
    output_path = os.path.join(os.path.dirname(BSR_FILE_PATH), OUTPUT_FILE_NAME)
    df_bsr.to_excel(output_path, index=False, sheet_name=BSR_SHEET_NAME)

    print("-" * 50)
    print(f"✅ Success! Processed file saved locally to: {output_path}")
    print("-" * 50)


except FileNotFoundError as e:
    print(f"❌ File Error: Ensure both BSR ({BSR_FILE_PATH}) and Macro ({MACRO_FILE_PATH}) files exist. Error: {e}")
except KeyError as e:
    print(f"❌ Column Error: Missing expected column in BSR or Macro file: {e}")
except Exception as e:
    print(f"❌ An unexpected error occurred during execution: {e}. Check if required columns are present.")

--- Starting Local File Processing ---

✅ VALIDATION COMPLETE:
Total Rows Flagged: 509
Total Unique Market Pairs Missing Channels: 100
--------------------------------------------------
✅ Success! Processed file saved locally to: data\QC_Processed_BSR_Channel_Existence.xlsx
--------------------------------------------------


In [35]:
import pandas as pd
import os
import re
from typing import Dict, Any, List, Set

# --- FILE PATHS AND CONSTANTS ---
BSR_FILE_PATH = "data/WF 3 F1-R12 - Great Britain.xlsx"
BSR_SHEET_NAME = "Worksheet"
BSR_HEADER_ROW = 5 

MACRO_FILE_NAME = "Macro - BSA Market Duplicator 3.5-F1.xlsm"
MACRO_SHEET_NAME = "Data Core"
MACRO_FILE_PATH = os.path.join("data", MACRO_FILE_NAME) 
MACRO_HEADER_INDEX = 1 
SEARCH_TERM = "Formula 1"
REQUIRED_RULE_COLS = ['Orig Market', 'Dup Market', 'Dup Channel'] 

# Output file path definition
OUTPUT_DIR = os.path.dirname(BSR_FILE_PATH)
OUTPUT_FILE_NAME = "QC_Processed_BSR_Channel_Existence.xlsx"
OUTPUT_FULL_PATH = os.path.join(OUTPUT_DIR, OUTPUT_FILE_NAME)

# --- HELPER FUNCTION FOR CHANNEL NAME NORMALIZATION ---
def normalize_channel_name(channel_series):
    """Removes country codes and parentheses to get core channel identity."""
    normalized = channel_series.astype(str).str.strip().str.upper()
    normalized = normalized.str.replace(r'\s*\([^)]*\)', '', regex=True)
    normalized = normalized.str.replace(r'\s+ARG|\s+BOL|\s+CHL|\s+PER|\s+SWE|\s+DE', '', regex=True)
    normalized = normalized.str.replace(r'\s+TV', '', regex=True)
    return normalized.str.strip()

# =========================================================================
# === 1. VALIDATION FUNCTION (Simulated BSRValidator Method) ==============
# =========================================================================

def validate_dup_channel_existence(df: pd.DataFrame, df_dup_rules: pd.DataFrame) -> Dict[str, Any]:
    """
    Checks if every 'Dup Channel' required by the Duplication Rules is actually present 
    in the list of 'TV-Channel's within the corresponding 'Dup Market' of the BSR.
    Reports the full context of existing and missing channels.
    """
    initial_rows = len(df)
    FLAG_COLUMN = 'QC_Dup_Channel_Existence_Flag'
    
    # 1. Initialization and Checks
    df[FLAG_COLUMN] = 'OK'
    REQUIRED_BSR_COLS = ['Market', 'TV-Channel']

    if not all(col in df.columns for col in REQUIRED_BSR_COLS) or \
       not all(col in df_dup_rules.columns for col in REQUIRED_RULE_COLS):
        return {
            "check_key": "dup_channel_existence", "status": "Skipped",
            "action": "Duplication Channel Check",
            "description": "Skipped: Missing required columns in BSR or Rule set.",
            "details": {"rows_flagged": 0}
        }
    
    # 2. Data Preparation for Efficient Lookup
    
    bsr_df_check = df[['Market', 'TV-Channel']].copy()
    bsr_df_check['Market_Norm'] = bsr_df_check['Market'].astype(str).str.strip().str.upper()
    bsr_df_check['TV-Channel_Norm'] = normalize_channel_name(bsr_df_check['TV-Channel'])
    
    existing_channels_map = bsr_df_check.groupby('Market_Norm')['TV-Channel_Norm'].apply(set).to_dict()
    orig_channel_count_map = bsr_df_check.groupby('Market_Norm')['TV-Channel_Norm'].nunique().to_dict()

    # 3. Aggregate Rules and Prepare for Validation
    missing_channels_log = []
    rules_grouped = df_dup_rules.groupby(['Orig Market', 'Dup Market'])
    df_dup_rules['Required_Channel_Norm'] = normalize_channel_name(df_dup_rules['Dup Channel'])
    
    rows_flagged = 0
    
    # Iterate over each unique (Source Market, Target Market) pair
    for (orig_market_raw, dup_market_raw), group in rules_grouped:
        
        orig_market = orig_market_raw.upper().strip()
        dup_market = dup_market_raw.upper().strip()
        
        required_channels_set = set(group['Required_Channel_Norm'].unique())
        existing_channels = existing_channels_map.get(dup_market, set())
        
        missing_channels = required_channels_set.difference(existing_channels)
        
        # 4. Validation Check and Logging
        if missing_channels:
            
            # Log the specific failure reason
            missing_channels_log.append({
                "Orig_Market": orig_market_raw,
                "Dup_Market": dup_market_raw, 
                "Orig_Channel_Count": orig_channel_count_map.get(orig_market, 0),
                "Dup_Channel_Exist_Count": len(existing_channels),
                "Missing_Channels_Count": len(missing_channels),
                "Missing_Channels_List": sorted(list(missing_channels))
            })
            
            # 5. Apply Flag to the BSR
            flag_mask = (df['Market'].astype(str).str.strip().str.upper() == dup_market)
            flag_message = f"Completeness Error: {len(missing_channels)} Dup Channel(s) missing (Source: {orig_market_raw})."
            current_flags = df.loc[flag_mask, FLAG_COLUMN]
            rows_to_flag = flag_mask & (current_flags == 'OK')
            
            df.loc[rows_to_flag, FLAG_COLUMN] = flag_message
            rows_flagged += rows_to_flag.sum()


    final_status = "Completed" if rows_flagged == 0 else "Flagged"

    return {
        "check_key": "dup_channel_existence",
        "status": final_status,
        "action": "Duplication Channel Check (List of Missing)",
        "description": f"Checked Duplication rules. Flagged {rows_flagged} rows in markets missing required channels.",
        "details": {
            "rows_flagged": int(rows_flagged),
            "missing_market_pairs_count": len(missing_channels_log),
            "missing_market_details": missing_channels_log
        }
    }
    
# =========================================================================
# === 2. EXECUTION AND SAVE (LOCAL FILESYSTEM) ============================
# =========================================================================

try:
    print("--- Starting Local File Processing ---")
    
    # 1. LOAD BSR DATA (Main Data)
    df_bsr = pd.read_excel(BSR_FILE_PATH, sheet_name=BSR_SHEET_NAME, header=BSR_HEADER_ROW)
    df_bsr.columns = [str(c).strip() for c in df_bsr.columns]

    # 2. LOAD AND FILTER MACRO DATA (Duplication Rules)
    df_macro = pd.read_excel(MACRO_FILE_PATH, sheet_name=MACRO_SHEET_NAME, header=MACRO_HEADER_INDEX)
    df_macro.columns = [str(c).strip() for c in df_macro.columns]

    filtered_df = df_macro[
        df_macro['Projects'].astype(str).str.contains(SEARCH_TERM, case=False, na=False)
    ].copy()
    
    # Final Duplication Rules DataFrame
    df_dup_rules = filtered_df[['Orig Market', 'Dup Market', 'Dup Channel']].copy()
    
    # 3. RUN VALIDATION (Function modifies df_bsr in place)
    validation_result = validate_dup_channel_existence(df_bsr, df_dup_rules)
    
    print("\n✅ VALIDATION COMPLETE:")
    print(f"Total Rows Flagged: {validation_result['details'].get('rows_flagged', 0)}")
    print(f"Total Unique Market Pairs Missing Channels: {validation_result['details'].get('missing_market_pairs_count', 0)}")
    
    # 4. SAVE PROCESSED FILE
    # Ensure the output directory exists
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    df_bsr.to_excel(OUTPUT_FULL_PATH, index=False, sheet_name=BSR_SHEET_NAME)

    print("-" * 50)
    print(f"✅ Success! Processed file saved locally to: {OUTPUT_FULL_PATH}")
    print("-" * 50)


except FileNotFoundError as e:
    print(f"❌ File Error: Ensure both BSR ({BSR_FILE_PATH}) and Macro ({MACRO_FILE_PATH}) files exist. Error: {e}")
except KeyError as e:
    print(f"❌ Column Error: Missing expected column in BSR or Macro file: {e}")
except Exception as e:
    print(f"❌ An unexpected error occurred during execution: {e}")

--- Starting Local File Processing ---

✅ VALIDATION COMPLETE:
Total Rows Flagged: 509
Total Unique Market Pairs Missing Channels: 28
--------------------------------------------------
✅ Success! Processed file saved locally to: data\QC_Processed_BSR_Channel_Existence.xlsx
--------------------------------------------------


In [48]:
import pandas as pd
import os
import re
from typing import Dict, Any, List, Set

# --- FILE PATHS AND CONSTANTS ---
# NOTE: These paths must be correct relative to where you run this script.
BSR_FILE_PATH = "data/WF 3 F1-R12 - Great Britain.xlsx"
BSR_SHEET_NAME = "Worksheet"
BSR_HEADER_ROW = 5 

MACRO_FILE_NAME = "Macro - BSA Market Duplicator 3.5-F1.xlsm"
MACRO_SHEET_NAME = "Data Core"
MACRO_FILE_PATH = os.path.join("data", MACRO_FILE_NAME) 
MACRO_HEADER_INDEX = 1 
SEARCH_TERM = "Formula 1"
REQUIRED_RULE_COLS = ['Orig Market', 'Dup Market', 'Dup Channel'] 

OUTPUT_DIR = os.path.dirname(BSR_FILE_PATH)
OUTPUT_FILE_NAME = "QC_Processed_BSR_Channel_Existence.xlsx"
OUTPUT_FULL_PATH = os.path.join(OUTPUT_DIR, OUTPUT_FILE_NAME)


# --- HELPER FUNCTION FOR CHANNEL NAME NORMALIZATION ---
def normalize_channel_name(channel_series):
    """Removes country codes and parentheses to get core channel identity."""
    normalized = channel_series.astype(str).str.strip().str.upper()
    normalized = normalized.str.replace(r'\s*\([^)]*\)', '', regex=True)
    normalized = normalized.str.replace(r'\s+ARG|\s+BOL|\s+CHL|\s+PER|\s+SWE|\s+DE|\s+AFR|\s+PCA|\s+COL|\s+ECU|\s+URY|\s+MEX|\s+JPN|\s+LTU|\s+CHE|\s+FRA', '', regex=True)
    normalized = normalized.str.replace(r'\s+TV', '', regex=True)
    return normalized.str.strip()

# =========================================================================
# === 1. VALIDATION FUNCTION (Simulated BSRValidator Method) ==============
# =========================================================================

def validate_dup_channel_existence(df: pd.DataFrame, df_dup_rules: pd.DataFrame) -> Dict[str, Any]:
    """
    Checks if every 'Dup Channel' required by the Duplication Rules is actually present 
    in the list of 'TV-Channel's within the corresponding 'Dup Market' of the BSR.
    Reports the full list of missing channels for each market pair.
    """
    initial_rows = len(df)
    FLAG_COLUMN = 'QC_Dup_Channel_Existence_Flag'
    
    # 1. Initialization and Checks
    df[FLAG_COLUMN] = 'OK'
    REQUIRED_BSR_COLS = ['Market', 'TV-Channel']

    if not all(col in df.columns for col in REQUIRED_BSR_COLS) or \
       not all(col in df_dup_rules.columns for col in REQUIRED_RULE_COLS):
        return {
            "check_key": "dup_channel_existence", "status": "Skipped",
            "action": "Duplication Channel Check",
            "description": "Skipped: Missing required columns in BSR or Rule set.",
            "details": {"rows_flagged": 0}
        }
    
    # 2. Data Preparation for Efficient Lookup
    
    bsr_df_check = df[['Market', 'TV-Channel']].copy()
    bsr_df_check['Market_Norm'] = bsr_df_check['Market'].astype(str).str.strip().str.upper()
    
    # Apply normalization to BSR channels for the lookup map values
    bsr_df_check['TV-Channel_Norm'] = normalize_channel_name(bsr_df_check['TV-Channel'])
    
    # Create a dictionary for quick lookup of existing channels in the BSR:
    existing_channels_map = bsr_df_check.groupby('Market_Norm')['TV-Channel_Norm'].apply(set).to_dict()
    orig_channel_count_map = bsr_df_check.groupby('Market_Norm')['TV-Channel_Norm'].nunique().to_dict()

    # 3. Aggregate Rules and Prepare for Validation
    
    missing_channels_log = []
    
    # Use the variable name 'rules_to_check_df' to avoid confusion if needed elsewhere
    rules_to_check_df = df_dup_rules[REQUIRED_RULE_COLS].copy()
    rules_grouped = rules_to_check_df.groupby(['Orig Market', 'Dup Market'])
    rules_to_check_df['Required_Channel_Norm'] = normalize_channel_name(rules_to_check_df['Dup Channel'])
    
    rows_flagged = 0
    
    # Iterate over each unique (Source Market, Target Market) pair
    for (orig_market_raw, dup_market_raw), group in rules_grouped:
        
        orig_market = orig_market_raw.upper().strip()
        dup_market = dup_market_raw.upper().strip()
        
        # Get the set of ALL channels required for this specific duplication pair
        required_channels_set = set(group['Required_Channel_Norm'].unique())
        
        # Get the channels currently existing in the BSR for the target market
        existing_channels = existing_channels_map.get(dup_market, set())
        
        # Find the difference: Channels present in REQUIRED set but NOT in EXISTING set
        missing_channels = required_channels_set.difference(existing_channels)
        
        # 4. Validation Check and Logging
        if missing_channels:
            
            # Log the specific failure reason
            missing_channels_log.append({
                "Orig_Market": orig_market_raw,
                "Dup_Market": dup_market_raw, 
                "Orig_Channel_Count": orig_channel_count_map.get(orig_market, 0),
                "Dup_Channel_Exist_Count": len(existing_channels),
                "Missing_Channels_Count": len(missing_channels),
                "Missing_Channels_List": sorted(list(missing_channels))
            })
            
            # 5. Apply Flag to the BSR
            # Format the flag message with the comprehensive list
            missing_list_str = "; ".join(sorted(list(missing_channels)))
            flag_message = f"Completeness Error: {len(missing_channels)} Channel(s) missing. Required: [{missing_list_str}] (Source: {orig_market_raw})."
            
            # Flag ALL rows in the target Dup Market (since the issue is market-wide completeness)
            flag_mask = (df['Market'].astype(str).str.strip().str.upper() == dup_market)
            
            # Only flag rows that were not already flagged
            current_flags = df.loc[flag_mask, FLAG_COLUMN]
            rows_to_flag = flag_mask & (current_flags == 'OK')
            
            df.loc[rows_to_flag, FLAG_COLUMN] = flag_message
            rows_flagged += rows_to_flag.sum()


    final_status = "Completed" if rows_flagged == 0 else "Flagged"

    return {
        "check_key": "dup_channel_existence",
        "status": final_status,
        "action": "Duplication Channel Check",
        "description": f"Checked Duplication rules. Flagged {rows_flagged} rows in markets missing required channels.",
        "details": {
            "rows_flagged": int(rows_flagged),
            "missing_market_pairs_count": len(missing_channels_log),
            "missing_market_details": missing_channels_log
        }
    }

# =========================================================================
# === 2. EXECUTION AND SAVE (LOCAL FILESYSTEM) ============================
# =========================================================================

try:
    print("--- Starting Local File Processing ---")
    
    # 1. LOAD BSR DATA (Main Data)
    df_bsr = pd.read_excel(BSR_FILE_PATH, sheet_name=BSR_SHEET_NAME, header=BSR_HEADER_ROW)
    df_bsr.columns = [str(c).strip() for c in df_bsr.columns]

    # 2. LOAD AND FILTER MACRO DATA (Duplication Rules)
    df_macro = pd.read_excel(MACRO_FILE_PATH, sheet_name=MACRO_SHEET_NAME, header=MACRO_HEADER_INDEX)
    df_macro.columns = [str(c).strip() for c in df_macro.columns]

    filtered_df = df_macro[
        df_macro['Projects'].astype(str).str.contains(SEARCH_TERM, case=False, na=False)
    ].copy()
    
    # Final Duplication Rules DataFrame
    df_dup_rules = filtered_df[['Orig Market', 'Dup Market', 'Dup Channel']].copy()
    
    # 3. RUN VALIDATION (Function modifies df_bsr in place)
    validation_result = validate_dup_channel_existence(df_bsr, df_dup_rules)
    
    print("\n✅ VALIDATION COMPLETE:")
    print(f"Total Rows Flagged: {validation_result['details'].get('rows_flagged', 0)}")
    print(f"Total Unique Market Pairs Missing Channels: {validation_result['details'].get('missing_market_pairs_count', 0)}")
    
    # 4. SAVE PROCESSED FILE
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
        
    df_bsr.to_excel(OUTPUT_FULL_PATH, index=False, sheet_name=BSR_SHEET_NAME)

    print("-" * 50)
    print(f"✅ Success! Processed file saved locally to: {OUTPUT_FULL_PATH}")
    print("-" * 50)


except FileNotFoundError as e:
    print(f"❌ File Error: Ensure both BSR ({BSR_FILE_PATH}) and Macro ({MACRO_FILE_PATH}) files exist. Error: {e}")
except KeyError as e:
    print(f"❌ Column Error: Missing expected column in BSR or Macro file: {e}")
except Exception as e:
    print(f"❌ An unexpected error occurred during execution: {e}")

--- Starting Local File Processing ---

✅ VALIDATION COMPLETE:
Total Rows Flagged: 509
Total Unique Market Pairs Missing Channels: 28
--------------------------------------------------
✅ Success! Processed file saved locally to: data\QC_Processed_BSR_Channel_Existence.xlsx
--------------------------------------------------
