In [1]:
import pandas as pd
df = pd.read_excel(
    "data/WF 3 F1-R12 - Great Britain.xlsx",
    sheet_name="Worksheet",  # Specify the tab name
    header=5 # Specify the 4th row as the header
)

print(df.head())

                      Region     Market  Market ID Broadcaster  \
0  Central and South America  Argentina       74.0        ESPN   
1  Central and South America  Argentina       74.0    Mediapro   
2  Central and South America  Argentina       74.0    Mediapro   
3  Central and South America  Argentina       74.0    Mediapro   
4  Central and South America  Argentina       74.0    Mediapro   

         TV-Channel  Channel ID Pay/Free TV Date (UTC/GMT)       Date  \
0        ESPN 2 ARG      1210.0         NaN     2025-07-04 2025-07-03   
1  Fox Sports (ARG)      2732.0         NaN     2025-07-04 2025-07-04   
2  Fox Sports (ARG)      2732.0         NaN     2025-07-04 2025-07-04   
3  Fox Sports (ARG)      2732.0         NaN     2025-07-05 2025-07-04   
4  Fox Sports (ARG)      2732.0         NaN     2025-07-05 2025-07-04   

                   Day  ... Spot price in Euro [1 sec.] Fixture analysis  \
0  2025-07-03 00:00:00  ...                         NaN              NaN   
1  2025-07-0

In [4]:
import pandas as pd
from datetime import datetime
import pandas as pd
import numpy as np
import re
from typing import List ,Dict,Any
from openpyxl.styles import PatternFill
from pandas.api.types import is_object_dtype, is_categorical_dtype, CategoricalDtype 
from fuzzywuzzy import fuzz
from datetime import datetime

def _create_session_schedule(self):
    """Creates the definitive schedule DataFrame for mapping."""
    data = {
        'Session': [
            'Practice 1', 'Practice 2', 'Practice 3', 
            'Qualifying', 'GRAND PRIX (52 LAPS OR 120 MINS)'
        ],
        'Date': [
            '4-Jul-2025', '4-Jul-2025', '5-Jul-2025', 
            '5-Jul-2025', '6-Jul-2025'
        ],
        'Start Time': [
            '11:30:00', '15:00:00', '10:30:00', 
            '14:00:00', '14:00:00'
        ]
    }
    df_schedule = pd.DataFrame(data)

    # 1. Standardize the Session names to your target categories
    df_schedule['Target_Competition'] = np.select(
        [
            df_schedule['Session'].str.contains('Practice'),
            df_schedule['Session'] == 'Qualifying',
            df_schedule['Session'].str.contains('GRAND PRIX')
        ],
        ['Training', 'Qualifying', 'Race'],
        default='Support' # Default should theoretically not be hit here
    )

    # 2. Create the precise key for joining (Date + Time)
    # NOTE: This assumes BSR's Start (UTC) is a string representation of time
    df_schedule['Date_Key'] = pd.to_datetime(df_schedule['Date']).dt.date
    df_schedule['Start_Key'] = df_schedule['Start Time'].str.strip()

    return df_schedule[['Target_Competition', 'Date_Key', 'Start_Key']]



In [5]:
def _impute_competition_sessions(self) -> Dict[str, Any]:
    """
    Imputes blank values in the 'Competition' column by matching 'Date' and 'Start Time' 
    against the known GP schedule.
    """
    initial_blanks = self.df['Competition'].isna().sum()
    
    # Check for required BSR columns
    required_cols = ['Date (UTC/GMT)', 'Start (UTC)', 'Competition']
    if not all(col in self.df.columns for col in required_cols):
         return {
            "check_key": "impute_competition",
            "status": "Skipped",
            "action": "Competition Imputation",
            "description": f"Skipped: Missing required BSR columns {required_cols}.",
            "details": {"rows_imputed": 0}
        }
    
    # Get the schedule reference table
    df_schedule = self._create_session_schedule()
    
    # --- Prepare BSR for Join ---
    df_temp = self.df.copy()
    
    # 1. Convert BSR Date column to match schedule key format (date object)
    df_temp['Date_Key'] = pd.to_datetime(df_temp['Date (UTC/GMT)']).dt.date
    
    # 2. Use BSR Start Time column as the time key (assuming it was standardized earlier)
    # NOTE: Assuming Start (UTC) was converted to string/object earlier
    df_temp['Start_Key'] = df_temp['Start (UTC)'].astype(str).str.strip() 

    # --- Perform Left Merge ---
    # Merge BSR with Schedule on (Date, Start Time) to pull in 'Target_Competition'
    df_temp = df_temp.merge(
        df_schedule, 
        on=['Date_Key', 'Start_Key'], 
        how='left', 
        suffixes=('', '_Map')
    )
    
    # --- Imputation Logic ---
    
    # Identify rows that are currently blank in the Competition column
    blank_mask = self.df['Competition'].isna() | (self.df['Competition'].astype(str).str.strip() == '')

    # Identify rows where the BSR matched a session in the schedule
    matched_mask = blank_mask & df_temp['Target_Competition'].notna()
    
    # 1. Fill matched blanks with the correct session name
    self.df.loc[matched_mask, 'Competition'] = df_temp.loc[matched_mask, 'Target_Competition']
    
    # 2. Fill the remaining unmatched blanks with 'Support'
    remaining_blanks_mask = self.df['Competition'].isna() | (self.df['Competition'].astype(str).str.strip() == '')
    self.df.loc[remaining_blanks_mask, 'Competition'] = 'Support'
    
    # --- Clean Up and Report ---
    rows_imputed = matched_mask.sum()
    rows_defaulted_to_support = remaining_blanks_mask.sum() - rows_imputed # Rough count of those not matched
    
    return {
        "check_key": "impute_competition",
        "status": "Completed",
        "action": "Competition Imputation",
        "description": f"Imputed blank 'Competition' values: {rows_imputed} rows mapped to sessions, {rows_defaulted_to_support} defaulted to 'Support'.",
        "details": {
            "rows_imputed": int(rows_imputed),
            "rows_defaulted_to_support": int(rows_defaulted_to_support),
            "initial_blanks": int(initial_blanks)
        }
    }

In [9]:
import pandas as pd
import numpy as np

# --- 1. Define the Schedule Function ---
def _create_session_schedule():
    """Creates the definitive schedule DataFrame for mapping."""
    data = {
        'Session': [
            'Practice 1', 'Practice 2', 'Practice 3', 
            'Qualifying', 'GRAND PRIX (52 LAPS OR 120 MINS)'
        ],
        'Date': [
            '4-Jul-2025', '4-Jul-2025', '5-Jul-2025', 
            '5-Jul-2025', '6-Jul-2025'
        ],
        'Start Time': [
            '11:30:00', '15:00:00', '10:30:00', 
            '14:00:00', '14:00:00'
        ]
    }
    df_schedule = pd.DataFrame(data)

    # Standardize Session names to your target categories
    df_schedule['Target_Competition'] = np.select(
        [
            df_schedule['Session'].str.contains('Practice'),
            df_schedule['Session'] == 'Qualifying',
            df_schedule['Session'].str.contains('GRAND PRIX')
        ],
        ['Training', 'Qualifying', 'Race'],
        default='Support' # Default should not be hit if input is clean
    )

    # Create the precise key for joining: Date (as date object) + Time (as string)
    df_schedule['Date_Key'] = pd.to_datetime(df_schedule['Date']).dt.date
    df_schedule['Start_Key'] = df_schedule['Start Time'].str.strip()

    return df_schedule[['Target_Competition', 'Date_Key', 'Start_Key']]


# --- 2. Load and Prepare Data ---
try:
    df = pd.read_excel(
        "data/WF 3 F1-R12 - Great Britain.xlsx",
        sheet_name="Worksheet",
        header=5
    )
except FileNotFoundError:
    print("Execution halted: File not found.")
    df = pd.DataFrame() # Create an empty DF to avoid further errors


if not df.empty and 'Competition' in df.columns:
    
    # Preparation Steps for BSR (Needed for merging)
    df['Date_Key'] = pd.to_datetime(df['Date (UTC/GMT)']).dt.date
    df['Start_Key'] = df['Start (UTC)'].astype(str).str.strip() 

    # Get the schedule reference table
    df_schedule = _create_session_schedule()

    # --- 3. Perform Left Merge (Mapping) ---
    # Merge BSR with Schedule on (Date, Start Time) to pull in 'Target_Competition'
    df = df.merge(
        df_schedule, 
        on=['Date_Key', 'Start_Key'], 
        how='left', 
        suffixes=('', '_Map')
    )

    # --- 4. Imputation Logic: Create New Column ---

    # Initialize the new column with a placeholder value indicating no match was found
    df['Imputed_Competition'] = np.nan 
    
    # 1. Fill the new column with the successfully mapped session name (Training, Qualifying, Race)
    # This targets ALL rows, regardless of whether the original Competition column was blank.
    matched_mask = df['Target_Competition'].notna()
    df.loc[matched_mask, 'Imputed_Competition'] = df.loc[matched_mask, 'Target_Competition']
    rows_imputed = matched_mask.sum() # Total rows that matched the schedule

    # 2. Default the remaining NaNs in the new column to 'Support'
    rows_defaulted_to_support = df['Imputed_Competition'].isna().sum()
    df['Imputed_Competition'] = df['Imputed_Competition'].fillna('Support')

    # --- Clean Up and Display Results ---
    df = df.drop(columns=['Date_Key', 'Start_Key', 'Target_Competition'])

    # Calculate initial and final states for comparison
    initial_blanks = df['Competition'].isna().sum() 
    
    print(f"\n--- Competition Imputation Results ---")
    print(f"Total Rows Mapped to Sessions (Training/Qualifying/Race): {rows_imputed}")
    print(f"Total Rows Defaulted to 'Support': {rows_defaulted_to_support}")
    
    print("\nDataFrame head showing original and imputed values:")
    # Display the original Competition column alongside the new imputed one
    print(df[['Date (UTC/GMT)', 'Start (UTC)', 'Competition', 'Imputed_Competition', 'Program Title']].head(10))

else:
    print("DataFrame is empty or missing the 'Competition' column.")


--- Competition Imputation Results ---
Total Rows Mapped to Sessions (Training/Qualifying/Race): 139
Total Rows Defaulted to 'Support': 2855

DataFrame head showing original and imputed values:
  Date (UTC/GMT) Start (UTC) Competition Imputed_Competition  \
0     2025-07-04    01:00:00     Support             Support   
1     2025-07-04    19:00:03    Training             Support   
2     2025-07-04    20:09:29    Training             Support   
3     2025-07-05    00:01:16    Training             Support   
4     2025-07-05    01:11:25    Training             Support   
5     2025-07-05    18:01:27    Training             Support   
6     2025-07-05    19:12:10  Qualifying             Support   
7     2025-07-05    22:30:00    Training             Support   
8     2025-07-05    23:40:39  Qualifying             Support   
9     2025-07-06    10:30:09    Training             Support   

                                       Program Title  
0  El Show de la F1        -O El Show de la F

 'Training' 'Training' 'Qualifying' 'Race' 'Qualifying' 'Training'
 'Training' 'Training' 'Qualifying' 'Race' 'Training' 'Training'
 'Training' 'Training' 'Qualifying' 'Race' 'Race' 'Qualifying' 'Training'
 'Training' 'Training' 'Qualifying' 'Race' 'Training' 'Training'
 'Training' 'Qualifying' 'Training' 'Training' 'Training' 'Qualifying'
 'Race' 'Training' 'Training' 'Training' 'Qualifying' 'Race' 'Training'
 'Training' 'Training' 'Qualifying' 'Race' 'Training' 'Training'
 'Training' 'Qualifying' 'Race' 'Training' 'Training' 'Training'
 'Qualifying' 'Race' 'Training' 'Training' 'Training' 'Qualifying' 'Race'
 'Race' 'Training' 'Training' 'Training' 'Qualifying' 'Race' 'Training'
 'Training' 'Training' 'Qualifying' 'Race' 'Training' 'Training'
 'Training' 'Qualifying' 'Race' 'Qualifying' 'Race' 'Qualifying'
 'Training' 'Training' 'Training' 'Qualifying' 'Race' 'Training'
 'Training' 'Qualifying' 'Race' 'Training' 'Training' 'Training'
 'Qualifying' 'Race' 'Training' 'Training' 'Traini

In [15]:
import pandas as pd
import numpy as np
from datetime import timedelta

# Define the schedule function outside the main logic block
def _get_f1_live_schedule():
    """Creates a standardized DataFrame of official F1 live session windows (UTC)."""
    data = {
        'Session': ['Practice 1', 'Practice 2', 'Practice 3', 'Qualifying', 'GRAND PRIX'],
        'Date': ['4-Jul-2025', '4-Jul-2025', '5-Jul-2025', '5-Jul-2025', '6-Jul-2025'],
        'Start Time': ['11:30:00', '15:00:00', '10:30:00', '14:00:00', '14:00:00'],
        'End Time': ['12:30:00', '16:00:00', '11:30:00', '15:00:00', '16:00:00']
    }
    df_schedule = pd.DataFrame(data)

    df_schedule['Live_Start_UTC'] = pd.to_datetime(df_schedule['Date'] + ' ' + df_schedule['Start Time'])
    df_schedule['Live_End_UTC'] = pd.to_datetime(df_schedule['Date'] + ' ' + df_schedule['End Time'])

    return df_schedule

def get_time_string(series):
    dt_series = pd.to_datetime(series, errors='coerce', format='mixed')
    time_series = dt_series.dt.strftime('%H:%M:%S').fillna('00:00:00')
    return time_series

# --- 2. Load and Prepare Data (Mocking a BSR Load) ---
try:
    df = pd.read_excel(
        "data/WF 3 F1-R12 - Great Britain.xlsx",
        sheet_name="Worksheet",
        header=5
    )
except FileNotFoundError:
    print("Execution halted: File not found.")
    df = pd.DataFrame() 

if not df.empty:
    NEW_COL = 'Imputed_Program_Type'
    
    # 1. Clean and Prepare Time/Date Columns (Essential for logic)
    df['Program Title'] = df['Program Title'].astype(str).str.strip()
    df['Start_Time_Clean'] = get_time_string(df['Start (UTC)'])
    df['End_Time_Clean'] = get_time_string(df['End (UTC)'])
    
    try:
        # Create robust datetime objects for comparison
        date_dt = pd.to_datetime(df['Date (UTC/GMT)'], errors='coerce', format='mixed').dt.strftime('%Y-%m-%d')
        date_dt_clean = date_dt.fillna('1970-01-01') 

        bsr_dates = pd.to_datetime(date_dt_clean + ' ' + df['Start_Time_Clean'], errors='coerce')
        bsr_ends = pd.to_datetime(date_dt_clean + ' ' + df['End_Time_Clean'], errors='coerce')
        
        # Calculate BSR Duration in minutes
        bsr_duration_minutes = (bsr_ends - bsr_dates) / timedelta(minutes=1)
        
    except Exception as e:
        print(f"ERROR: Failed to parse BSR Date/Time columns: {e}")
        df = pd.DataFrame() 

if not df.empty:
    
    # Initialize the new column with the default category
    df[NEW_COL] = 'Magazine & Support'
    rows_imputed = 0
    
    df_schedule = _get_f1_live_schedule()

    # --- Logical Thresholds ---
    LIVE_TIME_TOLERANCE_MIN = 5      # Must start within 5 minutes of official live start
    LIVE_DURATION_TOLERANCE_PCT = 0.10 # Duration must be within 10% of official live duration
    HIGHLIGHT_MAX_DURATION_MIN = 60  # Highlights/Magazines are typically 1 hour or less

    # --- Primary Program Type Imputation ---
    
    for _, session in df_schedule.iterrows():
        live_start = session['Live_Start_UTC']
        live_end = session['Live_End_UTC']
        live_duration = (live_end - live_start) / timedelta(minutes=1)
        
        # 1. Calculate time differences and duration windows
        time_diff_abs = (bsr_dates - live_start).abs() / timedelta(minutes=1)
        time_diff_actual = (bsr_dates - live_start) / timedelta(minutes=1)
        
        duration_min = live_duration * (1 - LIVE_DURATION_TOLERANCE_PCT)
        duration_max = live_duration * (1 + LIVE_DURATION_TOLERANCE_PCT)
        
        is_long_duration = (bsr_duration_minutes >= duration_min)
        is_short_duration = (bsr_duration_minutes <= HIGHLIGHT_MAX_DURATION_MIN)
        
        # --- SEGREGATION MASKS (Hierarchical Logic) ---
        
        # MASK A: LIVE (Highest Confidence)
        # Starts very close to official time (before or slightly after) AND matches duration.
        LIVE_MATCH_MASK = is_long_duration & (time_diff_abs <= LIVE_TIME_TOLERANCE_MIN)

        # MASK B: REPEAT (Full session rebroadcast)
        # Matches duration but starts SIGNIFICANTLY later (e.g., more than 6 hours later, or next day)
        # This catches full-session reruns that are not live.
        REPEAT_MATCH_MASK = is_long_duration & (time_diff_actual > (6 * 60)) 

        # MASK C: HIGHLIGHTS
        # Short duration AND not a repeat AND occurs after the official session end.
        # It's an approximation, but captures post-session short-form content.
        HIGHLIGHTS_MATCH_MASK = is_short_duration & \
                                (bsr_dates > live_end) & \
                                (time_diff_actual > LIVE_TIME_TOLERANCE_MIN)
        
        # --- Apply Flags (Hierarchically) ---
        
        # Only target rows that still hold the default 'Magazine & Support' flag

        # 1. Apply LIVE Flag
        live_final_mask = LIVE_MATCH_MASK & (df[NEW_COL] == 'Magazine & Support')
        df.loc[live_final_mask, NEW_COL] = 'Live'
        rows_imputed += live_final_mask.sum()
        
        # 2. Apply REPEAT Flag (Only to rows NOT already marked Live)
        repeat_final_mask = REPEAT_MATCH_MASK & (df[NEW_COL] == 'Magazine & Support')
        df.loc[repeat_final_mask, NEW_COL] = 'Repeat'
        rows_imputed += repeat_final_mask.sum()
        
        # 3. Apply HIGHLIGHTS Flag (Only to remaining rows)
        highlights_final_mask = HIGHLIGHTS_MATCH_MASK & (df[NEW_COL] == 'Magazine & Support')
        df.loc[highlights_final_mask, NEW_COL] = 'Highlights'
        rows_imputed += highlights_final_mask.sum()


    # --- Final Cleanup and Display Results ---
    
    rows_defaulted_to_support = (df[NEW_COL] == 'Magazine & Support').sum()
    
    print(f"\n--- Program Type Imputation Results (Logical) ---")
    print(f"Total Rows Imputed (Live/Repeat/Highlights): {rows_imputed}")
    print(f"Total Rows Defaulted to 'Magazine & Support': {rows_defaulted_to_support}")
    
    print("\nDataFrame head showing Imputed Program Type:")
    print(df[['Date (UTC/GMT)', 'Start (UTC)', 'Program Title', 'Type of program', NEW_COL]].head(10))


--- Program Type Imputation Results (Logical) ---
Total Rows Imputed (Live/Repeat/Highlights): 2015
Total Rows Defaulted to 'Magazine & Support': 979

DataFrame head showing Imputed Program Type:
  Date (UTC/GMT) Start (UTC)  \
0     2025-07-04    01:00:00   
1     2025-07-04    19:00:03   
2     2025-07-04    20:09:29   
3     2025-07-05    00:01:16   
4     2025-07-05    01:11:25   
5     2025-07-05    18:01:27   
6     2025-07-05    19:12:10   
7     2025-07-05    22:30:00   
8     2025-07-05    23:40:39   
9     2025-07-06    10:30:09   

                                       Program Title     Type of program  \
0  El Show de la F1        -O El Show de la F1   ...  Magazine & Support   
1                             FORMULA 1 PRACTICAS(R)              Repeat   
2                            FORMULA 1 PRACTICAS(R2)              Repeat   
3                            FORMULA 1 PRACTICAS(R3)              Repeat   
4                            FORMULA 1 PRACTICAS(R4)              Repe