# IE423 Project: CUSUM-Based Early Warning System

This notebook implements a CUSUM (Cumulative Sum) control chart-based early warning system to detect significant in-match events such as goals and red cards using minute-by-minute soccer match data.

## Objectives
- Detect abnormal shifts in match statistics (e.g., possession, attacks, shots).
- Generate early warnings before goals or red cards.
- Evaluate alarms in terms of early detection, false alarms, and missed events.

The methodology is based on the course project description and inspired by previous example solutions provided.


In [1]:
import pandas as pd
import numpy as np

## Load and Prepare Data

The uploaded dataset includes minute-by-minute stats for multiple matches. We fix overlapping minute values between halves to ensure continuous match timelines.


In [2]:
df = pd.read_csv("fall25_ie423_project_data_sample.txt")

def adjust_half_minutes(match_df):
    df_match = match_df.copy()
    if 'halftime' in df_match.columns and any(df_match['halftime'] == '2nd-half'):
        first_half_max = df_match[df_match['halftime'] == '1st-half']['minute'].max()
        if pd.isna(first_half_max):
            first_half_max = 45
        offset = max(0, first_half_max - 45)
        df_match.loc[df_match['halftime'] == '2nd-half', 'minute'] += offset
    return df_match

df_list = []
for match_id, match_data in df.groupby('match_id'):
    adjusted = adjust_half_minutes(match_data)
    adjusted = adjusted.sort_values('minute')
    df_list.append(adjusted)
df = pd.concat(df_list, ignore_index=True)


FileNotFoundError: [Errno 2] No such file or directory: 'fall25_ie423_project_data_sample.txt'

## CUSUM Alarm Detection

We apply a CUSUM control chart to selected metrics (with first-difference transformation if cumulative) and evaluate alarm-event matching.


In [None]:
def process_match(match_df, metrics, baseline_minutes=15):
    match_df = match_df.sort_values('minute').reset_index(drop=True)
    match_id = match_df['match_id'].iloc[0]

    events = []
    for _, row in match_df.iterrows():
        if row.get('goal_home_event', 0) == 1.0:
            events.append(("Goal - Home", int(row['minute'])))
        if row.get('goal_away_event', 0) == 1.0:
            events.append(("Goal - Away", int(row['minute'])))
        if row.get('red_home_event', 0) == 1.0:
            events.append(("Red Card - Home", int(row['minute'])))
        if row.get('red_away_event', 0) == 1.0:
            events.append(("Red Card - Away", int(row['minute'])))
    events.sort(key=lambda x: x[1])

    baseline_end = baseline_minutes
    max_minute = match_df['minute'].max()
    if baseline_end > max_minute:
        baseline_end = max_minute
    baseline_data = match_df[match_df['minute'] <= baseline_end]

    alarm_minutes = set()
    for metric, use_diff in metrics.items():
        if metric not in match_df.columns:
            continue
        series = match_df[metric].astype(float).values
        if use_diff:
            diffs = np.diff(series, prepend=series[0])
        else:
            diffs = series.copy()

        base_series = baseline_data[metric].astype(float).values
        if use_diff:
            base_diffs = np.diff(base_series, prepend=base_series[0])
        else:
            base_diffs = base_series.copy()

        if len(base_diffs) == 0:
            mu, sigma = 0.0, 1.0
        else:
            mu = np.mean(base_diffs)
            sigma = np.std(base_diffs, ddof=0)
            if sigma == 0:
                sigma = 1.0

        k = 0.5 * sigma
        h = 5.0 * sigma
        S_pos, S_neg = 0.0, 0.0

        for i, x in enumerate(diffs):
            t = match_df['minute'].iloc[i]
            if t <= baseline_end:
                deviation = x - mu
                S_pos = max(0, S_pos + (deviation - k))
                S_neg = min(0, S_neg + (deviation + k))
                continue
            deviation = x - mu
            S_pos = max(0, S_pos + (deviation - k))
            S_neg = min(0, S_neg + (deviation + k))
            if S_pos > h:
                alarm_minutes.add(int(match_df['minute'].iloc[i]))
                S_pos = 0
            if S_neg < -h:
                alarm_minutes.add(int(match_df['minute'].iloc[i]))
                S_neg = 0

    alarm_minutes = sorted(alarm_minutes)

    results = []
    used_alarms = set()
    lead_window = 5
    lag_window = 5
    for event_type, event_min in events:
        relevant_alarms = [a for a in alarm_minutes if (a >= event_min - lead_window and a <= event_min + lag_window)]
        if len(relevant_alarms) == 0:
            results.append((f"{event_type} (Missed)", event_min, None, None))
        else:
            chosen_alarm = None
            alarms_before = [a for a in relevant_alarms if a <= event_min]
            if len(alarms_before) > 0:
                chosen_alarm = max(alarms_before)
            else:
                chosen_alarm = min(relevant_alarms)

            if chosen_alarm in used_alarms:
                remaining = [a for a in relevant_alarms if a not in used_alarms]
                if len(remaining) == 0:
                    results.append((f"{event_type} (Missed)", event_min, None, None))
                    continue
                else:
                    alarms_before = [a for a in remaining if a <= event_min]
                    if len(alarms_before) > 0:
                        chosen_alarm = max(alarms_before)
                    else:
                        chosen_alarm = min(remaining)

            used_alarms.add(chosen_alarm)
            time_diff = event_min - chosen_alarm
            results.append((event_type, event_min, chosen_alarm, time_diff))

    for alarm_min in alarm_minutes:
        if alarm_min not in used_alarms:
            results.append(("False Alarm", None, alarm_min, None))

    return pd.DataFrame(results, columns=["event_type", "event_minute", "alarm_minute", "time_difference"])


## Example: Running CUSUM on a Sample Match

In [None]:
metrics_to_monitor = {
    "BALL_POSSESSION - home": True,
    "ATTACKS - home": True,
    "SHOTS_ON_TARGET - home": True
}

first_match_id = df['match_id'].iloc[0]
first_match_df = df[df['match_id'] == first_match_id]
result_example = process_match(first_match_df, metrics_to_monitor)
result_example