This file...    
...contains code to implement the left-to-right transformation (*standardize_attack_direction_with_period()*),  
...labels scoring and conceding actions (*scores()* and *concedes()*),  
...creates the game states (*add_previous_actions_features()*).



In [1]:
import pandas as pd

In [None]:
df = pd.read_csv('../../processed_data/all_actions_4.0.csv')

In [3]:
df['team'] = df['team'].map({True: 1, False: 0})

In [4]:
print(df.columns)

Index(['game_id', 'timestamp', 'period', 'actiontype', 'actiontype_eventpass',
       'actiontype_eventshot', 'actiontype_eventturnover',
       'actionresult_success', 'actionresult_miss', 'time', 'team',
       'start_x_ai', 'start_z_ai', 'end_x_ai', 'end_z_ai',
       'start_dist_to_goal_ai', 'start_angle_to_angle_ai',
       'end_dist_to_goal_ai', 'end_angle_to_angle_ai', 'movement_ai',
       'goalscore_team', 'goalscore_opponent', 'goalscore_diff',
       'home_strength', 'visitor_strength', 'strength_difference',
       'defensive_density', 'speed', 'time_delta_i', 'dx_a0i', 'dy_a0i',
       'mov_a0i'],
      dtype='object')


In [5]:
df.head(5)

Unnamed: 0,game_id,timestamp,period,actiontype,actiontype_eventpass,actiontype_eventshot,actiontype_eventturnover,actionresult_success,actionresult_miss,time,...,goalscore_diff,home_strength,visitor_strength,strength_difference,defensive_density,speed,time_delta_i,dx_a0i,dy_a0i,mov_a0i
0,0,1729120000000.0,1,EventPass,1,0,0,1,0,6,...,0,6,6,0,0.428,6.161367,1.0,-0.9478,2.8805,3.032426
1,0,1729120000000.0,1,EventPass,1,0,0,1,0,7,...,0,6,6,0,0.4189,2.637392,7.0,19.3504,-20.9293,28.503922
2,0,1729120000000.0,1,EventPass,1,0,0,1,0,14,...,0,6,6,0,0.071,0.430129,3.0,-14.4556,11.1611,18.262928
3,0,1729120000000.0,1,EventShot,0,1,0,0,1,17,...,0,6,6,0,0.0,7.305552,1.0,0.0,0.0,0.0
4,0,1729120000000.0,1,EventShot,0,1,0,0,1,18,...,0,6,6,0,0.0,7.23801,7.0,0.0,0.0,0.0


In [6]:
unique_counts = df['actiontype'].value_counts()
print(unique_counts)
# 874 GOALS CONFIRMED

actiontype
EventPass        79135
EventShot        22708
EventTurnover       76
Name: count, dtype: int64


In [8]:
df = df.drop(columns=['timestamp'])

In [9]:
len(df)

101919

In [None]:
def standardize_attack_direction_with_period(df):
    df = df.copy()

    # Group the actions by each game
    for game_id, game_df in df.groupby('game_id'):
            
        first_period_shots = game_df[(game_df['period'] == 1) & (game_df['actiontype_eventshot'] == 1)]

        # Only proceed if both teams have at least one shot
        if first_period_shots['team'].nunique() < 2:
            print("WARNING")
            continue

        # Calculate mean or median start_x_ai per team
        team_avg_x = first_period_shots.groupby('team')['start_x_ai'].median()

        # Team with lower start_x_ai is attacking left
        att_left_team = team_avg_x.idxmin()
            
        # Iterate over the game data and apply the period-based flipping logic
        for idx, row in game_df.iterrows():
            flip = False
            
            # Get the period for this action
            period = row['period']
            att_team = row['team']
            
            # Determine which direction the team should be attacking based on period
            if (period % 2 == 1 and (att_team == att_left_team)) or (period % 2 == 0 and (att_team != att_left_team)):
                # This means the team should be flipped in this period
                flip = True
            # If the expected direction does not match the actual direction, flip the action
            if flip:
                # Flip the coordinates (x and z) for the action
                df.loc[idx, ['start_x_ai', 'end_x_ai']] *= -1
                df.loc[idx, ['start_z_ai', 'end_z_ai']] *= -1

    return df

In [11]:
actions_df = standardize_attack_direction_with_period(df)



In [12]:
successful_shots = actions_df[actions_df['actiontype_eventshot'] == 1]
successful_shots = successful_shots[successful_shots['actionresult_success'] == 1]

# Shots that end in left goal (x = -26.95)
wrong_direction = successful_shots[successful_shots['end_x_ai'] == -26.95]

# Percentage
error_rate = len(wrong_direction) / len(successful_shots) * 100
print(f"Percentage of wrong-direction goals: {error_rate:.2f}%")

Percentage of wrong-direction goals: 0.46%


In [14]:
def count_and_filter_possession_sequences(actions_df, max_gap_seconds=8, min_count=3):
    actions_df = actions_df.copy()
    actions_df["sequence_id"] = None

    current_team = None
    last_time = None
    sequence_id = -1
    count_in_sequence = 0
    indices_in_sequence = []

    for i in range(len(actions_df)):
        row = actions_df.iloc[i]
        team = row["team"]
        time = row["time"]

        if team == current_team and (time - last_time <= max_gap_seconds):
            count_in_sequence += 1
            indices_in_sequence.append(i)
        else:
            if count_in_sequence >= min_count:
                for idx in indices_in_sequence:
                    actions_df.at[idx, "sequence_id"] = sequence_id
            # reset sequence
            sequence_id += 1
            current_team = team
            count_in_sequence = 1
            indices_in_sequence = [i]

        last_time = time

    # Final sequence check
    if count_in_sequence >= min_count:
        for idx in indices_in_sequence:
            actions_df.at[idx, "sequence_id"] = sequence_id

    # Keep only valid sequences
    actions_df = actions_df[actions_df["sequence_id"].notna()].copy()
    actions_df["sequence_id"] = actions_df["sequence_id"].astype(int)

    return actions_df

In [15]:
clean_df = count_and_filter_possession_sequences(actions_df, 8, 3)

In [None]:
clean_df.to_pickle('../testing/clean_df.pkl')
actions_df.to_pickle('data/dirty_df.pkl')

In [17]:
print(len(clean_df))

67276


In [18]:
def scores(actions_df, nr_actions=4):
    goals = (actions_df["actiontype"] == "EventShot") & (actions_df["actionresult_success"] == 1)

    y = pd.DataFrame({
        "goal": goals,
        "team": actions_df["team"],
        "possession": actions_df["team"]  # assuming possession = team here
    })

    for i in range(1, nr_actions):
        for c in ["goal", "team", "possession"]:
            shifted = y[c].shift(-i)
            if c == "goal":
                shifted[-i:] = False
            else:
                shifted[-i:] = None
            y[f"{c}+{i}"] = shifted

    res = y["goal"].copy()

    # Track whether possession has stayed the same up to each step
    possession_mask = pd.Series(True, index=y.index)

    for i in range(1, nr_actions):
        still_same_possession = y[f"possession+{i}"] == y["possession"]
        possession_mask = possession_mask & still_same_possession.fillna(False)

        same_team_goal = y[f"goal+{i}"] & (y[f"team+{i}"] == y["team"])
        valid = possession_mask & same_team_goal
        res = res | valid

    return pd.DataFrame(res, columns=["scores"])

In [19]:
def concedes(actions_df, nr_actions=4):
    # Detect which actions are goals
    goals = (actions_df["actiontype"] == "EventShot") & (actions_df["actionresult_success"] == 1)

    # Current team (assumed to be in possession)
    team = actions_df["team"]

    # Initialize result as all False
    res = pd.Series(False, index=actions_df.index)

    for idx in range(len(actions_df)):
        original_team = team.iloc[idx]
        possession_changed = False

        for i in range(1, nr_actions):
            if idx + i >= len(actions_df):
                break

            future_team = team.iloc[idx + i]
            future_goal = goals.iloc[idx + i]

            if not possession_changed:
                if future_team != original_team:
                    possession_changed = True
            else:
                # Possession changed again – stop looking further
                if future_team == original_team:
                    break

            # If goal is scored by opponent after a single possession change
            if possession_changed and future_goal and future_team != original_team:
                res.iloc[idx] = True
                break  # No need to check further

    return pd.DataFrame(res, columns=["concedes"])

In [20]:
Y_scores = scores(actions_df)
Y_concedes = concedes(actions_df)
Y = pd.concat([Y_scores, Y_concedes], axis=1)

In [21]:
Y_scores_clean = scores(clean_df, 4)
Y_concedes_clean = concedes(clean_df, 4)
Y_clean = pd.concat([Y_scores_clean, Y_concedes_clean], axis=1)

In [22]:
n = 3

In [23]:
actions_df = actions_df.drop(['actiontype', 'game_id', 'home_strength', 'visitor_strength', 'goalscore_team', 'goalscore_opponent'], axis=1)

In [24]:
clean_df = clean_df.drop(['actiontype', 'game_id', 'home_strength', 'visitor_strength', 'goalscore_team', 'goalscore_opponent', 'sequence_id'], axis=1)

In [25]:
def add_previous_actions_features(df, n=1, exclude_columns=None):
    """
    Appends features from previous n actions to each row in the dataframe.

    Parameters:
    - df: pd.DataFrame of actions
    - n: number of previous actions to include
    - exclude_columns: list of column names to exclude from copying

    Returns:
    - pd.DataFrame with additional columns for previous actions
    """
    df = df.reset_index(drop=True)  # Ensure a clean index
    output = df.copy()
    
    if exclude_columns is None:
        exclude_columns = []

    for i in range(1, n + 1):
        shifted = df.shift(i).add_suffix(f"_prev{i}")
        # Drop excluded columns
        for col in exclude_columns:
            if col in df.columns:
                shifted.drop(f"{col}_prev{i}", axis=1, inplace=True, errors='ignore')
        output = pd.concat([output, shifted], axis=1)

    return output

In [26]:
gamestates = add_previous_actions_features(actions_df, n)
Y = Y.iloc[n:]
X = gamestates.iloc[n:]

In [27]:
print(len(X.columns))

100


In [28]:
gamestates_clean = add_previous_actions_features(clean_df, n)
Y_clean = Y_clean.iloc[n:]
X_clean = gamestates_clean.iloc[n:]

In [None]:
X.to_pickle('data/X.pkl')
Y.to_pickle('data/Y.pkl')

In [None]:
X_clean.to_pickle('../testing/X_clean.pkl')
Y_clean.to_pickle('../testing/Y_clean.pkl')

In [31]:
def calculate_average_possession_length(df, max_gap_seconds=5):
    # Initialize variables for possession sequence calculation
    sequence_lengths = []
    current_team = None
    last_time = None
    count_in_sequence = 0

    # Loop through the DataFrame to calculate sequences
    for i, row in df.iterrows():
        team = row['team']
        time = row['time']
        
        if current_team is None or (team == current_team and time - last_time <= max_gap_seconds):
            # Continue the current possession sequence
            count_in_sequence += 1
        else:
            # End the current sequence and start a new one
            sequence_lengths.append(count_in_sequence)
            count_in_sequence = 1  # Reset for new sequence

        # Update the team and last time
        current_team = team
        last_time = time
    
    # Append the final sequence length
    if count_in_sequence > 0:
        sequence_lengths.append(count_in_sequence)

    # Calculate and return the average possession length
    average_length = sum(sequence_lengths) / len(sequence_lengths) if sequence_lengths else 0
    print(f"Average possession sequence length: {average_length:.2f} actions")
    return average_length

In [32]:
print("Actions_df total actions:", len(actions_df))
print("Clean_df total actions:", len(clean_df))

Actions_df total actions: 101919
Clean_df total actions: 67276


In [33]:
# Calculate and print the average possession length for both dataframes
print("Average possession length (actions_df):")
calculate_average_possession_length(actions_df)

print("\nAverage possession length (clean_df):")
calculate_average_possession_length(clean_df)

Average possession length (actions_df):
Average possession sequence length: 2.17 actions

Average possession length (clean_df):
Average possession sequence length: 3.34 actions


3.3390907286082987

In [None]:
# Filter for only successful shots (goals)
filtered_df = clean_df[(clean_df['actiontype_eventshot'] == 1) & (clean_df['actionresult_success'] == 1)]

# Group by game_id and count goals
average_goals_per_game = filtered_df.groupby("game_id").size().mean()

print(f"Average goals per game: {average_goals_per_game:.2f}")

Average goals per game: 3.85
