## __Import__

In [3]:
import sys
import os
import pandas as pd
import numpy as np
from ast import literal_eval
# Run the notebook from inside the notebooks folder
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__name__), '..')))
from scripts.dataloader import Dataloader


dataloader = Dataloader(file_path="../../data/new_approach/all_leagues.parquet")
dataloader.load_data()
df = dataloader.get_dimension("defending")

# load standard stats
standard_stats = pd.read_csv("../../data/new_approach/standard_stats_all.csv").loc[:,["player","player_id","full_match_equivalents"]]
standard_stats

[2025-07-08 21:28:06] Loading data from local file system


Unnamed: 0,player,player_id,full_match_equivalents
0,Leroy Sané,3053.0,0.0
1,Jean-Eric Maxim Choupo-Moting,3499.0,0.0
2,Joël Andre Job Matip,3502.0,0.0
3,Sead Kolašinac,3510.0,0.0
4,Pierre-Emile Højbjerg,3570.0,0.0
5,Younès Belhanda,5242.0,0.0
6,Andrej Kramarić,5460.0,1.0
7,Fabian Lukas Schär,5537.0,1.0
8,Sebastian Rudy,6039.0,1.0
9,Johannes Geis,6717.0,0.0


## __Setting types__

In [5]:
def convert_to_list(input_data):
    if(isinstance(input_data, str)):
        try:
            return literal_eval(input_data)
        except (ValueError, SyntaxError):
            print(f"Error: The string {input_data} could not be converted to a list.")
            return None
    
    return input_data
    
df["location"] = df["location"].apply(convert_to_list)
df[["x", "y"]] = df["location"].apply(pd.Series)
df["under_pressure"] = df["under_pressure"].fillna(False)
df

Unnamed: 0,player,player_id,type,location,ball_recovery_recovery_failure,ball_recovery_offensive,bad_behaviour_card,foul_committed_card,counterpress,block_offensive,...,foul_committed_offensive,foul_committed_penalty,foul_won_penalty,interception_outcome,clearance_body_part,duel_type,duel_outcome,under_pressure,x,y
3097,Leroy Sané,3053.0,Pressure,"[55.4, 73.9]",,,,,,,...,,,,,,,,False,55.4,73.9
3098,Benedikt Höwedes,7016.0,Pressure,"[47.3, 73.7]",,,,,,,...,,,,,,,,False,47.3,73.7
3099,Kevin Volland,8215.0,Pressure,"[96.8, 25.5]",,,,,,,...,,,,,,,,False,96.8,25.5
3100,Mark Uth,8387.0,Pressure,"[109.8, 43.8]",,,,,,,...,,,,,,,,False,109.8,43.8
3101,Pirmin Schwegler,8556.0,Pressure,"[81.4, 27.2]",,,,,,,...,,,,,,,,False,81.4,27.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6395867,Marcelo Vieira da Silva Júnior,5552.0,Ball Recovery,"[99.0, 21.0]",,,,,,,...,,,,,,,,False,99.0,21.0
6395868,Danilo Luiz da Silva,3063.0,Ball Recovery,"[81.0, 60.0]",,,,,,,...,,,,,,,,False,81.0,60.0
6396036,Keylor Navas Gamboa,5597.0,Bad Behaviour,,,,Yellow Card,,,,...,,,,,,,,False,,
6396037,Gabriel Fernández Arenas,6384.0,Bad Behaviour,,,,Yellow Card,,,,...,,,,,,,,False,,


## __Generate Features__

In [12]:
def is_in_defensive_penalty_area(x, y):
    x_axis = ((0 <= x) and (x <= 18))  # Stop before goal area
    y_axis = ((40 - 22) <= y and y <= (40 + 22))
    return x_axis and y_axis

def analyze_defending(df, standard_stats):
    """
    This function Pre-compute all conditions for Vectorize operations.
    Returns results grouped by player and under_pressure  
    """
    # Pre-compute all conditions
    df_with_flags = df.copy()

    # Actions
    df_with_flags["is_ball_recovery"] = df["type"] == "Ball Recovery"
    df_with_flags["is_ball_recovery_successful"] = df["ball_recovery_recovery_failure"].isna()
    df_with_flags["is_offensive_ball_recovery"] = df["ball_recovery_offensive"] == True
    df_with_flags["is_ball_recovery_failed"] = df["ball_recovery_recovery_failure"] == True
    df_with_flags["is_bad_behaviour"] = df["type"]=="Bad Behaviour"
    df_with_flags["is_yellow_card_bad_behaviour"] = (df["bad_behaviour_card"] == "Yellow Card") 
    df_with_flags["is_second_yellow_card_bad_behaviour"] = df["bad_behaviour_card"] == "Second Yellow"
    df_with_flags["is_red_card_bad_behaviour"] = df["bad_behaviour_card"] == "Red Card"
    df_with_flags["is_yellow_card_foul"] = (df["foul_committed_card"] == "Yellow Card")
    df_with_flags["is_second_yellow_card_foul"] = (df["foul_committed_card"] == "Second Yellow")
    df_with_flags["is_red_card_foul"] = (df["foul_committed_card"] == "Red Card")    
    df_with_flags["is_counterpress"] = df["counterpress"].notna()
    df_with_flags["is_pressure_on_opponent"] = df["type"] == "Pressure"
    df_with_flags["is_block"] = df["type"] == "Block"
    df_with_flags["is_block_offensive"] = df["block_offensive"] == True
    df_with_flags["is_block_ball_deflection"] = df["block_deflection"] == True
    df_with_flags["is_block_shot_on_target"] = df["block_save_block"] == True
    df_with_flags["is_clearance"] = df["type"] == "Clearance"
    df_with_flags["is_duel"] = df["type"] == "Duel"
    df_with_flags["is_foul"] = df["type"] == "Foul Committed"
    df_with_flags["is_offensive_foul"] = df["foul_committed_offensive"] == True
    df_with_flags["is_foul_penalty_resulted"] = df["foul_committed_penalty"] == True
    df_with_flags["teammate_is_fouled_in_op_penalty"] = df["foul_won_penalty"] == True
    df_with_flags["is_interception"] =(df["interception_outcome"]=="Success In Play") | (df["interception_outcome"]=="Won") 
    df_with_flags["is_shield"] = df["type"] == "Shield"


    # Location-based flags
    df_with_flags['is_attacking_third'] = df['x'] >= 80
    df_with_flags['is_middle_third'] = (80 > df['x']) & (df['x'] > 40)
    df_with_flags['is_defending_third'] = df['x'] <= 40
    df_with_flags['is_in_defending_box'] = df[["x","y"]].apply(lambda row: is_in_defensive_penalty_area(row['x'], row['y']), axis=1)

    # Combinations
    df_with_flags["ball_recovery_successful"] = df_with_flags["is_ball_recovery"] & df_with_flags["is_ball_recovery_successful"]
    df_with_flags["ball_recovery_failed"] = (df_with_flags["is_ball_recovery"])  & (df_with_flags["is_ball_recovery_failed"])
    df_with_flags["ball_recovery_offensive_successful"] = df_with_flags["is_offensive_ball_recovery"] & df_with_flags["is_ball_recovery_successful"]
    df_with_flags["block_during_counterpress"] = df_with_flags["is_block"] & df_with_flags["is_block"]
    df_with_flags["is_clearance_with_head"] = (df_with_flags["is_clearance"]) & (df["clearance_body_part"] == "Head")
    df_with_flags["is_duel_tackling"] = (df_with_flags["is_duel"]) & (df["duel_type"] == "Tackle")
    df_with_flags["is_duel_tackling_won"] = (df_with_flags["is_duel"]) & (df["duel_type"] == "Tackle") & ((df["duel_outcome"] == "Won") | (df["duel_outcome"] == "Success In Play"))
    df_with_flags["is_pressure_on_opponent_defending_third"] = df_with_flags["is_pressure_on_opponent"] & df_with_flags['is_defending_third']
    df_with_flags["is_pressure_on_opponent_middle_third"] = df_with_flags["is_pressure_on_opponent"] & df_with_flags['is_middle_third']
    df_with_flags["is_pressure_on_opponent_attacking_third"] = df_with_flags["is_pressure_on_opponent"] & df_with_flags['is_attacking_third']
    df_with_flags["is_counterpress_on_opponent_defending_third"] = df_with_flags["is_counterpress"] & df_with_flags['is_defending_third']
    df_with_flags["is_counterpress_on_opponent_middle_third"] = df_with_flags["is_counterpress"] & df_with_flags['is_middle_third']
    df_with_flags["is_counterpress_on_opponent_attacking_third"] = df_with_flags["is_counterpress"] & df_with_flags['is_attacking_third']
    df_with_flags["is_tackling_in_defending_third"] = df_with_flags["is_duel_tackling"] & df_with_flags['is_defending_third']
    df_with_flags["is_tackling_in_middle_third"] = df_with_flags["is_duel_tackling"] & df_with_flags['is_middle_third']
    df_with_flags["is_tackling_in_attacking_third"] = df_with_flags["is_duel_tackling"] & df_with_flags['is_attacking_third']

    df_with_flags["is_foul_in_defending_third"] = df_with_flags["is_foul"] & df_with_flags['is_defending_third']
    df_with_flags["is_foul_in_middle_third"] = df_with_flags["is_foul"] & df_with_flags['is_middle_third']
    df_with_flags["is_foul_in_attacking_third"] = df_with_flags["is_foul"] & df_with_flags['is_attacking_third']

    df_with_flags["total_yellow_card"] = df_with_flags["is_yellow_card_foul"] | df_with_flags["is_yellow_card_foul"] | df_with_flags["is_second_yellow_card_bad_behaviour"] | df_with_flags["is_yellow_card_bad_behaviour"]
    df_with_flags["total_red_card"] = df_with_flags["is_red_card_foul"] | df_with_flags["is_red_card_bad_behaviour"]
    

    total_stats = df_with_flags.groupby('player_id').agg(
        # ball recovery
        ball_recovery_total=('is_ball_recovery', 'sum'),
        ball_recovery_successful=("ball_recovery_successful","sum"),
        ball_recovery_offensive_total=("is_offensive_ball_recovery", "sum"),
        ball_recovery_offensive_successful=("ball_recovery_offensive_successful", "sum"),
        ball_recovery_failed=("ball_recovery_failed", "sum"),
        # pressure 
        pressure_on_opponent=("is_pressure_on_opponent","sum"),
        pressure_on_opponent_defending_third=("is_pressure_on_opponent_defending_third","sum"),
        pressure_on_opponent_middle_third=("is_pressure_on_opponent_middle_third","sum"),
        pressure_on_opponent_attacking_third=("is_pressure_on_opponent_attacking_third","sum"),
        # counterpressing
        counterpressing_total = ("is_counterpress","sum"),
        counterpressing_opponent_defending_third = ("is_counterpress_on_opponent_defending_third","sum"),
        counterpressing_opponent_middle_third = ("is_counterpress_on_opponent_middle_third","sum"),
        counterpressing_attacking_third = ("is_counterpress_on_opponent_attacking_third","sum"),
        # shields
        shield_total=("is_shield","sum"),
        # blocking
        block_total = ("is_block", "sum"),
        block_offensive = ("is_block_offensive", "sum"),
        block_ball_deflection = ("is_block_ball_deflection", "sum"),
        block_counterpress = ("is_block_ball_deflection", "sum"),
        block_during_counterpress =("block_during_counterpress", "sum"),
        block_shot_on_target=("is_block_shot_on_target", "sum"),
        # clearance
        clearance_total=("is_clearance","sum"),
        clearance_with_head=("is_clearance_with_head","sum"),
        # interception
        interception=("is_interception","sum"),
        # duel
        tackling=("is_duel_tackling","sum"),
        tackling_won=("is_duel_tackling_won","sum"),
        tackling_in_defending_third=("is_tackling_in_defending_third","sum"),
        tackling_in_middle_third=("is_tackling_in_middle_third","sum"),
        tackling_in_attacking_third=("is_tackling_in_attacking_third","sum"),
        # fouls
        fouls_total=("is_foul","sum"),
        fouls_in_defending_third=("is_foul_in_defending_third","sum"),
        fouls_in_middle_third=("is_foul_in_middle_third","sum"),
        fouls_in_attacking_third=("is_foul_in_attacking_third","sum"),
        fouls_offensive=("is_offensive_foul","sum"),
        fouls_lead_to_penalty=("is_foul_penalty_resulted","sum"),
        fouls_wins_a_penalty=("teammate_is_fouled_in_op_penalty","sum"),
        fouls_yellow_card=("is_yellow_card_foul", "sum"),
        fouls_second_yellow_card=("is_second_yellow_card_foul", "sum"),
        fouls_red_card=("is_red_card_foul", "sum"),
        # bad_behaviour
        bad_behaviour_total=("is_bad_behaviour", "sum"),
        bad_behaviour_yellow_card=("is_yellow_card_bad_behaviour", "sum"),
        bad_behaviour_second_yellow_card=("is_second_yellow_card_bad_behaviour", "sum"),
        bad_behaviour_red_card=("is_red_card_bad_behaviour", "sum"),
        # total cards
        total_yellow = ("total_yellow_card", "sum"),
        total_red = ("total_red_card", "sum"),
    )

    ### calculate relative values ###

    calculation_pairs = [
        ("ball_recovery_successful","ball_recovery_total","ball_recovery_successful_%"),
        ("pressure_on_opponent_defending_third","pressure_on_opponent","pressure_on_opponent_defending_third_%"),
        ("pressure_on_opponent_middle_third","pressure_on_opponent","pressure_on_opponent_middle_third_%"),
        ("pressure_on_opponent_attacking_third","pressure_on_opponent","pressure_on_opponent_attacking_third_%"),
        ("counterpressing_opponent_defending_third","counterpressing_total","counterpressing_opponent_defending_third_%"),
        ("counterpressing_opponent_middle_third","counterpressing_total","counterpressing_opponent_middle_third_%"),
        ("counterpressing_attacking_third","counterpressing_total","counterpressing_opponent_attacking_third_%"),
        ("tackling_won","tackling","tackling_success_%"),
        ("tackling_in_defending_third","tackling","tackling_in_defending_third_%"),
        ("tackling_in_middle_third","tackling","tackling_in_middle_third_%"),
        ("tackling_in_attacking_third","tackling","tackling_in_attacking_third_%"),
        ("fouls_in_defending_third","fouls_total","fouls_in_defending_third_%"),
        ("fouls_in_middle_third","fouls_total","fouls_in_middle_third_%"),
        ("fouls_in_attacking_third","fouls_total","fouls_in_attacking_third_%"),
    ]

    for a, b, c in calculation_pairs:
        total_stats[c] = (total_stats[f'{a}'] / total_stats[f'{b}'])
        # replace inf
        total_stats[c] = total_stats[c].replace([np.inf, -np.inf], 0)
    total_stats = total_stats.fillna(0)

    ###  calcuate stats per match ###

    # merge standard stats with absolute values (result_df)
    absolute_column_values = [col for col in total_stats.columns if not col.endswith("_%") ]
    df_stats_per_game = pd.merge(left=standard_stats, right=total_stats[absolute_column_values],on="player_id",how="left")
    df_stats_per_game = df_stats_per_game.fillna(0)

    # calcuate stats per match and add to result_df
    for col in df_stats_per_game.drop(["player", "player_id", "full_match_equivalents"], axis=1).columns:
        col_name = f"{col}_per_match"
        try:
            df_stats_per_game[col_name] = (df_stats_per_game[col] / 90).round(3)
        except:
            print(col_name, "could not be calculated. Maybe it is not a numeric column?")

    # keep only per match stats
    column_per_match = [col for col in df_stats_per_game.columns if col.endswith("_per_match") or col=="player_id" or col=="player" ]
    df_stats_per_game = df_stats_per_game[column_per_match]

    # merge: abosulte, relative, per game values
    total_stats = pd.merge(left=total_stats, right=df_stats_per_game, on="player_id", how="right")
    total_stats = total_stats.fillna(0)


    return total_stats

result_df = analyze_defending(df, standard_stats)


Unnamed: 0,player_id,ball_recovery_total,ball_recovery_successful,ball_recovery_offensive_total,ball_recovery_offensive_successful,ball_recovery_failed,pressure_on_opponent,pressure_on_opponent_defending_third,pressure_on_opponent_middle_third,pressure_on_opponent_attacking_third,...,fouls_wins_a_penalty_per_match,fouls_yellow_card_per_match,fouls_second_yellow_card_per_match,fouls_red_card_per_match,bad_behaviour_total_per_match,bad_behaviour_yellow_card_per_match,bad_behaviour_second_yellow_card_per_match,bad_behaviour_red_card_per_match,total_yellow_per_match,total_red_per_match
0,3053.0,167.0,139.0,0,0,28,558.0,159.0,259.0,140.0,...,0.0,0.044,0.0,0.0,0.0,0.0,0.0,0.0,0.044,0.0
1,3499.0,115.0,104.0,1,1,11,315.0,92.0,157.0,66.0,...,0.0,0.011,0.0,0.0,0.0,0.0,0.0,0.0,0.011,0.0
2,3502.0,108.0,101.0,0,0,7,322.0,210.0,98.0,14.0,...,0.0,0.033,0.0,0.0,0.0,0.0,0.0,0.0,0.033,0.0
3,3510.0,77.0,71.0,0,0,6,313.0,153.0,118.0,42.0,...,0.0,0.056,0.0,0.0,0.0,0.0,0.0,0.0,0.056,0.0
4,3570.0,91.0,87.0,0,0,4,306.0,87.0,195.0,24.0,...,0.0,0.033,0.0,0.0,0.0,0.0,0.0,0.0,0.033,0.0
5,5242.0,60.0,55.0,0,0,5,204.0,43.0,107.0,54.0,...,0.0,0.022,0.0,0.0,0.0,0.0,0.0,0.0,0.022,0.0
6,5460.0,35.0,33.0,0,0,2,193.0,13.0,94.0,86.0,...,0.0,0.011,0.011,0.0,0.0,0.0,0.0,0.0,0.011,0.0
7,5537.0,70.0,68.0,0,0,2,172.0,95.0,66.0,11.0,...,0.0,0.033,0.0,0.0,0.0,0.0,0.0,0.0,0.033,0.0
8,6039.0,114.0,103.0,0,0,11,508.0,145.0,291.0,72.0,...,0.0,0.089,0.0,0.011,0.0,0.0,0.0,0.0,0.089,0.011
9,6717.0,119.0,110.0,0,0,9,456.0,176.0,252.0,28.0,...,0.0,0.044,0.0,0.011,0.0,0.0,0.0,0.0,0.044,0.011


## __Storing data__

In [14]:
import numpy as np
# Check for inf values in df_merged
has_inf = result_df.isin([np.inf, -np.inf]).any().any()

print(f"DataFrame contains inf values: {has_inf}")

DataFrame contains inf values: False


In [7]:
result_df.to_csv("../../data/defending.csv",index=True)