# Loading Data

In [1]:
import pandas as pd
from pathlib import Path

PROJECT_ROOT_DIR = Path.cwd().parent.parent
PROJECT_ROOT_DIR

file_paths_defending  = ["/data/processed/bundesliga/defending.csv",
               "/data/processed/la_liga/defending.csv",
               "/data/processed/ligue_1/defending.csv",
               "/data/processed/premier_league/defending.csv",
               "/data/processed/serie_a/defending.csv", 
               ]

file_paths_standard_stats  = ["/data/standard_stats_bundesliga.csv",
            "/data/standard_stats_la_liga.csv",
            "/data/standard_stats_ligue_1.csv",
            "/data/standard_stats_premier_league.csv",
            "/data/standard_stats_serie_a.csv",
            ]

In [2]:
defending = pd.DataFrame()

# Loop through each file, load the DataFrame, and concatenate
for file in file_paths_defending:
    temp_df = pd.read_csv(f"{PROJECT_ROOT_DIR}{file}")  # Load DataFrame from CSV file
    defending = pd.concat([defending, temp_df], ignore_index=True)
defending = defending.set_index("player")

standard_stats = pd.DataFrame()

# Loop through each file, load the DataFrame, and concatenate
for file in file_paths_standard_stats:
    temp_df = pd.read_csv(f"{PROJECT_ROOT_DIR}{file}")  # Load DataFrame from CSV file
    standard_stats = pd.concat([standard_stats, temp_df], ignore_index=True)
standard_stats = standard_stats.set_index("player")

standard_stats

Unnamed: 0_level_0,country,team,position,match_played,minutes_played,subbed_in,subbed_out,unique_positions_played,positions_played,full_match_equivalents,league
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Aaron Hunt,Germany,Hamburger SV,Midfielder,23.0,1742.0,3.0,8.0,"['Right Wing', 'Right Defensive Midfield', 'Le...","['Right Center Forward', 'Left Wing', 'Center ...",19.355556,bundesliga
Adam Hloušek,Czech Republic,VfB Stuttgart,Defender,7.0,425.0,3.0,0.0,"['Left Center Back', 'Right Center Midfield', ...","['Left Center Back', 'Left Wing', 'Left Center...",4.722222,bundesliga
Adelino André Vieira Freitas,Portugal,Wolfsburg,Forward,26.0,1908.0,4.0,8.0,"['Right Center Midfield', 'Right Back', 'Left ...","['Right Midfield', 'Right Back', 'Right Midfie...",21.200000,bundesliga
Admir Mehmedi,Switzerland,Bayer Leverkusen,Midfielder,28.0,1373.0,14.0,7.0,"['Left Midfield', 'Right Wing', 'Right Defensi...","['Right Center Forward', 'Left Midfield', 'Rig...",15.255556,bundesliga
Adnan Januzaj,Belgium,Borussia Dortmund,Forward,6.0,176.0,6.0,0.0,"['Center Forward', 'Right Midfield', 'Left Win...","['Right Wing', 'Right Wing', 'Right Wing', 'Le...",1.955556,bundesliga
...,...,...,...,...,...,...,...,...,...,...,...
Ľubomír Tupta,Slovakia,Hellas Verona,,0.0,0.0,0.0,0.0,[],[],0.000000,serie_a
Łukasz Skorupski,Poland,Empoli,Goalkeeper,31.0,2790.0,0.0,0.0,['Goalkeeper'],"['Goalkeeper', 'Goalkeeper', 'Goalkeeper', 'Go...",31.000000,serie_a
Ştefan Daniel Radu,Romania,Lazio,Defender,13.0,1097.0,0.0,3.0,"['Left Center Back', 'Right Center Forward', '...","['Left Center Back', 'Right Center Forward', '...",12.188889,serie_a
Šime Vrsaljko,Croatia,Sassuolo,Defender,35.0,3065.0,0.0,3.0,"['Center Attacking Midfield', 'Right Back', 'R...","['Right Back', 'Right Back', 'Right Wing Back'...",34.055556,serie_a


In [3]:
standard_stats.groupby("position").size()

position
Defender      876
Forward       736
Goalkeeper    220
Midfielder    860
dtype: int64

In [4]:
standard_stats['position'] = standard_stats['position'].str.replace("Forward, Defender", "Forward", case=False, regex=False)
standard_stats['position'] = standard_stats['position'].str.replace("Forward, Midfielder, Defender", "Forward", case=False, regex=False)
standard_stats['position'] = standard_stats['position'].str.replace("Midfielder, Defender", "Midfielder", case=False, regex=False)
standard_stats['position'] = standard_stats['position'].str.replace("Midfielder, Forward", "Midfielder", case=False, regex=False)
standard_stats['position'] = standard_stats['position'].str.replace("Forward, Midfielder", "Forward", case=False, regex=False)
standard_stats['position'] = standard_stats['position'].str.replace("Defender, Forward", "Defender", case=False, regex=False)


In [None]:
standard_stats.groupby("position").size()

position
Defender      876
Forward       736
Goalkeeper    220
Midfielder    860
dtype: int64

: 

In [6]:
# merge both df's
df = pd.merge(left=standard_stats[["position","match_played","minutes_played"]],
         right=defending,
         left_index=True, 
         right_index=True)

# keep in defedning only players with min. matches played
requirement_mask = df["match_played"] > 10
defending_columns = list(defending.columns) + ["position"]
defending = df.loc[requirement_mask, defending_columns]
df = df.loc[requirement_mask, :]

defending

Unnamed: 0_level_0,ball_recovery_total,ball_recovery_successful,ball_recovery_offensive_total,ball_recovery_offensive_successful,ball_recovery_failed,pressure_on_opponent,pressure_on_opponent_defending_third,pressure_on_opponent_middle_third,pressure_on_opponent_attacking_third,counterpressing_total,...,fouls_yellow_card_per_match,fouls_second_yellow_card_per_match,fouls_red_card_per_match,bad_behaviour_total_per_match,bad_behaviour_yellow_card_per_match,bad_behaviour_second_yellow_card_per_match,bad_behaviour_red_card_per_match,total_yellow_per_match,total_red_per_match,position
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaron Hunt,108.0,103.0,0,0,5,359.0,65.0,190.0,104.0,107.0,...,0.011,0.000,0.000,0.011,0.011,0.0,0.0,0.022,0.000,Midfielder
Adelino André Vieira Freitas,133.0,130.0,0,0,3,296.0,106.0,149.0,41.0,95.0,...,0.033,0.000,0.000,0.000,0.000,0.0,0.0,0.033,0.000,Forward
Admir Mehmedi,76.0,73.0,1,1,3,315.0,51.0,159.0,105.0,100.0,...,0.011,0.000,0.000,0.011,0.011,0.0,0.0,0.022,0.000,Midfielder
Albin Ekdal,69.0,58.0,0,0,11,230.0,83.0,122.0,25.0,73.0,...,0.044,0.000,0.000,0.000,0.000,0.0,0.0,0.044,0.000,Midfielder
Alejandro Gálvez Jimena,57.0,52.0,0,0,5,225.0,134.0,85.0,6.0,67.0,...,0.033,0.000,0.000,0.000,0.000,0.0,0.0,0.033,0.000,Defender
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Álvaro Borja Morata Martín,62.0,57.0,0,0,5,346.0,36.0,188.0,122.0,105.0,...,0.022,0.000,0.000,0.044,0.044,0.0,0.0,0.067,0.000,Forward
Édgar Osvaldo Barreto Cáceres,91.0,86.0,1,1,5,649.0,255.0,311.0,83.0,195.0,...,0.033,0.000,0.000,0.000,0.000,0.0,0.0,0.033,0.000,Midfielder
Łukasz Skorupski,148.0,148.0,0,0,0,3.0,3.0,0.0,0.0,0.0,...,0.000,0.000,0.000,0.022,0.022,0.0,0.0,0.022,0.000,Goalkeeper
Ştefan Daniel Radu,71.0,66.0,0,0,5,141.0,64.0,63.0,14.0,41.0,...,0.022,0.000,0.011,0.011,0.011,0.0,0.0,0.033,0.011,Defender


In [7]:
defending.groupby("position").size()

position
Defender      654
Forward       549
Goalkeeper    117
Midfielder    645
dtype: int64

In [8]:
target_column = "position"
columns_relative_values = [col for col in df.columns if col.endswith("_%") and col != target_column]
columns_per_match = [col for col in df.columns if col.endswith("_per_match") and col != target_column]
columns_absolute_values = [col for col in df.columns if not col.endswith("_%") and not col.endswith("_per_match") and col != target_column and col != "match_played" and col != "minutes_played"]


# 1. Select k features

In [9]:
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


config = {
    "conf_1" : {
        "columns_value_type" : "absolute_values",
        "columns" : columns_absolute_values,
    },
    "conf_2": {
        "columns_value_type" : "relative_values",
        "columns" : columns_relative_values,
    },
    "conf_3": {
        "columns_value_type" : "per_match",
        "columns" : columns_per_match,
    }
}

def feature_selection(X, y, model, scale_data=True):
    feature_names = X.columns
    if scale_data:
        scaler = StandardScaler()
        X = scaler.fit_transform(X)  

    model.fit(X, y)

    selector = SelectFromModel(model, max_features=20, threshold="mean", importance_getter="coef_")

    selected_columns = feature_names[selector.get_support()]
    
    return selected_columns

def train_evaluate_model(X, y, model, scale=True, test_size=0.2, random_state=42):
    # Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Optionale Skalierung
    if scale:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    # Train
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)

    # Report
    return classification_report(y_test, y_pred, output_dict=True)

model = LogisticRegression(penalty="l1", solver="liblinear", C=1)
results = dict()

for c in config:
    print(f"Feature representation: {config[c]["columns_value_type"]}")
    X = defending[config[c]["columns"]]
    y = defending["position"]


    selected_columns = feature_selection(X, y, model, scale_data=True)
    X_selected = defending[selected_columns]

    prediction_results = train_evaluate_model(X_selected, y, model, scale=True)
    results[c] = {
        "f1_score": prediction_results["macro avg"]["f1-score"],
        "selected_columns": selected_columns
    }
best_config = max(results, key=lambda c: results[c]["f1_score"])
print(f"Best performing feature representation: {config[best_config]["columns_value_type"]} | macro f1-score: {results[best_config]['f1_score']:.2f}")
print(results[best_config]['selected_columns'])

Feature representation: absolute_values
Feature representation: relative_values
Feature representation: per_match
Best performing feature representation: absolute_values | macro f1-score: 0.91
Index(['ball_recovery_successful', 'ball_recovery_failed',
       'pressure_on_opponent', 'pressure_on_opponent_defending_third',
       'pressure_on_opponent_middle_third',
       'pressure_on_opponent_attacking_third', 'counterpressing_total',
       'counterpressing_opponent_middle_third', 'shield_total', 'block_total',
       'block_during_counterpress', 'clearance_total', 'tackling_won',
       'tackling_in_defending_third'],
      dtype='object')


# TODO: Logistics Regression penalizes to strong
Check with deep research if linear regression or lasso appropriate to solve multiclass task with label encoding

In [10]:
results

{'conf_1': {'f1_score': 0.9112913804422667,
  'selected_columns': Index(['ball_recovery_successful', 'ball_recovery_failed',
         'pressure_on_opponent', 'pressure_on_opponent_defending_third',
         'pressure_on_opponent_middle_third',
         'pressure_on_opponent_attacking_third', 'counterpressing_total',
         'counterpressing_opponent_middle_third', 'shield_total', 'block_total',
         'block_during_counterpress', 'clearance_total', 'tackling_won',
         'tackling_in_defending_third'],
        dtype='object')},
 'conf_2': {'f1_score': 0.8460560720555795,
  'selected_columns': Index(['pressure_on_opponent_defending_third_%',
         'pressure_on_opponent_middle_third_%',
         'pressure_on_opponent_attacking_third_%',
         'tackling_in_defending_third_%', 'tackling_in_middle_third_%'],
        dtype='object')},
 'conf_3': {'f1_score': 0.9112913804422667,
  'selected_columns': Index(['ball_recovery_successful_per_match', 'ball_recovery_failed_per_match',
   

Feature selection

In [15]:
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


def feature_selection(X, y, model, scale_data=True):
    feature_names = X.columns
    if scale_data:
        scaler = StandardScaler()
        X = scaler.fit_transform(X)  

    model.fit(X, y)

    selector = SelectFromModel(model, max_features=20, threshold="mean", importance_getter="coef_")

    selected_columns = feature_names[selector.get_support()]
    
    return selected_columns

X = defending[columns_absolute_values]
y = defending["position"]
model = LogisticRegression(penalty="l2", solver="liblinear", C=1)
selected_columns = feature_selection(X, y, model, scale_data=True)
selected_columns

Index(['ball_recovery_total', 'ball_recovery_successful',
       'ball_recovery_failed', 'pressure_on_opponent',
       'pressure_on_opponent_defending_third',
       'pressure_on_opponent_middle_third',
       'pressure_on_opponent_attacking_third', 'counterpressing_total',
       'counterpressing_opponent_middle_third',
       'counterpressing_attacking_third', 'shield_total', 'block_total',
       'block_during_counterpress', 'clearance_total', 'tackling',
       'tackling_won', 'tackling_in_defending_third'],
      dtype='object')

In [252]:
defending[columns_absolute_values]

Unnamed: 0_level_0,ball_recovery_total,ball_recovery_successful,ball_recovery_offensive_total,ball_recovery_offensive_successful,ball_recovery_failed,pressure_on_opponent,pressure_on_opponent_defending_third,pressure_on_opponent_middle_third,pressure_on_opponent_attacking_third,counterpressing_total,...,fouls_wins_a_penalty,fouls_yellow_card,fouls_second_yellow_card,fouls_red_card,bad_behaviour_total,bad_behaviour_yellow_card,bad_behaviour_second_yellow_card,bad_behaviour_red_card,total_yellow,total_red
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aaron Hunt,108.0,103.0,0,0,5,359.0,65.0,190.0,104.0,107.0,...,0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0
Adelino André Vieira Freitas,133.0,130.0,0,0,3,296.0,106.0,149.0,41.0,95.0,...,0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
Admir Mehmedi,76.0,73.0,1,1,3,315.0,51.0,159.0,105.0,100.0,...,0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0
Albin Ekdal,69.0,58.0,0,0,11,230.0,83.0,122.0,25.0,73.0,...,0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
Alejandro Gálvez Jimena,57.0,52.0,0,0,5,225.0,134.0,85.0,6.0,67.0,...,0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Álvaro Borja Morata Martín,62.0,57.0,0,0,5,346.0,36.0,188.0,122.0,105.0,...,0,2.0,0.0,0.0,4.0,4.0,0.0,0.0,6.0,0.0
Édgar Osvaldo Barreto Cáceres,91.0,86.0,1,1,5,649.0,255.0,311.0,83.0,195.0,...,0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
Łukasz Skorupski,148.0,148.0,0,0,0,3.0,3.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,2.0,0.0
Ştefan Daniel Radu,71.0,66.0,0,0,5,141.0,64.0,63.0,14.0,41.0,...,0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,3.0,1.0


# 2. Evaluate

In [295]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


def train_evaluate_model(X, y, model, scale=True, test_size=0.2, random_state=42):
    # Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Optionale Skalierung
    if scale:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    # Train
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)

    # Report
    return classification_report(y_test, y_pred)

X = defending[selected_columns]
y = defending["position"]
#model = RandomForestClassifier(n_estimators=100, random_state=42)
model = LogisticRegression(penalty="l1", solver="liblinear", C=1, random_state=42)
results = train_evaluate_model(X, y, model, scale=True)
print(results)

              precision    recall  f1-score   support

    Defender       0.96      0.96      0.96       128
     Forward       0.80      0.90      0.85       112
  Goalkeeper       1.00      1.00      1.00        18
  Midfielder       0.88      0.79      0.84       135

    accuracy                           0.89       393
   macro avg       0.91      0.91      0.91       393
weighted avg       0.89      0.89      0.89       393



In [297]:
# check penalized parameters
import numpy as np
coeffcients = model.coef_
np.sum((coeffcients > -0.5) & (coeffcients < 0.5)), np.max(coeffcients), np.min(coeffcients)

(26, 4.14038781020555, -7.192375931473155)

In [87]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
results = train_evaluate_model(X, y, model, scale=False)
print(results)


              precision    recall  f1-score   support

    Defender       0.92      0.95      0.93       128
     Forward       0.82      0.87      0.84       112
  Goalkeeper       1.00      1.00      1.00        18
  Midfielder       0.86      0.79      0.83       135

    accuracy                           0.87       393
   macro avg       0.90      0.90      0.90       393
weighted avg       0.87      0.87      0.87       393

