# Import

In [1]:
import os
import json
import numpy as np
import pandas as pd
from pathlib import Path


# === Constants ===
PROJECT_ROOT_DIR = Path.cwd().parent.parent.parent
PROJECT_ROOT_DIR

def get_data(target:str = "position_level_0", match_played=2, minutes_played=90):
    """Merges all dimensions and applies filters"""
    # vars
    dimensions = ["defending","possession", "passing", "shooting", "goal_keeping"]
    df_standard_stats = pd.read_csv(f"{PROJECT_ROOT_DIR}/data/new_approach/standard_stats_all_final.csv",dtype={"player_id":"int32"}) # load_standard_stats(unique_index=True)

    # Merge all dimensions
    df = df_standard_stats[["player_id", "position_level_0", "position_level_1","position_level_2", "match_played", "minutes_played"]].copy()
    for dim in dimensions:
        # load
        df_dimension = pd.read_csv(f"{PROJECT_ROOT_DIR}/data/new_approach/{dim}_ex.csv",dtype={"player_id":"int32"})
        print(f"Dim {dim} shape{df_dimension.shape}")
        print("Columns:", df_dimension.columns.tolist())
        # merge and update base df
        df = pd.merge(
            left=df,
            right=df_dimension.loc[:, df_dimension.columns != "player"],
            left_on="player_id", 
            right_on="player_id",
            how="left"
        )
    print(f"Merge shape: {df.shape}")

    # filter rows
    print(f"Apply filters: match_played={match_played} , minutes_player={minutes_played}")
    df_filtered = df.loc[(df["match_played"]>=match_played) & (df["minutes_played"]>=minutes_played), : ].copy()#filter_df(df, match_played=match_played, minutes_played=minutes_played)

    # filter columns
    config_1_columns = ["player_id", "position_level_0", "position_level_1","position_level_2"]
    config_2_columns = ["player_id", "position_level_0", "position_level_1","position_level_2"]
    config_3_columns = ["player_id", "position_level_0", "position_level_1","position_level_2"]

    # load and merge selected features
    for dim in dimensions:
        path = f"{PROJECT_ROOT_DIR}/experiment_results/feature_selection_{target}/automated_{dim}.json"
        print(f"Load features from: {path}")
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        print("---")
        print(dim)
        config_1_columns.extend(data["conf_1"]["selected_columns"])
        #print(data["conf_1"]["selected_columns"])
        config_2_columns.extend(data["conf_2"]["selected_columns"])
        config_3_columns.extend(data["conf_3"]["selected_columns"])
    
    # Optionally remove duplicates if needed
    config_1_columns = list(set(config_1_columns))
    config_2_columns = list(set(config_2_columns))
    config_3_columns = list(set(config_3_columns))

    print("Config 1 columns:", len(config_1_columns))
    print("Config 2 columns:", len(config_2_columns))
    print("Config 3 columns:", len(config_3_columns))

    return (df_filtered.loc[:,config_1_columns].copy(), df_filtered.loc[:,config_2_columns].copy(), df_filtered.loc[:,config_3_columns].copy())



# Engineered Features
Get the count of total features

In [2]:
total_features = 0
dimensions = ["defending","possession", "passing", "shooting", "goal_keeping"]
for dim in dimensions:
    n_dim_features = 0
    # load
    df_dimension = pd.read_csv(f"{PROJECT_ROOT_DIR}/data/new_approach/{dim}_ex.csv",dtype={"player_id":"int32"})
    for c in df_dimension.columns:
        if c not in ["player_id", "player"]:
            total_features += 1
            n_dim_features += 1
    print(f"Dim {dim} features: {n_dim_features}")
print("Total features:", total_features)
            

Dim defending features: 119
Dim possession features: 254
Dim passing features: 338
Dim shooting features: 214
Dim goal_keeping features: 94
Total features: 1019


# Aggregated Player Vector Dataframe
shape

In [3]:
df = pd.DataFrame()

df = pd.read_csv(f"{PROJECT_ROOT_DIR}/data/new_approach/standard_stats_all_final.csv",dtype={"player_id":"int32"}) # load_standard_stats(unique_index=True)
df = df.set_index("player_id")


for dim in ["defending","possession", "passing", "shooting", "goal_keeping"]:
    # load
    df_dimension = pd.read_csv(f"{PROJECT_ROOT_DIR}/data/new_approach/{dim}_ex.csv",dtype={"player_id":"int32"})
    df_dimension = df_dimension.set_index("player_id")
    df = pd.merge(
            left=df,
            right=df_dimension.loc[:,df_dimension.columns != "player"],
            left_index=True,
            right_index=True,
            how="left"
        )
df.head(), df.shape

(                       player   country           team  match_played  \
 player_id                                                              
 2936       Christophe Kerbrat    France       Guingamp          30.0   
 2943              Lucas Deaux    France         Nantes          16.0   
 2944         Benjamin Corgnet    France  Saint-Étienne           9.0   
 2946        Frédéric Guilbert    France       Bordeaux          30.0   
 2947            Anthony Lopes  Portugal           Lyon          37.0   
 
            minutes_played  subbed_in  subbed_out  \
 player_id                                          
 2936               2613.0        1.0         1.0   
 2943               1181.0        3.0         3.0   
 2944                467.0        4.0         4.0   
 2946               2525.0        2.0         2.0   
 2947               3330.0        0.0         0.0   
 
                                      unique_positions_played  \
 player_id                                       

missing values (position level 0)

In [4]:
df["position_level_0"].isna().sum()

429

count matches played

In [5]:
df["match_played"].value_counts().sort_index()

match_played
0.0     646
1.0      71
2.0     118
3.0      66
4.0      86
5.0      52
6.0      63
7.0      51
8.0      55
9.0      57
10.0     57
11.0     60
12.0     47
13.0     49
14.0     47
15.0     55
16.0     54
17.0     47
18.0     44
19.0     63
20.0     43
21.0     50
22.0     56
23.0     55
24.0     53
25.0     62
26.0     46
27.0     61
28.0     48
29.0     58
30.0     66
31.0     70
32.0     61
33.0     76
34.0     71
35.0     74
36.0     48
37.0     42
38.0     47
39.0     24
40.0     30
41.0     22
42.0     21
43.0     15
44.0     12
45.0     18
46.0     12
47.0      8
48.0      6
49.0      8
50.0      5
52.0      3
53.0      1
54.0      1
55.0      1
56.0      3
58.0      1
60.0      1
62.0      2
Name: count, dtype: int64

In [6]:
646 + 118 + 71

835

In [7]:
df["minutes_played"].value_counts().sort_index()

minutes_played
0.0       646
90.0       77
91.0        3
92.0       19
93.0       26
         ... 
3429.0      1
3431.0      1
3436.0      1
3453.0      1
3464.0      1
Name: count, Length: 1045, dtype: int64

In [8]:
646 + 77


723

In [9]:
df

Unnamed: 0_level_0,player,country,team,match_played,minutes_played,subbed_in,subbed_out,unique_positions_played,positions_played,full_match_equivalents,...,sweeper_in_defending_third_per_match,sweeper_in_middle_third_per_match,sweeper_collects_ball_per_match,sweeper_clears_ball_per_match,collecting_ball_total_per_match,collecting_ball_failed_per_match,collecting_ball_in_second_attempt_per_match,avg_distance_to_goal_per_match,avg_distance_to_goal_saved_shots_per_match,avg_distance_to_goal_goals_conceded_per_match
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2936,Christophe Kerbrat,France,Guingamp,30.0,2613.0,1.0,1.0,['Right Center Back'],"['Right Center Back', 'Right Center Back', 'Ri...",29.033333,...,0.000,0.0,0.000,0.000,0.0,0.000,0.000,0.00,0.000,0.000
2943,Lucas Deaux,France,Nantes,16.0,1181.0,3.0,3.0,"['Right Defensive Midfield', 'Right Wing', 'Ce...","['Right Defensive Midfield', 'Left Defensive M...",13.122222,...,0.000,0.0,0.000,0.000,0.0,0.000,0.000,0.00,0.000,0.000
2944,Benjamin Corgnet,France,Saint-Étienne,9.0,467.0,4.0,4.0,"['Center Attacking Midfield', 'Right Defensive...","['Center Attacking Midfield', 'Center Attackin...",5.188889,...,0.000,0.0,0.000,0.000,0.0,0.000,0.000,0.00,0.000,0.000
2946,Frédéric Guilbert,France,Bordeaux,30.0,2525.0,2.0,2.0,"['Right Back', 'Right Center Back']","['Right Center Back', 'Right Back', 'Right Cen...",28.055556,...,0.000,0.0,0.000,0.000,0.0,0.000,0.000,0.00,0.000,0.000
2947,Anthony Lopes,Portugal,Lyon,37.0,3330.0,0.0,0.0,['Goalkeeper'],"['Goalkeeper', 'Goalkeeper', 'Goalkeeper', 'Go...",37.000000,...,0.178,0.0,0.489,0.167,0.4,0.011,0.011,0.07,0.055,0.064
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403760,Victor Paillon,France,Troyes,0.0,0.0,0.0,0.0,[],[],0.000000,...,0.000,0.0,0.000,0.000,0.0,0.000,0.000,0.00,0.000,0.000
404019,Douti Gbampok,France,Gazélec Ajaccio,0.0,0.0,0.0,0.0,[],[],0.000000,...,0.000,0.0,0.000,0.000,0.0,0.000,0.000,0.00,0.000,0.000
404310,Gueïda Fofana,France,Lyon,0.0,0.0,0.0,0.0,[],[],0.000000,...,0.000,0.0,0.000,0.000,0.0,0.000,0.000,0.00,0.000,0.000
404319,Aristote N'Dongala,"Congo, (Kinshasa)",Nantes,0.0,0.0,0.0,0.0,[],[],0.000000,...,0.000,0.0,0.000,0.000,0.0,0.000,0.000,0.00,0.000,0.000


In [10]:
df.dtypes

player                                            object
country                                           object
team                                              object
match_played                                     float64
minutes_played                                   float64
                                                  ...   
collecting_ball_failed_per_match                 float64
collecting_ball_in_second_attempt_per_match      float64
avg_distance_to_goal_per_match                   float64
avg_distance_to_goal_saved_shots_per_match       float64
avg_distance_to_goal_goals_conceded_per_match    float64
Length: 1032, dtype: object

In [16]:
df_filtered = df[(df["match_played"]>=2) & (df["minutes_played"]>=90) ]
df_filtered["position_level_0"].value_counts()

position_level_0
Defender      826
Midfielder    795
Forward       547
Goalkeeper    184
Name: count, dtype: int64

# Multicollinearity