# Analysis of the extended data of player for 2023-2024

In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [6]:
data = pd.read_csv('../../data/raw/player_season_stats_23-24.csv')

In [7]:
len(data.columns)

178

In [8]:
data.columns.tolist()

['league',
 'season',
 'team',
 'player',
 'nation',
 'pos',
 'age',
 'born',
 'MP Playing Time',
 'Starts Playing Time',
 'Min Playing Time',
 '90s Playing Time',
 'Gls Performance',
 'Ast Performance',
 'G+A Performance',
 'G-PK Performance',
 'PK Performance',
 'PKatt Performance',
 'CrdY Performance',
 'CrdR Performance',
 'xG Expected',
 'npxG Expected',
 'xAG Expected',
 'npxG+xAG Expected',
 'PrgC Progression',
 'PrgP Progression',
 'PrgR Progression',
 'Gls Per 90 Minutes',
 'Ast Per 90 Minutes',
 'G+A Per 90 Minutes',
 'G-PK Per 90 Minutes',
 'G+A-PK Per 90 Minutes',
 'xG Per 90 Minutes',
 'xAG Per 90 Minutes',
 'xG+xAG Per 90 Minutes',
 'npxG Per 90 Minutes',
 'npxG+xAG Per 90 Minutes',
 '90s',
 'G-xG Expected',
 'np:G-xG Expected',
 'npxG/Sh Expected',
 'Dist Standard',
 'FK Standard',
 'G/Sh Standard',
 'G/SoT Standard',
 'Gls Standard',
 'PK Standard',
 'PKatt Standard',
 'Sh Standard',
 'Sh/90 Standard',
 'SoT Standard',
 'SoT% Standard',
 'SoT/90 Standard',
 '1/3',
 'Ast

In [9]:
selected_columns = [
    # --- Metadata (Needed for identification) ---
    'league', 'season', 'team', 'player', 'nation', 'pos', 'age', 'MP Playing Time','Min Playing Time' ,'90s Playing Time',

    # --- Attack Generation (Goals & Assists) ---
    'npxG Per 90 Minutes',      # The gold standard for scoring threat (Non-Penalty xG)
    'G-PK Per 90 Minutes',      # Actual non-penalty goal output
    'xAG Per 90 Minutes',       # Quality of expected assists
    'xG+xAG Per 90 Minutes',    # Total offensive production

    # --- Shooting Style ---
    'Sh/90 Standard',           # Shooting Volume
    'SoT% Standard',            # Accuracy (Shots on Target %)
    'Dist Standard',            # Average Distance (Distinguishes poachers from range shooters)
    'G/Sh Standard',            # Efficiency (Conversion rate)

    # --- Playmaking & Passing ---
    'KP',                       # Key Passes (Volume of chances created)
    'PrgP',                     # Progressive Passes (Verticality)
    'PPA',                      # Passes into the 18-yard box
    'CrsPA',                    # Crosses into the 18-yard box
    '1/3',                      # Passes into the final third
    'Cmp% Total',               # General retention/safety
    'Cmp% Long',                # Long passing quality
    'Att Long',                 # Long passing intent (Style indicator)

    # --- Pass Types (Tactical Role) ---
    'TB Pass Types',            # Through Balls (Playmaker indicator)
    'Sw Pass Types',            # Switches of play (Range/Distribution)
    'Crs Pass Types',           # Crosses (Winger style)

    # --- Dribbling & Ball Carrying ---
    'PrgC Carries',             # Progressive Carries (Driving forward)
    'CPA Carries',              # Carries into the penalty area
    'Succ Take-Ons',            # Completed Dribbles
    'Succ% Take-Ons',           # Dribble efficiency
    'Dis Carries',              # Dispossessed (Tackled by opponent)
    'Mis Carries',              # Miscontrols (Unforced errors/Clumsiness)
    'PrgR Receiving',           # Progressive Passes Received (Outlet/Target capability)

    # --- Defense & Pressing (Zone Definition) ---
    'TklW Tackles',             # Total Tackles Won
    'Att 3rd Tackles',          # High Pressing (Forwards/High midfielders)
    'Mid 3rd Tackles',          # Midfield engagement
    'Def 3rd Tackles',          # Deep defending
    'Int',                      # Interceptions (Game reading)
    'Sh Blocks',                # Shots blocked (Protective defending)
    'Recov Performance',        # Ball Recoveries (Hustle/Work rate)

    # --- Aerial & Physical ---
    'Won Aerial Duels',         # Volume of aerials won
    'Won% Aerial Duels',        # Aerial Dominance
    'Fls Performance',          # Fouls committed (Aggression)
    'Fld Performance',          # Fouls drawn

    # --- Heatmap Proxies (Positioning - Vital for Clustering) ---
    'Att Pen Touches',          # Touches in opponent's box
    'Att 3rd Touches',          # Touches in offensive zone
    'Mid 3rd Touches',          # Touches in midfield
    'Def 3rd Touches',          # Touches in build-up zone
    'Def Pen Touches',          # Touches in own box
    
    # -- Overall Activity ---
    'Live Touches',              # Overall involvement  
    'Cmp Total'                  # Total Passes Completed
]

In [10]:
data_relevant = data[selected_columns].copy()

In [11]:
data_relevant

Unnamed: 0,league,season,team,player,nation,pos,age,MP Playing Time,Min Playing Time,90s Playing Time,...,Won% Aerial Duels,Fls Performance,Fld Performance,Att Pen Touches,Att 3rd Touches,Mid 3rd Touches,Def 3rd Touches,Def Pen Touches,Live Touches,Cmp Total
0,ENG-Premier League,2324,Arsenal,Aaron Ramsdale,ENG,GK,25,6,540,6.0,...,,0,1,0,0,11,186,140,197,131
1,ENG-Premier League,2324,Arsenal,Ben White,ENG,DF,25,37,2988,33.2,...,62.1,19,12,64,676,1165,633,120,2456,1828
2,ENG-Premier League,2324,Arsenal,Bukayo Saka,ENG,FW,21,35,2919,32.4,...,39.0,41,69,271,1310,474,182,37,1946,1143
3,ENG-Premier League,2324,Arsenal,Cédric Soares,POR,DF,31,3,62,0.7,...,0.0,0,1,0,12,47,10,0,69,60
4,ENG-Premier League,2324,Arsenal,David Raya,ESP,GK,27,32,2880,32.0,...,83.3,0,4,0,0,39,1060,764,1099,800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2847,ITA-Serie A,2324,Udinese,Sandi Lovrić,SVN,MF,25,29,1987,22.1,...,30.8,25,23,35,347,444,169,31,939,543
2848,ITA-Serie A,2324,Udinese,Simone Pafundi,ITA,MF,17,1,8,0.1,...,100.0,1,0,0,2,4,3,0,9,5
2849,ITA-Serie A,2324,Udinese,Thomas Kristensen,DEN,DF,21,26,2096,23.3,...,59.2,26,7,22,57,499,583,164,1124,662
2850,ITA-Serie A,2324,Udinese,Vivaldo Semedo,POR,FW,18,1,12,0.1,...,,1,2,1,1,2,0,0,3,0


In [12]:
data_relevant[data_relevant['player'] == 'Eric García']

Unnamed: 0,league,season,team,player,nation,pos,age,MP Playing Time,Min Playing Time,90s Playing Time,...,Won% Aerial Duels,Fls Performance,Fld Performance,Att Pen Touches,Att 3rd Touches,Mid 3rd Touches,Def 3rd Touches,Def Pen Touches,Live Touches,Cmp Total
703,ESP-La Liga,2324,Barcelona,Eric García,ESP,DF,22,2,57,0.6,...,100.0,1,0,1,2,33,18,2,53,46
869,ESP-La Liga,2324,Girona,Eric García,ESP,DF,22,30,2637,29.3,...,57.0,24,26,23,227,1378,972,180,2567,2064


In [14]:
data_relevant.to_csv('../../data/interim/player_season_stats_23-24_relevant.csv', index=False)

In [None]:
# # 2. Filter out Goalkeepers (GKs) and players with low playing time
# df_normalized = data_relevant[data_relevant['pos'] != 'GK']
# df_normalized = df_normalized[df_normalized['Min Playing Time'] > 500]

# # 3. Handle missing values (NaNs)
# df_normalized = df_normalized.fillna(0)

# # 4. Separate Metadata from Numerical Data
# metadata = df_normalized[['player', 'team', 'pos', 'league']]
# features = df_normalized.drop(columns=['league', 'season', 'team', 'player', 
#                             'nation', 'pos', 'age', 'MP Playing Time', 
#                             'Min Playing Time', '90s Playing Time'])

# # 5. Estandarizar (Z-Score)
# scaler = StandardScaler()
# features_scaled = scaler.fit_transform(features)

# # AHORA 'features_scaled' es lo que pasas a tu función de SimHash