In [4]:
import pandas as pd
import numpy as np
import json 
import sys
import os
from tqdm import tqdm
from datetime import datetime
from statsbombpy import sb
from pathlib import Path
from collections import Counter

# Run the notebook from inside the notebooks folder
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__name__), '..')))

from dataloader import Dataloader

PROJECT_ROOT_DIR = Path.cwd().parent.parent.parent
with open(f"{PROJECT_ROOT_DIR}/config/position_mapping.json","r") as f:
    position_mapping = json.load(f)

with open(f"{PROJECT_ROOT_DIR}/config/competition_config.json", "r") as f:
    league_mapping = json.load(f)

df = pd.DataFrame()
for league in league_mapping.keys():
    print(f"Loading data for {league}...")
    dataloader = Dataloader(league)
    dataloader.load_data()
    temp_df = dataloader.get_dimension(dimension="standard_stats",row_filter=False)
    df = pd.concat([df, temp_df], ignore_index=True)

Loading data for bundesliga...
[2025-07-08 12:06:30] Loading data form local file system
Loading data for la_liga...
[2025-07-08 12:06:40] Loading data form local file system


  self.df = pd.read_csv(file_path,dtype=self.dtypes)


Loading data for ligue_1...
[2025-07-08 12:06:55] Loading data form local file system


  self.df = pd.read_csv(file_path,dtype=self.dtypes)


Loading data for premier_league...
[2025-07-08 12:07:09] Loading data form local file system


  self.df = pd.read_csv(file_path,dtype=self.dtypes)


Loading data for serie_a...
[2025-07-08 12:07:22] Loading data form local file system


  self.df = pd.read_csv(file_path,dtype=self.dtypes)


Loading data for ucl...
[2025-07-08 12:07:36] Loading data form local file system


In [14]:
df

Unnamed: 0,player,match_id,minute,substitution_replacement,substitution_outcome
0,,3890561,0,,
1,,3890561,0,,
2,,3890561,0,,
3,,3890561,0,,
4,,3890561,45,,
...,...,...,...,...,...
6396041,Kléper Laveran Lima Ferreira,18243,63,,
6396042,Cristiano Ronaldo dos Santos Aveiro,18243,95,,
6396043,Juan Francisco Torres Belén,18243,95,,
6396044,Francisco Román Alarcón Suárez,18243,119,,


In [27]:
# === Get player information from match lineups ===
df_main_info = pd.DataFrame()
match_ids = df["match_id"].unique()

for match_id in tqdm(match_ids, desc="Retrieving Lineups per game from API", unit="match"):
    lineups = sb.lineups(match_id=match_id)
    for team in lineups.keys():
        temp_df = lineups[team].loc[:, ["player_id", "player_name", "country", "positions", "jersey_number", "cards"]]
        df_main_info = pd.concat([df_main_info, temp_df], axis=0)
    df_main_info

Retrieving Lineups per game from API: 100%|██████████| 1824/1824 [14:57<00:00,  2.03match/s]


In [34]:
df_main_info

Unnamed: 0,player_id,player_name,country,positions,jersey_number,cards
0,3053,Leroy Sané,Germany,"[{'position_id': 12, 'position': 'Right Midfie...",19,[]
1,3499,Jean-Eric Maxim Choupo-Moting,Cameroon,"[{'position_id': 16, 'position': 'Left Midfiel...",13,[]
2,3502,Joël Andre Job Matip,Cameroon,"[{'position_id': 3, 'position': 'Right Center ...",32,[]
3,3510,Sead Kolašinac,Bosnia and Herzegovina,"[{'position_id': 6, 'position': 'Left Back', '...",6,[]
4,3570,Pierre-Emile Højbjerg,Denmark,[],23,[]
...,...,...,...,...,...,...
13,6383,Thomas Teye Partey,Ghana,"[{'position_id': 11, 'position': 'Left Defensi...",22,[]
14,6384,Gabriel Fernández Arenas,Spain,"[{'position_id': 9, 'position': 'Right Defensi...",14,"[{'time': '92:32', 'card_type': 'Yellow Card',..."
15,7069,Miguel Ángel Moyà Rumbo,Spain,[],1,[]
16,19668,Augusto Matías Fernández,Argentina,"[{'position_id': 11, 'position': 'Left Defensi...",12,[]


In [39]:
df_main_info.drop_duplicates(subset=['player_id'])["player_id"].value_counts()

player_id
3053     1
22594    1
9463     1
9528     1
9578     1
        ..
6828     1
19431    1
6725     1
7797     1
38336    1
Name: count, Length: 3069, dtype: int64

In [38]:
df_main_info[df_main_info["player_id"] == 401453]

Unnamed: 0,player_id,player_name,country,positions,jersey_number,cards
16,401453,David N'Gog,France,[],24,[]
16,401453,"David N""Gog",France,[],24,[]
16,401453,"David N""Gog",France,[],24,[]
16,401453,"David N""Gog",France,"[{'position_id': 23, 'position': 'Center Forwa...",24,[]
16,401453,"David N""Gog",France,[],24,[]
14,401453,"David N""Gog",France,"[{'position_id': 23, 'position': 'Center Forwa...",24,[]
15,401453,"David N""Gog",France,[],24,[]
15,401453,"David N""Gog",France,"[{'position_id': 19, 'position': 'Center Attac...",24,[]
14,401453,"David N""Gog",France,[],24,[]
14,401453,"David N""Gog",France,"[{'position_id': 23, 'position': 'Center Forwa...",24,"[{'time': '39:00', 'card_type': 'Yellow Card',..."


In [33]:
sb.events(match_id=3890561)["player_id"]



0          NaN
1          NaN
2          NaN
3          NaN
4          NaN
         ...  
3935    8508.0
3936    8517.0
3937    3510.0
3938    5537.0
3939       NaN
Name: player_id, Length: 3940, dtype: float64

# Position Assignment

In [1]:
from collections import Counter, defaultdict

position_mapping = {
    "GK": "Goalkeeper",
    "Goalkeeper": "Goalkeeper",
    
    "RB": "Defender",
    "RCB": "Defender",
    "CB": "Defender",
    "LCB": "Defender",
    "LB": "Defender",
    "RWB": "Defender",
    "LWB": "Defender",
    "Right Back": "Defender",
    "Right Center Back": "Defender",
    "Center Back": "Defender",
    "Left Center Back": "Defender",
    "Left Back": "Defender",
    "Right Wing Back": "Defender",
    "Left Wing Back": "Defender",
    
    "RDM": "Midfielder",
    "CDM": "Midfielder",
    "LDM": "Midfielder",
    "RM": "Midfielder",
    "RCM": "Midfielder",
    "CM": "Midfielder",
    "LCM": "Midfielder",
    "LM": "Midfielder",
    "RW": "Midfielder",
    "RAM": "Midfielder",
    "CAM": "Midfielder",
    "LAM": "Midfielder",
    "Right Defensive Midfield": "Midfielder",
    "Center Defensive Midfield": "Midfielder",
    "Left Defensive Midfield": "Midfielder",
    "Right Midfield": "Midfielder",
    "Right Center Midfield": "Midfielder",
    "Center Midfield": "Midfielder",
    "Left Center Midfield": "Midfielder",
    "Left Midfield": "Midfielder",
    "Right Attacking Midfield": "Midfielder",
    "Center Attacking Midfield": "Midfielder",
    "Left Attacking Midfield": "Midfielder",
    
    "LW": "Forward",
    "RCF": "Forward",
    "ST": "Forward",
    "LCF": "Forward",
    "SS": "Forward",
    "Left Wing": "Forward",
    "Right Wing": "Forward",
    "Right Center Forward": "Forward",
    "Striker": "Forward",
    "Left Center Forward": "Forward",
    "Secondary Striker": "Forward",
    "Center Forward":"Forward"
}

position = ""

played_positions = ['Right Defensive Midfield', 'Left Defensive Midfield', 'Left Defensive Midfield', 'Left Defensive Midfield', 'Left Defensive Midfield', 'Left Defensive Midfield', 'Right Defensive Midfield', 'Center Defensive Midfield', 'Right Center Back', 'Right Defensive Midfield', 'Left Defensive Midfield', 'Right Defensive Midfield', 'Right Wing', 'Right Defensive Midfield', 'Left Center Back', 'Right Defensive Midfield', 'Center Defensive Midfield', 'Left Center Back', 'Left Center Back', 'Left Center Back', 'Right Defensive Midfield', 'Left Center Back']
played_roles = [position_mapping.get(pos, None) for pos in played_positions]

# get global role count
counter = Counter(played_roles)

# if equal count - pick position of last game
value_to_keys = defaultdict(list)
for key, value in counter.items():
    value_to_keys[value].append(key)

duplicates = [keys for _, keys in value_to_keys.items() if len(keys) > 1]

if duplicates:
    for i in reversed(played_roles):
        if i in duplicates[0]:
            position = i
            print(f"Last position played: {position}")
            break
else:
    position = max(counter, key=counter.get)

print(position)

Midfielder


In [55]:
counter

{'Midfielder': 15, 'Defender': 14, 'Forward': 1}

In [62]:
counter = {'Midfielder': 15, 'Defender': 14, 'Forward': 1}
position = ""

# if equal count - pick position of last game
value_to_keys = defaultdict(list)
for key, value in counter.items():
    value_to_keys[value].append(key)

duplicates = [keys for _, keys in value_to_keys.items() if len(keys) > 1]# [0]
value_to_keys
if duplicates:
    for i in reversed(played_roles):
        if i in duplicates[0]:
            position = i
            print(f"Last position played: {position}")
            break
else:
    position = max(counter, key=counter.get)

print(position)

Midfielder


In [1]:
import pandas as pd
df = pd.read_csv("../../../data/new_approach/standard_stats_all.csv")
df

Unnamed: 0,player,player_id,country,team,position,match_played,minutes_played,subbed_in,subbed_out,unique_positions_played,positions_played,full_match_equivalents
0,Christophe Kerbrat,2936.0,France,Guingamp,Defender,30.0,2613.0,1.0,1.0,['Right Center Back'],"['Right Center Back', 'Right Center Back', 'Ri...",29.033333
1,Lucas Deaux,2943.0,France,Nantes,Midfielder,16.0,1181.0,3.0,3.0,"['Right Center Back', 'Center Defensive Midfie...","['Right Defensive Midfield', 'Left Defensive M...",13.122222
2,Benjamin Corgnet,2944.0,France,Saint-Étienne,Midfielder,9.0,467.0,4.0,4.0,"['Center Forward', 'Center Attacking Midfield'...","['Center Attacking Midfield', 'Center Attackin...",5.188889
3,Frédéric Guilbert,2946.0,France,Bordeaux,Defender,30.0,2525.0,2.0,2.0,"['Right Back', 'Right Center Back']","['Right Center Back', 'Right Back', 'Right Cen...",28.055556
4,Anthony Lopes,2947.0,Portugal,Lyon,Goalkeeper,37.0,3330.0,0.0,0.0,['Goalkeeper'],"['Goalkeeper', 'Goalkeeper', 'Goalkeeper', 'Go...",37.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
3064,Victor Paillon,403760.0,France,Troyes,,0.0,0.0,0.0,0.0,[],[],0.000000
3065,Douti Gbampok,404019.0,France,Gazélec Ajaccio,,0.0,0.0,0.0,0.0,[],[],0.000000
3066,Gueïda Fofana,404310.0,France,Lyon,,0.0,0.0,0.0,0.0,[],[],0.000000
3067,Aristote N'Dongala,404319.0,"Congo, (Kinshasa)",Nantes,,0.0,0.0,0.0,0.0,[],[],0.000000


In [15]:
import pandas as pd 

df = pd.read_csv("../../../data/new_approach/standard_stats_all_test.csv")
df = df.drop("position",axis=1)
df = df.rename({
    "new_position": "position_level_0",
    "role": "position_level_2"
}, axis=1)

import json
with open('../../../config/position_mapping_level_1.json', 'r') as file:
    position_mapping_level_1 = json.load(file)
reverse_mapping = {pos: level for level, positions in position_mapping_level_1.items() for pos in positions}
df["position_level_1"] = df["position_level_2"].map(reverse_mapping)
df.to_csv("../../../data/new_approach/standard_stats_all_test.csv", index=False)


In [31]:
import pandas as pd 
from sklearn.model_selection import train_test_split


df = pd.read_csv("../../../data/new_approach/standard_stats_all_test.csv")
df = df.loc[(df["match_played"]>=2) & (df["minutes_played"]>=90), : ]

X = df
y = df["position_level_0"]

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

X_train["position_level_0"].value_counts() / X_train["position_level_0"].value_counts().sum() * 100

position_level_0
Defender      33.918129
Midfielder    33.705476
Forward       25.146199
Goalkeeper     7.230197
Name: count, dtype: float64

In [None]:
import pandas as pd 
from sklearn.model_selection import train_test_split


df = pd.read_csv("../../../data/new_approach/standard_stats_all_test.csv")
df = df.loc[(df["match_played"]>=2) & (df["minutes_played"]>=90), : ]

X = df
y = df["position_level_0"]

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

X_train["position_level_0"].value_counts() / X_train["position_level_0"].value_counts().sum() * 100

ValueError: Stratified train/test split is not implemented for shuffle=False