In [1]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_selector
import pickle
import pandas as pd
import numpy as np

In [2]:
# 1 - Load data (file path readable by the api app ; )
df = pd.read_csv("../raw_data/FC26_20250921.csv")
df.head()

  df = pd.read_csv("../raw_data/FC26_20250921.csv")


Unnamed: 0,player_id,player_url,fifa_version,fifa_update,fifa_update_date,short_name,long_name,player_positions,overall,potential,...,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk,player_face_url
0,252371,/player/252371/jude-bellingham/260004/,26,4,2025-09-19,J. Bellingham,Jude Victor William Bellingham,"CAM, CM",90,94,...,85+3,85+3,83+3,82+3,81+3,81+3,81+3,82+3,18+3,https://cdn.sofifa.net/players/252/371/26_120.png
1,239053,/player/239053/federico-valverde/260004/,26,4,2025-09-19,F. Valverde,Federico Santiago Valverde Dipetta,"CM, CDM, RB",89,90,...,87+3,87+3,86+3,86+3,83+3,83+3,83+3,86+3,18+3,https://cdn.sofifa.net/players/239/053/26_120.png
2,212622,/player/212622/joshua-kimmich/260004/,26,4,2025-09-19,J. Kimmich,Joshua Walter Kimmich,"CDM, RB, CM",89,89,...,87+2,87+2,86+3,85+3,82+3,82+3,82+3,85+3,21+3,https://cdn.sofifa.net/players/212/622/26_120.png
3,235212,/player/235212/achraf-hakimi/260004/,26,4,2025-09-19,A. Hakimi,Achraf Hakimi Mouhأشرف حكيمي,"RB, RM",89,90,...,83+3,83+3,86+3,86+3,81+3,81+3,81+3,86+3,17+3,https://cdn.sofifa.net/players/235/212/26_120.png
4,224232,/player/224232/nicolo-barella/260004/,26,4,2025-09-19,N. Barella,Nicolò Barella,CM,87,87,...,85+2,85+2,84+3,83+3,80+3,80+3,80+3,83+3,19+3,https://cdn.sofifa.net/players/224/232/26_120.png


In [3]:
# Fill in NaNs

# 0 for goalkeeping speed for all outfield players
df['goalkeeping_speed'] = df['goalkeeping_speed'].fillna(0)

# 0 for grouped outfield attribute scores for goalkeepers
columns_to_fill = ['pace', 'physic', 'defending', 'passing', 'shooting', 'dribbling']
df[columns_to_fill] = df[columns_to_fill].fillna(0)

In [4]:
# Create target from player_positions
def categorize_role(positions):
    if pd.isna(positions):
        return None
    
    # Get first position (primary)
    primary_pos = positions.split(',')[0].strip().upper()
    
    position_groups = {
        'ST': 'Forward', 'CF': 'Forward', 'LW': 'Forward', 'RW': 'Forward', 'LF': 'Forward', 'RF': 'Forward',
        'CAM': 'Midfielder', 'CM': 'Midfielder', 'CDM': 'Midfielder', 'LM': 'Midfielder', 'RM': 'Midfielder',
        'CB': 'Defender', 'LB': 'Defender', 'RB': 'Defender', 'LWB': 'Defender', 'RWB': 'Defender',
        'GK': 'Goalkeeper'
    }
    
    return position_groups.get(primary_pos) #string avec des valeurs séparées avec des virgules. Le .get permet d'extraire des valeurs pour certains valeurs

df['role'] = df['player_positions'].apply(categorize_role)
df = df[df['role'] != 'Goalkeeper']
df = df.dropna(subset=['role'])

df

Unnamed: 0,player_id,player_url,fifa_version,fifa_update,fifa_update_date,short_name,long_name,player_positions,overall,potential,...,rdm,rwb,lb,lcb,cb,rcb,rb,gk,player_face_url,role
0,252371,/player/252371/jude-bellingham/260004/,26,4,2025-09-19,J. Bellingham,Jude Victor William Bellingham,"CAM, CM",90,94,...,85+3,83+3,82+3,81+3,81+3,81+3,82+3,18+3,https://cdn.sofifa.net/players/252/371/26_120.png,Midfielder
1,239053,/player/239053/federico-valverde/260004/,26,4,2025-09-19,F. Valverde,Federico Santiago Valverde Dipetta,"CM, CDM, RB",89,90,...,87+3,86+3,86+3,83+3,83+3,83+3,86+3,18+3,https://cdn.sofifa.net/players/239/053/26_120.png,Midfielder
2,212622,/player/212622/joshua-kimmich/260004/,26,4,2025-09-19,J. Kimmich,Joshua Walter Kimmich,"CDM, RB, CM",89,89,...,87+2,86+3,85+3,82+3,82+3,82+3,85+3,21+3,https://cdn.sofifa.net/players/212/622/26_120.png,Midfielder
3,235212,/player/235212/achraf-hakimi/260004/,26,4,2025-09-19,A. Hakimi,Achraf Hakimi Mouhأشرف حكيمي,"RB, RM",89,90,...,83+3,86+3,86+3,81+3,81+3,81+3,86+3,17+3,https://cdn.sofifa.net/players/235/212/26_120.png,Defender
4,224232,/player/224232/nicolo-barella/260004/,26,4,2025-09-19,N. Barella,Nicolò Barella,CM,87,87,...,85+2,84+3,83+3,80+3,80+3,80+3,83+3,19+3,https://cdn.sofifa.net/players/224/232/26_120.png,Midfielder
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16945,275651,/player/275651/xiao-peng/260004/,26,4,2025-09-19,Peng Xiao,Peng Xiao彭啸,CB,48,61,...,42+2,42+2,44+2,48+2,48+2,48+2,44+2,13+2,https://cdn.sofifa.net/players/275/651/26_120.png,Defender
16969,77910,/player/77910/zhaozhi-zhang/260004/,26,4,2025-09-19,Zhang Zhaozhi,Zhang Zhaozhi张兆致,CB,48,55,...,41+2,43+2,45+2,48+2,48+2,48+2,45+2,15+2,https://cdn.sofifa.net/players/077/910/26_120.png,Defender
17049,273641,/player/273641/guoliang-sun/260004/,26,4,2025-09-19,Sun Guoliang,Sun Guoliang孙国梁,CB,51,51,...,48+2,47+2,50+1,51+0,51+0,51+0,50+1,12+2,https://cdn.sofifa.net/players/273/641/26_120.png,Defender
17060,274116,/player/274116/hyun-tae-jo/260004/,26,4,2025-09-19,Jo Hyun Tae,Hyun-tae Jo조현태,CB,49,61,...,43+2,42+2,44+2,49+2,49+2,49+2,44+2,12+2,https://cdn.sofifa.net/players/274/116/26_120.png,Defender


In [5]:
detailed_skill_attributes = [
    'attacking_crossing', 'attacking_finishing',
    'attacking_heading_accuracy', 'attacking_short_passing',
    'attacking_volleys', 'skill_dribbling', 'skill_curve',
    'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control',
    'movement_acceleration', 'movement_sprint_speed', 'movement_agility',
    'movement_reactions', 'movement_balance', 'power_shot_power',
    'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots',
    'mentality_aggression', 'mentality_interceptions',
    'mentality_positioning', 'mentality_vision', 'mentality_penalties',
    'mentality_composure', 'defending_marking_awareness',
    'defending_standing_tackle', 'defending_sliding_tackle',
    'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking',
    'goalkeeping_positioning', 'goalkeeping_reflexes', 'goalkeeping_speed'
]

In [6]:
df

Unnamed: 0,player_id,player_url,fifa_version,fifa_update,fifa_update_date,short_name,long_name,player_positions,overall,potential,...,rdm,rwb,lb,lcb,cb,rcb,rb,gk,player_face_url,role
0,252371,/player/252371/jude-bellingham/260004/,26,4,2025-09-19,J. Bellingham,Jude Victor William Bellingham,"CAM, CM",90,94,...,85+3,83+3,82+3,81+3,81+3,81+3,82+3,18+3,https://cdn.sofifa.net/players/252/371/26_120.png,Midfielder
1,239053,/player/239053/federico-valverde/260004/,26,4,2025-09-19,F. Valverde,Federico Santiago Valverde Dipetta,"CM, CDM, RB",89,90,...,87+3,86+3,86+3,83+3,83+3,83+3,86+3,18+3,https://cdn.sofifa.net/players/239/053/26_120.png,Midfielder
2,212622,/player/212622/joshua-kimmich/260004/,26,4,2025-09-19,J. Kimmich,Joshua Walter Kimmich,"CDM, RB, CM",89,89,...,87+2,86+3,85+3,82+3,82+3,82+3,85+3,21+3,https://cdn.sofifa.net/players/212/622/26_120.png,Midfielder
3,235212,/player/235212/achraf-hakimi/260004/,26,4,2025-09-19,A. Hakimi,Achraf Hakimi Mouhأشرف حكيمي,"RB, RM",89,90,...,83+3,86+3,86+3,81+3,81+3,81+3,86+3,17+3,https://cdn.sofifa.net/players/235/212/26_120.png,Defender
4,224232,/player/224232/nicolo-barella/260004/,26,4,2025-09-19,N. Barella,Nicolò Barella,CM,87,87,...,85+2,84+3,83+3,80+3,80+3,80+3,83+3,19+3,https://cdn.sofifa.net/players/224/232/26_120.png,Midfielder
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16945,275651,/player/275651/xiao-peng/260004/,26,4,2025-09-19,Peng Xiao,Peng Xiao彭啸,CB,48,61,...,42+2,42+2,44+2,48+2,48+2,48+2,44+2,13+2,https://cdn.sofifa.net/players/275/651/26_120.png,Defender
16969,77910,/player/77910/zhaozhi-zhang/260004/,26,4,2025-09-19,Zhang Zhaozhi,Zhang Zhaozhi张兆致,CB,48,55,...,41+2,43+2,45+2,48+2,48+2,48+2,45+2,15+2,https://cdn.sofifa.net/players/077/910/26_120.png,Defender
17049,273641,/player/273641/guoliang-sun/260004/,26,4,2025-09-19,Sun Guoliang,Sun Guoliang孙国梁,CB,51,51,...,48+2,47+2,50+1,51+0,51+0,51+0,50+1,12+2,https://cdn.sofifa.net/players/273/641/26_120.png,Defender
17060,274116,/player/274116/hyun-tae-jo/260004/,26,4,2025-09-19,Jo Hyun Tae,Hyun-tae Jo조현태,CB,49,61,...,43+2,42+2,44+2,49+2,49+2,49+2,44+2,12+2,https://cdn.sofifa.net/players/274/116/26_120.png,Defender


In [7]:
# --- 1. Split train/test ---
X = df[detailed_skill_attributes] #1 car c'est une liste
y = df['role']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# our similarity matching is only based on detailed skill attribtues
# attacking_skill_attributes = [
#     'attacking_crossing', 'attacking_finishing',
#     'attacking_heading_accuracy', 'attacking_short_passing',
#     'attacking_volleys']

# middling_skill_attributes = [
#     'skill_dribbling', 'skill_curve',
#     'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control',
#     'movement_acceleration', 'movement_sprint_speed', 'movement_agility',
#     'movement_reactions', 'movement_balance', 'power_shot_power',
#     'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots',
# ]

# defending_skill_attributes = [
# 'mentality_aggression', 'mentality_interceptions',
#     'mentality_positioning', 'mentality_vision', 'mentality_penalties',
#     'mentality_composure', 'defending_marking_awareness',
#     'defending_standing_tackle', 'defending_sliding_tackle',
#     'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking',
#     'goalkeeping_positioning', 'goalkeeping_reflexes', 'goalkeeping_speed'
# ]

# column selection as part of pipeline
def select_skill_columns(X):
    return X[detailed_skill_attributes]

# column selector
#alternative pour la sélection : column_selector = FunctionTransformer(select_skill_columns)

# MinMax scale
scaling_pipe = Pipeline([
    ('mm_scaler', MinMaxScaler())
])

# imputer: only gk speed has NaNs, fill with 0, only affects outfiled players
imputing_pipe = Pipeline([
    ("gk_speed", SimpleImputer(strategy="constant", fill_value=0))
])

# preprocessing pipe (select columns, scale values, impute NaNs)
preprocessor_pipe = Pipeline([
    #("select_columns", column_selector),
    ("scaling", scaling_pipe),
    ("imputing", imputing_pipe)
])
preprocessor_pipe

0,1,2
,steps,"[('scaling', ...), ('imputing', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('mm_scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,steps,"[('gk_speed', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,0
,copy,True
,add_indicator,False
,keep_empty_features,False


In [15]:
preprocessor_pipe.set_output(transform='pandas')

0,1,2
,steps,"[('scaling', ...), ('imputing', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('mm_scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,steps,"[('gk_speed', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,0
,copy,True
,add_indicator,False
,keep_empty_features,False


In [16]:
# transform raw data
X_train_preproc = preprocessor_pipe.fit_transform(X_train)
X_train_preproc

Unnamed: 0,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,...,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,goalkeeping_speed
11088,0.493151,0.481013,0.347222,0.626866,0.389610,0.597403,0.500000,0.518987,0.602740,0.608108,...,0.523810,0.333333,0.469136,0.468354,0.136986,0.255814,0.068493,0.117647,0.051724,0.0
6507,0.342466,0.721519,0.680556,0.552239,0.662338,0.649351,0.564103,0.455696,0.328767,0.581081,...,0.587302,0.370370,0.209877,0.164557,0.150685,0.186047,0.164384,0.132353,0.120690,0.0
4069,0.493151,0.329114,0.486111,0.701493,0.207792,0.649351,0.653846,0.417722,0.712329,0.662162,...,0.619048,0.716049,0.716049,0.759494,0.136986,0.093023,0.095890,0.161765,0.189655,0.0
11454,0.547945,0.303797,0.388889,0.507463,0.311688,0.558442,0.346154,0.265823,0.465753,0.567568,...,0.238095,0.530864,0.592593,0.544304,0.150685,0.279070,0.123288,0.058824,0.137931,0.0
3325,0.397260,0.240506,0.708333,0.791045,0.168831,0.701299,0.602564,0.658228,0.753425,0.729730,...,0.746032,0.790123,0.814815,0.784810,0.136986,0.255814,0.178082,0.161765,0.155172,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13423,0.287671,0.151899,0.666667,0.611940,0.194805,0.298701,0.256410,0.164557,0.465753,0.567568,...,0.380952,0.728395,0.691358,0.683544,0.095890,0.279070,0.164384,0.191176,0.120690,0.0
5390,0.397260,0.316456,0.513889,0.671642,0.389610,0.610390,0.705128,0.468354,0.684932,0.662162,...,0.571429,0.629630,0.691358,0.594937,0.164384,0.302326,0.150685,0.161765,0.137931,0.0
860,0.589041,0.759494,0.763889,0.746269,0.766234,0.766234,0.679487,0.544304,0.684932,0.770270,...,0.714286,0.370370,0.654321,0.354430,0.150685,0.093023,0.068493,0.073529,0.172414,0.0
15858,0.082192,0.113924,0.708333,0.298507,0.181818,0.103896,0.128205,0.113924,0.191781,0.418919,...,0.238095,0.530864,0.641975,0.556962,0.095890,0.255814,0.068493,0.161765,0.137931,0.0


In [17]:
# transform raw data
X_test_preproc = preprocessor_pipe.transform(X_test)
X_test_preproc

Unnamed: 0,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,...,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,goalkeeping_speed
15782,0.328767,0.063291,0.583333,0.492537,0.103896,0.168831,0.102564,0.316456,0.520548,0.472973,...,0.222222,0.543210,0.679012,0.683544,0.109589,0.116279,0.068493,0.161765,0.155172,0.0
14735,0.315068,0.088608,0.583333,0.522388,0.129870,0.415584,0.371795,0.075949,0.479452,0.472973,...,0.206349,0.604938,0.728395,0.708861,0.178082,0.255814,0.123288,0.088235,0.189655,0.0
476,0.657534,0.594937,0.763889,0.805970,0.688312,0.727273,0.692308,0.468354,0.794521,0.783784,...,0.761905,0.777778,0.839506,0.784810,0.068493,0.232558,0.164384,0.147059,0.086207,0.0
13128,0.315068,0.518987,0.472222,0.298507,0.506494,0.558442,0.410256,0.392405,0.150685,0.472973,...,0.269841,0.185185,0.074074,0.063291,0.178082,0.255814,0.109589,0.088235,0.172414,0.0
10472,0.397260,0.594937,0.500000,0.447761,0.571429,0.571429,0.371795,0.202532,0.246575,0.527027,...,0.444444,0.160494,0.172840,0.126582,0.068493,0.255814,0.191781,0.102941,0.155172,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3613,0.520548,0.632911,0.597222,0.582090,0.324675,0.597403,0.512821,0.696203,0.616438,0.608108,...,0.365079,0.740741,0.629630,0.696203,0.054795,0.116279,0.082192,0.147059,0.224138,0.0
3411,0.589041,0.556962,0.361111,0.686567,0.545455,0.636364,0.666667,0.607595,0.684932,0.675676,...,0.666667,0.679012,0.691358,0.683544,0.109589,0.116279,0.136986,0.161765,0.241379,0.0
8494,0.643836,0.632911,0.486111,0.641791,0.662338,0.766234,0.641026,0.417722,0.616438,0.689189,...,0.492063,0.148148,0.148148,0.101266,0.095890,0.069767,0.123288,0.176471,0.224138,0.0
8641,0.315068,0.303797,0.472222,0.626866,0.207792,0.571429,0.333333,0.215190,0.575342,0.567568,...,0.349206,0.716049,0.679012,0.670886,0.123288,0.186047,0.041096,0.132353,0.172414,0.0


In [20]:
# # Model : OneClassSVM Unsupervised Outlier Detection.Estimate the support of a high-dimensional distribution.
# clf = OneClassSVM(kernel='rbf', degree=3, gamma='scale', tol=0.001, nu=0.5, shrinking=True, cache_size=200, verbose=False, max_iter=-1).fit(X)#faut-il un fit and transform ?
# clf.predict(X_train)
# clf.score_samples(X_train)
# )


clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=20, random_state=42).fit(X_train, y_train)
clf.score(X_test, y_test)

clf.feature_importances_

array([0.06293275, 0.19736977, 0.05926791, 0.01409417, 0.00969711,
       0.00908285, 0.00761643, 0.01012304, 0.05531665, 0.00909025,
       0.00991673, 0.01796099, 0.00987914, 0.00449108, 0.00926628,
       0.01779702, 0.06844308, 0.01034654, 0.00859274, 0.01129455,
       0.0109521 , 0.01334644, 0.01078704, 0.10869781, 0.00887421,
       0.00626427, 0.03930527, 0.01080266, 0.15788436, 0.00663392,
       0.00624424, 0.00550136, 0.00726055, 0.00486669, 0.        ])

In [21]:
# save knn model as pickel file
with open("position_classifier.pkl", "wb") as file:
    pickle.dump(clf, file)

In [25]:
# Top 10 important features
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': clf.feature_importances_
}).sort_values('importance', ascending=True)

In [26]:
feature_importance

Unnamed: 0,feature,importance
34,goalkeeping_speed,0.0
13,movement_reactions,0.004491
33,goalkeeping_reflexes,0.004867
31,goalkeeping_kicking,0.005501
30,goalkeeping_handling,0.006244
25,mentality_composure,0.006264
29,goalkeeping_diving,0.006634
32,goalkeeping_positioning,0.007261
6,skill_curve,0.007616
18,power_strength,0.008593


In [27]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Evaluate
y_pred = clf.predict(X_test)
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=['True_' + c for c in clf.classes_],
    columns=['Pred_' + c for c in clf.classes_]
))

Test Accuracy: 0.8608

Classification Report:
              precision    recall  f1-score   support

    Defender       0.92      0.92      0.92      1213
     Forward       0.82      0.76      0.79       657
  Midfielder       0.82      0.86      0.84      1399

    accuracy                           0.86      3269
   macro avg       0.86      0.85      0.85      3269
weighted avg       0.86      0.86      0.86      3269


Confusion Matrix:
                 Pred_Defender  Pred_Forward  Pred_Midfielder
True_Defender             1113             1               99
True_Forward                 1           499              157
True_Midfielder             90           107             1202
