In [1]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_selector
import pickle
import pandas as pd
import numpy as np

In [2]:
# 1 - Load data (file path readable by the api app ; )
df = pd.read_csv("../raw_data/FC26_20250921.csv")
df.head()

  df = pd.read_csv("../raw_data/FC26_20250921.csv")


Unnamed: 0,player_id,player_url,fifa_version,fifa_update,fifa_update_date,short_name,long_name,player_positions,overall,potential,...,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk,player_face_url
0,252371,/player/252371/jude-bellingham/260004/,26,4,2025-09-19,J. Bellingham,Jude Victor William Bellingham,"CAM, CM",90,94,...,85+3,85+3,83+3,82+3,81+3,81+3,81+3,82+3,18+3,https://cdn.sofifa.net/players/252/371/26_120.png
1,239053,/player/239053/federico-valverde/260004/,26,4,2025-09-19,F. Valverde,Federico Santiago Valverde Dipetta,"CM, CDM, RB",89,90,...,87+3,87+3,86+3,86+3,83+3,83+3,83+3,86+3,18+3,https://cdn.sofifa.net/players/239/053/26_120.png
2,212622,/player/212622/joshua-kimmich/260004/,26,4,2025-09-19,J. Kimmich,Joshua Walter Kimmich,"CDM, RB, CM",89,89,...,87+2,87+2,86+3,85+3,82+3,82+3,82+3,85+3,21+3,https://cdn.sofifa.net/players/212/622/26_120.png
3,235212,/player/235212/achraf-hakimi/260004/,26,4,2025-09-19,A. Hakimi,Achraf Hakimi Mouhأشرف حكيمي,"RB, RM",89,90,...,83+3,83+3,86+3,86+3,81+3,81+3,81+3,86+3,17+3,https://cdn.sofifa.net/players/235/212/26_120.png
4,224232,/player/224232/nicolo-barella/260004/,26,4,2025-09-19,N. Barella,Nicolò Barella,CM,87,87,...,85+2,85+2,84+3,83+3,80+3,80+3,80+3,83+3,19+3,https://cdn.sofifa.net/players/224/232/26_120.png


In [3]:
# Fill in NaNs

# 0 for goalkeeping speed for all outfield players
df['goalkeeping_speed'] = df['goalkeeping_speed'].fillna(0)

# 0 for grouped outfield attribute scores for goalkeepers
columns_to_fill = ['pace', 'physic', 'defending', 'passing', 'shooting', 'dribbling']
df[columns_to_fill] = df[columns_to_fill].fillna(0)

In [4]:
# Create target from player_positions
def categorize_role(positions):
    if pd.isna(positions):
        return None
    
    # Get first position (primary)
    primary_pos = positions.split(',')[0].strip().upper()
    
    position_groups = {
        'ST': 'Forward', 'CF': 'Forward', 'LW': 'Forward', 'RW': 'Forward', 'LF': 'Forward', 'RF': 'Forward',
        'CAM': 'Midfielder', 'CM': 'Midfielder', 'CDM': 'Midfielder', 'LM': 'Midfielder', 'RM': 'Midfielder',
        'CB': 'Defender', 'LB': 'Defender', 'RB': 'Defender', 'LWB': 'Defender', 'RWB': 'Defender',
        'GK': 'Goalkeeper'
    }
    
    return position_groups.get(primary_pos) #string avec des valeurs séparées avec des virgules. Le .get permet d'extraire des valeurs pour certains valeurs

df['role'] = df['player_positions'].apply(categorize_role)
df = df[df['role'] != 'Goalkeeper']
df = df.dropna(subset=['role'])

df

Unnamed: 0,player_id,player_url,fifa_version,fifa_update,fifa_update_date,short_name,long_name,player_positions,overall,potential,...,rdm,rwb,lb,lcb,cb,rcb,rb,gk,player_face_url,role
0,252371,/player/252371/jude-bellingham/260004/,26,4,2025-09-19,J. Bellingham,Jude Victor William Bellingham,"CAM, CM",90,94,...,85+3,83+3,82+3,81+3,81+3,81+3,82+3,18+3,https://cdn.sofifa.net/players/252/371/26_120.png,Midfielder
1,239053,/player/239053/federico-valverde/260004/,26,4,2025-09-19,F. Valverde,Federico Santiago Valverde Dipetta,"CM, CDM, RB",89,90,...,87+3,86+3,86+3,83+3,83+3,83+3,86+3,18+3,https://cdn.sofifa.net/players/239/053/26_120.png,Midfielder
2,212622,/player/212622/joshua-kimmich/260004/,26,4,2025-09-19,J. Kimmich,Joshua Walter Kimmich,"CDM, RB, CM",89,89,...,87+2,86+3,85+3,82+3,82+3,82+3,85+3,21+3,https://cdn.sofifa.net/players/212/622/26_120.png,Midfielder
3,235212,/player/235212/achraf-hakimi/260004/,26,4,2025-09-19,A. Hakimi,Achraf Hakimi Mouhأشرف حكيمي,"RB, RM",89,90,...,83+3,86+3,86+3,81+3,81+3,81+3,86+3,17+3,https://cdn.sofifa.net/players/235/212/26_120.png,Defender
4,224232,/player/224232/nicolo-barella/260004/,26,4,2025-09-19,N. Barella,Nicolò Barella,CM,87,87,...,85+2,84+3,83+3,80+3,80+3,80+3,83+3,19+3,https://cdn.sofifa.net/players/224/232/26_120.png,Midfielder
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16945,275651,/player/275651/xiao-peng/260004/,26,4,2025-09-19,Peng Xiao,Peng Xiao彭啸,CB,48,61,...,42+2,42+2,44+2,48+2,48+2,48+2,44+2,13+2,https://cdn.sofifa.net/players/275/651/26_120.png,Defender
16969,77910,/player/77910/zhaozhi-zhang/260004/,26,4,2025-09-19,Zhang Zhaozhi,Zhang Zhaozhi张兆致,CB,48,55,...,41+2,43+2,45+2,48+2,48+2,48+2,45+2,15+2,https://cdn.sofifa.net/players/077/910/26_120.png,Defender
17049,273641,/player/273641/guoliang-sun/260004/,26,4,2025-09-19,Sun Guoliang,Sun Guoliang孙国梁,CB,51,51,...,48+2,47+2,50+1,51+0,51+0,51+0,50+1,12+2,https://cdn.sofifa.net/players/273/641/26_120.png,Defender
17060,274116,/player/274116/hyun-tae-jo/260004/,26,4,2025-09-19,Jo Hyun Tae,Hyun-tae Jo조현태,CB,49,61,...,43+2,42+2,44+2,49+2,49+2,49+2,44+2,12+2,https://cdn.sofifa.net/players/274/116/26_120.png,Defender


In [5]:
# detailed_skill_attributes = [
#     'attacking_crossing', 'attacking_finishing',
#     'attacking_heading_accuracy', 'attacking_short_passing',
#     'attacking_volleys', 'skill_dribbling', 'skill_curve',
#     'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control',
#     'movement_acceleration', 'movement_sprint_speed', 'movement_agility',
#     'movement_reactions', 'movement_balance', 'power_shot_power',
#     'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots',
#     'mentality_aggression', 'mentality_interceptions',
#     'mentality_positioning', 'mentality_vision', 'mentality_penalties',
#     'mentality_composure', 'defending_marking_awareness',
#     'defending_standing_tackle', 'defending_sliding_tackle',
#     'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking',
#     'goalkeeping_positioning', 'goalkeeping_reflexes', 'goalkeeping_speed'
# ]
detailed_skill_attributes = ['age', 'pace', 'dribbling', 'passing', 'defending', 'shooting', 'physic', 'skill_moves', 'weak_foot']

In [6]:
df

Unnamed: 0,player_id,player_url,fifa_version,fifa_update,fifa_update_date,short_name,long_name,player_positions,overall,potential,...,rdm,rwb,lb,lcb,cb,rcb,rb,gk,player_face_url,role
0,252371,/player/252371/jude-bellingham/260004/,26,4,2025-09-19,J. Bellingham,Jude Victor William Bellingham,"CAM, CM",90,94,...,85+3,83+3,82+3,81+3,81+3,81+3,82+3,18+3,https://cdn.sofifa.net/players/252/371/26_120.png,Midfielder
1,239053,/player/239053/federico-valverde/260004/,26,4,2025-09-19,F. Valverde,Federico Santiago Valverde Dipetta,"CM, CDM, RB",89,90,...,87+3,86+3,86+3,83+3,83+3,83+3,86+3,18+3,https://cdn.sofifa.net/players/239/053/26_120.png,Midfielder
2,212622,/player/212622/joshua-kimmich/260004/,26,4,2025-09-19,J. Kimmich,Joshua Walter Kimmich,"CDM, RB, CM",89,89,...,87+2,86+3,85+3,82+3,82+3,82+3,85+3,21+3,https://cdn.sofifa.net/players/212/622/26_120.png,Midfielder
3,235212,/player/235212/achraf-hakimi/260004/,26,4,2025-09-19,A. Hakimi,Achraf Hakimi Mouhأشرف حكيمي,"RB, RM",89,90,...,83+3,86+3,86+3,81+3,81+3,81+3,86+3,17+3,https://cdn.sofifa.net/players/235/212/26_120.png,Defender
4,224232,/player/224232/nicolo-barella/260004/,26,4,2025-09-19,N. Barella,Nicolò Barella,CM,87,87,...,85+2,84+3,83+3,80+3,80+3,80+3,83+3,19+3,https://cdn.sofifa.net/players/224/232/26_120.png,Midfielder
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16945,275651,/player/275651/xiao-peng/260004/,26,4,2025-09-19,Peng Xiao,Peng Xiao彭啸,CB,48,61,...,42+2,42+2,44+2,48+2,48+2,48+2,44+2,13+2,https://cdn.sofifa.net/players/275/651/26_120.png,Defender
16969,77910,/player/77910/zhaozhi-zhang/260004/,26,4,2025-09-19,Zhang Zhaozhi,Zhang Zhaozhi张兆致,CB,48,55,...,41+2,43+2,45+2,48+2,48+2,48+2,45+2,15+2,https://cdn.sofifa.net/players/077/910/26_120.png,Defender
17049,273641,/player/273641/guoliang-sun/260004/,26,4,2025-09-19,Sun Guoliang,Sun Guoliang孙国梁,CB,51,51,...,48+2,47+2,50+1,51+0,51+0,51+0,50+1,12+2,https://cdn.sofifa.net/players/273/641/26_120.png,Defender
17060,274116,/player/274116/hyun-tae-jo/260004/,26,4,2025-09-19,Jo Hyun Tae,Hyun-tae Jo조현태,CB,49,61,...,43+2,42+2,44+2,49+2,49+2,49+2,44+2,12+2,https://cdn.sofifa.net/players/274/116/26_120.png,Defender


In [7]:
# --- 1. Split train/test ---
X = df[detailed_skill_attributes] #1 car c'est une liste
y = df['role']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
X_train

Unnamed: 0,age,pace,dribbling,passing,defending,shooting,physic,skill_moves,weak_foot
11088,19,56.0,63.0,62.0,42.0,57.0,46.0,3,2
6507,24,73.0,64.0,54.0,34.0,70.0,77.0,3,4
4069,21,67.0,68.0,67.0,67.0,49.0,69.0,2,5
11454,24,69.0,61.0,55.0,54.0,41.0,57.0,2,3
3325,28,68.0,70.0,69.0,74.0,44.0,72.0,3,3
...,...,...,...,...,...,...,...,...,...
13423,26,57.0,50.0,50.0,67.0,29.0,65.0,2,2
5390,30,53.0,64.0,65.0,62.0,52.0,70.0,3,3
860,28,78.0,75.0,70.0,51.0,75.0,80.0,3,4
15858,22,62.0,38.0,36.0,58.0,28.0,67.0,2,2


In [8]:
# our similarity matching is only based on detailed skill attribtues
# attacking_skill_attributes = [
#     'attacking_crossing', 'attacking_finishing',
#     'attacking_heading_accuracy', 'attacking_short_passing',
#     'attacking_volleys']

# middling_skill_attributes = [
#     'skill_dribbling', 'skill_caurve',
#     'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control',
#     'movement_acceleration', 'movement_sprint_speed', 'movement_agility',
#     'movement_reactions', 'movement_balance', 'power_shot_power',
#     'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots',
# ]

# defending_skill_attributes = [
# 'mentality_aggression', 'mentality_interceptions',
#     'mentality_positioning', 'mentality_vision', 'mentality_penalties',
#     'mentality_composure', 'defending_marking_awareness',
#     'defending_standing_tackle', 'defending_sliding_tackle',
#     'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking',
#     'goalkeeping_positioning', 'goalkeeping_reflexes', 'goalkeeping_speed'
# ]

# column selection as part of pipeline
def select_skill_columns(X):
    return X[detailed_skill_attributes]

# column selector
#alternative pour la sélection : column_selector = FunctionTransformer(select_skill_columns)

# MinMax scale
scaling_pipe = Pipeline([
    ('mm_scaler', MinMaxScaler())
])

# imputer: only gk speed has NaNs, fill with 0, only affects outfiled players
imputing_pipe = Pipeline([
    ("gk_speed", SimpleImputer(strategy="constant", fill_value=0))
])

# preprocessing pipe (select columns, scale values, impute NaNs)
preprocessor_pipe = Pipeline([
    #("select_columns", column_selector),
    ("scaling", scaling_pipe),
    ("imputing", imputing_pipe)
])
preprocessor_pipe

0,1,2
,steps,"[('scaling', ...), ('imputing', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('mm_scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,steps,"[('gk_speed', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,0
,copy,True
,add_indicator,False
,keep_empty_features,False


In [9]:
preprocessor_pipe.set_output(transform='pandas')

0,1,2
,steps,"[('scaling', ...), ('imputing', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('mm_scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,steps,"[('gk_speed', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,0
,copy,True
,add_indicator,False
,keep_empty_features,False


In [10]:
# transform raw data
X_train_preproc = preprocessor_pipe.fit_transform(X_train)
X_train_preproc

Unnamed: 0,age,pace,dribbling,passing,defending,shooting,physic,skill_moves,weak_foot
11088,0.107143,0.388060,0.585714,0.552239,0.360000,0.507042,0.237288,0.333333,0.25
6507,0.285714,0.641791,0.600000,0.432836,0.253333,0.690141,0.762712,0.333333,0.75
4069,0.178571,0.552239,0.657143,0.626866,0.693333,0.394366,0.627119,0.000000,1.00
11454,0.285714,0.582090,0.557143,0.447761,0.520000,0.281690,0.423729,0.000000,0.50
3325,0.428571,0.567164,0.685714,0.656716,0.786667,0.323944,0.677966,0.333333,0.50
...,...,...,...,...,...,...,...,...,...
13423,0.357143,0.402985,0.400000,0.373134,0.693333,0.112676,0.559322,0.000000,0.25
5390,0.500000,0.343284,0.600000,0.597015,0.626667,0.436620,0.644068,0.333333,0.50
860,0.428571,0.716418,0.757143,0.671642,0.480000,0.760563,0.813559,0.333333,0.75
15858,0.214286,0.477612,0.228571,0.164179,0.573333,0.098592,0.593220,0.000000,0.25


In [11]:
# transform raw data
X_test_preproc = preprocessor_pipe.transform(X_test)
X_test_preproc

Unnamed: 0,age,pace,dribbling,passing,defending,shooting,physic,skill_moves,weak_foot
15782,0.178571,0.358209,0.300000,0.313433,0.600000,0.098592,0.338983,0.000000,0.25
14735,0.250000,0.373134,0.400000,0.343284,0.640000,0.070423,0.593220,0.000000,0.25
476,0.321429,0.552239,0.742857,0.731343,0.813333,0.633803,0.779661,0.333333,0.50
13128,0.392857,0.611940,0.542857,0.253731,0.093333,0.521127,0.508475,0.333333,0.50
10472,0.392857,0.731343,0.571429,0.343284,0.146667,0.535211,0.694915,0.000000,0.50
...,...,...,...,...,...,...,...,...,...
3613,0.464286,0.432836,0.628571,0.567164,0.680000,0.605634,0.372881,0.000000,0.50
3411,0.607143,0.268657,0.657143,0.641791,0.640000,0.563380,0.576271,0.000000,0.50
8494,0.178571,0.671642,0.714286,0.611940,0.146667,0.633803,0.271186,0.666667,1.00
8641,0.285714,0.537313,0.571429,0.462687,0.640000,0.323944,0.694915,0.000000,0.25


In [16]:
# save knn model as pickel file
with open("preprocessor_position_classifier.pkl", "wb") as file:
    pickle.dump(preprocessor_pipe, file)

In [12]:
# # Model : OneClassSVM Unsupervised Outlier Detection.Estimate the support of a high-dimensional distribution.
# clf = OneClassSVM(kernel='rbf', degree=3, gamma='scale', tol=0.001, nu=0.5, shrinking=True, cache_size=200, verbose=False, max_iter=-1).fit(X)#faut-il un fit and transform ?
# clf.predict(X_train)
# clf.score_samples(X_train)
# )


clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=20, random_state=42).fit(X_train_preproc, y_train)
clf.score(X_test_preproc, y_test)

clf.feature_importances_

array([0.0549377 , 0.08316103, 0.05398165, 0.1350994 , 0.26597612,
       0.27157183, 0.10415277, 0.01311322, 0.01800629])

In [13]:
# save knn model as pickel file
with open("position_classifier.pkl", "wb") as file:
    pickle.dump(clf, file)

In [14]:
# Top 10 important features
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': clf.feature_importances_
}).sort_values('importance', ascending=False)

In [15]:
feature_importance

Unnamed: 0,feature,importance
5,shooting,0.271572
4,defending,0.265976
3,passing,0.135099
6,physic,0.104153
1,pace,0.083161
0,age,0.054938
2,dribbling,0.053982
8,weak_foot,0.018006
7,skill_moves,0.013113


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Evaluate
y_pred = clf.predict(X_test)
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=['True_' + c for c in clf.classes_],
    columns=['Pred_' + c for c in clf.classes_]
))

Test Accuracy: 0.8608

Classification Report:
              precision    recall  f1-score   support

    Defender       0.92      0.92      0.92      1213
     Forward       0.82      0.76      0.79       657
  Midfielder       0.82      0.86      0.84      1399

    accuracy                           0.86      3269
   macro avg       0.86      0.85      0.85      3269
weighted avg       0.86      0.86      0.86      3269


Confusion Matrix:
                 Pred_Defender  Pred_Forward  Pred_Midfielder
True_Defender             1113             1               99
True_Forward                 1           499              157
True_Midfielder             90           107             1202
