# Prédiction des valeurs d'un joueur de Football

### Import des bibliothèques

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_absolute_error, root_mean_squared_error, mean_absolute_percentage_error

### Import des données

In [None]:
players = pd.read_csv("data/fifa_players.csv")

### Exploration des données

In [None]:
players.head()

In [None]:
players.info()

In [None]:
players.columns

In [None]:
players.describe()

#### Analysons les valeurs nulles de notre Dataset

In [None]:
print(players.isnull().sum()) 

In [None]:
print(players['national_team'].isnull().sum() / len(players) * 100)

In [None]:
players = players.drop(columns=['name', 'full_name', 'birth_date', 'national_team_position', 'national_jersey_number','national_team', 'national_rating' ])
players.info()

On supprime les colonnes clause libératoire et salaire car elles risquent de donner trop d'indices à notre modèle

In [None]:
players = players.drop(columns=['release_clause_euro', 'wage_euro'])
players.info()

In [None]:
print(players.isnull().sum()) 

 On se débarasse des valeurs nulles de la valeur cible 

In [None]:
players = players.dropna(subset=['value_euro'])
print(players.isnull().sum()) 

### Nettoyage et traitement des autres colonnes

In [None]:
print(players['positions'].unique())

In [None]:
positions = players['positions'].str.get_dummies(sep=',')
print(positions)

In [None]:
players = pd.concat([players, positions], axis=1)
players.head()

In [None]:
players.drop(columns=['positions'], inplace=True)
players.columns

Ici on a une fonction qui traduit les True/False obtenu en 1/0 

In [None]:
def convert_columns_to_int(df, columns):
    for column in columns:
        df[column] = df[column].astype('int')
    return df

In [None]:
print(sorted(players['nationality'].unique()))

Ici nous avons le TOP 10 des nations de l'année 2018 et le traitement de la colonne nationalité

In [None]:
top_nations = ['Belgium', 'France', 'Brazil', 'Croatia', 'England', 'Portugal', 'Uruguay', 'Switzerland', 'Spain', 'Denmark', 'Argentina']
players['top_nation'] = players['nationality'].apply(lambda x: 1 if x in top_nations else 0)
players.head()

In [None]:
print(players['preferred_foot'].unique())
print(players['preferred_foot'].value_counts())

In [None]:
players = pd.get_dummies(players, columns = ['preferred_foot'], prefix = 'foot', drop_first = False)
convert_columns_to_int(players, ['foot_Left', 'foot_Right'])
players.head()

In [None]:
print(players['body_type'].unique())
print(players['body_type'].value_counts()) 

In [None]:
players['body_type'] = players['body_type'].replace(['Messi','C. Ronaldo','Neymar', 'Courtois', 'Akinfenwa', 'Shaqiri', 'PLAYER_BODY_TYPE_25'], 'Normal')
print(players['body_type'].unique())
print(players['body_type'].value_counts())

In [None]:
players = pd.get_dummies(players, columns = ['body_type'], prefix = 'body', drop_first = False)
players.head()

In [None]:
convert_columns_to_int(players, ['body_Lean', 'body_Normal', 'body_Stocky'])
players.head()

In [None]:
players.columns

In [None]:
players.describe()

In [None]:
players = players.drop(columns=['nationality'])

In [None]:
print(players[['body_Lean', 'body_Normal', 'body_Stocky']].sum(axis=1).value_counts())

On affiche juste les minimums et maximums de chaque colonne 

In [None]:
for col in players.columns:
    print(f"{col}: {players[col].min(), players[col].max()}")

On passe la valeur de joueur en `Log` base 10

In [None]:
players['log_value_euro'] = np.log10(players['value_euro'])
players.head()

Ici on a une fonction qui permet de visualiser la distribution d'une colonne 

In [None]:
def distribution(data, column):
    sns.histplot(data[column], bins=50, kde=True)
    plt.title("Distribution de " + column)
    plt.xlabel(column)
    plt.ylabel("Nombre de joueurs")
    plt.show()

In [None]:
distribution(players, 'value_euro')

In [None]:
distribution(players, 'log_value_euro')

Ici on a une fonction qui permet de mesurer la corrélation entre les variables

In [None]:
def correlation_scatterplot(data, x, y):
    sns.scatterplot(data=data, x=x, y=y)
    plt.title("Relation entre "+x+" et "+y)
    plt.xlabel(x)
    plt.ylabel(y)
    plt.show()

### Feature Engineering

In [None]:

players['potential_to_rating'] = players['potential'] / players['overall_rating']
players['bmi'] = players['weight_kgs'] / ((players['height_cm'] / 100) ** 2)
players['weighted_reputation'] = players['overall_rating'] * players['international_reputation(1-5)']
players['age_to_potential'] = (players['potential'] - players['overall_rating']) / players['age']
players['decline_risk'] = players['age'] / players['potential'] 
players['pace'] = players[['acceleration', 'sprint_speed']].mean(axis=1)
players['shoot'] = players[['finishing', 'penalties', 'volleys','shot_power', 'long_shots','positioning']].mean(axis=1)
players['pass'] = players[['vision', 'crossing', 'freekick_accuracy', 'short_passing', 'long_passing', 'curve']].mean(axis=1)
players['dribble'] = players[['agility', 'balance','reactions','ball_control', 'dribbling', 'composure']].mean(axis=1)
players['defend'] = players[['interceptions', 'heading_accuracy','standing_tackle','sliding_tackle', 'marking']].mean(axis=1)
players['physical'] = players[['jumping','stamina', 'strength', 'aggression']].mean(axis=1)

positions_cols = ['CAM','CB', 'CDM', 'CF', 'CM', 'GK', 'LB', 'LM', 'LW', 'LWB', 'RB', 'RM', 'RW', 'RWB', 'ST']

players['versatile'] = players[positions_cols].sum(axis=1)
players['versatile_x_rating'] = players['versatile'] * players['overall_rating']

In [None]:
columns_to_remove = ['crossing', 'finishing',
       'heading_accuracy', 'short_passing', 'volleys', 'dribbling', 'curve',
       'freekick_accuracy', 'long_passing', 'ball_control', 'acceleration',
       'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power',
       'jumping', 'stamina', 'strength', 'long_shots', 'aggression',
       'interceptions', 'positioning', 'vision', 'penalties', 'composure',
       'marking', 'standing_tackle', 'sliding_tackle']

players.drop(columns = columns_to_remove, inplace=True)

In [None]:
players.columns

Ici on s'est rendu compte que les gardiens allaient être problématique

In [None]:
print(players['GK'].sum())

On les supprime

In [None]:
players = players[players['GK'] == 0]

In [None]:
players.describe()

In [None]:
distribution(players, 'height_cm')

In [None]:
print(players['height_cm'].value_counts())

In [None]:
suspect_sizes = [152.40, 154.94] 
players.loc[players['height_cm'].isin(suspect_sizes), 'height_cm'] = np.nan
players['known_height'] = players['height_cm'].notna().astype(int)

print(players['known_height'].value_counts())

On filtre les lignes sans NaN dans les deux colonnes

In [None]:
valid_data = players[['height_cm', 'value_euro', 'log_value_euro']].dropna()

Ici on vérifie les corrélations avec pandas

In [None]:
corr_value = valid_data['height_cm'].corr(valid_data['value_euro'])
corr_log_value = valid_data['height_cm'].corr(valid_data['log_value_euro'])

print(f"Corrélation entre height_cm et value_euro : {corr_value:.4f}")
print(f"Corrélation entre height_cm et log_value_euro : {corr_log_value:.4f}")


Ici nous avons une fonction qui permet de calculer la médiane pondérée

In [None]:
def impute_height(row):
    if pd.isna(row['height_cm']):
        positions = row[positions_cols]
        player_positions = positions[positions == 1].index.tolist()
        
        if len(player_positions) > 0:
            median_height = players[players[player_positions].sum(axis=1) > 0]['height_cm'].median()
            return median_height
    return row['height_cm']

In [None]:
players['height_cm'] = players.apply(impute_height, axis=1)

Devrait affiché `0` car nous venons de la traiter

In [None]:
print(players['height_cm'].isna().sum())

In [None]:
print(players[['height_cm', 'known_height']].head(10))

In [None]:
players.columns

### Training / Validation / Evaluation

Séparation du dataset en deux ensembles: Train / Test

In [None]:
X = players.drop(columns=['value_euro', 'log_value_euro'])
Y = players['value_euro']
Y_log = players['log_value_euro']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y_log, test_size=0.3, random_state=42)

Ici nous avons défini des fonctions qui permettent de récuperer le resultat de ces métriques avec les valeurs réélles et non en `log scale` 

In [None]:
def rmse_original(Y_true, Y_pred_log):
    Y_true_orig = 10**Y_true
    Y_pred_orig = 10**Y_pred_log
    return root_mean_squared_error(Y_true_orig, Y_pred_orig)

def mae_original(Y_true, Y_pred_log):
    Y_true_orig = 10**Y_true
    Y_pred_orig = 10**Y_pred_log
    return mean_absolute_error(Y_true_orig, Y_pred_orig)

def mape_original(Y_true, Y_pred_log):
    Y_true_orig = 10**Y_true
    Y_pred_orig = 10**Y_pred_log
    return mean_absolute_percentage_error(Y_true_orig, Y_pred_orig)

Petit `heatmap` pour voir de près la corrélation entre les différents colonnes

In [None]:
sns.heatmap(X_train.corr(), cmap="coolwarm", annot=False)
plt.show()

Visualisation de la corrélation entre les colonnes que nous jugeons suscpetibles d'être les plus impactantes pour le training du model

In [None]:
sns.pairplot(data=players, x_vars=['overall_rating','potential', 'decline_risk', 'weighted_reputation'], y_vars='log_value_euro', kind='scatter')
plt.show()

### 2 models séléctionnés: `Linear Regression` et `Random Forest` 
#### Validation croisée en utilisant différentes métriques

In [None]:
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(random_state=42)
}
scoring = {
    'MAE': make_scorer(mean_absolute_error, greater_is_better=False),
    'RMSE': make_scorer(root_mean_squared_error, greater_is_better=False),
    'MAPE': make_scorer(mean_absolute_percentage_error, greater_is_better=False),
    'RMSE_original': make_scorer(rmse_original, greater_is_better=False),
    'MAE_original': make_scorer(mae_original, greater_is_better=False),
    'MAPE_original': make_scorer(mape_original, greater_is_better=False)
}
results = {}
for model_name, model in models.items():
    cv_results = cross_validate(
        model, 
        X_train, 
        Y_train, 
        cv=5, 
        scoring=scoring, 
        return_train_score=True
    )
    results[model_name] = {
        "RMSE": cv_results['test_RMSE'],
        "RMSE_mean": np.mean(cv_results['test_RMSE']),
        "MAE": cv_results['test_MAE'],
        "MAE_mean": np.mean(cv_results['test_MAE']),
        "MAPE": cv_results['test_MAPE'],
        "MAPE_mean": np.mean(cv_results['test_MAPE']),
        "RMSE_original": cv_results['test_RMSE_original'],
        "RMSE_original_mean": np.mean(cv_results['test_RMSE_original']),
        "MAE_original": cv_results['test_MAE_original'],
        "MAE_original_mean": np.mean(cv_results['test_MAE_original']),
        "MAPE_original": cv_results['test_MAPE_original'],
        "MAPE_original_mean": np.mean(cv_results['test_MAPE_original'])
    }
print("\n--- Résumé des résultats ---")
for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    for metric, values in metrics.items():
        if "mean" in metric:
            print(f"  {metric}: {-values:.4f}")

Tunning du `Random Forest` afin d'optimiser les performances (Cela peut être long ... très long en fonction de la machine)

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5, 10],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True]
}

grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    scoring={
        'MAPE': 'neg_mean_absolute_percentage_error',
        'RMSE': 'neg_root_mean_squared_error',
    },
    refit='MAPE',
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, Y_train)

print("Best parameters: ", grid_search.best_params_)
print("Best value for MAPE: ", -grid_search.best_score_)

cv_results = grid_search.cv_results_
rmse_scores = cv_results['mean_test_RMSE']

best_rmse_idx = rmse_scores.argmax()
best_rmse = -rmse_scores[best_rmse_idx]
best_rmse_params = cv_results['params'][best_rmse_idx]
print(f"\nMeilleure valeur de RMSE : {best_rmse:.4f} pour les paramètres : {best_rmse_params}")

Meilleurs paramètres pour MAPE : {'n_estimators': 500, 'bootstrap': True, 'max_depth': 20, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2} 

Meilleurs paramètres pour RMSE : {'bootstrap': True, 'max_depth': 30, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200} 

### Evaluation du model sur l'ensemble de `Test` 

In [None]:
model = RandomForestRegressor(n_estimators= 500, bootstrap=  True, max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

rmse_test_log = root_mean_squared_error(Y_test,Y_pred)
mae_test_log = mean_absolute_error(Y_test, Y_pred)
mape_test = mean_absolute_percentage_error(Y_test, Y_pred)

Y_test_orig = 10 ** Y_test
Y_pred_orig = 10 ** Y_pred

rmse_test_orig = root_mean_squared_error(Y_test_orig, Y_pred_orig)
mae_test_orig = mean_absolute_error(Y_test_orig, Y_pred_orig)
mape_test_orig = mean_absolute_percentage_error(Y_test_orig, Y_pred_orig)

print(f"RMSE (log scale): {rmse_test_log:.4f}")
print(f"MAE (log scale): {mae_test_log:.4f}")
print(f"MAPE (log scale): {mape_test:.4f}")
print(f"RMSE (original scale): {rmse_test_orig:.4f}")
print(f"MAE (original scale): {mae_test_orig:.4f}")
print(f"MAPE (original scale): {mape_test_orig:.4f}")

Petit figure pour visualiser résumer les resultats

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(Y_test_orig, Y_pred_orig, alpha=0.7)
plt.plot([Y_test_orig.min(), Y_test_orig.max()], [Y_test_orig.min(), Y_test_orig.max()], 'r--')
plt.xlabel("Valeurs réelles (en euros)")
plt.ylabel("Valeurs prédites (en euros)")
plt.title("Comparaison des prédictions")
plt.show()