In [1]:
from pathlib import Path

PROJECT_ROOT = Path("..").resolve()
DATA_AGG = PROJECT_ROOT / "data" / "agregation"
DATA_PREP = PROJECT_ROOT / "data" / "preprocessed"

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv(DATA_AGG / 'test_dataset.csv')
print(df.head())

print("\n Structure des données")
print(df.info())

  participant_id        date  calories_burned_day  distance_km_day  steps_day  \
0            p01  2019-11-01              4009.10         14.42400    17873.0   
1            p01  2019-11-02              3533.56         10.58480    13118.0   
2            p01  2019-11-03              3748.73         11.46085    14312.0   
3            p01  2019-11-04              3353.38          8.85970    10970.0   
4            p01  2019-11-05              3794.63         13.71470    16186.0   

   exercise_duration_min_day  exercise_calories_day  exercise_steps_day  \
0                  58.883333                  494.0              4664.0   
1                  40.966667                  354.0              3035.0   
2                        NaN                    NaN                 NaN   
3                  17.066667                  145.0              1284.0   
4                  40.633333                  517.0              4966.0   

   exercise_distance_km_day  exercise_avg_hr  ...  srpe_durati

### Prétraitement

In [3]:
# Correction des types de données et imputation des valeurs manquantes
df['date'] = pd.to_datetime(df['date'])

# Valeurs par defaut 0
events_cols = ['injured', 'injury_minor', 'injury_major', 'n_injuries', 'alcohol_consumed', 
               'n_exercise_sessions', 'n_srpe_sessions', 'high_intensity_minutes', 
               'hrz_fat_burn_minutes', 'hrz_cardio_minutes', 'hrz_peak_minutes']

df[events_cols] = df[events_cols].fillna(0)

In [4]:
# Imputation temporelle (Ffill + Interp) pour les mesures physiologiques/quotidiennes
# mon raisonnement : l'état du joueur ne change pas drastiquement du jour au lendemain
# malgré qu'il puisse y avoir des soucis de capteurs ou d'enregistrement
df = df.sort_values(by=['participant_id', 'date']).reset_index(drop=True)

measure_cols = [col for col in df.columns if col not in events_cols and col not in ['participant_id', 'date']]
# Forward Fill par participant
for col in measure_cols:
    df[col] = df.groupby('participant_id')[col].fillna(method='ffill')
    
# combler les trous entre deux observations valides
df[measure_cols] = df[measure_cols].interpolate(method='linear')
# plus robuste au outliers
df[measure_cols] = df[measure_cols].fillna(df[measure_cols].median()) 

  df[col] = df.groupby('participant_id')[col].fillna(method='ffill')
  df[col] = df.groupby('participant_id')[col].fillna(method='ffill')
  df[col] = df.groupby('participant_id')[col].fillna(method='ffill')
  df[col] = df.groupby('participant_id')[col].fillna(method='ffill')
  df[col] = df.groupby('participant_id')[col].fillna(method='ffill')
  df[col] = df.groupby('participant_id')[col].fillna(method='ffill')
  df[col] = df.groupby('participant_id')[col].fillna(method='ffill')
  df[col] = df.groupby('participant_id')[col].fillna(method='ffill')
  df[col] = df.groupby('participant_id')[col].fillna(method='ffill')
  df[col] = df.groupby('participant_id')[col].fillna(method='ffill')
  df[col] = df.groupby('participant_id')[col].fillna(method='ffill')
  df[col] = df.groupby('participant_id')[col].fillna(method='ffill')
  df[col] = df.groupby('participant_id')[col].fillna(method='ffill')
  df[col] = df.groupby('participant_id')[col].fillna(method='ffill')
  df[col] = df.groupby('participan

In [5]:
#Conversion en Int 
cols_to_int = [
    'steps_day', 'exercise_steps_day', 'n_exercise_sessions', 
    'hr_max', 'hr_p90', 'hr_p10', 
    'high_intensity_minutes', 'hrz_fat_burn_minutes', 'hrz_cardio_minutes', 'hrz_peak_minutes',
    'sleep_overall_score', 'sleep_composition_score', 'sleep_revitalization_score', 
    'sleep_duration_score', 'deep_sleep_min', 'sleep_resting_hr', 'sleep_restlessness',
    'glasses_of_fluid', 'n_meals', 'alcohol_consumed',
    'injured', 'injury_minor', 'injury_major', 'n_injuries',
    'n_srpe_sessions', 'hr_peak', 'hr_fat_burn', 'hr_cardio', 'hr_high_intensity'
]

cols_to_int = [c for c in cols_to_int if c in df.columns]

for col in cols_to_int:
    df[col] = df[col].round().astype('int64')

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 457 entries, 0 to 456
Data columns (total 52 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   participant_id              457 non-null    object        
 1   date                        457 non-null    datetime64[ns]
 2   calories_burned_day         457 non-null    float64       
 3   distance_km_day             457 non-null    float64       
 4   steps_day                   457 non-null    int64         
 5   exercise_duration_min_day   457 non-null    float64       
 6   exercise_calories_day       457 non-null    float64       
 7   exercise_steps_day          457 non-null    int64         
 8   exercise_distance_km_day    457 non-null    float64       
 9   exercise_avg_hr             457 non-null    float64       
 10  high_intensity_minutes      457 non-null    int64         
 11  hrz_fat_burn_minutes        457 non-null    int64         

In [6]:
print("\n--- Aperçu des colonnes converties en int ---")
print(df[cols_to_int].head())


--- Aperçu des colonnes converties en int ---
   steps_day  exercise_steps_day  n_exercise_sessions  hr_max  hr_p90  hr_p10  \
0      17873                4664                    2     140      91      52   
1      13118                3035                    1     122      89      49   
2      14312                3035                    0     132      88      51   
3      10970                1284                    1     126      83      51   
4      16186                4966                    2     163      96      49   

   high_intensity_minutes  hrz_fat_burn_minutes  hrz_cardio_minutes  \
0                      57                    52                   0   
1                      41                    41                   0   
2                       0                     0                   0   
3                      16                    16                   0   
4                      25                    15                   5   

   hrz_peak_minutes  ...  alcohol_consu

### Indice de perforrmance

c'est la variable cible 

In [7]:
from sklearn.preprocessing import MinMaxScaler
df_preprocessed = df.copy()
# normaliser les colonnes importantes pour l'indice
index_cols = ['steps_day', 'hr_mean', 'sleep_overall_score', 'mood', 'readiness']
scaler = MinMaxScaler()
df_preprocessed[index_cols] = scaler.fit_transform(df_preprocessed[index_cols])


In [8]:

# Définition de la variable cible 'Performance_Index'

# On inverse hr_mean (plus la FC est basse => mieux c'est)
# Les poids sont subjectifs
df_preprocessed['Performance_Index'] = (
    # Forte pondération sur l'activité physique
    0.4 * df_preprocessed['steps_day'] + 
    
    # Moins d'effort cardiaque pour le meme travail 
    0.2 * (1 - df_preprocessed['hr_mean']) + 
    
    # qualité de sommeil
    0.2 * df_preprocessed['sleep_overall_score'] + 
    
    #l'humeur
    0.1 * df_preprocessed['mood'] +
    
    # préparation
    0.3 * df_preprocessed['readiness']
)

print(df_preprocessed['Performance_Index'].describe())

count    457.000000
mean       0.637525
std        0.130450
min        0.217424
25%        0.540039
50%        0.638157
75%        0.742564
max        0.918791
Name: Performance_Index, dtype: float64


### blessure
une variable binaire `injury_in_7_days`

In [9]:
# 1- Grouper par participant
# 2- Décaler la colonne 'injured' (qui est 1 ou 0) vers le haut de 7 jours
# 3- Prendre la valeur MAX de la colonne décalée pour les 7 jours suivants.

# si une blessure arrive dans les 7 jours suivant le jour J (J+1 à J+7)
df_preprocessed['injury_next_7d'] = df_preprocessed.groupby('participant_id')['injured'].transform(lambda x: x.shift(-1).rolling(window=7, min_periods=1).max()).fillna(0)

# to int
df_preprocessed['injury_next_7d'] = df_preprocessed['injury_next_7d'].astype(int)

print("\nDistribution de la variable cible blessure")
print(df_preprocessed['injury_next_7d'].value_counts(normalize=True))


Distribution de la variable cible blessure
injury_next_7d
0    0.873085
1    0.126915
Name: proportion, dtype: float64


In [10]:
OUTPUT_CSV = DATA_PREP / "test_dataset.csv"
df_preprocessed.to_csv(OUTPUT_CSV, index=False)