In [None]:
# Importation des modules et modèles 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, RandomizedSearchCV, cross_val_score
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.feature_selection import mutual_info_classif, SelectKBest, VarianceThreshold, RFECV
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from imblearn.combine import SMOTEENN
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
import warnings
import os
import s3fs

warnings.filterwarnings("ignore")

#On crée un bucket pour pouvoir récupérer les données sur le serveur
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

BUCKET = "atoubert-ensae"



In [30]:
#On fixe un random state pour toute la suite du code (on prend classiquement la valeur 42)
RANDOM_STATE = 42

In [31]:
def load_data(file_key):
    file_path_s3 = f"{BUCKET}/{file_key}"
    try:
        with fs.open(file_path_s3, mode="rb") as file_in:
            df = pd.read_csv(file_in, sep=",")
        return df
    except Exception as e:
        print(f"Error loading {file_key}: {e}")
        return pd.DataFrame()  # Return empty DataFrame on failure

train_home_team = load_data("train_home_team_statistics_df.csv")
train_away_team = load_data("train_away_team_statistics_df.csv")
train_home_player = load_data("train_home_player_statistics_df.csv")
train_away_player = load_data("train_away_player_statistics_df.csv")
train_scores = load_data("Y_train.csv")

test_home_team = load_data("test_home_team_statistics_df.csv")
test_away_team = load_data("test_away_team_statistics_df.csv")
test_home_player = load_data("test_home_player_statistics_df.csv")
test_away_player = load_data("test_away_player_statistics_df.csv")

In [32]:
train_home_team.head()

Unnamed: 0,ID,LEAGUE,TEAM_NAME,TEAM_SHOTS_TOTAL_season_sum,TEAM_SHOTS_INSIDEBOX_season_sum,TEAM_SHOTS_OFF_TARGET_season_sum,TEAM_SHOTS_ON_TARGET_season_sum,TEAM_SHOTS_OUTSIDEBOX_season_sum,TEAM_PASSES_season_sum,TEAM_SUCCESSFUL_PASSES_season_sum,...,TEAM_YELLOWCARDS_5_last_match_std,TEAM_REDCARDS_5_last_match_std,TEAM_OFFSIDES_5_last_match_std,TEAM_ATTACKS_5_last_match_std,TEAM_PENALTIES_5_last_match_std,TEAM_SUBSTITUTIONS_5_last_match_std,TEAM_BALL_SAFE_5_last_match_std,TEAM_DANGEROUS_ATTACKS_5_last_match_std,TEAM_INJURIES_5_last_match_std,TEAM_GOALS_5_last_match_std
0,0,Ligue 1,Toulouse,3.0,2.0,5.0,2.0,1.0,2.0,2.0,...,3.0,0.0,6.0,0.0,10.0,8.0,7.0,2.0,4.0,3.0
1,1,Ligue 2,Brest,6.0,8.0,3.0,6.0,5.0,8.0,7.0,...,4.0,0.0,4.0,3.0,10.0,0.0,1.0,2.0,8.0,4.0
2,2,Serie A,Sampdoria,4.0,2.0,5.0,2.0,8.0,1.0,1.0,...,4.0,5.0,6.0,3.0,6.0,7.0,2.0,3.0,2.0,4.0
3,3,League One,Coventry City,7.0,5.0,5.0,6.0,6.0,9.0,9.0,...,4.0,0.0,1.0,8.0,8.0,5.0,5.0,5.0,,6.0
4,4,Premier League,Wolverhampton Wanderers,3.0,3.0,2.0,3.0,4.0,4.0,3.0,...,1.0,0.0,2.0,5.0,8.0,7.0,2.0,6.0,4.0,4.0


In [33]:
train_home_team.shape

(12303, 143)

In [34]:
train_home_team.describe()

Unnamed: 0,ID,TEAM_SHOTS_TOTAL_season_sum,TEAM_SHOTS_INSIDEBOX_season_sum,TEAM_SHOTS_OFF_TARGET_season_sum,TEAM_SHOTS_ON_TARGET_season_sum,TEAM_SHOTS_OUTSIDEBOX_season_sum,TEAM_PASSES_season_sum,TEAM_SUCCESSFUL_PASSES_season_sum,TEAM_SAVES_season_sum,TEAM_CORNERS_season_sum,...,TEAM_YELLOWCARDS_5_last_match_std,TEAM_REDCARDS_5_last_match_std,TEAM_OFFSIDES_5_last_match_std,TEAM_ATTACKS_5_last_match_std,TEAM_PENALTIES_5_last_match_std,TEAM_SUBSTITUTIONS_5_last_match_std,TEAM_BALL_SAFE_5_last_match_std,TEAM_DANGEROUS_ATTACKS_5_last_match_std,TEAM_INJURIES_5_last_match_std,TEAM_GOALS_5_last_match_std
count,12303.0,12043.0,10918.0,12303.0,12301.0,10916.0,10890.0,10919.0,11523.0,12302.0,...,12302.0,12240.0,11297.0,12303.0,12290.0,12212.0,10508.0,12303.0,8985.0,12301.0
mean,6151.0,4.391182,4.299872,4.511339,4.150882,4.524643,4.350597,4.130049,4.674043,4.505284,...,3.959437,2.927859,3.590068,3.936276,3.559072,3.303144,3.797868,3.694302,3.865331,3.625559
std,3551.714516,2.871062,2.928012,2.806821,2.861291,2.84513,2.908079,2.900489,2.871876,2.802689,...,2.850569,3.834186,2.830877,2.83041,3.673394,3.349802,2.86621,2.778135,2.792247,2.851149
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3075.5,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,2.0,...,2.0,0.0,1.0,2.0,0.0,0.0,1.0,2.0,2.0,1.0
50%,6151.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,4.0,...,4.0,0.0,3.0,4.0,4.0,3.0,3.0,3.0,3.0,3.0
75%,9226.5,6.0,6.0,6.0,6.0,6.0,6.0,6.0,7.0,6.0,...,6.0,6.0,5.0,6.0,6.0,6.0,6.0,5.0,6.0,5.0
max,12302.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,...,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0


In [35]:
train_home_team.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12303 entries, 0 to 12302
Columns: 143 entries, ID to TEAM_GOALS_5_last_match_std
dtypes: float64(140), int64(1), object(2)
memory usage: 13.4+ MB


In [36]:
train_home_player.head()

Unnamed: 0,ID,LEAGUE,TEAM_NAME,POSITION,PLAYER_NAME,PLAYER_ACCURATE_CROSSES_season_sum,PLAYER_ACCURATE_PASSES_season_sum,PLAYER_AERIALS_WON_season_sum,PLAYER_ASSISTS_season_sum,PLAYER_BIG_CHANCES_CREATED_season_sum,...,PLAYER_STARTING_LINEUP_5_last_match_std,PLAYER_SUCCESSFUL_DRIBBLES_5_last_match_std,PLAYER_TACKLES_5_last_match_std,PLAYER_TOTAL_CROSSES_5_last_match_std,PLAYER_TOTAL_DUELS_5_last_match_std,PLAYER_YELLOWCARDS_5_last_match_std,PLAYER_PUNCHES_5_last_match_std,PLAYER_LONG_BALLS_5_last_match_std,PLAYER_LONG_BALLS_WON_5_last_match_std,PLAYER_SHOTS_OFF_TARGET_5_last_match_std
0,0,Ligue 1,Toulouse,defender,Agustín Rogel,0.0,8.0,5.0,0.0,0.0,...,,,,,,,,,,
1,0,Ligue 1,Toulouse,defender,Mathieu Goncalves,0.0,7.0,4.0,0.0,0.0,...,63.0,14.0,13.0,0.0,36.0,77.0,,,,
2,0,Ligue 1,Toulouse,goalkeeper,Baptiste Reynet,0.0,33.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,,,,
3,0,Ligue 1,Toulouse,midfielder,Jean-Victor Makengo,8.0,27.0,2.0,0.0,8.0,...,63.0,0.0,13.0,6.0,17.0,0.0,,,,
4,0,Ligue 1,Toulouse,,Efthymios Koulouris,2.0,23.0,27.0,14.0,8.0,...,63.0,17.0,10.0,13.0,27.0,0.0,,,,


In [37]:
train_home_player.shape

(237079, 307)

In [38]:
train_home_player.describe()

Unnamed: 0,ID,PLAYER_ACCURATE_CROSSES_season_sum,PLAYER_ACCURATE_PASSES_season_sum,PLAYER_AERIALS_WON_season_sum,PLAYER_ASSISTS_season_sum,PLAYER_BIG_CHANCES_CREATED_season_sum,PLAYER_BIG_CHANCES_MISSED_season_sum,PLAYER_BLOCKED_SHOTS_season_sum,PLAYER_CAPTAIN_season_sum,PLAYER_CLEARANCES_season_sum,...,PLAYER_STARTING_LINEUP_5_last_match_std,PLAYER_SUCCESSFUL_DRIBBLES_5_last_match_std,PLAYER_TACKLES_5_last_match_std,PLAYER_TOTAL_CROSSES_5_last_match_std,PLAYER_TOTAL_DUELS_5_last_match_std,PLAYER_YELLOWCARDS_5_last_match_std,PLAYER_PUNCHES_5_last_match_std,PLAYER_LONG_BALLS_5_last_match_std,PLAYER_LONG_BALLS_WON_5_last_match_std,PLAYER_SHOTS_OFF_TARGET_5_last_match_std
count,237079.0,208790.0,208790.0,208790.0,233482.0,207184.0,207184.0,207184.0,25290.0,208790.0,...,229636.0,180164.0,205393.0,205393.0,205393.0,229636.0,66500.0,0.0,0.0,0.0
mean,6151.422454,7.926907,22.719675,12.71967,9.301141,9.298498,7.219274,10.449895,34.374575,13.541597,...,32.848983,14.63365,21.673538,12.412098,27.139586,25.537786,2.118481,,,
std,3552.223779,15.09838,20.848741,16.201509,16.382073,15.763732,14.599223,16.975008,35.10304,18.505913,...,36.46419,17.543558,19.338093,16.668662,19.41403,33.779798,11.188666,,,
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
25%,3080.0,0.0,6.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,...,0.0,0.0,0.0,0.0,12.0,0.0,0.0,,,
50%,6152.0,1.0,18.0,7.0,0.0,0.0,0.0,3.0,20.0,6.0,...,0.0,12.0,19.0,7.0,26.0,0.0,0.0,,,
75%,9227.0,9.0,34.0,18.0,14.0,14.0,9.0,14.0,66.0,18.0,...,70.0,23.0,33.0,19.0,39.0,63.0,0.0,,,
max,12302.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,,,


In [39]:
train_home_player.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237079 entries, 0 to 237078
Columns: 307 entries, ID to PLAYER_SHOTS_OFF_TARGET_5_last_match_std
dtypes: float64(302), int64(1), object(4)
memory usage: 555.3+ MB


In [None]:
#On rajoute des préfixes HOME_ et AWAY_ devant les colonnes et les on ajoute dans un même dataset pour les teams pour distinguer
def prefix_columns(df, prefix):
    df = df.rename(columns={col: f"{prefix}_{col}" if col != 'ID' else col for col in df.columns})
    return df

#On ajoute les préfixes
train_home_team_prefixed = prefix_columns(train_home_team, 'HOME')
train_away_team_prefixed = prefix_columns(train_away_team, 'AWAY')

#On merge selon ID
train_team = pd.merge(train_home_team_prefixed, train_away_team_prefixed, on='ID', how='inner')


In [41]:
#On aggrège les statistiques des joueurs pour chaque colonne en faisant la moyenne des stats des joueurs de chaque équipe
def aggregate_player_stats(df, prefix):
    season_total_columns = [col for col in df.columns if col.endswith('_season_sum')]
        
    team_stats = df.groupby('ID')[season_total_columns].agg(['mean', 'max', 'std']).reset_index()
        
    team_stats.columns = ['ID'] + [f"{prefix}_{col[0]}_{col[1]}" for col in team_stats.columns[1:]]
    return team_stats



#On applique notre fonction au home et away
train_home_player_agg = aggregate_player_stats(train_home_player, 'HOME')
train_away_player_agg = aggregate_player_stats(train_away_player, 'AWAY')



In [42]:
#Enfin, on aggrège toutes les données pour avoir une base de données train, en se basant sur ID

train_data = train_team.merge(train_home_player_agg, on='ID', how='inner').merge(train_away_player_agg, on='ID', how='inner')

In [43]:
#On transforme les variables catégorielles en une seule  variable numérique via un mapping pour pouvoir avoir une colonne target

train_scores['target'] = train_scores[['HOME_WINS', 'DRAW', 'AWAY_WINS']].idxmax(axis=1)
target_mapping = {'HOME_WINS': 0, 'DRAW': 1, 'AWAY_WINS': 2}
train_scores['target'] = train_scores['target'].map(target_mapping)
y = train_scores['target']

#On retire les colonnes inutiles
train_data = train_data.drop(['HOME_LEAGUE', 'HOME_TEAM_NAME', 'AWAY_LEAGUE', 'AWAY_TEAM_NAME'], axis=1)


In [44]:
#On ajoute des variables potentiellement utiles
def add_features(df):
    
    
    df['GOALS_DIFFERENCE'] = (
        df['HOME_TEAM_GOALS_season_sum'] - 
        df['AWAY_TEAM_GOALS_season_sum']
    )
    
    df['RECENT_WINS_DIFFERENCE'] = (
        df['HOME_TEAM_GAME_WON_5_last_match_sum'] - 
        df['AWAY_TEAM_GAME_WON_5_last_match_sum']
    )
    

    df['HOME_SCORING_RATIO'] = (
        df['HOME_TEAM_GOALS_season_sum'] / 
        (df['HOME_TEAM_SHOTS_ON_TARGET_season_sum'] )
    )
    
    df['AWAY_SCORING_RATIO'] = (
        df['AWAY_TEAM_GOALS_season_sum'] / 
        (df['AWAY_TEAM_SHOTS_ON_TARGET_season_sum'])
    )
    
    return df

train_data = add_features(train_data)




In [45]:
#On drop la colonne ID pour la suite
train_data = train_data.drop('ID', axis=1)

In [46]:
#On split en train et test
X_train_full, X_test_final, y_train_full, y_test_final = train_test_split(
    train_data, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)



In [47]:
#On split en train et validation
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, 
    y_train_full,
    test_size=0.25,
    stratify=y_train_full,
    random_state=RANDOM_STATE
)




In [48]:
#On crée des grilles de paramètres pour les modèles


xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2]
}

rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

In [49]:
#On crée des modèles de base, ici XGBoost et Random Forest
base_models = [
    ('xgb', xgb.XGBClassifier(
        objective='multi:softprob', 
        num_class=3, 
        use_label_encoder=False, 
        eval_metric='mlogloss',
        random_state=RANDOM_STATE
    )),
    ('rf', RandomForestClassifier(random_state=RANDOM_STATE)),
    ]

In [None]:
#On crée une fonction pour preprocesser les données avant de tune les paramètres, qui sert à remplacer les inf/nan et à les imputer
def preprocess_data(X_train, X_valid):

    X_train = X_train.replace([np.inf, -np.inf], np.nan)
    X_valid = X_valid.replace([np.inf, -np.inf], np.nan)
    X_train = X_train.dropna(axis=1, how='all')
    X_valid = X_valid[X_train.columns]  
    

    imputer = SimpleImputer(strategy='mean')

    X_train_processed = pd.DataFrame(
        imputer.fit_transform(X_train),
        columns=X_train.columns
    )
    X_valid_processed = pd.DataFrame(
        imputer.transform(X_valid),
        columns=X_valid.columns
    )
    
    return X_train_processed, X_valid_processed

#On crée une fonction qui permet de tuner les hyperparamètres des modèles
def hyperparameter_tuning(model, param_grid, X_train, y_train, X_valid, y_valid):
    #D'abord il faut faire attention à préprocesser les données avec la fonction ci-dessus
    X_train_processed, X_valid_processed = preprocess_data(X_train, X_valid)
    
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=RANDOM_STATE)
    
    if isinstance(model, xgb.XGBClassifier):
        search = RandomizedSearchCV(
            estimator=model,
            param_distributions=param_grid,
            n_iter=20,
            scoring='accuracy',
            cv=cv,
            verbose=1,
            random_state=RANDOM_STATE,
            n_jobs=-1
        )
        search.fit(
            X_train_processed, 
            y_train,
            eval_set=[(X_valid_processed, y_valid)],
            verbose=False
        )

    else:
        search = RandomizedSearchCV(
            estimator=model,
            param_distributions=param_grid,
            n_iter=20,
            scoring='accuracy',
            cv=cv,
            verbose=1,
            random_state=RANDOM_STATE,
            n_jobs=-1
        )
        search.fit(X_train_processed, y_train)
    
    print(f"The best parameters for {model.__class__.__name__} are :{search.best_params_}")
    print(f"The best cross-validation score for the model is: {search.best_score_:.4f}")
    
    return search.best_estimator_




In [51]:
#On crée une fonction pour évaluer les modèles en affichant l'accuracy et un classification report
def evaluate_model(model, X, y, name):
    
    y_pred = model.predict(X)
    y_pred_proba = model.predict_proba(X)
    
    print(f"{name} Results:")
    print(f"Accuracy : {accuracy_score(y, y_pred):.4f}")
    print("Classification Report:")
    print(classification_report(y, y_pred))



In [None]:
#On crée une fonction qui permet de créer un ensemble model en utilisant un stacking classifier
def create_ensemble_model(X_train, y_train, X_valid, y_valid, base_models):

    X_train_processed, X_valid_processed = preprocess_data(X_train, X_valid)
    

    meta_classifier = LogisticRegression(multi_class='ovr', random_state=RANDOM_STATE)
    

    ensemble = StackingClassifier(
        estimators=base_models,
        final_estimator=meta_classifier,
        cv=5,
        stack_method='predict_proba'
    )
    

    ensemble.fit(X_train_processed, y_train)
    
    ensemble_accuracy = accuracy_score(y_valid, ensemble.predict(X_valid_processed))
    
    return ensemble, ensemble_accuracy

In [None]:

#On tune les modèles de bases et on les stocke dans un dico pour pouvoir y accéder facilement plus tard
tuned_models = {}
for name, model in base_models:
    if name == 'xgb':
        best_model = hyperparameter_tuning(model, xgb_param_grid, X_train, y_train, X_valid, y_valid)
    elif name == 'rf':
        best_model = hyperparameter_tuning(model, rf_param_grid, X_train, y_train, X_valid, y_valid)
    tuned_models[name] = best_model







Fitting 10 folds for each of 20 candidates, totalling 200 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

The best parameters for XGBClassifier are : {'subsample': 0.8, 'n_estimators': 300, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 1.0}
The best cross-validation score for the model is: 0.4919
Fitting 10 folds for each of 20 candidates, totalling 200 fits
The best parameters for RandomForestClassifier are : {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 10}
The best cross-validation score for the model is: 0.4931


In [54]:

#On preprocess les données de validation avant d'évaluer les modèles
X_valid_processed, _ = preprocess_data(X_valid, X_valid)

#On les évalue ensuite sur le validation set
model_scores = {}
for name, model in tuned_models.items():
    evaluate_model(model, X_valid_processed, y_valid, f"{name}")
    model_scores[name] = accuracy_score(y_valid, model.predict(X_valid_processed))




xgb Results:
Accuracy : 0.5002
Classification Report:
              precision    recall  f1-score   support

           0       0.51      0.85      0.63      1071
           1       0.29      0.00      0.01       637
           2       0.48      0.43      0.45       753

    accuracy                           0.50      2461
   macro avg       0.43      0.43      0.37      2461
weighted avg       0.44      0.50      0.42      2461

rf Results:
Accuracy : 0.4961
Classification Report:
              precision    recall  f1-score   support

           0       0.51      0.84      0.63      1071
           1       0.37      0.02      0.04       637
           2       0.47      0.40      0.44       753

    accuracy                           0.50      2461
   macro avg       0.45      0.42      0.37      2461
weighted avg       0.46      0.50      0.42      2461



In [55]:
#On crée l'ensemble model
ensemble_models = [(name, model) for name, model in tuned_models.items()]
ensemble, ensemble_accuracy = create_ensemble_model(X_train, y_train, X_valid, y_valid, ensemble_models)

#On compare les modèles de base en prenant le max
best_base_accuracy = max(model_scores.values())


#Puis on regarde si le meilleur des modèles de base sous performe ou non par rapport à l'ensemble
if ensemble_accuracy > best_base_accuracy:
    best_model = ensemble
    best_model_name = 'Ensemble'
else:
    best_model_name = max(model_scores, key=model_scores.get) #On prend le meilleur modèle de base à partir des clés du dico
    best_model = tuned_models[best_model_name]

print(f"The best model is {best_model_name} with accuracy {max(ensemble_accuracy, best_base_accuracy):.4f}")

The best model is Ensemble with accuracy 0.5030


In [56]:
#On définit une pipeline complète qu'on va appliquer sur tout le dataset de train
complete_pipeline = ImbPipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', RobustScaler()),
    ('variance_threshold', VarianceThreshold(threshold=0.01)),
    ('feature_selection', RFECV(
        estimator=RandomForestClassifier(random_state=RANDOM_STATE),
        step=1,
        cv=5
    )),
    ('resampling', SMOTEENN(random_state=RANDOM_STATE)),
    ('classifier', best_model)
])

#On définit une pipeline complète pour le dataset de test cette fois
test_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', RobustScaler()),
    ('variance_threshold', VarianceThreshold(threshold=0.01)),
    ('feature_selection', RFECV(
        estimator=RandomForestClassifier(random_state=RANDOM_STATE),
        step=1,
        cv=5
    )),
    ('classifier', best_model)
])


X_train_full_processed, _ = preprocess_data(X_train_full, X_train_full)

complete_pipeline.fit(X_train_full_processed, y_train_full)
test_pipeline.fit(X_train_full_processed, y_train_full)

In [None]:
#On applique le même préprocessing pour ne pas avoir d'inconsistency
def preprocess_test_data(test_home_team, test_away_team, test_home_player, test_away_player, test_pipeline):
    test_home_team_prefixed = prefix_columns(test_home_team, 'HOME')
    test_away_team_prefixed = prefix_columns(test_away_team, 'AWAY')
    
    test_team = pd.merge(test_home_team_prefixed, test_away_team_prefixed, on='ID', how='inner')
    
    test_home_player_agg = aggregate_player_stats(test_home_player, 'HOME')
    test_away_player_agg = aggregate_player_stats(test_away_player, 'AWAY')
    
    test_data = test_team.merge(test_home_player_agg, on='ID', how='inner').merge(test_away_player_agg, on='ID', how='inner')
    
    test_data = add_features(test_data)
    
    test_ids = test_data['ID'].copy()
    
    test_data = test_data.drop(['ID', 'HOME_LEAGUE', 'HOME_TEAM_NAME', 'AWAY_LEAGUE', 'AWAY_TEAM_NAME'], axis=1, errors='ignore')
    
    #Il faut  appliquer la fonction preprocess_data, car sinon quand on run add_features il peut y avoir des valeurs infinies dans la table
    test_data_processed, _ = preprocess_data(test_data, test_data)
    
    #Il faut faire attention à réindexer les colonnes pour que ça coincide avec celles des données d'entraînement
    test_data_processed = test_data_processed.reindex(columns=X_train_full_processed.columns, fill_value=0)

    test_predictions = test_pipeline.predict(test_data_processed)
    test_probabilities = test_pipeline.predict_proba(test_data_processed)
    
    return test_predictions, test_probabilities, test_ids

#Puis on prédit les résultats du test set
test_predictions, test_probabilities, test_ids = preprocess_test_data(
    test_home_team, 
    test_away_team, 
    test_home_player, 
    test_away_player, 
    test_pipeline 
)


In [None]:
#On crée le fichier csv avec les résultat


submission = pd.DataFrame({
    'ID': test_ids,
    'HOME_WINS': (test_predictions == 0).astype(int),
    'DRAW': (test_predictions == 1).astype(int),
    'AWAY_WINS': (test_predictions == 2).astype(int)
})


submission_file = 'submission_finale.csv'
submission.to_csv(submission_file, index=False)
print(f"Submission file saved.")

Submission file saved.
