In [78]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
# Read training data
train_home_team_statistics_df = pd.read_csv('/Train_Data/train_home_team_statistics_df.csv', index_col=0)
train_away_team_statistics_df = pd.read_csv('/Train_Data/train_away_team_statistics_df.csv', index_col=0)
train_scores = pd.read_csv('/Y_train_football.csv', index_col=0)

# Data preparation
train_home = train_home_team_statistics_df.iloc[:, 2:]
train_away = train_away_team_statistics_df.iloc[:, 2:]

# Rename columns
train_home.columns = 'HOME_' + train_home.columns
train_away.columns = 'AWAY_' + train_away.columns

# Reassemble data in one dataset
train_data = pd.concat([train_home, train_away], join='inner', axis=1)
train_scores = train_scores.loc[train_data.index]

# Replace infinite data with NaN
train_data = train_data.replace({np.inf: np.nan, -np.inf: np.nan})


# Data preprocessing & EDA

In [None]:
train_data.head()

In [None]:
np.array(train_data.columns)

In [None]:
train_data.describe()

In [None]:
train_data.info()

In [None]:
train_data.duplicated()

In [None]:
train_scores

In [None]:
df = train_scores
counts = df.apply(pd.Series.value_counts).T
counts.plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Target variable distribution')
plt.xlabel('Classes (HOME_WINS-DRAW-AWAY_WINS)')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
round((train_data.isnull().sum()/train_data.shape[0])*100,2)

In [79]:
# Replace missing values with the mean value of each feature 
train_data = train_data.fillna(train_data.mean())

In [128]:
# Data normalisation
scaler = StandardScaler()
train_data_scaled = scaler.fit_transform(train_data)

In [199]:
train_home_scaled = scaler.fit_transform(train_home)
train_away_scaled = scaler.fit_transform(train_away)

In [204]:
train_home_scaled = train_data.fillna(train_home.mean())
train_away_scaled = train_data.fillna(train_away.mean())

In [None]:
print('Mean value after z-score', train_data_scaled.mean())
print('Standard deviation value after z-score', train_data_scaled.std())

In [None]:
pd.DataFrame(train_data_scaled).describe()

In [None]:
correlations = train_data.corrwith(train_scores['HOME_WINS'])
print(correlations.max())
correlations = train_data.corrwith(train_scores['AWAY_WINS'])
print(correlations.max())
correlations = train_data.corrwith(train_scores['DRAW'])
print(correlations.max())

In [None]:
corr_train_data = train_data.corr()
corr_train_data

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(corr_train_data, annot=False, cmap='coolwarm', fmt='.2f')

plt.title("train data features Correlation Matrix", fontsize=16)
plt.show()

# Splitting data into train, test and validation sets

In [137]:
# Target variable
train_new_y = train_scores[['HOME_WINS', 'DRAW', 'AWAY_WINS']]

# Data splitting to train, validation and test
X_train, X_test, y_train, y_test = model_selection.train_test_split(train_data_scaled, train_new_y, train_size=0.8, random_state=42)
X_train, X_valid, y_train, y_valid = model_selection.train_test_split(X_train, y_train, train_size=0.8, random_state=42)

# XGBoost Model

In [None]:
# XGBOOST parameters
params_1 = {
    'booster': 'gbtree',
    'tree_method': 'hist',
    'max_depth': 4,
    'learning_rate': 0.025,
    'objective': 'multi:softprob',
    'num_class': 3,  # Trois classes : HOME_WINS, DRAW, AWAY_WINS
    'eval_metric': 'mlogloss'
}

original_columns = np.array(train_data_selected.columns).tolist()

# DMatrix for XGBoost
d_train = xgb.DMatrix(X_train, label=np.argmax(y_train.values, axis=1), feature_names=original_columns)  
d_valid = xgb.DMatrix(X_valid, label=np.argmax(y_valid.values, axis=1), feature_names=original_columns)

# Early stopping
num_round = 10000
evallist = [(d_train, 'train'), (d_valid, 'eval')]

evals_result = {}

# Model training
bst = xgb.train(params_1, d_train, num_round, evallist, early_stopping_rounds=100, evals_result=evals_result)

In [None]:
# Extract the log-loss values for train and validation from evals_result
train_logloss = evals_result['train']['mlogloss']
valid_logloss = evals_result['eval']['mlogloss']

# Plot the log-loss curves
plt.figure(figsize=(10, 6))
plt.plot(train_logloss, label='Train Log-Loss', color='blue')
plt.plot(valid_logloss, label='Validation Log-Loss', color='red')
plt.xlabel('Number of Rounds')
plt.ylabel('Log-Loss')
plt.title('Log-Loss during Training')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Extract feature importance
importance = bst.get_score(importance_type='weight')
sorted_dict = sorted(importance.items(), key=lambda item: item[1], reverse=True)
dict(sorted_dict[:100]).keys()

In [None]:
# Visualisation de l'importance des fonctionnalités
xgb.plot_importance(bst, max_num_features=100)
plt.gcf().set_size_inches(15, 20)

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the model
model = xgb.XGBClassifier(objective='multi:softmax', num_class=3, eval_metric='mlogloss')

# Define the hyperparameters grid
param_grid = {
    'max_depth': [4, 5],
    'learning_rate': [0.025, 0.1],
    'n_estimators': [150, 200],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8,0.9,1.0]
}

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=5, verbose=1)
grid_search.fit(X_train, np.argmax(y_train.values, axis=1))

# Output the best parameters and score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
# Predictions on test dataset
X_test_xgb = xgb.DMatrix(X_test)
predictions = bst.predict(X_test_xgb, iteration_range=(0, bst.best_iteration))

predicted_classes = np.argmax(predictions, axis=1)

# Calculate accuracy
accuracy = accuracy_score(np.argmax(y_test.values, axis=1), predicted_classes)
f1 = f1_score(np.argmax(y_test.values, axis=1), predicted_classes, average='micro')
precision = precision_score(np.argmax(y_test.values, axis=1), predicted_classes, average='micro')
f1 = f1_score(np.argmax(y_test.values, axis=1), predicted_classes, average='micro')
recall = recall_score(np.argmax(y_test.values, axis=1), predicted_classes, average='micro')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"f1 score: {f1:.4f}")
print(f"Recall: {recall:.4f}")


In [41]:
# Soumission des prédictions
test_home = pd.read_csv('/Test_Data/test_home_team_statistics_df.csv', index_col=0)
test_away = pd.read_csv('/Test_Data/test_away_team_statistics_df.csv', index_col=0)

test_home.columns = 'HOME_' + test_home.columns
test_away.columns = 'AWAY_' + test_away.columns

test_data = pd.concat([test_home, test_away], join='inner', axis=1)
test_data_imputed = imputer.transform(test_data)
test_data_scaled = scaler.transform(test_data_imputed)

# Prédictions sur les données de test
X_bench = xgb.DMatrix(test_data_scaled)
predictions = bst.predict(X_bench, iteration_range=(0, bst.best_iteration))

# Sélection de la classe avec la probabilité la plus élevée
predicted_classes = np.argmax(predictions, axis=1)

# Création d'un DataFrame pour les résultats
submission = pd.DataFrame(predicted_classes, columns=['Prediction'])
submission['HOME_WINS'] = (submission['Prediction'] == 0).astype(int)
submission['DRAW'] = (submission['Prediction'] == 1).astype(int)
submission['AWAY_WINS'] = (submission['Prediction'] == 2).astype(int)

# Sauvegarde des résultats de la soumission
submission = submission[['HOME_WINS', 'DRAW', 'AWAY_WINS']]
submission.index = test_data.index
submission.reset_index(inplace=True)
submission.to_csv('/benchmark_submission.csv', index=False)

# RandomForest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, KFold
# Initialize RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)

# Train the model
rf_model.fit(X_train, np.argmax(y_train.values, axis=1))

# Predict on the validation set
y_pred = rf_model.predict(X_valid)

# Evaluate the model
accuracy = accuracy_score(np.argmax(y_valid.values, axis=1), y_pred)
print(f"Random Forest Model Accuracy: {accuracy:.4f}")

In [None]:
from sklearn.model_selection import GridSearchCV
# Perform cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)  # 5-fold cross-validation

# Cross-validation score (using accuracy)
cv_scores = cross_val_score(rf_model, X_train, np.argmax(y_train.values, axis=1), cv=cv, scoring='accuracy')

# Print cross-validation results
print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Cross-Validation Accuracy: {cv_scores.mean():.4f}")
print(f"Standard Deviation of Cross-Validation Accuracy: {cv_scores.std():.4f}")

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV with cross-validation
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Fit the model with the best parameters found by grid search
grid_search.fit(X_train, np.argmax(y_train.values, axis=1))

# Output the best parameters and best score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")

In [46]:
test_home = pd.read_csv('/Test_Data/test_home_team_statistics_df.csv', index_col=0)
test_away = pd.read_csv('/Test_Data/test_away_team_statistics_df.csv', index_col=0)

test_home.columns = 'HOME_' + test_home.columns
test_away.columns = 'AWAY_' + test_away.columns

test_data = pd.concat([test_home, test_away], join='inner', axis=1)
test_data_cleaned = test_data.fillna(test_data.mean())
scaler = StandardScaler()
test_data_scaled = scaler.fit_transform(test_data_cleaned)

predictions = rf_model.predict(test_data_scaled)

submission = pd.DataFrame(predictions, columns=['Prediction'])
submission['HOME_WINS'] = (submission['Prediction'] == 0).astype(int)
submission['DRAW'] = (submission['Prediction'] == 1).astype(int)
submission['AWAY_WINS'] = (submission['Prediction'] == 2).astype(int)

submission = submission[['HOME_WINS', 'DRAW', 'AWAY_WINS']]
submission.index = test_data.index
submission.reset_index(inplace=True)
submission.to_csv('/benchmark_submission_randomforest.csv', index=False)

# Decision Tree Model

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Decision Tree model parameters
dt_params = {
    'max_depth': 3,
    'min_samples_split': 2,
    'criterion': 'entropy', 
    'random_state': 42, 
    'max_features': 'auto', 
    'min_samples_leaf': 5, 
    'min_samples_split': 18, 
    'splitter': 'random'
}

dt_model = DecisionTreeClassifier(**dt_params)

# Train the model on the training data
dt_model.fit(X_train, np.argmax(y_train.values, axis=1))

# Make predictions on the validation set
y_pred = dt_model.predict(X_valid)

# Calculate the accuracy score
accuracy = accuracy_score(np.argmax(y_valid.values, axis=1), y_pred)

print(f"Decision Tree Model Accuracy: {accuracy}")

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
precision = precision_score(np.argmax(y_valid.values, axis=1), y_pred, average='weighted')
recall = recall_score(np.argmax(y_valid.values, axis=1), y_pred, average='weighted')
f1 = f1_score(np.argmax(y_valid.values, axis=1), y_pred, average='weighted')
conf_matrix = confusion_matrix(np.argmax(y_valid.values, axis=1), y_pred)
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)

In [None]:
from sklearn.model_selection import cross_validate

# Perform cross-validation for multiple metrics
cv_results = cross_validate(dt_model, X_train, np.argmax(y_train.values, axis=1), cv=5, 
                            scoring=['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'])

print("Cross-Validation Results:")
print(f"Accuracy: {cv_results['test_accuracy']}")
print(f"Precision (Weighted): {cv_results['test_precision_weighted']}")
print(f"Recall (Weighted): {cv_results['test_recall_weighted']}")
print(f"F1-Score (Weighted): {cv_results['test_f1_weighted']}")
print(f"Mean Accuracy: {cv_results['test_accuracy'].mean()}")
print(f"Mean Precision: {cv_results['test_precision_weighted'].mean()}")
print(f"Mean Recall: {cv_results['test_recall_weighted'].mean()}")
print(f"Mean F1-Score: {cv_results['test_f1_weighted'].mean()}")


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the parameter grid for RandomizedSearchCV
param_dist = {
    'max_depth': randint(1, 20),               
    'min_samples_split': randint(2, 20),       
    'min_samples_leaf': randint(1, 20),        
    'max_features': ['auto', 'sqrt', 'log2'], 
    'criterion': ['gini', 'entropy'],         
    'splitter': ['best', 'random']            
}

# Set up RandomizedSearchCV with cross-validation
random_search = RandomizedSearchCV(
    estimator=dt_model, 
    param_distributions=param_dist,
    n_iter=100,                     
    cv=5,                           
    verbose=1,                      
    random_state=42,                
    n_jobs=-1                      
)

# Fit RandomizedSearchCV
random_search.fit(X_train, np.argmax(y_train.values, axis=1))

# Get the best hyperparameters and the best score
print("Best Hyperparameters:", random_search.best_params_)
print("Best Cross-Validation Accuracy:", random_search.best_score_)

In [74]:
test_home = pd.read_csv('/Test_Data/test_home_team_statistics_df.csv', index_col=0)
test_away = pd.read_csv('/Test_Data/test_away_team_statistics_df.csv', index_col=0)

test_home.columns = 'HOME_' + test_home.columns
test_away.columns = 'AWAY_' + test_away.columns

test_data = pd.concat([test_home, test_away], join='inner', axis=1)
test_data_scaled = test_data.fillna(test_data.mean())
test_data_scaled = scaler.transform(test_data_scaled)

predictions = dt_model.predict(test_data_scaled)

submission = pd.DataFrame(predictions, columns=['Prediction'])
submission['HOME_WINS'] = (submission['Prediction'] == 0).astype(int)
submission['DRAW'] = (submission['Prediction'] == 1).astype(int)
submission['AWAY_WINS'] = (submission['Prediction'] == 2).astype(int)

submission = submission[['HOME_WINS', 'DRAW', 'AWAY_WINS']]
submission.index = test_data.index
submission.reset_index(inplace=True)
submission.to_csv('/benchmark_submission_decisiontree.csv', index=False)