# XGBOOST TRAINING 

In [1]:
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
import time
import pandas as pd 


In [2]:
X = pd.read_csv('../data/X.csv')
y = pd.read_csv('../data/y.csv')
X_test = pd.read_csv('../data/X_test.csv')

In [3]:
# Cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=52)

rmse_scores = []
test_predictions = np.zeros(len(X_test))
oof_predictions = np.zeros(len(X))  # Out-of-fold predictions

params = {
    'max_depth': 10,
    'colsample_bytree': 0.7,
    'subsample': 0.9,
    'n_estimators': 2000,
    'learning_rate': 0.02,
    'gamma': 0.01, 
    'max_delta_step': 2,
    'eval_metric': "rmse",
    'early_stopping_rounds':100,
    'random_state': 42,
    'enable_categorical': True
}

# Loop with cross validation
for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f"\n{'='*20} Fold {fold+1}/{5} {'='*20}")
    # Split data
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    start_time = time.time()
    
    # Model training
    model = XGBRegressor(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=100
        )
    
    # Validation predictions - store in out-of-fold array
    fold_val_predictions = model.predict(X_val)
    oof_predictions[val_idx] = fold_val_predictions
    
    # Test predictions
    test_predictions += model.predict(X_test)
    
    # Metrics computation for this fold
    fold_rmse = np.sqrt(mean_squared_error(y_val, fold_val_predictions))
    rmse_scores.append(fold_rmse)
    
    elapsed_time = time.time() - start_time
    
    print(f"Fold {fold}:")
    print(f"RMSE: {fold_rmse:.4f}")
    print(f"Training time: {elapsed_time:.2f} seconds")

# Average test predictions across all folds
test_predictions /= 5

# Calculate average RMSE across all folds
avg_rmse = np.mean(rmse_scores)
print(f"\nAverage Fold RMSE: {avg_rmse:.4f}")

# Calculate full CV RMSE using all out-of-fold predictions
full_rmse = np.sqrt(mean_squared_error(y, oof_predictions))
print(f"Final CV RMSE: {full_rmse:.4f}")


[0]	validation_0-rmse:0.94457
[100]	validation_0-rmse:0.14175
[200]	validation_0-rmse:0.06232
[300]	validation_0-rmse:0.05965
[400]	validation_0-rmse:0.05956
[500]	validation_0-rmse:0.05956
[593]	validation_0-rmse:0.05957
Fold 0:
RMSE: 0.0596
Training time: 15.20 seconds

[0]	validation_0-rmse:0.94421
[100]	validation_0-rmse:0.14197
[200]	validation_0-rmse:0.06237
[300]	validation_0-rmse:0.05964
[400]	validation_0-rmse:0.05954
[500]	validation_0-rmse:0.05954
[552]	validation_0-rmse:0.05955
Fold 1:
RMSE: 0.0595
Training time: 14.51 seconds

[0]	validation_0-rmse:0.94561
[100]	validation_0-rmse:0.14217
[200]	validation_0-rmse:0.06210
[300]	validation_0-rmse:0.05929
[400]	validation_0-rmse:0.05916
[500]	validation_0-rmse:0.05917
[556]	validation_0-rmse:0.05917
Fold 2:
RMSE: 0.0592
Training time: 14.34 seconds

[0]	validation_0-rmse:0.94566
[100]	validation_0-rmse:0.14200
[200]	validation_0-rmse:0.06385
[300]	validation_0-rmse:0.06147
[400]	validation_0-rmse:0.06140
[472]	validation_0-rms

## Submission 

In [7]:
import os 


submission = pd.read_csv('../../dataset/sample_submission.csv')
test_predictions = np.expm1(test_predictions)

i = 1 
while os.path.exists(f"../predictions/submissions/xgb_submission_{i}.csv"):
    i+=1

submission["Calories"] = test_predictions

csv_filename = f"../predictions/submissions/xgb_submission_{i}.csv"
submission.to_csv(csv_filename, index=False)
print(f"Fichier CSV enregistré : {csv_filename}")

npy_filename = f"../predictions/oof/xgb_oof_predictions_{i}.npy"
np.save(npy_filename, oof_predictions)
print(f"Fichier NumPy enregistré : {npy_filename}")

Fichier CSV enregistré : ../predictions/submissions/xgb_submission_2.csv
Fichier NumPy enregistré : ../predictions/oof/xgb_oof_predictions_2.npy
