# CATBOOST TRAINING 

In [1]:
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
import time
import pandas as pd 


In [2]:
X = pd.read_csv('../data/X.csv')
y = pd.read_csv('../data/y.csv')
X_test = pd.read_csv('../data/X_test.csv')

In [8]:
kf = KFold(n_splits=5, shuffle=True, random_state=52)

rmse_scores = []
test_predictions = np.zeros(len(X_test))
oof_predictions = np.zeros(len(X))  # Out-of-fold predictions

cat_features = ['Sex']  

model_params = {
    'iterations': 2000,
    'learning_rate': 0.02,
    'depth': 10,
    'l2_leaf_reg': 3,
    'random_seed': 52,
    'eval_metric': 'RMSE',
    'early_stopping_rounds': 100,
    'cat_features': cat_features,
    'verbose': 100
}


print("=== Training CatBoost ===")


for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y)):
    print(f"\nFold {fold+1}")
    

    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[valid_idx], y.iloc[valid_idx]
    

    model = CatBoostRegressor(**model_params)
    

    start_time = time.time()
    

    model.fit(
        X_train, y_train,
        eval_set=(X_val, y_val),
        use_best_model=True
    )
    

    train_time = time.time() - start_time
    

    fold_val_predictions = model.predict(X_val)
    oof_predictions[valid_idx] = fold_val_predictions
    test_predictions = model.predict(X_test)
    
    

    fold_rmsle = np.sqrt(mean_squared_error(y_val, fold_val_predictions))
    rmse_scores.append(fold_rmsle)
    
    print(f"Fold {fold+1} RMSLE: {fold_rmsle:.4f}")
    print(f"Training time: {train_time:.1f} sec")


mean_rmsle = np.mean(rmse_scores)
std_rmsle = np.std(rmse_scores)
print(f"\nCatBoost Performance:")
print(f"Mean RMSLE: {mean_rmsle:.4f} ± {std_rmsle:.4f}")

=== Training CatBoost ===

Fold 1
0:	learn: 0.9448048	test: 0.9442074	best: 0.9442074 (0)	total: 41.9ms	remaining: 1m 23s
100:	learn: 0.1541874	test: 0.1543566	best: 0.1543566 (100)	total: 3.98s	remaining: 1m 14s
200:	learn: 0.0664084	test: 0.0669874	best: 0.0669874 (200)	total: 8.29s	remaining: 1m 14s
300:	learn: 0.0608279	test: 0.0614700	best: 0.0614700 (300)	total: 12.2s	remaining: 1m 9s
400:	learn: 0.0597972	test: 0.0606018	best: 0.0606018 (400)	total: 16.7s	remaining: 1m 6s
500:	learn: 0.0592322	test: 0.0602382	best: 0.0602382 (500)	total: 22.4s	remaining: 1m 7s
600:	learn: 0.0587874	test: 0.0600039	best: 0.0600039 (600)	total: 26.8s	remaining: 1m 2s
700:	learn: 0.0584336	test: 0.0598613	best: 0.0598613 (700)	total: 31s	remaining: 57.4s
800:	learn: 0.0581117	test: 0.0597518	best: 0.0597518 (800)	total: 35.7s	remaining: 53.5s
900:	learn: 0.0578410	test: 0.0596629	best: 0.0596629 (900)	total: 40s	remaining: 48.7s
1000:	learn: 0.0575939	test: 0.0595991	best: 0.0595991 (1000)	total: 4

## Submissions

In [9]:
import os 


submission = pd.read_csv('../../dataset/sample_submission.csv')
predictions = np.expm1(test_predictions)

i = 1 
while os.path.exists(f"../predictions/submissions/cat_submission_{i}.csv"):
    i+=1

submission["Calories"] = predictions

csv_filename = f"../predictions/submissions/cat_submission_{i}.csv"
submission.to_csv(csv_filename, index=False)
print(f"Fichier CSV enregistré : {csv_filename}")

npy_filename = f"../predictions/oof/cat_oof_predictions_{i}.npy"
np.save(npy_filename, oof_predictions)
print(f"Fichier NumPy enregistré : {npy_filename}")

Fichier CSV enregistré : ../predictions/submissions/cat_submission_2.csv
Fichier NumPy enregistré : ../predictions/oof/cat_oof_predictions_2.npy
