In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_log_error
from lightgbm import LGBMRegressor
import warnings

In [2]:
train_df = pd.read_csv('/kaggle/input/playground-series-s5e5/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s5e5/test.csv')
sample_submission = pd.read_csv('/kaggle/input/playground-series-s5e5/sample_submission.csv')
print("Data loaded successfully!")
print(f"Train data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

Data loaded successfully!
Train data shape: (750000, 9)
Test data shape: (250000, 8)


In [3]:
def feature_engineer(df):
    df_copy = df.copy()
    df_copy['BMI'] = df_copy['Weight'] / ((df_copy['Height'] / 100)**2)

    df_copy['Duration_HeartRate_Interaction'] = df_copy['Duration'] * df_copy['Heart_Rate']
    df_copy['Duration_BodyTemp_Interaction'] = df_copy['Duration'] * df_copy['Body_Temp']
    df_copy['HeartRate_BodyTemp_Interaction'] = df_copy['Heart_Rate'] * df_copy['Body_Temp']
    df_copy['Age_Duration_Interaction'] = df_copy['Age'] * df_copy['Duration']

    df_copy['Duration_sq'] = df_copy['Duration']**2
    df_copy['Heart_Rate_sq'] = df_copy['Heart_Rate']**2
    df_copy['Body_Temp_sq'] = df_copy['Body_Temp']**2
    return df_copy

In [4]:
train_df_fe = feature_engineer(train_df.copy())
test_df_fe = feature_engineer(test_df.copy())

In [5]:
X = train_df_fe.drop(['id', 'Calories'], axis=1)
y = train_df_fe['Calories']
X_test = test_df_fe.drop('id', axis=1)

In [6]:
train_cols = X.columns
test_cols = X_test.columns

missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    X_test[c] = 0

missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train:
    X[c] = 0

X_test = X_test[train_cols]

In [7]:
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X.select_dtypes(include='object').columns.tolist()

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'
)

In [9]:
def rmsle(y_true, y_pred):
    y_pred[y_pred < 0] = 0
    return np.sqrt(mean_squared_log_error(y_true, y_pred))
y_transformed = np.log1p(y)

In [10]:
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
models = {
    'LGBM': LGBMRegressor(random_state=42, n_estimators=1000, learning_rate=0.05, num_leaves=31, n_jobs=-1),
    'XGBoost': XGBRegressor(random_state=42, n_estimators=1000, learning_rate=0.05, max_depth=6, n_jobs=-1, tree_method='hist'),
    'CatBoost': CatBoostRegressor(random_state=42, n_estimators=1000, learning_rate=0.05, verbose=0)
}


In [11]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

oof_preds_dict = {name: np.zeros(len(X)) for name in models.keys()}
test_preds_dict = {name: np.zeros(len(X_test)) for name in models.keys()}
model_scores = {name: [] for name in models.keys()}
for fold, (train_index, val_index) in enumerate(kf.split(X, y_transformed)):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y_transformed.iloc[train_index], y_transformed.iloc[val_index]

    for model_name, model_instance in models.items():
        if model_name == 'CatBoost':
            catboost_preprocessor = ColumnTransformer(
                transformers=[
                    ('num', StandardScaler(), numerical_cols),
                ],
                remainder='passthrough'
            )

            X_train_transformed = catboost_preprocessor.fit_transform(X_train)
            X_val_transformed = catboost_preprocessor.transform(X_val)

            cat_feature_indices_for_catboost = []
            if 'Sex' in categorical_cols:
                cat_feature_indices_for_catboost.append(len(numerical_cols))

            model_instance.fit(X_train_transformed, y_train, cat_features=cat_feature_indices_for_catboost)

            val_preds_transformed = model_instance.predict(X_val_transformed)
            oof_preds_dict[model_name][val_index] = val_preds_transformed

            test_fold_preds_transformed = model_instance.predict(catboost_preprocessor.transform(X_test))
            test_preds_dict[model_name] += test_fold_preds_transformed / kf.n_splits

        else:
            lgbm_xgb_preprocessor = ColumnTransformer(
                transformers=[
                    ('num', StandardScaler(), numerical_cols),
                    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols) # Keep OneHotEncoder for LGBM/XGBoost
                ],
                remainder='passthrough'
            )
            current_pipeline = Pipeline(steps=[
                ('preprocessor', lgbm_xgb_preprocessor),
                ('regressor', model_instance)
            ])
            current_pipeline.fit(X_train, y_train)

            val_preds_transformed = current_pipeline.predict(X_val)
            oof_preds_dict[model_name][val_index] = val_preds_transformed

            test_fold_preds_transformed = current_pipeline.predict(X_test)
            test_preds_dict[model_name] += test_fold_preds_transformed / kf.n_splits

        val_preds_actual = np.expm1(val_preds_transformed)
        y_val_actual = np.expm1(y_val)
        val_preds_actual[val_preds_actual < 0] = 0

        fold_score = rmsle(y_val_actual, val_preds_actual)
        model_scores[model_name].append(fold_score)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042128 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1760
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 16
[LightGBM] [Info] Start training from score 4.141163
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.039204 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 16
[LightGBM] [Info] Start training from score 4.141466
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041340 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1760
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 16
[LightGBM] [Info] Start 

In [12]:
print("\n--- Individual Model Average RMSLE Scores ---")
average_scores = {}
for name, scores in model_scores.items():
    avg_score = np.mean(scores)
    std_score = np.std(scores)
    average_scores[name] = avg_score
    print(f"  {name}: {avg_score:.5f} +/- {std_score:.5f}")


--- Individual Model Average RMSLE Scores ---
  LGBM: 0.06027 +/- 0.00020
  XGBoost: 0.06021 +/- 0.00014
  CatBoost: 0.05999 +/- 0.00024


In [13]:
total_inverse_score = sum(1/score for score in average_scores.values())
weights = {name: (1/score) / total_inverse_score for name, score in average_scores.items()}

print("\n--- Ensemble Weights ---")
for name, weight in weights.items():
    print(f"  {name}: {weight:.4f}")


--- Ensemble Weights ---
  LGBM: 0.3327
  XGBoost: 0.3330
  CatBoost: 0.3343


In [14]:
final_ensemble_test_preds_transformed = np.zeros(len(X_test))
for model_name, weight in weights.items():
    final_ensemble_test_preds_transformed += test_preds_dict[model_name] * weight

final_ensemble_predictions = np.expm1(final_ensemble_test_preds_transformed)
final_ensemble_predictions[final_ensemble_predictions < 0] = 0

In [15]:
ensemble_oof_preds_transformed = np.zeros(len(X))
for model_name, weight in weights.items():
    ensemble_oof_preds_transformed += oof_preds_dict[model_name] * weight

ensemble_oof_preds_actual = np.expm1(ensemble_oof_preds_transformed)
ensemble_oof_preds_actual[ensemble_oof_preds_actual < 0] = 0
overall_ensemble_rmsle = rmsle(y, ensemble_oof_preds_actual)
print(f"\nOverall Ensemble OOF RMSLE: {overall_ensemble_rmsle:.5f}")
submission_df = pd.DataFrame({'id': test_df['id'], 'Calories': final_ensemble_predictions})
submission_df.to_csv('submission.csv', index=False)

print("\nSubmission file 'submission.csv' created successfully!")
print(submission_df.head())


Overall Ensemble OOF RMSLE: 0.05951

Submission file 'submission.csv' created successfully!
       id    Calories
0  750000   27.412745
1  750001  108.022242
2  750002   87.190516
3  750003  125.665365
4  750004   76.022609
