In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_log_error
from lightgbm import LGBMRegressor
import warnings

In [2]:
train_df = pd.read_csv('/kaggle/input/playground-series-s5e5/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s5e5/test.csv')
sample_submission = pd.read_csv('/kaggle/input/playground-series-s5e5/sample_submission.csv')
print("Data loaded successfully!")
print(f"Train data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

Data loaded successfully!
Train data shape: (750000, 9)
Test data shape: (250000, 8)


In [3]:
train_df.info()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          750000 non-null  int64  
 1   Sex         750000 non-null  object 
 2   Age         750000 non-null  int64  
 3   Height      750000 non-null  float64
 4   Weight      750000 non-null  float64
 5   Duration    750000 non-null  float64
 6   Heart_Rate  750000 non-null  float64
 7   Body_Temp   750000 non-null  float64
 8   Calories    750000 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 51.5+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          250000 non-null  int64  
 1   Sex         250000 non-null  object 
 2   Age         250000 non-null  int64  
 3   Height      250000 non-null  float64
 4 

In [4]:
print(train_df.head())
print(test_df.head())

   id     Sex  Age  Height  Weight  Duration  Heart_Rate  Body_Temp  Calories
0   0    male   36   189.0    82.0      26.0       101.0       41.0     150.0
1   1  female   64   163.0    60.0       8.0        85.0       39.7      34.0
2   2  female   51   161.0    64.0       7.0        84.0       39.8      29.0
3   3    male   20   192.0    90.0      25.0       105.0       40.7     140.0
4   4  female   38   166.0    61.0      25.0       102.0       40.6     146.0
       id     Sex  Age  Height  Weight  Duration  Heart_Rate  Body_Temp
0  750000    male   45   177.0    81.0       7.0        87.0       39.8
1  750001    male   26   200.0    97.0      20.0       101.0       40.5
2  750002  female   29   188.0    85.0      16.0       102.0       40.4
3  750003  female   39   172.0    73.0      20.0       107.0       40.6
4  750004  female   30   173.0    67.0      16.0        94.0       40.5


In [5]:
print(train_df.isnull().sum())
print(test_df.isnull().sum())

id            0
Sex           0
Age           0
Height        0
Weight        0
Duration      0
Heart_Rate    0
Body_Temp     0
Calories      0
dtype: int64
id            0
Sex           0
Age           0
Height        0
Weight        0
Duration      0
Heart_Rate    0
Body_Temp     0
dtype: int64


In [6]:
def feature_engineer(df):
    df_copy = df.copy()
    df_copy['BMI'] = df_copy['Weight'] / ((df_copy['Height'] / 100)**2)

    df_copy['Duration_HeartRate_Interaction'] = df_copy['Duration'] * df_copy['Heart_Rate']
    df_copy['Duration_BodyTemp_Interaction'] = df_copy['Duration'] * df_copy['Body_Temp']
    df_copy['HeartRate_BodyTemp_Interaction'] = df_copy['Heart_Rate'] * df_copy['Body_Temp']
    df_copy['Age_Duration_Interaction'] = df_copy['Age'] * df_copy['Duration']

    df_copy['Duration_sq'] = df_copy['Duration']**2
    df_copy['Heart_Rate_sq'] = df_copy['Heart_Rate']**2
    df_copy['Body_Temp_sq'] = df_copy['Body_Temp']**2
    return df_copy

In [7]:
train_df_fe = feature_engineer(train_df.copy())
test_df_fe = feature_engineer(test_df.copy())

In [8]:
X = train_df_fe.drop(['id', 'Calories'], axis=1)
y = train_df_fe['Calories']
X_test = test_df_fe.drop('id', axis=1)

In [9]:
train_cols = X.columns
test_cols = X_test.columns

In [10]:
missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    X_test[c] = 0
missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train:
    X[c] = 0
X_test = X_test[train_cols]

In [11]:
print(f"\nFeatures after engineering and alignment: {X.columns.tolist()}")
print(f"Number of features: {len(X.columns)}")


Features after engineering and alignment: ['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'BMI', 'Duration_HeartRate_Interaction', 'Duration_BodyTemp_Interaction', 'HeartRate_BodyTemp_Interaction', 'Age_Duration_Interaction', 'Duration_sq', 'Heart_Rate_sq', 'Body_Temp_sq']
Number of features: 15


In [12]:
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X.select_dtypes(include='object').columns.tolist()

In [13]:
preprocessor = ColumnTransformer(
transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
)

In [14]:
def rmsle(y_true, y_pred):
    y_pred[y_pred < 0] = 0
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [15]:
lgbm = LGBMRegressor(random_state=42, n_estimators=1000, learning_rate=0.05, num_leaves=31)
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('regressor', lgbm)])

In [16]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [17]:
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))
models = []

In [18]:
print("\n--- Starting K-Fold Cross-Validation ---")
fold_rmsle_scores = []
y_transformed = np.log1p(y)

for fold, (train_index, val_index) in enumerate(kf.split(X, y_transformed)):
    print(f"Fold {fold+1}")
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y_transformed.iloc[train_index], y_transformed.iloc[val_index]
    model_pipeline.fit(X_train, y_train)
    models.append(model_pipeline)
    val_preds_transformed = model_pipeline.predict(X_val)
    val_preds = np.expm1(val_preds_transformed)
    y_val_actual = np.expm1(y_val)
    val_preds[val_preds < 0] = 0
    fold_score = rmsle(y_val_actual, val_preds)
    fold_rmsle_scores.append(fold_score)
    print(f"  Fold {fold+1} RMSLE: {fold_score:.4f}")
    oof_preds[val_index] = val_preds_transformed
    test_fold_preds_transformed = model_pipeline.predict(X_test)
    test_preds += test_fold_preds_transformed / kf.n_splits 
print(f"\nAverage K-Fold RMSLE: {np.mean(fold_rmsle_scores):.4f} +/- {np.std(fold_rmsle_scores):.4f}")



--- Starting K-Fold Cross-Validation ---
Fold 1
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051272 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1760
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 16
[LightGBM] [Info] Start training from score 4.141163
  Fold 1 RMSLE: 0.0604
Fold 2
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.087611 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 16
[LightGBM] [Info] Start training from score 4.141466
  Fold 2 RMSLE: 0.0605
Fold 3
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.047566 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1760
[LightGBM

In [19]:
oof_preds_actual = np.expm1(oof_preds)
oof_preds_actual[oof_preds_actual < 0] = 0
overall_rmsle = rmsle(y, oof_preds_actual) # Use original y for overall RMSLE
print(f"Overall OOF RMSLE: {overall_rmsle:.4f}")
final_test_predictions = np.expm1(test_preds)
final_test_predictions[final_test_predictions < 0] = 0
submission_df = pd.DataFrame({'id': test_df['id'], 'Calories': final_test_predictions})
submission_df.to_csv('submission.csv', index=False)
print("\nSubmission file 'submission.csv' created successfully!")
print(submission_df.head())

Overall OOF RMSLE: 0.0603

Submission file 'submission.csv' created successfully!
       id    Calories
0  750000   27.594661
1  750001  108.340359
2  750002   86.662665
3  750003  126.134795
4  750004   76.119193
