In [1]:
# Kaggle API Dataset Download
# This section handles downloading and accessing the dataset for the Playground Series S5E5 competition

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# This cell detects whether we're running on Kaggle or locally
IN_KAGGLE = os.path.exists('/kaggle/input')

if IN_KAGGLE:
    # If running on Kaggle, the data is already available in the /kaggle/input directory
    print("Running on Kaggle - dataset already available")

    # Competition data paths
    BASE_DIR = '/kaggle/input/playground-series-s5e5'

else:
    # If running locally, we need to download the data via the Kaggle API
    print("Running locally - downloading data via Kaggle API")

    # First, check if kaggle module is installed
    try:
        import kaggle
    except ImportError:
        print("Kaggle API not found. Installing...")
        !pip install kaggle
        import kaggle

    # Create directory for data if it doesn't exist
    os.makedirs('kaggle_data', exist_ok=True)

    # Download competition data
    # Note: You need to have your Kaggle API credentials in ~/.kaggle/kaggle.json
    # If not already set up, run the following commands in a cell:
    """
    # Run this if you haven't set up Kaggle API credentials:
    !mkdir -p ~/.kaggle
    !echo '{"username":"YOUR_USERNAME","key":"YOUR_KEY"}' > ~/.kaggle/kaggle.json
    !chmod 600 ~/.kaggle/kaggle.json
    """

    # Download all competition files
    !kaggle competitions download -c playground-series-s5e5 -p kaggle_data

    # Unzip the downloaded files
    import zipfile
    with zipfile.ZipFile('kaggle_data/playground-series-s5e5.zip', 'r') as zip_ref:
        zip_ref.extractall('kaggle_data')

    print("Dataset downloaded successfully!")

    # Set the base directory for data access
    BASE_DIR = 'kaggle_data'

# Now let's define paths to access the files in a consistent way
# This will work both on Kaggle and locally
train_path = os.path.join(BASE_DIR, 'train.csv')
test_path = os.path.join(BASE_DIR, 'test.csv')
sample_submission_path = os.path.join(BASE_DIR, 'sample_submission.csv')

# Load the datasets
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sample_submission = pd.read_csv(sample_submission_path)

# Display basic information about the datasets
print("\n--- Dataset Information ---")
print(f"Training set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")
print(f"Sample submission shape: {sample_submission.shape}")

# Display a few rows of the training data
print("\n--- First few rows of training data ---")
train_df.head()

Running locally - downloading data via Kaggle API
playground-series-s5e5.zip: Skipping, found more recently modified local copy (use --force to force download)
Dataset downloaded successfully!

--- Dataset Information ---
Training set shape: (750000, 9)
Test set shape: (250000, 8)
Sample submission shape: (250000, 2)

--- First few rows of training data ---


Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,4,female,38,166.0,61.0,25.0,102.0,40.6,146.0


In [2]:
def preprocess_df(df):
    # Original preprocessing
    if 'id' in df.columns:
        df = df.drop('id', axis=1)
    df['Intensity'] = df['Heart_Rate'] * df['Duration']
    df['TotalTemp'] = df['Body_Temp'] * df['Duration']
    df['Sex'] = pd.Categorical(df.Sex)
    
    # Calculate VO₂-Based Calorie Estimate
    # Step 1: Estimate VO₂_max (mL/kg/min) - using a simplified age-based estimate
    # A common simple estimate is 45.2 - 0.35*Age for men, slightly lower for women
    # We'll use an average approach of 40 - 0.25*Age for simplicity
    vo2_max = 40 - 0.25 * df['Age']
    
    # Step 2: Calculate estimated VO₂ during exercise based on heart rate
    # Assuming HR is linearly related to VO₂ consumption (simplified model)
    # MaxHR estimated as 208 - 0.7 * Age
    max_hr = 208 - 0.7 * df['Age']
    hr_percentage = df['Heart_Rate'] / max_hr
    vo2_estimate = vo2_max * hr_percentage
    
    # Step 3: Calculate total O₂ consumed in liters
    # Total O₂ = VO₂ (mL/kg/min) × Weight(kg) × Duration(min) / 1000
    total_o2_consumed = vo2_estimate * df['Weight'] * df['Duration'] / 1000
    
    # Step 4: Convert O₂ to calories (5 kcal per liter of O₂)
    df['VO2_Calories'] = total_o2_consumed * 5
    
    return df

train_df = preprocess_df(train_df)
test_df = preprocess_df(test_df)
cats=['Sex']
train_df.head()

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories,Intensity,TotalTemp,VO2_Calories
0,0,male,36,189.0,82.0,26.0,101.0,41.0,150.0,2626.0,1066.0,182.584573
1,1,female,64,163.0,60.0,8.0,85.0,39.7,34.0,680.0,317.6,30.0
2,2,female,51,161.0,64.0,7.0,84.0,39.8,29.0,588.0,278.6,29.758328
3,3,male,20,192.0,90.0,25.0,105.0,40.7,140.0,2625.0,1017.5,213.112113
4,4,female,38,166.0,61.0,25.0,102.0,40.6,146.0,2550.0,1015.0,130.76833


In [3]:
import sys
!{sys.executable} -m pip install xgboost
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_log_error
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

print("Training XGBoost with robust RMSLE optimization...")
X = train_df.drop('Calories', axis=1)
y = train_df['Calories']

# Create train/validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical features
cat_features = ['Sex']

# Apply one-hot encoding for categorical features (XGBoost doesn't handle categorical features directly like CatBoost)
X_train_encoded = pd.get_dummies(X_train, columns=cat_features, drop_first=True)
X_val_encoded = pd.get_dummies(X_val, columns=cat_features, drop_first=True)

# Log-transform targets
y_train_log = np.log1p(np.maximum(0, y_train))
y_val_log = np.log1p(np.maximum(0, y_val))

# XGBoost configuration - parameters chosen to be similar to the CatBoost setup
xgb_model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.01,        # Reduced learning rate for stability
    max_depth=6,               # Similar depth as CatBoost
    objective='reg:squarederror',  # MSE objective for log-transformed data
    eval_metric='rmse',        # Standard RMSE evaluation
    random_state=42,
    verbosity=1,
    reg_lambda=5,              # L2 regularization similar to l2_leaf_reg
    min_child_weight=10,       # Similar to min_data_in_leaf
    subsample=0.8,             # Add some subsampling for robustness
    colsample_bytree=0.8       # Feature subsampling
)

# Train the model on log-transformed targets with early stopping
# Note: early_stopping_rounds should be provided as a parameter to fit_params, not directly to fit()
eval_set = [(X_val_encoded, y_val_log)]
xgb_model.fit(
    X_train_encoded, 
    y_train_log,
    eval_set=eval_set,
)

# Make predictions (on log scale) and transform back
val_predictions_log = xgb_model.predict(X_val_encoded)
val_predictions = np.expm1(val_predictions_log)  # expm1 is inverse of log1p

# Ensure predictions are non-negative (should already be due to exp transform)
val_predictions = np.maximum(0, val_predictions)

# Calculate RMSLE directly
def rmsle(y_true, y_pred):
    # Ensure inputs are positive
    y_true = np.maximum(0, y_true)
    y_pred = np.maximum(0, y_pred)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# Evaluate the model
val_mse = mean_squared_error(y_val, val_predictions)
val_rmse = np.sqrt(val_mse)
val_rmsle = rmsle(y_val, val_predictions)
val_r2 = r2_score(y_val, val_predictions)

print(f"Validation MSE: {val_mse:.2f}")
print(f"Validation RMSE: {val_rmse:.2f}")
print(f"Validation RMSLE: {val_rmsle:.4f}")  # This is your target metric
print(f"Validation R²: {val_r2:.4f}")

# Feature importance
importance = xgb_model.feature_importances_
feature_names = X_train_encoded.columns
importance_df = sorted(zip(feature_names, importance), key=lambda x: x[1], reverse=True)

print("\nFeature Importance:")
for name, importance in importance_df:
    print(f"{name}: {importance}")

# If you need to predict on test data later
# test_encoded = pd.get_dummies(test_df, columns=cat_features, drop_first=True)
# test_predictions_log = xgb_model.predict(test_encoded)
# test_predictions = np.expm1(test_predictions_log)
# test_predictions = np.maximum(0, test_predictions)

Training XGBoost with robust RMSLE optimization...
[0]	validation_0-rmse:0.95309
[1]	validation_0-rmse:0.94376
[2]	validation_0-rmse:0.93448
[3]	validation_0-rmse:0.92531
[4]	validation_0-rmse:0.91623
[5]	validation_0-rmse:0.90723
[6]	validation_0-rmse:0.89838
[7]	validation_0-rmse:0.88957
[8]	validation_0-rmse:0.88090
[9]	validation_0-rmse:0.87231
[10]	validation_0-rmse:0.86377
[11]	validation_0-rmse:0.85536
[12]	validation_0-rmse:0.84701
[13]	validation_0-rmse:0.83871
[14]	validation_0-rmse:0.83051
[15]	validation_0-rmse:0.82238
[16]	validation_0-rmse:0.81436
[17]	validation_0-rmse:0.80648
[18]	validation_0-rmse:0.79860
[19]	validation_0-rmse:0.79087
[20]	validation_0-rmse:0.78319
[21]	validation_0-rmse:0.77555
[22]	validation_0-rmse:0.76802
[23]	validation_0-rmse:0.76058
[24]	validation_0-rmse:0.75321
[25]	validation_0-rmse:0.74591
[26]	validation_0-rmse:0.73864
[27]	validation_0-rmse:0.73147
[28]	validation_0-rmse:0.72435
[29]	validation_0-rmse:0.71732
[30]	validation_0-rmse:0.7103

In [4]:
print("\nGenerating predictions for test set...")
X_test = test_df

# We need to apply the same one-hot encoding to the test data
# First, identify categorical features
cat_features = ['Sex']

# Apply one-hot encoding for categorical features
X_test_encoded = pd.get_dummies(X_test, columns=cat_features, drop_first=True)

# Ensure the columns match exactly with training data
# Get the columns from the trained model (might be accessible via feature_names_in_)
train_columns = X_train_encoded.columns

# Check if any columns are missing in the test data
missing_cols = set(train_columns) - set(X_test_encoded.columns)
# Add missing columns with default value of 0
for col in missing_cols:
    X_test_encoded[col] = 0
    
# Ensure columns are in the same order as training data
X_test_encoded = X_test_encoded[train_columns]

# Make predictions (these are still in log space)
test_predictions_log = xgb_model.predict(X_test_encoded)

# Transform back from log space to original scale
test_predictions = np.expm1(test_predictions_log)  # This is the inverse of log1p

# Ensure predictions are non-negative (although expm1 should always give positive values)
test_predictions = np.maximum(0, test_predictions)

# Create the submission dataframe
submission = pd.DataFrame({
    'id': test_df.index if 'id' not in test_df.columns else test_df['id'],
    'Calories': test_predictions
})

# Save to CSV
submission.to_csv('submission.csv', index=False)
print(f"Submission file created: submission.csv with {submission.shape[0]} rows")

# Display the first few rows
print("\nFirst few rows of the submission file:")
display(submission.head())


Generating predictions for test set...
Submission file created: submission.csv with 250000 rows

First few rows of the submission file:


Unnamed: 0,id,Calories
0,750000,27.239355
1,750001,108.752502
2,750002,87.439392
3,750003,126.763222
4,750004,76.271507


In [5]:
# Check if running on Kaggle or locally
IN_KAGGLE = os.path.exists('/kaggle/working')

if IN_KAGGLE:
    # If running on Kaggle, use the built-in submission mechanism
    print("Running on Kaggle - please use the 'Submit' button in the UI to submit your results")
else:
    # If running locally, use the Kaggle API to submit
    print("Submitting via Kaggle API...")
    
    # Ensure Kaggle API is installed
    try:
        import kaggle
    except ImportError:
        print("Kaggle API not found. Installing...")
        !pip install kaggle
        import kaggle
    
    # Submit the file
    # Note: Make sure you have Kaggle API credentials set up (~/.kaggle/kaggle.json)
    competition_name = "playground-series-s5e5"
    submission_message = "xgboost attempt"
    
    # Command to submit 
    submission_command = f"kaggle competitions submit -c {competition_name} -f submission.csv -m \"{submission_message}\""
    
    print(f"Running command: {submission_command}")
    !{submission_command}
    
    # Check your submissions (optional)
    print("\nYour recent submissions:")
    !kaggle competitions submissions -c {competition_name}
    
print("\nDone! 🎉")

Submitting via Kaggle API...
Running command: kaggle competitions submit -c playground-series-s5e5 -f submission.csv -m "xgboost attempt"
400 Client Error: Bad Request for url: https://www.kaggle.com/api/v1/competitions/submissions/submit/playground-series-s5e5

Your recent submissions:



  0%|          | 0.00/4.21M [00:00<?, ?B/s]
  9%|8         | 368k/4.21M [00:00<00:01, 3.56MB/s]
 47%|####7     | 2.00M/4.21M [00:00<00:00, 11.2MB/s]
 92%|#########1| 3.86M/4.21M [00:00<00:00, 10.6MB/s]
100%|##########| 4.21M/4.21M [00:00<00:00, 5.18MB/s]


fileName        date                        description          status                     publicScore  privateScore  
--------------  --------------------------  -------------------  -------------------------  -----------  ------------  
submission.csv  2025-05-16 17:21:13.507000  xgboost attempt      SubmissionStatus.COMPLETE  0.05912                    
submission.csv  2025-05-16 17:17:48.697000  xgboost attempt      SubmissionStatus.COMPLETE  0.05913                    
submission.csv  2025-05-16 16:34:49.147000  xgboost attempt      SubmissionStatus.COMPLETE  0.05863                    
submission.csv  2025-05-16 16:32:27.227000  xgboost attempt      SubmissionStatus.COMPLETE  0.05870                    
submission.csv  2025-05-16 16:25:22.277000  Improved Catboost    SubmissionStatus.COMPLETE  0.05923                    
submission.csv  2025-05-14 17:46:12.483000  Random Forest model  SubmissionStatus.COMPLETE  0.06327                    
submission.csv  2025-05-14 17:44:25.0700