In [None]:
# Kaggle API Dataset Download
# This section handles downloading and accessing the dataset for the Playground Series S5E5 competition

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# This cell detects whether we're running on Kaggle or locally
IN_KAGGLE = os.path.exists('/kaggle/input')

if IN_KAGGLE:
    # If running on Kaggle, the data is already available in the /kaggle/input directory
    print("Running on Kaggle - dataset already available")

    # Competition data paths
    BASE_DIR = '/kaggle/input/playground-series-s5e5'

else:
    # If running locally, we need to download the data via the Kaggle API
    print("Running locally - downloading data via Kaggle API")

    # First, check if kaggle module is installed
    try:
        import kaggle
    except ImportError:
        print("Kaggle API not found. Installing...")
        !pip install kaggle
        import kaggle

    # Create directory for data if it doesn't exist
    os.makedirs('kaggle_data', exist_ok=True)

    # Download competition data
    # Note: You need to have your Kaggle API credentials in ~/.kaggle/kaggle.json
    # If not already set up, run the following commands in a cell:
    """
    # Run this if you haven't set up Kaggle API credentials:
    !mkdir -p ~/.kaggle
    !echo '{"username":"YOUR_USERNAME","key":"YOUR_KEY"}' > ~/.kaggle/kaggle.json
    !chmod 600 ~/.kaggle/kaggle.json
    """

    # Download all competition files
    !kaggle competitions download -c playground-series-s5e5 -p kaggle_data

    # Unzip the downloaded files
    import zipfile
    with zipfile.ZipFile('kaggle_data/playground-series-s5e5.zip', 'r') as zip_ref:
        zip_ref.extractall('kaggle_data')

    print("Dataset downloaded successfully!")

    # Set the base directory for data access
    BASE_DIR = 'kaggle_data'

# Now let's define paths to access the files in a consistent way
# This will work both on Kaggle and locally
train_path = os.path.join(BASE_DIR, 'train.csv')
test_path = os.path.join(BASE_DIR, 'test.csv')
sample_submission_path = os.path.join(BASE_DIR, 'sample_submission.csv')

# Load the datasets
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sample_submission = pd.read_csv(sample_submission_path)

# Display basic information about the datasets
print("\n--- Dataset Information ---")
print(f"Training set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")
print(f"Sample submission shape: {sample_submission.shape}")

# Display a few rows of the training data
print("\n--- First few rows of training data ---")
train_df.head()

In [None]:
def preprocess_df(df):
    df['Intensity'] = df['Heart_Rate'] * df['Duration']
    df['TotalTemp'] = df['Body_Temp'] * df['Duration']
    df['Sex'] = pd.Categorical(df.Sex)

    return df

train_df = preprocess_df(train_df)
test_df = preprocess_df(test_df)

cats=['Sex']

In [None]:
%pip install lightgbm

In [None]:
from lightgbm import LGBMRegressor
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_log_error
from sklearn.model_selection import train_test_split
import numpy as np

print("Training LightGBM with robust RMSLE optimization...")
X = train_df.drop('Calories', axis=1)
y = train_df['Calories']

# Create train/validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical features
cat_features = ['Sex']
if train_df['Sex'].dtype == 'object':
    train_df['Sex'] = train_df['Sex'].astype('category').cat.codes
cat_indices = [X.columns.get_loc(col) for col in cat_features]  # LightGBM needs indices

# Use the same approach with MSE on log-transformed targets
# Add a small constant to ensure all values are positive
y_train_log = np.log1p(np.maximum(0, y_train))
y_val_log = np.log1p(np.maximum(0, y_val))

# LightGBM configuration similar to the CatBoost setup
lgb_model = LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.01,        # Reduced learning rate for stability
    max_depth=6,               # Equivalent to depth in CatBoost
    objective='rmse',          # Standard RMSE on log-transformed data
    metric='rmse',             # Standard RMSE evaluation
    random_state=42,
    verbosity=1,               # Similar to verbose in CatBoost
    reg_lambda=5,              # L2 regularization, similar to l2_leaf_reg
    min_child_samples=10,      # Similar to min_data_in_leaf
    subsample=0.8,             # Add some bagging for robustness
    colsample_bytree=0.8,      # Feature subsampling for robustness
)

# Train the model on log-transformed targets
lgb_model.fit(
    X_train, y_train_log,
    eval_set=[(X_val, y_val_log)],
    categorical_feature=cat_indices,
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=200)
    ]
)

# Make predictions (on log scale) and transform back
val_predictions_log = lgb_model.predict(X_val)
val_predictions = np.expm1(val_predictions_log)  # expm1 is inverse of log1p

# Ensure predictions are non-negative (should already be due to exp transform)
val_predictions = np.maximum(0, val_predictions)

# Calculate RMSLE directly
def rmsle(y_true, y_pred):
    # Ensure inputs are positive
    y_true = np.maximum(0, y_true)
    y_pred = np.maximum(0, y_pred)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

val_mse = mean_squared_error(y_val, val_predictions)
val_rmse = np.sqrt(val_mse)
val_rmsle = rmsle(y_val, val_predictions)
val_r2 = r2_score(y_val, val_predictions)

print(f"Validation MSE: {val_mse:.2f}")
print(f"Validation RMSE: {val_rmse:.2f}")
print(f"Validation RMSLE: {val_rmsle:.4f}")  # This is your target metric
print(f"Validation R²: {val_r2:.4f}")

# Feature importance
feature_importance = lgb_model.feature_importances_
feature_names = X.columns
importance_df = sorted(zip(feature_names, feature_importance), key=lambda x: x[1], reverse=True)

print("\nFeature Importance:")
for name, importance in importance_df:
    print(f"{name}: {importance}")

In [None]:
print("\nGenerating predictions for test set...")
X_test = test_df

# Make predictions (these are still in log space)
test_predictions_log = lgb_model.predict(X_test)

# Transform back from log space to original scale
test_predictions = np.expm1(test_predictions_log)  # This is the inverse of log1p

# Ensure predictions are non-negative (although expm1 should always give positive values)
test_predictions = np.maximum(0, test_predictions)

submission = pd.DataFrame({
    'id': test_df['id'] if 'id' in test_df.columns else range(len(test_predictions)),
    'Calories': test_predictions
})

# Save to CSV
submission.to_csv('submission.csv', index=False)
print(f"Submission file created: submission.csv with {submission.shape[0]} rows")

# Display the first few rows
submission.head()

In [None]:
# Check if running on Kaggle or locally
IN_KAGGLE = os.path.exists('/kaggle/working')

if IN_KAGGLE:
    # If running on Kaggle, use the built-in submission mechanism
    print("Running on Kaggle - please use the 'Submit' button in the UI to submit your results")
else:
    # If running locally, use the Kaggle API to submit
    print("Submitting via Kaggle API...")
    
    # Ensure Kaggle API is installed
    try:
        import kaggle
    except ImportError:
        print("Kaggle API not found. Installing...")
        !pip install kaggle
        import kaggle
    
    # Submit the file
    # Note: Make sure you have Kaggle API credentials set up (~/.kaggle/kaggle.json)
    competition_name = "playground-series-s5e5"
    submission_message = "Improved Catboost"
    
    # Command to submit 
    submission_command = f"kaggle competitions submit -c {competition_name} -f submission.csv -m \"{submission_message}\""
    
    print(f"Running command: {submission_command}")
    !{submission_command}
    
    # Check your submissions (optional)
    print("\nYour recent submissions:")
    !kaggle competitions submissions -c {competition_name}
    
print("\nDone! 🎉")