In [None]:
# Kaggle API Dataset Download
# This section handles downloading and accessing the dataset for the Playground Series S5E5 competition

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# This cell detects whether we're running on Kaggle or locally
IN_KAGGLE = os.path.exists('/kaggle/input')

if IN_KAGGLE:
    # If running on Kaggle, the data is already available in the /kaggle/input directory
    print("Running on Kaggle - dataset already available")

    # Competition data paths
    BASE_DIR = '/kaggle/input/playground-series-s5e5'

else:
    # If running locally, we need to download the data via the Kaggle API
    print("Running locally - downloading data via Kaggle API")

    # First, check if kaggle module is installed
    try:
        import kaggle
    except ImportError:
        print("Kaggle API not found. Installing...")
        !pip install kaggle
        import kaggle

    # Create directory for data if it doesn't exist
    os.makedirs('kaggle_data', exist_ok=True)

    # Download competition data
    # Note: You need to have your Kaggle API credentials in ~/.kaggle/kaggle.json
    # If not already set up, run the following commands in a cell:
    """
    # Run this if you haven't set up Kaggle API credentials:
    !mkdir -p ~/.kaggle
    !echo '{"username":"YOUR_USERNAME","key":"YOUR_KEY"}' > ~/.kaggle/kaggle.json
    !chmod 600 ~/.kaggle/kaggle.json
    """

    # Download all competition files
    !kaggle competitions download -c playground-series-s5e5 -p kaggle_data

    # Unzip the downloaded files
    import zipfile
    with zipfile.ZipFile('kaggle_data/playground-series-s5e5.zip', 'r') as zip_ref:
        zip_ref.extractall('kaggle_data')

    print("Dataset downloaded successfully!")

    # Set the base directory for data access
    BASE_DIR = 'kaggle_data'

# Now let's define paths to access the files in a consistent way
# This will work both on Kaggle and locally
train_path = os.path.join(BASE_DIR, 'train.csv')
test_path = os.path.join(BASE_DIR, 'test.csv')
sample_submission_path = os.path.join(BASE_DIR, 'sample_submission.csv')

# Load the datasets
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sample_submission = pd.read_csv(sample_submission_path)

# Display basic information about the datasets
print("\n--- Dataset Information ---")
print(f"Training set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")
print(f"Sample submission shape: {sample_submission.shape}")

# Display a few rows of the training data
print("\n--- First few rows of training data ---")
train_df.head()

In [None]:
def preprocess_df(df):
    df['Intensity'] = df['Heart_Rate'] * df['Duration']
    df['TotalTemp'] = df['Body_Temp'] * df['Duration']
    if 'Sex' in df.columns and df['Sex'].dtype == 'object':
        df['Sex'] = df['Sex'].astype('category').cat.codes

    return df

train_df = preprocess_df(train_df)
test_df = preprocess_df(test_df)

In [None]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np

# Let's first separate our features from the target
print("Training a CatBoost Regression model...")
# Drop any non-feature columns (assuming 'Calories' is the only target)
# If there are other non-feature columns like ID, drop them as well
X = train_df.drop('Calories', axis=1)
y = train_df['Calories']

# Optionally create a validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the CatBoost model
cb_model = CatBoostRegressor(
    iterations=500,            # Number of boosting iterations (trees)
    learning_rate=0.05,        # Learning rate
    depth=6,                   # Depth of trees
    loss_function='RMSE',      # Loss function to optimize
    eval_metric='RMSE',        # Metric to evaluate model quality
    random_seed=42,            # Random seed for reproducibility
    verbose=100                # Print training progress every 100 iterations
)

# Train the model
cb_model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50)

# Validate the model
val_predictions = cb_model.predict(X_val)
val_mse = mean_squared_error(y_val, val_predictions)
val_rmse = np.sqrt(mean_squared_error(y_val, val_predictions))  # Fixed to properly calculate RMSE
val_r2 = r2_score(y_val, val_predictions)

print(f"Validation MSE: {val_mse:.2f}")
print(f"Validation RMSE: {val_rmse:.2f}")  # Now correctly shows RMSE
print(f"Validation R²: {val_r2:.4f}")

In [None]:
print("\nGenerating predictions for test set...")

# Apply the same simple preprocessing to test data
X_test = test_df

# Make predictions
test_predictions = cb_model.predict(X_test)

# Create submission file
submission = pd.DataFrame({
    'id': test_df['id'] if 'id' in test_df.columns else range(len(test_predictions)),
    'Calories': test_predictions
})

# Save to CSV
submission.to_csv('submission.csv', index=False)
print(f"Submission file created: submission.csv with {submission.shape[0]} rows")

# Display the first few rows
submission.head()

In [None]:
# Check if running on Kaggle or locally
IN_KAGGLE = os.path.exists('/kaggle/working')

if IN_KAGGLE:
    # If running on Kaggle, use the built-in submission mechanism
    print("Running on Kaggle - please use the 'Submit' button in the UI to submit your results")
else:
    # If running locally, use the Kaggle API to submit
    print("Submitting via Kaggle API...")
    
    # Ensure Kaggle API is installed
    try:
        import kaggle
    except ImportError:
        print("Kaggle API not found. Installing...")
        !pip install kaggle
        import kaggle
    
    # Submit the file
    # Note: Make sure you have Kaggle API credentials set up (~/.kaggle/kaggle.json)
    competition_name = "playground-series-s5e5"
    submission_message = "Random Forest model"
    
    # Command to submit 
    submission_command = f"kaggle competitions submit -c {competition_name} -f submission.csv -m \"{submission_message}\""
    
    print(f"Running command: {submission_command}")
    !{submission_command}
    
    # Check your submissions (optional)
    print("\nYour recent submissions:")
    !kaggle competitions submissions -c {competition_name}
    
print("\nDone! 🎉")