In [None]:
# Kaggle API Dataset Download
# This section handles downloading and accessing the dataset for the Playground Series S5E5 competition

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# This cell detects whether we're running on Kaggle or locally
IN_KAGGLE = os.path.exists('/kaggle/input')

if IN_KAGGLE:
    # If running on Kaggle, the data is already available in the /kaggle/input directory
    print("Running on Kaggle - dataset already available")

    # Competition data paths
    BASE_DIR = '/kaggle/input/playground-series-s5e5'

else:
    # If running locally, we need to download the data via the Kaggle API
    print("Running locally - downloading data via Kaggle API")

    # First, check if kaggle module is installed
    try:
        import kaggle
    except ImportError:
        print("Kaggle API not found. Installing...")
        !pip install kaggle
        import kaggle

    # Create directory for data if it doesn't exist
    os.makedirs('kaggle_data', exist_ok=True)

    # Download competition data
    # Note: You need to have your Kaggle API credentials in ~/.kaggle/kaggle.json
    # If not already set up, run the following commands in a cell:
    """
    # Run this if you haven't set up Kaggle API credentials:
    !mkdir -p ~/.kaggle
    !echo '{"username":"YOUR_USERNAME","key":"YOUR_KEY"}' > ~/.kaggle/kaggle.json
    !chmod 600 ~/.kaggle/kaggle.json
    """

    # Download all competition files
    !kaggle competitions download -c playground-series-s5e5 -p kaggle_data

    # Unzip the downloaded files
    import zipfile
    with zipfile.ZipFile('kaggle_data/playground-series-s5e5.zip', 'r') as zip_ref:
        zip_ref.extractall('kaggle_data')

    print("Dataset downloaded successfully!")

    # Set the base directory for data access
    BASE_DIR = 'kaggle_data'

# Now let's define paths to access the files in a consistent way
# This will work both on Kaggle and locally
train_path = os.path.join(BASE_DIR, 'train.csv')
test_path = os.path.join(BASE_DIR, 'test.csv')
sample_submission_path = os.path.join(BASE_DIR, 'sample_submission.csv')

# Load the datasets
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sample_submission = pd.read_csv(sample_submission_path)

# Display basic information about the datasets
print("\n--- Dataset Information ---")
print(f"Training set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")
print(f"Sample submission shape: {sample_submission.shape}")

# Display a few rows of the training data
print("\n--- First few rows of training data ---")
train_df.head()

In [None]:
def preprocess_df(df):
    # Original preprocessing
    if 'id' in df.columns:
        df = df.drop('id', axis=1)
    df['Intensity'] = df['Heart_Rate'] * df['Duration']
    df['TotalTemp'] = df['Body_Temp'] * df['Duration']
    df['Sex'] = pd.Categorical(df.Sex)

    df['Temp_Heart_Interaction'] = df['Body_Temp'] * df['Heart_Rate']
    df['HR_Times_Weight'] = df['Heart_Rate'] * df['Weight']
    df['Est_Max_HR'] = 220 - df['Age']
    df['Age_Based_HR_Percent'] = df['Heart_Rate'] / df['Est_Max_HR']
    df['BMI'] = df['Weight'] / (df['Height'] ** 2)

    df['Temp_Heart_Interaction_Duration'] = df['Temp_Heart_Interaction'] * df['Duration']
    df['HR_Times_Weight_Times_Duration'] = df['HR_Times_Weight'] * df['Duration']
    df['Age_Based_Intensity'] = df['Age_Based_HR_Percent'] * df['Duration']
    df['BMI_Intensity'] = df['BMI'] * df['Intensity']
    
    # Calculate VO₂-Based Calorie Estimate
    # Step 1: Estimate VO₂_max (mL/kg/min) - using a simplified age-based estimate
    # A common simple estimate is 45.2 - 0.35*Age for men, slightly lower for women
    # We'll use an average approach of 40 - 0.25*Age for simplicity
    vo2_max = 40 - 0.25 * df['Age']
    
    # Step 2: Calculate estimated VO₂ during exercise based on heart rate
    # Assuming HR is linearly related to VO₂ consumption (simplified model)
    # MaxHR estimated as 208 - 0.7 * Age
    max_hr = 208 - 0.7 * df['Age']
    hr_percentage = df['Heart_Rate'] / max_hr
    vo2_estimate = vo2_max * hr_percentage
    
    # Step 3: Calculate total O₂ consumed in liters
    # Total O₂ = VO₂ (mL/kg/min) × Weight(kg) × Duration(min) / 1000
    total_o2_consumed = vo2_estimate * df['Weight'] * df['Duration'] / 1000
    
    # Step 4: Convert O₂ to calories (5 kcal per liter of O₂)
    df['VO2_Calories'] = total_o2_consumed * 5

    is_male = df['Sex'] == 'male'
    
    # Calculate calories per minute based on the formulas
    male_calories_per_min = (-55.0969 + 0.6309 * df['Heart_Rate'] + 
                            0.1988 * df['Weight'] + 
                            0.2017 * df['Age']) / 4.184
    
    female_calories_per_min = (-20.4022 + 0.4472 * df['Heart_Rate'] - 
                              0.1263 * df['Weight'] + 
                              0.074 * df['Age']) / 4.184
    
    # Assign the appropriate calculation based on sex
    df['Calories_Per_Minute'] = np.where(is_male, male_calories_per_min, female_calories_per_min)
    
    # Calculate total calories based on duration
    df['HR_Based_Calories'] = df['Calories_Per_Minute'] * df['Duration']

    if 'Intensity' in df.columns:
        df = df.drop('Intensity', axis=1)
    
    return df

orig_test_df = test_df
train_df = preprocess_df(train_df)
test_df = preprocess_df(test_df)
cats=['Sex']
train_df.head()

In [None]:
import sys
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_log_error
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

def train_xgboost_model(train_df, target_column='Calories', cat_features=['Sex'], 
                        test_size=0.2, random_state=42, verbose=True):
    """
    Train an XGBoost model with RMSLE optimization using log transformation.
    
    Parameters:
    -----------
    train_df : pd.DataFrame
        Training dataframe containing features and target
    target_column : str, default='Calories'
        Name of the target column
    cat_features : list, default=['Sex']
        List of categorical feature column names
    test_size : float, default=0.2
        Proportion of data to use for validation
    random_state : int, default=42
        Random seed for reproducibility
    verbose : bool, default=True
        Whether to print training progress and results
    
    Returns:
    --------
    dict : Dictionary containing:
        - 'model': Trained XGBoost model
        - 'metrics': Dictionary with validation metrics
        - 'feature_importance': List of tuples (feature_name, importance)
        - 'predictions': Validation predictions
        - 'X_val': Validation features (encoded)
        - 'y_val': Validation targets
        - 'X_train': Training features (encoded)
    """
    
    if verbose:
        print("Training XGBoost with robust RMSLE optimization...")
    
    # Prepare features and target
    X = train_df.drop(target_column, axis=1)
    y = train_df[target_column]
    
    # Create train/validation split
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )
    
    # Apply one-hot encoding for categorical features (XGBoost doesn't handle categorical features directly)
    X_train_encoded = pd.get_dummies(X_train, columns=cat_features, drop_first=True)
    X_val_encoded = pd.get_dummies(X_val, columns=cat_features, drop_first=True)
    
    # Log-transform targets
    y_train_log = np.log1p(np.maximum(0, y_train))
    y_val_log = np.log1p(np.maximum(0, y_val))
    
    # XGBoost configuration - parameters chosen to be similar to the CatBoost setup
    xgb_model = XGBRegressor(
        n_estimators=3000,
        learning_rate=0.01,        # Reduced learning rate for stability
        max_depth=8,               # Similar depth as CatBoost
        objective='reg:squarederror',  # MSE objective for log-transformed data
        eval_metric='rmsle',        # Standard RMSE evaluation
        random_state=random_state,
        verbosity=1 if verbose else 0,
        reg_lambda=5,              # L2 regularization similar to l2_leaf_reg
        min_child_weight=10,       # Similar to min_data_in_leaf
        subsample=0.8,             # Add some subsampling for robustness
        colsample_bytree=0.8       # Feature subsampling
    )
    
    # Train the model on log-transformed targets with early stopping
    eval_set = [(X_val_encoded, y_val_log)]
    xgb_model.fit(
        X_train_encoded, 
        y_train_log,
        eval_set=eval_set,
    )
    
    # Make predictions (on log scale) and transform back
    val_predictions_log = xgb_model.predict(X_val_encoded)
    val_predictions = np.expm1(val_predictions_log)  # expm1 is inverse of log1p
    val_predictions = np.maximum(0, val_predictions)  # Ensure non-negative
    
    # Calculate metrics
    def rmsle(y_true, y_pred):
        """Calculate Root Mean Squared Logarithmic Error"""
        y_true = np.maximum(0, y_true)
        y_pred = np.maximum(0, y_pred)
        return np.sqrt(mean_squared_log_error(y_true, y_pred))
    
    val_mse = mean_squared_error(y_val, val_predictions)
    val_rmse = np.sqrt(val_mse)
    val_rmsle = rmsle(y_val, val_predictions)
    val_r2 = r2_score(y_val, val_predictions)
    
    metrics = {
        'mse': val_mse,
        'rmse': val_rmse,
        'rmsle': val_rmsle,
        'r2': val_r2
    }
    
    if verbose:
        print(f"Validation MSE: {val_mse:.2f}")
        print(f"Validation RMSE: {val_rmse:.2f}")
        print(f"Validation RMSLE: {val_rmsle:.4f}")  # Target metric
        print(f"Validation R²: {val_r2:.4f}")
    
    # Feature importance
    importance = xgb_model.feature_importances_
    feature_names = X_train_encoded.columns
    importance_list = sorted(
        zip(feature_names, importance), 
        key=lambda x: x[1], 
        reverse=True
    )
    
    if verbose:
        print("\nFeature Importance:")
        for name, importance in importance_list:
            print(f"{name}: {importance}")
    
    return {
        'model': xgb_model,
        'metrics': metrics,
        'feature_importance': importance_list,
        'predictions': val_predictions,
        'X_val': X_val_encoded,
        'y_val': y_val,
        'X_train': X_train_encoded
    }

In [None]:
xgboost_1 = train_xgboost_model(
    train_df=train_df,
    target_column='Calories',
    cat_features=['Sex'],
    test_size=0.2,
    random_state=42,
    verbose=True
)

# Print information from xgboost_1
print("\n" + "="*50)
print("XGBOOST_1 RESULTS SUMMARY")
print("="*50)

print(f"Model Type: {type(xgboost_1['model']).__name__}")
print(f"Number of estimators: {xgboost_1['model'].n_estimators}")

print(f"\nValidation Metrics:")
print(f"  RMSLE: {xgboost_1['metrics']['rmsle']:.4f}")
print(f"  RMSE:  {xgboost_1['metrics']['rmse']:.2f}")
print(f"  R²:    {xgboost_1['metrics']['r2']:.4f}")

print(f"\nTop 5 Most Important Features:")
for i, (feature, importance) in enumerate(xgboost_1['feature_importance'][:5]):
    print(f"  {i+1}. {feature}: {importance:.4f}")

print(f"\nValidation Set Info:")
print(f"  Validation samples: {len(xgboost_1['y_val'])}")
print(f"  Prediction range: {xgboost_1['predictions'].min():.1f} - {xgboost_1['predictions'].max():.1f}")
print(f"  Actual range: {xgboost_1['y_val'].min():.1f} - {xgboost_1['y_val'].max():.1f}")
print(f"  Encoded features: {len(xgboost_1['X_train'].columns)}")

In [None]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_log_error
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

def train_catboost_model(train_df, target_column='Calories', cat_features=['Sex'], 
                        test_size=0.2, random_state=42, verbose=True):
    """
    Train a CatBoost model with RMSLE optimization using log transformation.
    
    Parameters:
    -----------
    train_df : pd.DataFrame
        Training dataframe containing features and target
    target_column : str, default='Calories'
        Name of the target column
    cat_features : list, default=['Sex']
        List of categorical feature column names
    test_size : float, default=0.2
        Proportion of data to use for validation
    random_state : int, default=42
        Random seed for reproducibility
    verbose : bool, default=True
        Whether to print training progress and results
    
    Returns:
    --------
    dict : Dictionary containing:
        - 'model': Trained CatBoost model
        - 'metrics': Dictionary with validation metrics
        - 'feature_importance': List of tuples (feature_name, importance)
        - 'predictions': Validation predictions
        - 'X_val': Validation features
        - 'y_val': Validation targets
    """
    
    if verbose:
        print("Training CatBoost with robust RMSLE optimization...")
    
    # Prepare features and target
    X = train_df.drop(target_column, axis=1)
    y = train_df[target_column]
    
    # Create train/validation split
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )
    
    # Log transform targets for more stable training
    y_train_log = np.log1p(np.maximum(0, y_train))
    y_val_log = np.log1p(np.maximum(0, y_val))
    
    # Configure CatBoost model
    cb_model = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.01,         # Reduced learning rate for stability
        depth=6,                    # Reduced depth
        loss_function='RMSE',       # Standard RMSE on log-transformed data
        eval_metric='RMSE',         # Standard RMSE evaluation
        random_seed=random_state,
        verbose=200 if verbose else False,
        l2_leaf_reg=5,              # Increased regularization
        min_data_in_leaf=10,        # Increased to avoid overfitting on noise
        max_ctr_complexity=1,       # Simplify categorical feature handling
    )
    
    # Train the model on log-transformed targets
    cb_model.fit(
        X_train, y_train_log, 
        eval_set=(X_val, y_val_log),
        cat_features=cat_features,
        early_stopping_rounds=50
    )
    
    # Make predictions and transform back to original scale
    val_predictions_log = cb_model.predict(X_val)
    val_predictions = np.expm1(val_predictions_log)  # expm1 is inverse of log1p
    val_predictions = np.maximum(0, val_predictions)  # Ensure non-negative
    
    # Calculate metrics
    def rmsle(y_true, y_pred):
        """Calculate Root Mean Squared Logarithmic Error"""
        y_true = np.maximum(0, y_true)
        y_pred = np.maximum(0, y_pred)
        return np.sqrt(mean_squared_log_error(y_true, y_pred))
    
    val_mse = mean_squared_error(y_val, val_predictions)
    val_rmse = np.sqrt(val_mse)
    val_rmsle = rmsle(y_val, val_predictions)
    val_r2 = r2_score(y_val, val_predictions)
    
    metrics = {
        'mse': val_mse,
        'rmse': val_rmse,
        'rmsle': val_rmsle,
        'r2': val_r2
    }
    
    if verbose:
        print(f"Validation MSE: {val_mse:.2f}")
        print(f"Validation RMSE: {val_rmse:.2f}")
        print(f"Validation RMSLE: {val_rmsle:.4f}")  # Target metric
        print(f"Validation R²: {val_r2:.4f}")
    
    # Feature importance
    feature_importance = cb_model.get_feature_importance()
    feature_names = X.columns
    importance_list = sorted(
        zip(feature_names, feature_importance), 
        key=lambda x: x[1], 
        reverse=True
    )
    
    if verbose:
        print("\nFeature Importance:")
        for name, importance in importance_list:
            print(f"{name}: {importance}")
    
    return {
        'model': cb_model,
        'metrics': metrics,
        'feature_importance': importance_list,
        'predictions': val_predictions,
        'X_val': X_val,
        'y_val': y_val
    }


In [None]:
catboost_1 = train_catboost_model(
    train_df=train_df,
    target_column='Calories',
    cat_features=['Sex'],
    test_size=0.2,
    random_state=42,
    verbose=True
)

In [None]:
print("\n" + "="*50)
print("CATBOOST_1 RESULTS SUMMARY")
print("="*50)

print(f"Model Type: {type(catboost_1['model']).__name__}")
print(f"Number of iterations trained: {catboost_1['model'].get_best_iteration()}")

print(f"\nValidation Metrics:")
print(f"  RMSLE: {catboost_1['metrics']['rmsle']:.4f}")
print(f"  RMSE:  {catboost_1['metrics']['rmse']:.2f}")
print(f"  R²:    {catboost_1['metrics']['r2']:.4f}")

print(f"\nTop 5 Most Important Features:")
for i, (feature, importance) in enumerate(catboost_1['feature_importance'][:5]):
    print(f"  {i+1}. {feature}: {importance:.2f}")

print(f"\nValidation Set Info:")
print(f"  Validation samples: {len(catboost_1['y_val'])}")
print(f"  Prediction range: {catboost_1['predictions'].min():.1f} - {catboost_1['predictions'].max():.1f}")
print(f"  Actual range: {catboost_1['y_val'].min():.1f} - {catboost_1['y_val'].max():.1f}")

In [None]:
# =============================================================================
# CATBOOST PREDICTIONS
# =============================================================================
print("Generating CatBoost predictions...")

# Prepare test data
X_test = test_df

# CatBoost can handle categorical features directly
catboost_predictions_log = catboost_1['model'].predict(X_test)
catboost_predictions = np.expm1(catboost_predictions_log)
catboost_predictions = np.maximum(0, catboost_predictions)

print(f"CatBoost predictions range: {catboost_predictions.min():.1f} - {catboost_predictions.max():.1f}")
print(f"CatBoost validation RMSLE: {catboost_1['metrics']['rmsle']:.4f}")

In [None]:
# =============================================================================
# XGBOOST PREDICTIONS
# =============================================================================
print("Generating XGBoost predictions...")

# Prepare test data with one-hot encoding
X_test = test_df
cat_features = ['Sex']

# Apply one-hot encoding for XGBoost (same as training)
X_test_encoded = pd.get_dummies(X_test, columns=cat_features, drop_first=True)

# Get the columns from the trained XGBoost model
train_columns = xgboost_1['X_train'].columns

# Check if any columns are missing in the test data
missing_cols = set(train_columns) - set(X_test_encoded.columns)
# Add missing columns with default value of 0
for col in missing_cols:
    X_test_encoded[col] = 0

# Ensure columns are in the same order as training data
X_test_encoded = X_test_encoded[train_columns]

# Make XGBoost predictions
xgboost_predictions_log = xgboost_1['model'].predict(X_test_encoded)
xgboost_predictions = np.expm1(xgboost_predictions_log)
xgboost_predictions = np.maximum(0, xgboost_predictions)

print(f"XGBoost predictions range: {xgboost_predictions.min():.1f} - {xgboost_predictions.max():.1f}")
print(f"XGBoost validation RMSLE: {xgboost_1['metrics']['rmsle']:.4f}")

In [None]:
# =============================================================================
# MODEL COMPARISON AND ANALYSIS
# =============================================================================
print("Analyzing model predictions...")

# Create ensemble predictions (simple average)
ensemble_predictions = (catboost_predictions + xgboost_predictions) / 2

print(f"Ensemble predictions range: {ensemble_predictions.min():.1f} - {ensemble_predictions.max():.1f}")

# Display prediction comparison (first 10 rows)
print("\nPrediction Comparison (first 10 rows):")
comparison_df = pd.DataFrame({
    'id': orig_test_df['id'][:10],
    'CatBoost': catboost_predictions[:10],
    'XGBoost': xgboost_predictions[:10],
    'Ensemble': ensemble_predictions[:10],
    'Difference': np.abs(catboost_predictions[:10] - xgboost_predictions[:10])
})
print(comparison_df.round(2))

# Model agreement statistics
print(f"\nModel Agreement Statistics:")
print(f"Mean absolute difference: {np.mean(np.abs(catboost_predictions - xgboost_predictions)):.2f}")
print(f"Max absolute difference: {np.max(np.abs(catboost_predictions - xgboost_predictions)):.2f}")
print(f"Correlation between models: {np.corrcoef(catboost_predictions, xgboost_predictions)[0,1]:.4f}")

# Validation performance comparison
print(f"\nValidation Performance Comparison:")
print(f"CatBoost RMSLE: {catboost_1['metrics']['rmsle']:.4f}")
print(f"XGBoost RMSLE:  {xgboost_1['metrics']['rmsle']:.4f}")

if catboost_1['metrics']['rmsle'] < xgboost_1['metrics']['rmsle']:
    print("✓ CatBoost performed better on validation set")
    best_model = "CatBoost"
else:
    print("✓ XGBoost performed better on validation set")
    best_model = "XGBoost"
    
print(f"Recommendation: Consider using {best_model} or the ensemble for final submission")

In [None]:
# =============================================================================
# MODEL COMPARISON AND ANALYSIS
# =============================================================================
print("Analyzing model predictions...")

# Create ensemble predictions (simple average)
ensemble_predictions = (catboost_predictions + xgboost_predictions) / 2

print(f"Ensemble predictions range: {ensemble_predictions.min():.1f} - {ensemble_predictions.max():.1f}")

# Display prediction comparison (first 10 rows)
print("\nPrediction Comparison (first 10 rows):")
comparison_df = pd.DataFrame({
    'id': orig_test_df['id'][:10],
    'CatBoost': catboost_predictions[:10],
    'XGBoost': xgboost_predictions[:10],
    'Ensemble': ensemble_predictions[:10],
    'Difference': np.abs(catboost_predictions[:10] - xgboost_predictions[:10])
})
print(comparison_df.round(2))

# Model agreement statistics
print(f"\nModel Agreement Statistics:")
print(f"Mean absolute difference: {np.mean(np.abs(catboost_predictions - xgboost_predictions)):.2f}")
print(f"Max absolute difference: {np.max(np.abs(catboost_predictions - xgboost_predictions)):.2f}")
print(f"Correlation between models: {np.corrcoef(catboost_predictions, xgboost_predictions)[0,1]:.4f}")

# Validation performance comparison
print(f"\nValidation Performance Comparison:")
print(f"CatBoost RMSLE: {catboost_1['metrics']['rmsle']:.4f}")
print(f"XGBoost RMSLE:  {xgboost_1['metrics']['rmsle']:.4f}")

if catboost_1['metrics']['rmsle'] < xgboost_1['metrics']['rmsle']:
    print("✓ CatBoost performed better on validation set")
    best_model = "CatBoost"
else:
    print("✓ XGBoost performed better on validation set")
    best_model = "XGBoost"
    
print(f"Recommendation: Consider using {best_model} or the ensemble for final submission")

In [None]:
# =============================================================================
# CREATE SUBMISSION FILES
# =============================================================================
print("Creating submission files...")

# CatBoost submission
catboost_submission = pd.DataFrame({
    'id': orig_test_df['id'],
    'Calories': catboost_predictions
})
catboost_submission.to_csv('catboost_submission.csv', index=False)

# XGBoost submission
xgboost_submission = pd.DataFrame({
    'id': orig_test_df['id'],
    'Calories': xgboost_predictions
})
xgboost_submission.to_csv('xgboost_submission.csv', index=False)

# Ensemble submission
ensemble_submission = pd.DataFrame({
    'id': orig_test_df['id'],
    'Calories': ensemble_predictions
})
ensemble_submission.to_csv('submission.csv', index=False)

print(f"✓ CatBoost submission: catboost_submission.csv ({catboost_submission.shape[0]} rows)")
print(f"✓ XGBoost submission: xgboost_submission.csv ({xgboost_submission.shape[0]} rows)")
print(f"✓ Ensemble submission: ensemble_submission.csv ({ensemble_submission.shape[0]} rows)")

# Display sample of ensemble submission
print("\nEnsemble Submission Preview:")
print(ensemble_submission.head(10))

In [None]:
# Check if running on Kaggle or locally
IN_KAGGLE = os.path.exists('/kaggle/working')

if IN_KAGGLE:
    # If running on Kaggle, use the built-in submission mechanism
    print("Running on Kaggle - please use the 'Submit' button in the UI to submit your results")
else:
    # If running locally, use the Kaggle API to submit
    print("Submitting via Kaggle API...")
    
    # Ensure Kaggle API is installed
    try:
        import kaggle
    except ImportError:
        print("Kaggle API not found. Installing...")
        !pip install kaggle
        import kaggle
    
    # Submit the file
    # Note: Make sure you have Kaggle API credentials set up (~/.kaggle/kaggle.json)
    competition_name = "playground-series-s5e5"
    submission_message = "xgboost with feature engineering"
    
    # Command to submit 
    submission_command = f"kaggle competitions submit -c {competition_name} -f submission.csv -m \"{submission_message}\""
    
    print(f"Running command: {submission_command}")
    !{submission_command}
    
    # Check your submissions (optional)
    print("\nYour recent submissions:")
    !kaggle competitions submissions -c {competition_name}
    
print("\nDone! 🎉")