In [None]:
# Kaggle API Dataset Download
# This section handles downloading and accessing the dataset for the Playground Series S5E5 competition

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# This cell detects whether we're running on Kaggle or locally
IN_KAGGLE = os.path.exists('/kaggle/input')

if IN_KAGGLE:
    # If running on Kaggle, the data is already available in the /kaggle/input directory
    print("Running on Kaggle - dataset already available")

    # Competition data paths
    BASE_DIR = '/kaggle/input/playground-series-s5e5'

else:
    # If running locally, we need to download the data via the Kaggle API
    print("Running locally - downloading data via Kaggle API")

    # First, check if kaggle module is installed
    try:
        import kaggle
    except ImportError:
        print("Kaggle API not found. Installing...")
        !pip install kaggle
        import kaggle

    # Create directory for data if it doesn't exist
    os.makedirs('kaggle_data', exist_ok=True)

    # Download competition data
    # Note: You need to have your Kaggle API credentials in ~/.kaggle/kaggle.json
    # If not already set up, run the following commands in a cell:
    """
    # Run this if you haven't set up Kaggle API credentials:
    !mkdir -p ~/.kaggle
    !echo '{"username":"YOUR_USERNAME","key":"YOUR_KEY"}' > ~/.kaggle/kaggle.json
    !chmod 600 ~/.kaggle/kaggle.json
    """

    # Download all competition files
    !kaggle competitions download -c playground-series-s5e5 -p kaggle_data

    # Unzip the downloaded files
    import zipfile
    with zipfile.ZipFile('kaggle_data/playground-series-s5e5.zip', 'r') as zip_ref:
        zip_ref.extractall('kaggle_data')

    print("Dataset downloaded successfully!")

    # Set the base directory for data access
    BASE_DIR = 'kaggle_data'

# Now let's define paths to access the files in a consistent way
# This will work both on Kaggle and locally
train_path = os.path.join(BASE_DIR, 'train.csv')
test_path = os.path.join(BASE_DIR, 'test.csv')
sample_submission_path = os.path.join(BASE_DIR, 'sample_submission.csv')

# Load the datasets
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sample_submission = pd.read_csv(sample_submission_path)

# Display basic information about the datasets
print("\n--- Dataset Information ---")
print(f"Training set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")
print(f"Sample submission shape: {sample_submission.shape}")

# Display a few rows of the training data
print("\n--- First few rows of training data ---")
train_df.head()

From an earlier run, we determined this feature importance.  Commenting out everything with extremely low importance.

Feature Importance:

Age_Based_Intensity: 0.6814467310905457
Temp_Heart_Interaction_Duration: 0.18323101103305817
Duration: 0.053071144968271255
BMI_Intensity: 0.038230136036872864
TotalTemp: 0.030490227043628693
Female_Intensity: 0.0040832036174833775
HR_Based_Calories: 0.0018469374626874924
Sex_male: 0.0016337857814505696
Age_Based_HR_Percent: 0.0015912051312625408
Male_Intensity: 0.0014280130853876472
Temp_Heart_Interaction: 0.0010874277213588357
Heart_Rate: 0.0005569902714341879
Calories_Per_Minute: 0.00046593035222031176
Est_Max_HR: 0.00021114895935170352
VO2_Calories: 0.00013731316721532494
Age: 0.00012986226647626609
Body_Temp: 8.561972208553925e-05
Weight: 7.448684482369572e-05
HR_Times_Weight_Times_Duration: 6.925689376657829e-05
HR_Times_Weight: 4.754386463901028e-05
Height: 4.337794962339103e-05
BMI: 3.855152681353502e-05

In [None]:
def preprocess_df(df):
    # Original preprocessing
    if 'id' in df.columns:
        df = df.drop('id', axis=1)
    df['Intensity'] = df['Heart_Rate'] * df['Duration']
    df['TotalTemp'] = df['Body_Temp'] * df['Duration']
    df['Sex'] = pd.Categorical(df.Sex)

    df['Temp_Heart_Interaction'] = df['Body_Temp'] * df['Heart_Rate']
    # df['HR_Times_Weight'] = df['Heart_Rate'] * df['Weight']
    df['Est_Max_HR'] = 220 - df['Age']
    df['Age_Based_HR_Percent'] = df['Heart_Rate'] / df['Est_Max_HR']
    df['BMI'] = df['Weight'] / (df['Height'] ** 2)

    df['Temp_Heart_Interaction_Duration'] = df['Temp_Heart_Interaction'] * df['Duration']
    # df['HR_Times_Weight_Times_Duration'] = df['HR_Times_Weight'] * df['Duration']
    df['Age_Based_Intensity'] = df['Age_Based_HR_Percent'] * df['Duration']
    df['BMI_Intensity'] = df['BMI'] * df['Intensity']

    # df['Male_Intensity'] = (df.Sex=='male')*df.Age_Based_Intensity
    # df['Female_Intensity'] = (df.Sex=='female')*df.Age_Based_Intensity
    
    # Calculate VO₂-Based Calorie Estimate
    # Step 1: Estimate VO₂_max (mL/kg/min) - using a simplified age-based estimate
    # A common simple estimate is 45.2 - 0.35*Age for men, slightly lower for women
    # We'll use an average approach of 40 - 0.25*Age for simplicity
    vo2_max = 40 - 0.25 * df['Age']
    
    # Step 2: Calculate estimated VO₂ during exercise based on heart rate
    # Assuming HR is linearly related to VO₂ consumption (simplified model)
    # MaxHR estimated as 208 - 0.7 * Age
    max_hr = 208 - 0.7 * df['Age']
    hr_percentage = df['Heart_Rate'] / max_hr
    vo2_estimate = vo2_max * hr_percentage
    
    # Step 3: Calculate total O₂ consumed in liters
    # Total O₂ = VO₂ (mL/kg/min) × Weight(kg) × Duration(min) / 1000
    total_o2_consumed = vo2_estimate * df['Weight'] * df['Duration'] / 1000
    
    # Step 4: Convert O₂ to calories (5 kcal per liter of O₂)
    df['VO2_Calories'] = total_o2_consumed * 5

    is_male = df['Sex'] == 'male'
    
    # Calculate calories per minute based on the formulas
    """
    male_calories_per_min = (-55.0969 + 0.6309 * df['Heart_Rate'] + 
                            0.1988 * df['Weight'] + 
                            0.2017 * df['Age']) / 4.184
    
    female_calories_per_min = (-20.4022 + 0.4472 * df['Heart_Rate'] - 
                              0.1263 * df['Weight'] + 
                              0.074 * df['Age']) / 4.184
    """
    
    # Assign the appropriate calculation based on sex
    #df['Calories_Per_Minute'] = np.where(is_male, male_calories_per_min, female_calories_per_min)
    
    # Calculate total calories based on duration
    #df['HR_Based_Calories'] = df['Calories_Per_Minute'] * df['Duration']

    if 'Intensity' in df.columns:
        df = df.drop('Intensity', axis=1)
    
    return df



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns

def analyze_normalization_method(df, plot=True, sample_rows=None):
    """
    Analyzes each numeric column in a dataframe and provides stats to help decide
    between log transformation and division by max normalization.
    
    Parameters:
    -----------
    df : pandas DataFrame
        The dataframe to analyze
    plot : bool, default=True
        Whether to plot histograms of the distributions
    sample_rows : int, default=None
        Number of rows to sample for analysis (useful for large dataframes)
    
    Returns:
    --------
    pd.DataFrame
        A dataframe with normalization recommendations
    """
    if sample_rows is not None and sample_rows < len(df):
        df_sample = df.sample(sample_rows, random_state=42)
    else:
        df_sample = df
    
    # Select only numeric columns
    numeric_df = df_sample.select_dtypes(include=['number'])
    
    results = []
    
    if plot:
        # Determine grid size for subplots
        n_cols = min(3, len(numeric_df.columns))
        n_rows = int(np.ceil(len(numeric_df.columns) / n_cols))
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4*n_rows))
        if n_rows == 1 and n_cols == 1:
            axes = np.array([axes])
        axes = axes.flatten()
    
    for i, col in enumerate(numeric_df.columns):
        data = numeric_df[col].dropna()
        
        # Skip if column has no data
        if len(data) == 0:
            continue
            
        # Basic statistics
        min_val = data.min()
        max_val = data.max()
        range_val = max_val - min_val
        mean_val = data.mean()
        median_val = data.median()
        std_val = data.std()
        
        # Check for zeros and negative values
        has_zeros = (data == 0).any()
        has_negatives = (data < 0).any()
        
        # Calculate skewness and kurtosis
        skewness = stats.skew(data)
        kurtosis = stats.kurtosis(data)
        
        # Calculate range ratio (max/min) for detecting orders of magnitude
        if min_val > 0:
            range_ratio = max_val / min_val
        else:
            range_ratio = np.nan
            
        # Calculate IQR and detect outliers
        q1 = data.quantile(0.25)
        q3 = data.quantile(0.75)
        iqr = q3 - q1
        outlier_ratio = ((data < (q1 - 1.5 * iqr)) | (data > (q3 + 1.5 * iqr))).mean()
        
        # Generate recommendation
        if has_negatives:
            recommendation = "Shift and normalize (can't use log directly)"
        elif has_zeros:
            if skewness > 1.0 or range_ratio > 100:
                recommendation = "Log(x+1) transformation"
            else:
                recommendation = "Division by max"
        else:  # Positive values only
            if skewness > 1.0 or range_ratio > 100:
                recommendation = "Log transformation"
            else:
                recommendation = "Division by max"
                
        # Confidence score (simple heuristic)
        confidence = 0
        if abs(skewness) > 2:  # Highly skewed
            confidence += 2
        elif abs(skewness) > 1:  # Moderately skewed
            confidence += 1
            
        if range_ratio > 1000:  # Very wide range
            confidence += 2
        elif range_ratio > 100:  # Wide range
            confidence += 1
            
        if outlier_ratio > 0.05:  # Many outliers
            confidence += 1
            
        confidence = min(confidence, 5)  # Cap at 5
        
        # Plot if requested
        if plot and i < len(axes):
            ax = axes[i]
            sns.histplot(data, ax=ax, kde=True)
            ax.set_title(f"{col}\nRecommendation: {recommendation}")
            ax.set_xlabel(f"Skewness: {skewness:.2f}, Range Ratio: {range_ratio:.1f}")
        
        results.append({
            'Column': col,
            'Min': min_val,
            'Max': max_val,
            'Mean': mean_val,
            'Median': median_val,
            'Std': std_val,
            'Skewness': skewness,
            'Kurtosis': kurtosis,
            'Range_Ratio': range_ratio,
            'Has_Zeros': has_zeros,
            'Has_Negatives': has_negatives,
            'Outlier_Ratio': outlier_ratio,
            'Recommendation': recommendation,
            'Confidence': confidence
        })
    
    if plot:
        # Hide empty subplots
        for j in range(i + 1, len(axes)):
            axes[j].set_visible(False)
        plt.tight_layout()
        plt.show()
    
    result_df = pd.DataFrame(results)
    
    # Print summary
    print("\n--- Normalization Recommendation Summary ---")
    for _, row in result_df.iterrows():
        print(f"\n{row['Column']}:")
        print(f"  Range: {row['Min']:.4g} to {row['Max']:.4g} (Ratio: {row['Range_Ratio']:.4g})")
        print(f"  Distribution: Mean={row['Mean']:.4g}, Median={row['Median']:.4g}, Std={row['Std']:.4g}")
        print(f"  Skewness: {row['Skewness']:.4g}, Kurtosis: {row['Kurtosis']:.4g}")
        print(f"  Contains: {'Zeros' if row['Has_Zeros'] else 'No zeros'}, {'Negatives' if row['Has_Negatives'] else 'No negatives'}")
        print(f"  Outliers: {row['Outlier_Ratio']*100:.2f}% of values")
        print(f"  Recommendation: {row['Recommendation']} (Confidence: {row['Confidence']}/5)")
    
    return result_df

In [None]:
#train_df = preprocess_df(train_df)
#result_df = analyze_normalization_method(train_df)
#result_df

In [None]:
def normalize_df_for_nn(df, log_cols, excluded_cols):
    """
    Normalizes all numeric features in a dataframe - using log normalization for
    specified columns and division by max for all other numeric columns.
    
    Parameters:
    -----------
    df : pandas DataFrame
        The dataframe containing features to normalize
    log_cols : list
        List of column names to apply log normalization to
    
    Returns:
    --------
    pandas DataFrame
        A new dataframe with normalized features
    """
    # Create a copy of the dataframe
    normalized_df = df.copy()
    
    # Get numeric columns
    numeric_cols = normalized_df.select_dtypes(include=['number']).columns.tolist()
    numeric_cols = [col for col in numeric_cols if col not in excluded_cols]
    
    # Apply log normalization to specified columns
    for col in log_cols:
        if col in numeric_cols:
            normalized_df[col] = np.log1p(normalized_df[col])  # log(1+x)
    
    # Apply division by max to all other numeric columns
    max_cols = [col for col in numeric_cols if col not in log_cols]
    for col in max_cols:
        max_val = normalized_df[col].max()
        normalized_df[col] = normalized_df[col] / max_val
    
    return normalized_df

In [None]:
orig_test_df = test_df

log_cols = ['VO2_Calories']
excluded_cols = ['Calories']

train_df = preprocess_df(train_df)
test_df = preprocess_df(test_df)

nn_train_df = normalize_df_for_nn(train_df, log_cols, excluded_cols)
nn_test_df = normalize_df_for_nn(test_df, log_cols, excluded_cols)

cats=['Sex']
train_df.head()

In [None]:
nn_train_df.head()

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from pytorch_tabnet.tab_model import TabNetRegressor
import torch
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from pytorch_tabnet.metrics import Metric

def train_simplified_tabnet(nn_train_df, nn_test_df, target_col='Calories', cat_features=['Sex']):
    print("Training simplified TabNet with MSE optimization on log-scale...")
    
    # Create a copy to avoid modifying the original dataframes
    train_df = nn_train_df.copy()
    test_df = nn_test_df.copy()
    
    # Apply label encoding to categorical features
    label_encoders = {}
    for cat_col in cat_features:
        if cat_col in train_df.columns:
            le = LabelEncoder()
            all_values = pd.concat([train_df[cat_col], test_df[cat_col] if cat_col in test_df.columns else pd.Series()])
            le.fit(all_values.astype(str))
            train_df[cat_col] = le.transform(train_df[cat_col].astype(str))
            if cat_col in test_df.columns:
                test_df[cat_col] = le.transform(test_df[cat_col].astype(str))
            label_encoders[cat_col] = le
    
    # Split features and target
    X = train_df.drop(target_col, axis=1)
    y = train_df[target_col]
    
    # Create train/validation split
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Log-transform targets
    y_train_log = np.log1p(np.maximum(0, y_train))
    y_val_log = np.log1p(np.maximum(0, y_val))
    
    # For TabNet format
    y_train_log = y_train_log.values.reshape(-1, 1)
    y_val_log = y_val_log.values.reshape(-1, 1)
    
    # Get categorical feature indices for TabNet
    cat_idxs = [i for i, col in enumerate(X_train.columns) if col in cat_features]
    cat_dims = [int(X_train[col].nunique()) for col in X_train.columns if col in cat_features]
    
    # Create a simple MSE metric class
    class MSE(Metric):
        def __init__(self):
            self._name = "mse"
            self._maximize = False

        def __call__(self, y_true, y_pred):
            y_pred = y_pred.squeeze()
            y_true = y_true.squeeze()
            return mean_squared_error(y_true, y_pred)
    
    # Initialize simplified TabNet model
    model = TabNetRegressor(
        cat_idxs=cat_idxs,
        cat_dims=cat_dims,
        cat_emb_dim=1,
        # Simplified architecture
        n_d=32,  # Reduced from 64
        n_a=32,  # Reduced from 64
        n_steps=3,  # Reduced from 5
        gamma=1.3,  # Reduced from 1.5
        n_independent=1,  # Reduced from 2
        n_shared=1,  # Reduced from 2
        # Stronger regularization
        lambda_sparse=5e-3,  # Increased from 1e-4
        # Learning rate and optimizer
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=1e-2),  # Reduced from 2e-2
        scheduler_params={"step_size":25, "gamma":0.75},
        scheduler_fn=torch.optim.lr_scheduler.StepLR,
        mask_type='sparsemax',
        verbose=1
    )
    
    # Convert to numpy arrays
    X_train_array = X_train.values
    X_val_array = X_val.values
    
    # Train with larger batches for speed
    model.fit(
        X_train=X_train_array, 
        y_train=y_train_log,
        eval_set=[(X_val_array, y_val_log)],
        eval_name=['valid'],
        eval_metric=[MSE],
        max_epochs=100,
        patience=10,  # Reduced patience for faster stopping
        batch_size=2048,  # Increased from 1024
        virtual_batch_size=256,  # Increased from 128
    )
    
    # Prepare test data
    X_test = test_df.drop(target_col, axis=1) if target_col in test_df.columns else test_df
    
    # Make predictions
    y_test_pred_log = model.predict(X_test.values)
    y_test_pred = np.expm1(y_test_pred_log.squeeze())
    
    # Calculate RMSLE on validation set
    if target_col in test_df.columns:
        y_test = test_df[target_col]
        y_val_pred_log = model.predict(X_val_array)
        y_val_pred = np.expm1(y_val_pred_log.squeeze())
        val_rmsle = np.sqrt(mean_squared_error(np.log1p(y_val), np.log1p(y_val_pred)))
        print(f"Final Validation RMSLE: {val_rmsle:.4f}")
    
    # Feature importance
    feat_importances = pd.DataFrame(
        {'feature': X_train.columns, 'importance': model.feature_importances_}
    ).sort_values('importance', ascending=False)
    
    print("Top 10 important features:")
    print(feat_importances.head(10))
    
    return model, y_test_pred, feat_importances, label_encoders

In [None]:
tabnet_model, tabnet_predictions, tabnet_feature_importances, label_encoders = train_simplified_tabnet(
    nn_train_df=nn_train_df, 
    nn_test_df=nn_test_df,
    target_col='Calories',
    cat_features=['Sex']
)

In [None]:
orig_test_df.head()

In [None]:
print("\nGenerating predictions for test set using TabNet...")

# Apply the same normalization to test_df as we did for training
# Assuming you used the same log_cols and excluded_cols as before
test_df_normalized = nn_test_df

# Manually recreate the label encoders
label_encoders = {}

# For 'Sex' feature - ensuring the exact same order as training
sex_encoder = LabelEncoder()
# The ordering shows 'female' first, then 'male'
sex_encoder.fit(['female', 'male'])  # This ordering is critical - it must match what was used in training
label_encoders['Sex'] = sex_encoder

#for cat_col, encoder in label_encoders.items():
 #   if cat_col in test_df_normalized.columns:
  #      test_df_normalized[cat_col] = encoder.transform(test_df_normalized[cat_col].astype(str))

# Use the trained TabNet model to make predictions
# The model already handles categorical encoding internally via the label_encoders
test_predictions_log = tabnet_model.predict(test_df_normalized.values)

# Transform back from log space to original scale
test_predictions = np.expm1(test_predictions_log.squeeze())  # squeeze() removes extra dimension

# Ensure predictions are non-negative
test_predictions = np.maximum(0, test_predictions)

# Create the submission dataframe
submission = pd.DataFrame({
    'id': orig_test_df['id'],
    'Calories': test_predictions
})

# Save to CSV
submission.to_csv('tabnet_submission.csv', index=False)
print(f"TabNet submission file created: tabnet_submission.csv with {submission.shape[0]} rows")

# Display the first few rows
print("\nFirst few rows of the TabNet submission file:")
display(submission.head())

In [None]:
import sys
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_log_error
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

print("Training XGBoost with robust RMSLE optimization...")
X = train_df.drop('Calories', axis=1)
y = train_df['Calories']

# Create train/validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical features
cat_features = ['Sex']

# Apply one-hot encoding for categorical features (XGBoost doesn't handle categorical features directly like CatBoost)
X_train_encoded = pd.get_dummies(X_train, columns=cat_features, drop_first=True)
X_val_encoded = pd.get_dummies(X_val, columns=cat_features, drop_first=True)

# Log-transform targets
y_train_log = np.log1p(np.maximum(0, y_train))
y_val_log = np.log1p(np.maximum(0, y_val))

# XGBoost configuration - parameters chosen to be similar to the CatBoost setup
xgb_model = XGBRegressor(
    n_estimators=3000,
    learning_rate=0.01,        # Reduced learning rate for stability
    max_depth=8,               # Similar depth as CatBoost
    objective='reg:squarederror',  # MSE objective for log-transformed data
    eval_metric='rmsle',        # Standard RMSE evaluation
    random_state=42,
    verbosity=1,
    reg_lambda=5,              # L2 regularization similar to l2_leaf_reg
    min_child_weight=10,       # Similar to min_data_in_leaf
    subsample=0.8,             # Add some subsampling for robustness
    colsample_bytree=0.8       # Feature subsampling
)

# Train the model on log-transformed targets with early stopping
# Note: early_stopping_rounds should be provided as a parameter to fit_params, not directly to fit()
eval_set = [(X_val_encoded, y_val_log)]
xgb_model.fit(
    X_train_encoded, 
    y_train_log,
    eval_set=eval_set,
)

# Make predictions (on log scale) and transform back
val_predictions_log = xgb_model.predict(X_val_encoded)
val_predictions = np.expm1(val_predictions_log)  # expm1 is inverse of log1p

# Ensure predictions are non-negative (should already be due to exp transform)
val_predictions = np.maximum(0, val_predictions)

# Calculate RMSLE directly
def rmsle(y_true, y_pred):
    # Ensure inputs are positive
    y_true = np.maximum(0, y_true)
    y_pred = np.maximum(0, y_pred)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# Evaluate the model
val_mse = mean_squared_error(y_val, val_predictions)
val_rmse = np.sqrt(val_mse)
val_rmsle = rmsle(y_val, val_predictions)
val_r2 = r2_score(y_val, val_predictions)

print(f"Validation MSE: {val_mse:.2f}")
print(f"Validation RMSE: {val_rmse:.2f}")
print(f"Validation RMSLE: {val_rmsle:.4f}")  # This is your target metric
print(f"Validation R²: {val_r2:.4f}")

# Feature importance
importance = xgb_model.feature_importances_
feature_names = X_train_encoded.columns
importance_df = sorted(zip(feature_names, importance), key=lambda x: x[1], reverse=True)

print("\nFeature Importance:")
for name, importance in importance_df:
    print(f"{name}: {importance}")

# If you need to predict on test data later
# test_encoded = pd.get_dummies(test_df, columns=cat_features, drop_first=True)
# test_predictions_log = xgb_model.predict(test_encoded)
# test_predictions = np.expm1(test_predictions_log)
# test_predictions = np.maximum(0, test_predictions)

In [None]:
print("\nGenerating predictions for test set...")
X_test = test_df

# We need to apply the same one-hot encoding to the test data
# First, identify categorical features
cat_features = ['Sex']

# Apply one-hot encoding for categorical features
X_test_encoded = pd.get_dummies(X_test, columns=cat_features, drop_first=True)

# Ensure the columns match exactly with training data
# Get the columns from the trained model (might be accessible via feature_names_in_)
train_columns = X_train_encoded.columns

# Check if any columns are missing in the test data
missing_cols = set(train_columns) - set(X_test_encoded.columns)
# Add missing columns with default value of 0
for col in missing_cols:
    X_test_encoded[col] = 0
    
# Ensure columns are in the same order as training data
X_test_encoded = X_test_encoded[train_columns]

# Make predictions (these are still in log space)
test_predictions_log = xgb_model.predict(X_test_encoded)

# Transform back from log space to original scale
test_predictions = np.expm1(test_predictions_log)  # This is the inverse of log1p

# Ensure predictions are non-negative (although expm1 should always give positive values)
test_predictions = np.maximum(0, test_predictions)

# Create the submission dataframe
submission = pd.DataFrame({
    'id': orig_test_df['id'],
    'Calories': test_predictions
})

# Save to CSV
submission.to_csv('submission.csv', index=False)
print(f"Submission file created: submission.csv with {submission.shape[0]} rows")

# Display the first few rows
print("\nFirst few rows of the submission file:")
display(submission.head())

In [None]:
# Check if running on Kaggle or locally
IN_KAGGLE = os.path.exists('/kaggle/working')

if IN_KAGGLE:
    # If running on Kaggle, use the built-in submission mechanism
    print("Running on Kaggle - please use the 'Submit' button in the UI to submit your results")
else:
    # If running locally, use the Kaggle API to submit
    print("Submitting via Kaggle API...")
    
    # Ensure Kaggle API is installed
    try:
        import kaggle
    except ImportError:
        print("Kaggle API not found. Installing...")
        !pip install kaggle
        import kaggle
    
    # Submit the file
    # Note: Make sure you have Kaggle API credentials set up (~/.kaggle/kaggle.json)
    competition_name = "playground-series-s5e5"
    submission_message = "xgboost with feature engineering"
    
    # Command to submit 
    submission_command = f"kaggle competitions submit -c {competition_name} -f submission.csv -m \"{submission_message}\""
    
    print(f"Running command: {submission_command}")
    !{submission_command}
    
    # Check your submissions (optional)
    print("\nYour recent submissions:")
    !kaggle competitions submissions -c {competition_name}
    
print("\nDone! 🎉")

In [None]:
#submit tabnet

# Check if running on Kaggle or locally
IN_KAGGLE = os.path.exists('/kaggle/working')

if IN_KAGGLE:
    # If running on Kaggle, use the built-in submission mechanism
    print("Running on Kaggle - please use the 'Submit' button in the UI to submit your results")
else:
    # If running locally, use the Kaggle API to submit
    print("Submitting via Kaggle API...")
    
    # Ensure Kaggle API is installed
    try:
        import kaggle
    except ImportError:
        print("Kaggle API not found. Installing...")
        !pip install kaggle
        import kaggle
    
    # Submit the file
    # Note: Make sure you have Kaggle API credentials set up (~/.kaggle/kaggle.json)
    competition_name = "playground-series-s5e5"
    submission_message = "pytorch tabnet"
    
    # Command to submit 
    submission_command = f"kaggle competitions submit -c {competition_name} -f tabnet_submission.csv -m \"{submission_message}\""
    
    print(f"Running command: {submission_command}")
    !{submission_command}
    
    # Check your submissions (optional)
    print("\nYour recent submissions:")
    !kaggle competitions submissions -c {competition_name}
    
print("\nDone! 🎉")