### Description

This is the May 2025 calories prediction competition.

### Files
1. train.csv
2. test.csv
3. sample_submission.csv

### Evaluation

The evaluation metric is the RMSLE.

Submission File
For each id in the test set, you must predict the number of minutes listened. The file should contain a header and have the following format:

- id,Listening_Time_minutes
- 26570,0.2
- 26571,0.1
- 26572,0.9
- etc.

## Package Importing

In [22]:
# general python libraries
import time
import sys
import datetime
import math
import numpy as np

# dataframe and data manipulation library
import pandas as pd

# visualisation and EDA libraries
import matplotlib.pyplot as  plt
import seaborn as sns

# machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn import metrics
import lightgbm as lgb
from sklearn.preprocessing import TargetEncoder

## Data Importing

In [23]:
TARGET_COLUMN = 'Calories'

In [24]:
folder_path = '../data/raw'
df_train = pd.read_csv(f'{folder_path}/train.csv', index_col='id')
df_test = pd.read_csv(f'{folder_path}/test.csv', index_col='id')
df_sample_submission = pd.read_csv(f'{folder_path}/sample_submission.csv',index_col='id')

In [25]:
df_train

Unnamed: 0_level_0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,female,38,166.0,61.0,25.0,102.0,40.6,146.0
...,...,...,...,...,...,...,...,...
749995,male,28,193.0,97.0,30.0,114.0,40.9,230.0
749996,female,64,165.0,63.0,18.0,92.0,40.5,96.0
749997,male,60,162.0,67.0,29.0,113.0,40.9,221.0
749998,male,45,182.0,91.0,17.0,102.0,40.3,109.0


In [26]:
df_train

Unnamed: 0_level_0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,female,38,166.0,61.0,25.0,102.0,40.6,146.0
...,...,...,...,...,...,...,...,...
749995,male,28,193.0,97.0,30.0,114.0,40.9,230.0
749996,female,64,165.0,63.0,18.0,92.0,40.5,96.0
749997,male,60,162.0,67.0,29.0,113.0,40.9,221.0
749998,male,45,182.0,91.0,17.0,102.0,40.3,109.0


In [27]:
df_train.dtypes

Sex            object
Age             int64
Height        float64
Weight        float64
Duration      float64
Heart_Rate    float64
Body_Temp     float64
Calories      float64
dtype: object

## Data Cleaning

In [28]:
def feature_engineering(df):
    
    # Encode sex as binary flag
    gender_mapping = {
        'male': 0,
        'female': 1
    }
    df['is_female'] = df['Sex'].map(gender_mapping)
    df = df.drop(columns=['Sex'])

    

    return df

In [29]:
def preprocessing(X: pd.DataFrame):

    X = feature_engineering(X)

    return X # Enabled this to stop warnings



In [30]:
def target_encoding(X_train, y_train, X_test):

    ### TARGET ENCODING
    # Categorical Columns
    categorical_columns = ["Genre","Publication_Day","Episode_Sentiment","Publication_Time","Podcast_Name"]
    categorical_encoded_columns = [column_name + '_TE' for column_name in categorical_columns]

    encoder = TargetEncoder(categories='auto', smooth='auto', cv=5, random_state=42)
    encoder.fit(X_train[categorical_columns], y_train)
    X_train[categorical_encoded_columns] = encoder.transform(X_train[categorical_columns])
    X_test[categorical_encoded_columns] = encoder.transform(X_test[categorical_columns])    

    # # Interaction Columns
    # interaction_features = [
    #     ('Publication_Day','Publication_Time')
    # ]

    # interaction_features_to_be_encoded = []
    # for feature_1, feature_2 in interaction_features:
    #     feature_name = feature_1 + '_' + feature_2 + '_TE'
    #     X_train[feature_name] = (X_train[feature_1].astype('str') + '_' + X_train[feature_2].astype('str')).astype('category')
    #     X_test[feature_name] = (X_test[feature_1].astype('str') + '_' + X_test[feature_2].astype('str')).astype('category')
    #     interaction_features_to_be_encoded.append(feature_name)
    
    # encoder = TargetEncoder(categories='auto', smooth='auto', cv=5, random_state=42)
    # encoder.fit(X_train[interaction_features_to_be_encoded], y_train)
    # X_train[interaction_features_to_be_encoded] = encoder.transform(X_train[interaction_features_to_be_encoded])
    # X_test[interaction_features_to_be_encoded] = encoder.transform(X_test[interaction_features_to_be_encoded])    

    # # Fitting encoder and transforming data

    return X_train, X_test

In [31]:
def postprocessing(X:pd.DataFrame):

    columns_to_drop = [
        # 'Sex', # Already dropped in feature engineering
        # 'Age',
        # 'Height',
        # 'Weight',
        # 'Duration',
        # 'Heart_Rate',
        # 'Body_Temp',
        # 'Calories' # target variable
    ]

    X = X.drop(columns=columns_to_drop)

    return X

## Model fitting

### Train Test Split

Splitting data into groupings for model fitting

In [32]:
from sklearn.metrics import root_mean_squared_log_error

def get_rmsle(preds,eval_data):

    y_preds_negatives_removed = np.clip(preds, a_min= 0, a_max = np.inf)

    rmsle = root_mean_squared_log_error(y_preds_negatives_removed, eval_data)

    # Metric must have the following format to be accepted
    # eval_name, eval_result, is_higher_better
    return 'rmsle', rmsle, False

In [33]:
from sklearn.model_selection import KFold, cross_validate, cross_val_score
from sklearn.metrics import root_mean_squared_log_error
from xgboost.callback import EarlyStopping
from optuna.integration import XGBoostPruningCallback

NUMBER_OF_SPLITS = 4
    
outer_kfold = KFold(n_splits=NUMBER_OF_SPLITS, shuffle=True)

list_train_rmse = []
list_test_rmse = []

for fold_number, (infold_training_indices, infold_test_indices) in enumerate(outer_kfold.split(df_train), 1):

    # Pre-processing of training data in kfold
    X_train = df_train.loc[infold_training_indices,df_train.columns != TARGET_COLUMN]
    y_train = df_train.loc[infold_training_indices,TARGET_COLUMN]

    X_train = preprocessing(X_train)
    X_train = postprocessing(X_train)

    # Pre-processing of training data in kfold for in-fold validation
    X_test = df_train.loc[infold_test_indices,df_train.columns != TARGET_COLUMN]
    y_test = df_train.loc[infold_test_indices,TARGET_COLUMN]
    
    X_test = preprocessing(X_test)
    X_test = postprocessing(X_test)


    hyperparameters = {'boosting_type': 'gbdt',
    'class_weight': None,
    'colsample_bytree': 0.5,
    'importance_type': 'gain',
    'learning_rate': 0.11994985788993823,
    'max_depth': 6,
    'min_child_samples': 10,
    'min_child_weight': 0.001,
    'min_split_gain': 0.0,
    'n_estimators': 100,
    'n_jobs': None,
    'num_leaves': 64,
    'objective': 'regression_l1',
    'random_state': 64,
    'reg_alpha': 0.1,
    'reg_lambda': 1.0,
    'subsample': 0.9,
    'subsample_for_bin': 200000,
    'subsample_freq': 3,
    'lambda_l1': 0.0016838659225225001,
    'lambda_l2': 6.816048559942313,
    'bagging_freq': 5,
    'bagging_fraction': 0.5600550938420944,
    'feature_fraction': 0.7571066302658794,
    'verbosity': 7}

    model = lgb.LGBMRegressor(
        **hyperparameters,
        # callbacks=[EarlyStopping(rounds=50, min_delta=2e-4, maximize=False)],
    )

    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_test, y_test)],
        eval_metric=get_rmsle,
    )

    y_train_preds = model.predict(X_train)
    train_rmse = root_mean_squared_log_error(y_true=y_train,y_pred=y_train_preds)
    list_train_rmse.append(train_rmse)

    y_test_preds = model.predict(X_test)
    test_rmse = root_mean_squared_log_error(y_true=y_test,y_pred=y_test_preds)
    list_test_rmse.append(test_rmse)

    print(f'--- Fold {fold_number} Completed ---')
    print('train_rmse, test_rmse - ',train_rmse,test_rmse)

print('--- Training_Completed ---')
print('The average test cross neg_root_mean_squared_error is ', sum(list_test_rmse)/len(list_test_rmse))

[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.071265
[LightGBM] [Debug] init for col-wise cost 0.000001 seconds, init for row-wise cost 0.004584 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001384 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 360
[LightGBM] [Info] Number of data points in the train set: 562500, number of used features: 7
[LightGBM] [Debug] Use subset for bagging
[LightGBM] [Info] Start training from score 76.000000
[LightGBM] [Debug] Re-bagging, using 314587 data to train
[LightGBM] [Debug] Trained a tree with leaves = 61 and depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 54 and depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 59 and depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 60 and depth = 6
[LightGBM] [

In [34]:
print('The average test cross neg_root_mean_squared_error is ', sum(list_test_rmse)/len(list_test_rmse))

The average test cross neg_root_mean_squared_error is  0.06587730618542707


In [35]:
# Training on entire dataset
X_train = df_train.loc[:,df_train.columns != TARGET_COLUMN]
y_train = df_train.loc[:,TARGET_COLUMN]

X_train = preprocessing(X_train)    
X_train = postprocessing(X_train)

# Pre-processing of training data in kfold for in-fold validation
X_test = df_test

X_test = preprocessing(X_test)
X_test = postprocessing(X_test)

model = lgb.LGBMRegressor(
    **hyperparameters,
)

model.fit(
    X_train, 
    y_train,
    # eval_metric=get_rmsle,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.071025
[LightGBM] [Debug] init for col-wise cost 0.000001 seconds, init for row-wise cost 0.003536 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001159 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 360
[LightGBM] [Info] Number of data points in the train set: 750000, number of used features: 7
[LightGBM] [Debug] Use subset for bagging
[LightGBM] [Info] Start training from score 77.000000
[LightGBM] [Debug] Re-bagging, using 419610 data to train
[LightGBM] [Debug] Trained a tree with leaves = 61 and depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 55 and depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 60 and depth = 6
[LightGBM] [Debug] Trained a tree with leaves = 61 and depth = 6
[LightGBM] [

# Test Set Validation

In [36]:
y_preds = model.predict(X_test)



In [37]:
df_sample_submission 

Unnamed: 0_level_0,Calories
id,Unnamed: 1_level_1
750000,88.283
750001,88.283
750002,88.283
750003,88.283
750004,88.283
...,...
999995,88.283
999996,88.283
999997,88.283
999998,88.283


In [38]:
import datetime

date = datetime.datetime.now().date().strftime('%Y-%m-%d')

model_type = type(model).__name__

comment = 'parameters_from_online_notebook_gender_fixed'

In [39]:
df_submission = df_sample_submission.copy()
df_submission[TARGET_COLUMN] = y_preds.clip(min=0,max=np.inf)

In [40]:
df_submission.Calories.min()

np.float64(0.8894645380584698)

In [41]:
# write the csv to the submissions folder
df_submission.to_csv(f'../submissions/{date}-{model_type}-{comment}.csv')

In [38]:
# CHECKLIST BEFORE RUNNING
# 1. is this a new run (start_run run_id empty) or are you inserting into an old run (start run populated)
# 2. Do you know the kaggle leaderboard metric? If not set to 999
# 3. Is this a leaderboard model? If not then disable the model logging at the end
# This take 2 minutes to run

import mlflow
from mlflow.models import infer_signature

# Set our tracking server uri for logging
mlflow.set_tracking_uri("http://localhost:5000")

# Create a new MLflow Experiment
mlflow.set_experiment("Kaggle S5E5")

# Start an MLflow run
with mlflow.start_run():

    # Log the hyperparameters
    mlflow.log_params(hyperparameters)

    # Log the loss metric
    mlflow.log_metric("cv_score", sum(list_test_rmse)/len(list_test_rmse))
    mlflow.log_metric("kaggle leaderboard", 0.16078)

    # Infer the model signature
    signature = infer_signature(
        model_input=X_train,
        model_output=y_train,
    )

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        signature=signature,
        input_example=X_train,
    );

2025/05/05 01:58:56 INFO mlflow.tracking.fluent: Experiment with name 'Kaggle S5E5' does not exist. Creating a new experiment.


🏃 View run agreeable-boar-940 at: http://localhost:5000/#/experiments/4/runs/23f97be3a90a4950848fe4bf77852242
🧪 View experiment at: http://localhost:5000/#/experiments/4


KeyboardInterrupt: 

array([ 27.63488177, 108.07915831,  86.98847049, ...,  64.58244601,
       181.35049496,  76.35856799])