# Linear Regression Notebook

In this notebook I will train a very simple model end to end.

### Description

This is the May 2025 calories prediction competition.

### Files
1. train.csv
2. test.csv
3. sample_submission.csv

### Evaluation

The evaluation metric is the RMSLE.

Submission File
For each id in the test set, you must predict the number of minutes listened. The file should contain a header and have the following format:

- id,Listening_Time_minutes
- 26570,0.2
- 26571,0.1
- 26572,0.9
- etc.

## Package Importing

In [74]:
# general python libraries
import numpy as np

# dataframe and data manipulation library
import pandas as pd

# machine learning libraries
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import root_mean_squared_log_error
import optuna

from scripts.data_processing import preprocessing, postprocessing

## Data Importing

In [2]:
TARGET_COLUMN = 'Calories'

In [3]:
folder_path = '../data/raw'
df_train = pd.read_csv(f'{folder_path}/train.csv', index_col='id')
df_test = pd.read_csv(f'{folder_path}/test.csv', index_col='id')
df_sample_submission = pd.read_csv(f'{folder_path}/sample_submission.csv',index_col='id')

## Data Cleaning

In [4]:
X_train = df_train.loc[:,df_train.columns != TARGET_COLUMN]
y_train = df_train.loc[:,TARGET_COLUMN]

X_train = preprocessing(X_train)
X_train = postprocessing(X_train)



## Model fitting

### Train Test Split

Splitting data into groupings for model fitting

In [113]:
from xgboost.callback import EarlyStopping
from optuna.integration import XGBoostPruningCallback

def objective(trial: optuna.Trial, X = X_train, y = y_train) -> int:

    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2,random_state=42)
    
    param = {
        'tree_method':'hist',
        'objective':'reg:squaredlogerror',  # squared log error objective
        'eval_metric':'rmsle',  
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': 10000,
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17]),
        'random_state': 42,
        'min_child_weight': trial.suggest_float('min_child_weight', 1, 10, log=True),
    }

    callbacks = [
        XGBoostPruningCallback(trial, observation_key="validation_0-rmsle"),
        EarlyStopping(rounds=50, min_delta=2e-4, save_best=True, maximize=False),
    ]

    model = xgb.XGBRegressor(
        **param,
        callbacks=callbacks
    )  


    model.fit(
        train_x, train_y,
        eval_set=[(test_x, test_y)],
        verbose=True
    )

    
    preds = model.predict(test_x)

    rmsle = root_mean_squared_log_error(test_y, preds)

    return rmsle

In [None]:
pruner = optuna.pruners.PercentilePruner(
    percentile=30.0,
    n_startup_trials=5,
    n_warmup_steps=800,
    interval_steps=500,
    n_min_trials=5
)
study = optuna.create_study(direction="minimize", pruner=pruner)

[I 2025-05-01 20:47:45,455] A new study created in memory with name: no-name-e9cdc3b6-ed8d-4629-ba12-264495a27c07


In [115]:
study.optimize(
    objective, 
    n_trials=50, 
    show_progress_bar=True
)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

  0%|          | 0/50 [00:00<?, ?it/s]

[0]	validation_0-rmsle:1.00798
[1]	validation_0-rmsle:0.98904
[2]	validation_0-rmsle:0.96929
[3]	validation_0-rmsle:0.94970
[4]	validation_0-rmsle:0.93101
[5]	validation_0-rmsle:0.91321
[6]	validation_0-rmsle:0.89342
[7]	validation_0-rmsle:0.87475
[8]	validation_0-rmsle:0.85482
[9]	validation_0-rmsle:0.83356
[10]	validation_0-rmsle:0.81626
[11]	validation_0-rmsle:0.79689
[12]	validation_0-rmsle:0.77728
[13]	validation_0-rmsle:0.75718
[14]	validation_0-rmsle:0.73533
[15]	validation_0-rmsle:0.73261
[16]	validation_0-rmsle:0.71311
[17]	validation_0-rmsle:0.69345
[18]	validation_0-rmsle:0.67342
[19]	validation_0-rmsle:0.65549
[20]	validation_0-rmsle:0.64189
[21]	validation_0-rmsle:0.62309
[22]	validation_0-rmsle:0.61068
[23]	validation_0-rmsle:0.59292
[24]	validation_0-rmsle:0.57417
[25]	validation_0-rmsle:0.55800
[26]	validation_0-rmsle:0.54747
[27]	validation_0-rmsle:0.53314
[28]	validation_0-rmsle:0.52380
[29]	validation_0-rmsle:0.50925
[30]	validation_0-rmsle:0.49711
[31]	validation_0-

# Optuna Visualisations

In [116]:
optuna.visualization.plot_optimization_history(study)

In [117]:
optuna.visualization.plot_param_importances(study)

In [119]:
optuna.visualization.plot_slice(study)

In [37]:
from optuna.integration import XGBoostPruningCallback

In [120]:
study.best_trial.params

{'lambda': 0.003202392566928705,
 'alpha': 0.007312397635381366,
 'colsample_bytree': 0.6,
 'subsample': 0.8,
 'learning_rate': 0.016,
 'max_depth': 5,
 'min_child_weight': 1.0063799781302378}