# Linear Regression Notebook

In this notebook I will train a very simple model end to end.

### Description

This is the May 2025 calories prediction competition.

### Files
1. train.csv
2. test.csv
3. sample_submission.csv

### Evaluation

The evaluation metric is the RMSLE.

Submission File
For each id in the test set, you must predict the number of minutes listened. The file should contain a header and have the following format:

- id,Listening_Time_minutes
- 26570,0.2
- 26571,0.1
- 26572,0.9
- etc.

## Package Importing

In [None]:
# general python libraries
import time
import sys
import datetime
import math
import numpy as np

# dataframe and data manipulation library
import pandas as pd

# visualisation and EDA libraries
import matplotlib.pyplot as  plt
import seaborn as sns

# machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn import metrics
import lightgbm as lgb
from sklearn.preprocessing import TargetEncoder

import optuna
from scripts.data_processing import preprocessing, postprocessing

## Data Importing

In [17]:
TARGET_COLUMN = 'Calories'

In [2]:
folder_path = '../data/raw'
df_train = pd.read_csv(f'{folder_path}/train.csv', index_col='id')
df_test = pd.read_csv(f'{folder_path}/test.csv', index_col='id')
df_sample_submission = pd.read_csv(f'{folder_path}/sample_submission.csv',index_col='id')

## Data Cleaning

In [None]:
X_train = df_train.loc[:,df_train.columns != TARGET_COLUMN]
y_train = df_train.loc[:,TARGET_COLUMN]

X_train = preprocessing(X_train)
X_train = postprocessing(X_train)



## Model fitting

### Train Test Split

Splitting data into groupings for model fitting

In [48]:
from sklearn.model_selection import KFold, cross_validate, cross_val_score
from sklearn.metrics import root_mean_squared_log_error

NUMBER_OF_SPLITS = 4
    
outer_kfold = KFold(n_splits=NUMBER_OF_SPLITS)
encoder = TargetEncoder(categories='auto', smooth='auto', cv=5, random_state=42)

list_train_rmse = []
list_test_rmse = []

for fold_number, (infold_training_indices, infold_test_indices) in enumerate(outer_kfold.split(df_train), 1):

    # Pre-processing of training data in kfold
    X_train = df_train.loc[infold_training_indices,df_train.columns != TARGET_COLUMN]
    y_train = df_train.loc[infold_training_indices,TARGET_COLUMN]

    X_train = preprocessing(X_train)
    X_train = postprocessing(X_train)

    # Pre-processing of training data in kfold for in-fold validation
    X_test = df_train.loc[infold_test_indices,df_train.columns != TARGET_COLUMN]
    y_test = df_train.loc[infold_test_indices,TARGET_COLUMN]
    
    X_test = preprocessing(X_test)
    X_test = postprocessing(X_test)


    hyperparameters = {
        "max_depth": -1,
        "num_leaves": 1024,
        "colsample_bytree": 0.7,
        "learning_rate": 0.03,
        "max_bin": 1024,
        "verbosity":0
    }

    model = lgb.LGBMRegressor(
        **hyperparameters
    )

    model.fit(
        X=X_train,
        y=y_train,
        eval_set=[(X_test,y_test)],
        # callbacks=[lgb.early_stopping(stopping_rounds=25,verbose=False)]
    )

    y_train_preds = model.predict(X_train)
    train_rmse = root_mean_squared_log_error(y_true=y_train,y_pred=y_train_preds)
    list_train_rmse.append(train_rmse)

    y_test_preds = model.predict(X_test)
    test_rmse = root_mean_squared_log_error(y_true=y_test,y_pred=y_test_preds)
    list_test_rmse.append(test_rmse)

    print(f'--- Fold {fold_number} Completed ---')
    print('train_rmse, test_rmse - ',train_rmse,test_rmse)

print('--- Training_Completed ---')
print('The average test cross neg_root_mean_squared_error is ', sum(list_test_rmse)/len(list_test_rmse))

--- Fold 1 Completed ---
train_rmse, test_rmse -  0.16796673299204476 0.16874951371008445
--- Fold 2 Completed ---
train_rmse, test_rmse -  0.16801194196200547 0.16841133419459423
--- Fold 3 Completed ---
train_rmse, test_rmse -  0.16779847260368308 0.16980278352793612
--- Fold 4 Completed ---
train_rmse, test_rmse -  0.1680743483153584 0.16785474329836667
--- Training_Completed ---
The average test cross neg_root_mean_squared_error is  0.16870459368274537


In [36]:
print('The average test cross neg_root_mean_squared_error is ', sum(list_test_rmse)/len(list_test_rmse))

The average test cross neg_root_mean_squared_error is  0.16870459368274537


In [47]:
# Training on entire dataset
X_train = df_train.loc[:,df_train.columns != TARGET_COLUMN]
y_train = df_train.loc[:,TARGET_COLUMN]

X_train = preprocessing(X_train)    
X_train = postprocessing(X_train)

# Pre-processing of training data in kfold for in-fold validation
X_test = df_test

X_test = preprocessing(X_test)
X_test = postprocessing(X_test)

model = lgb.LGBMRegressor(
    **hyperparameters
)

model.fit(X_train,y_train)

# Test Set Validation

In [38]:
y_preds = model.predict(X_test)

In [39]:
df_sample_submission 

Unnamed: 0_level_0,Calories
id,Unnamed: 1_level_1
750000,88.283
750001,88.283
750002,88.283
750003,88.283
750004,88.283
...,...
999995,88.283
999996,88.283
999997,88.283
999998,88.283


In [43]:
import datetime

date = datetime.datetime.now().date().strftime('%Y-%m-%d')

model_type = type(model).__name__

comment = 'initial_lgm_model_with_all_columns'

In [44]:
df_submission = df_sample_submission.copy()
df_submission[TARGET_COLUMN] = y_preds

In [45]:
# write the csv to the submissions folder
df_submission.to_csv(f'../submissions/{date}-{model_type}-{comment}.csv')

In [33]:
# CHECKLIST BEFORE RUNNING
# 1. is this a new run (start_run run_id empty) or are you inserting into an old run (start run populated)
# 2. Do you know the kaggle leaderboard metric? If not set to 999
# 3. Is this a leaderboard model? If not then disable the model logging at the end
# This take 2 minutes to run

import mlflow
from mlflow.models import infer_signature

# Set our tracking server uri for logging
mlflow.set_tracking_uri("http://localhost:5000")

# Create a new MLflow Experiment
mlflow.set_experiment("Kaggle S5E5")

# Start an MLflow run
with mlflow.start_run():

    # Log the hyperparameters
    mlflow.log_params(hyperparameters)

    # Log the loss metric
    mlflow.log_metric("cv_score", sum(list_test_rmse)/len(list_test_rmse))
    mlflow.log_metric("kaggle leaderboard", 0.16078)

    # Infer the model signature
    signature = infer_signature(
        model_input=X_train,
        model_output=y_train,
    )

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        signature=signature,
        input_example=X_train,
    );



🏃 View run useful-grub-263 at: http://localhost:5000/#/experiments/3/runs/c83d144de8724a6eb6c67760a7d94569
🧪 View experiment at: http://localhost:5000/#/experiments/3
