# XGBoost Notebook

In this notebook I will train an XGBoost model end to end.

### Description

This is the April 2025 podcast listening time podcast prediction competition.

The goal is to analyze and predict the average listening duration of podcast episodes based on various features.

### Files
1. train.csv
2. test.csv
3. sample_submission.csv

### Evaluation

The evaluation metric is the RMSE.

Submission File
For each id in the test set, you must predict the number of minutes listened. The file should contain a header and have the following format:

- id,Listening_Time_minutes
- 26570,0.2
- 26571,0.1
- 26572,0.9
- etc.

## Package Importing

In [None]:
# general python libraries
import time
import sys
import datetime
import math
import numpy as np

# dataframe and data manipulation library
import pandas as pd

# visualisation and EDA libraries
import matplotlib.pyplot as  plt
import seaborn as sns

# machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn import metrics
import lightgbm as lgb
from sklearn.preprocessing import TargetEncoder

## Data Importing

In [None]:
folder_path = '../data/raw'
df_train = pd.read_csv(f'{folder_path}/train.csv', index_col='id')
df_test = pd.read_csv(f'{folder_path}/test.csv', index_col='id')
df_sample_submission = pd.read_csv(f'{folder_path}/sample_submission.csv',index_col='id')

In [None]:
df_train.dtypes

In [None]:
column_dtypes = {
    'Podcast_Name':'category',
    'Episode_Title':'category',
    'Episode_Length_minutes':'float64',
    'Genre':'category',
    'Host_Popularity_percentage':'float64',
    'Publication_Day':'category',
    'Publication_Time':'category',
    'Guest_Popularity_percentage':'float64',
    'Number_of_Ads':'float64',
    'Episode_Sentiment':'object',
    'Listening_Time_minutes':'float64',
}

TARGET_COLUMN = 'Listening_Time_minutes'

## Data Cleaning

In [None]:
def feature_engineering(df):
    
    # Parse episode number
    df['Episode_Number_categorical'] = (
        df
            ['Episode_Title']
            .str.split(' ') # split based on space so that each element is a list ['Episode','12']
            .apply(lambda lst: lst[1])
            .astype('category')
    )

    df = df.drop(columns='Episode_Title')

    df['is_weekend']   = df['Publication_Day'].isin(['Saturday', 'Sunday']).astype('float64')

    return df

In [None]:
def preprocessing(X: pd.DataFrame):
    
    # Drop non-important columns
    CAT_COLUMNS = ["Genre","Publication_Day","Episode_Sentiment","Publication_Time","Podcast_Name"]
    X[CAT_COLUMNS] = X[CAT_COLUMNS].astype('category')

    X = feature_engineering(X)

    return X # Enabled this to stop warnings



In [None]:
def target_encoding(X_train, y_train, X_test):

    ### TARGET ENCODING
    # Categorical Columns
    categorical_columns = ["Genre","Publication_Day","Episode_Sentiment","Publication_Time","Podcast_Name"]
    categorical_encoded_columns = [column_name + '_TE' for column_name in categorical_columns]

    encoder = TargetEncoder(categories='auto', smooth='auto', cv=5, random_state=42)
    encoder.fit(X_train[categorical_columns], y_train)
    X_train[categorical_encoded_columns] = encoder.transform(X_train[categorical_columns])
    X_test[categorical_encoded_columns] = encoder.transform(X_test[categorical_columns])    

    # # Interaction Columns
    # interaction_features = [
    #     ('Publication_Day','Publication_Time')
    # ]

    # interaction_features_to_be_encoded = []
    # for feature_1, feature_2 in interaction_features:
    #     feature_name = feature_1 + '_' + feature_2 + '_TE'
    #     X_train[feature_name] = (X_train[feature_1].astype('str') + '_' + X_train[feature_2].astype('str')).astype('category')
    #     X_test[feature_name] = (X_test[feature_1].astype('str') + '_' + X_test[feature_2].astype('str')).astype('category')
    #     interaction_features_to_be_encoded.append(feature_name)
    
    # encoder = TargetEncoder(categories='auto', smooth='auto', cv=5, random_state=42)
    # encoder.fit(X_train[interaction_features_to_be_encoded], y_train)
    # X_train[interaction_features_to_be_encoded] = encoder.transform(X_train[interaction_features_to_be_encoded])
    # X_test[interaction_features_to_be_encoded] = encoder.transform(X_test[interaction_features_to_be_encoded])    

    # # Fitting encoder and transforming data

    return X_train, X_test

In [None]:
def postprocessing(X:pd.DataFrame):

    columns_to_drop = []

    X = X.drop(columns=columns_to_drop)

    return X

## Model fitting

### Train Test Split

Splitting data into groupings for model fitting

In [None]:
from sklearn.model_selection import KFold, cross_validate, cross_val_score
from sklearn.metrics import root_mean_squared_error

NUMBER_OF_SPLITS = 4
    
outer_kfold = KFold(n_splits=NUMBER_OF_SPLITS)
encoder = TargetEncoder(categories='auto', smooth='auto', cv=5, random_state=42)

list_train_rmse = []
list_test_rmse = []

for fold_number, (infold_training_indices, infold_test_indices) in enumerate(outer_kfold.split(df_train), 1):

    # Pre-processing of training data in kfold
    X_train = df_train.loc[infold_training_indices,df_train.columns != TARGET_COLUMN]
    y_train = df_train.loc[infold_training_indices,TARGET_COLUMN]

    X_train = preprocessing(X_train)

    # Pre-processing of training data in kfold for in-fold validation
    X_test = df_train.loc[infold_test_indices,df_train.columns != TARGET_COLUMN]
    y_test = df_train.loc[infold_test_indices,TARGET_COLUMN]
    
    X_test = preprocessing(X_test)

    X_train, X_test = target_encoding(X_train=X_train, X_test=X_test, y_train=y_train)

    X_train = postprocessing(X_train)
    X_test = postprocessing(X_test)

    hyperparameters = {
        "max_depth": -1,
        "num_leaves": 1024,
        "colsample_bytree": 0.7,
        "learning_rate": 0.03,
        "max_bin": 1024,
        "verbosity":0
    }

    model = lgb.LGBMRegressor(
        **hyperparameters
    )

    model.fit(
        X=X_train,
        y=y_train,
        eval_set=[(X_test,y_test)],
        callbacks=[lgb.early_stopping(stopping_rounds=25,verbose=False)]
    )

    y_train_preds = model.predict(X_train)
    train_rmse = root_mean_squared_error(y_true=y_train,y_pred=y_train_preds)
    list_train_rmse.append(train_rmse)

    y_test_preds = model.predict(X_test)
    test_rmse = root_mean_squared_error(y_true=y_test,y_pred=y_test_preds)
    list_test_rmse.append(test_rmse)

    print(f'--- Fold {fold_number} Completed ---')
    print('train_rmse, test_rmse - ',train_rmse,test_rmse)

print('--- Training_Completed ---')
print('The average test cross neg_root_mean_squared_error is ', sum(list_test_rmse)/len(list_test_rmse))

In [None]:
print('The average cross neg_root_mean_squared_error is ', sum(list_test_rmse)/len(list_test_rmse))

In [None]:
# Training on entire dataset
X_train = df_train.loc[:,df_train.columns != TARGET_COLUMN]
y_train = df_train.loc[:,TARGET_COLUMN]

X_train = preprocessing(X_train)

# Pre-processing of training data in kfold for in-fold validation
X_test = df_test

X_test = preprocessing(X_test)

X_train, X_test = target_encoding(X_train=X_train, X_test=X_test, y_train=y_train)

model = lgb.LGBMRegressor(
    **hyperparameters
)

model.fit(X_train,y_train)

In [None]:
# CHECKLIST BEFORE RUNNING
# 1. is this a new run (start_run run_id empty) or are you inserting into an old run (start run populated)
# 2. Do you know the kaggle leaderboard metric? If not set to 999
# 3. Is this a leaderboard model? If not then disable the model logging at the end
# This take 2 minutes to run

import mlflow
from mlflow.models import infer_signature

# Set our tracking server uri for logging
mlflow.set_tracking_uri("http://localhost:5000")

# Create a new MLflow Experiment
mlflow.set_experiment("Kaggle S5E4")

# Start an MLflow run
with mlflow.start_run():

    # Log the hyperparameters
    mlflow.log_params(hyperparameters)

    # Log the loss metric
    mlflow.log_metric("cv_neg_root_mean_squared_error", sum(list_test_rmse)/len(list_test_rmse))
    mlflow.log_metric("kaggle leaderboard", 13.10705)

    # Infer the model signature
    signature = infer_signature(
        model_input=X_train,
        model_output=y_train,
    )

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        signature=signature,
        input_example=X_train,
    );

# Test Set Validation

In [None]:
y_preds = model.predict(X_test)

In [None]:
df_sample_submission 

In [None]:
import datetime

date = datetime.datetime.now().date().strftime('%Y-%m-%d')

model_type = type(model).__name__

comment = 'converted_episode_number_to_a_categorical_variable'

In [None]:
df_submission = df_sample_submission.copy()
df_submission['Listening_Time_minutes'] = y_preds

In [None]:
# write the csv to the submissions folder
df_submission.to_csv(f'../submissions/{date}-{model_type}-{comment}.csv')