# XGBoost Notebook

In this notebook I will train an XGBoost model end to end.

### Description

This is the April 2025 podcast listening time podcast prediction competition.

The goal is to analyze and predict the average listening duration of podcast episodes based on various features.

### Files
1. train.csv
2. test.csv
3. sample_submission.csv

### Evaluation

The evaluation metric is the RMSE.

Submission File
For each id in the test set, you must predict the number of minutes listened. The file should contain a header and have the following format:

- id,Listening_Time_minutes
- 26570,0.2
- 26571,0.1
- 26572,0.9
- etc.

## Package Importing

In [1]:
# general python libraries
import time
import sys
import datetime
import math
import numpy as np

# dataframe and data manipulation library
import pandas as pd

# visualisation and EDA libraries
import matplotlib.pyplot as  plt
import ydata_profiling
import seaborn as sns

# machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn import metrics
import xgboost as xgb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import mlflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("testing_mlflow_features_tests")

mlflow.sklearn.autolog()
mlflow.xgboost.autolog(
    model_format="json"
)

## Data Importing

In [3]:
folder_path = '../data/raw'
df_train = pd.read_csv(f'{folder_path}/train.csv')
df_test = pd.read_csv(f'{folder_path}/test.csv')
sample = pd.read_csv(f'{folder_path}/sample_submission.csv')

In [4]:
TARGET_COLUMN = 'Listening_Time_minutes'

## Data Cleaning

### Plan for data cleaning for first model
0.  id
    - drop
1.  Podcast_Name - drop
    - drop
2.  Episode_Title - parse out episode number
    - parse out episode number
3.  Episode_Length_minutes
    - impute with mean
4.  Genre
    - drop
5.  Host_Popularity_percentage
    - impute with mean
6.  Publication_Day
    - drop
7.  Publication_Time
    - drop
8.  Guest_Popularity_percentage
    - impute with mean
9.  Number_of_Ads
    - impute one missing value with mean
10.  Episode_Sentiment
    - drop
11.  Listening_Time_minutes - target

In [5]:
def feature_engineering(df):
    
    # Parse episode number
    df['Episode_Number'] = (
        df
            ['Episode_Title']
            .str.split(' ') # split based on space so that each element is a list ['Episode','12']
            .apply(lambda lst: lst[1])
            .astype('float64')
    )
    
    df = df.drop(columns='Episode_Title')

    return df

In [6]:
def preprocessing(df):
    
    # Drop non-important columns
    columns_to_drop = ['Podcast_Name', 'Genre','Publication_Time','Episode_Sentiment','id', 'Publication_Day']
    
    df = df.drop(columns=columns_to_drop)

    df = feature_engineering(df)

    # Impute mean values in some columns
    median_number_of_ads = df['Number_of_Ads'].median()
    df['Number_of_Ads'] = df['Number_of_Ads'].fillna(median_number_of_ads)
    df['Number_of_Ads'] = df['Number_of_Ads'].astype('float64')

    median_guest_popularity = df['Guest_Popularity_percentage'].median()
    df['Guest_Popularity_percentage'] = df['Guest_Popularity_percentage'].fillna(median_guest_popularity)
    
    mean_episode_length = df['Episode_Length_minutes'].mean()
    df['Episode_Length_minutes'] = df['Episode_Length_minutes'].fillna(0)
    
    # # 1. Define the mapping dictionary
    # day_to_float = {
    #     'Monday': 1.0,
    #     'Tuesday': 2.0,
    #     'Wednesday': 3.0,
    #     'Thursday': 4.0,
    #     'Friday': 5.0,
    #     'Saturday': 6.0,
    #     'Sunday': 7.0
    # }

    # # 2. Create a new column with the mapped integer values
    # df['Publication_Day'] = df['Publication_Day'].map(day_to_float)

    return df # Enabled this to stop warnings



## Model fitting

### Train Test Split

Splitting data into groupings for model fitting

In [7]:
from sklearn.model_selection import KFold, cross_validate   
from sklearn.metrics import root_mean_squared_error

NUMBER_OF_SPLITS = 5

kfold_splits = KFold(n_splits=NUMBER_OF_SPLITS)

with mlflow.start_run(nested=True):

    list_train_rmse = []
    list_test_rmse = []

    for fold_number, (infold_training_indices, infold_test_indices) in enumerate(kfold_splits.split(df_train), 1):

        # Pre-processing of training data in kfold
        X_train = df_train.loc[infold_training_indices,df_train.columns != TARGET_COLUMN]
        X_train = preprocessing(X_train)
        
        y_train = df_train.loc[infold_training_indices,TARGET_COLUMN].to_numpy()

        # Pre-processing of training data in kfold for in-fold validation
        X_test = df_train.loc[infold_test_indices,df_train.columns != TARGET_COLUMN]
        X_test = preprocessing(X_test)
        
        y_test = df_train.loc[infold_test_indices,TARGET_COLUMN].to_numpy()

        # Defining XGBoost Parameters
        xgboost_model=xgb.XGBRegressor(
            random_state=42,
            objective='reg:squarederror',
            learning_rate=0.015,
            max_depth=6,
            n_estimators=700
        )

        xgboost_model.fit(
            X_train,
            y_train,
            verbose=200,
            eval_set=[(X_train,y_train)]
        )

        y_train_preds = xgboost_model.predict(X_train)
        train_rmse = root_mean_squared_error(y_true=y_train,y_pred=y_train_preds)
        
        y_test_preds = xgboost_model.predict(X_test)
        test_rmse = root_mean_squared_error(y_true=y_test,y_pred=y_test_preds)
        list_test_rmse.append(test_rmse)

        del(
            X_train,
            y_train,
            X_test,
            y_test,
            y_train_preds,
            y_test_preds
        )

        print(f'train_rmse, test_rmse = {train_rmse},{test_rmse}')

    print('Final average of all in-fold validation RMSE: ',sum(list_test_rmse)/NUMBER_OF_SPLITS)

[0]	validation_0-rmse:26.83523
[200]	validation_0-rmse:13.38999
[400]	validation_0-rmse:13.27233
[600]	validation_0-rmse:13.22266
[699]	validation_0-rmse:13.20348
train_rmse, test_rmse = 13.203475699962622,13.360496883786533
[0]	validation_0-rmse:26.83430
[200]	validation_0-rmse:13.39242
[400]	validation_0-rmse:13.27420
[600]	validation_0-rmse:13.22675
[699]	validation_0-rmse:13.20887
train_rmse, test_rmse = 13.208865678587012,13.351232074272852
[0]	validation_0-rmse:26.82972
[200]	validation_0-rmse:13.41247
[400]	validation_0-rmse:13.29425
[600]	validation_0-rmse:13.24410
[699]	validation_0-rmse:13.22547
train_rmse, test_rmse = 13.225469216551893,13.261866581090741
[0]	validation_0-rmse:26.82598
[200]	validation_0-rmse:13.41370
[400]	validation_0-rmse:13.29479
[600]	validation_0-rmse:13.24522
[699]	validation_0-rmse:13.22618
train_rmse, test_rmse = 13.226178820105082,13.26349226755659
[0]	validation_0-rmse:26.83510
[200]	validation_0-rmse:13.41017
[400]	validation_0-rmse:13.29107
[600

# Test Set Validation

In [8]:
# Defining XGBoost Parameters
xgboost_model=xgb.XGBRegressor(
    random_state=42,
    objective='reg:squarederror',
    learning_rate=0.015,
    max_depth=6,
    n_estimators=700
)

In [9]:
# Training on entire dataset
X_train = df_train.loc[:,df_train.columns != TARGET_COLUMN]
X_train = preprocessing(X_train)

y_train = df_train.loc[:,TARGET_COLUMN].to_numpy()

In [10]:
xgboost_model.fit(X_train,y_train,verbose=10)

2025/04/21 20:22:25 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '622b234e0e014a61a9c9797fe45d1297', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


🏃 View run redolent-cub-387 at: http://localhost:5000/#/experiments/1/runs/622b234e0e014a61a9c9797fe45d1297
🧪 View experiment at: http://localhost:5000/#/experiments/1


In [11]:
X_test = preprocessing(df_test)

In [12]:
y_preds = xgboost_model.predict(X_test)

In [13]:
df_submission = pd.DataFrame(
    y_preds,
    columns=['Listening_Time_minutes'],
    index=df_test.id
)

In [16]:
# write the csv to the submissions folder
df_submission.to_csv('../submissions/xgboost_impute_with_median_for_guest_popularity.csv')

In [15]:
df_submission

Unnamed: 0_level_0,Listening_Time_minutes
id,Unnamed: 1_level_1
750000,56.517708
750001,18.227524
750002,49.621071
750003,77.622375
750004,49.313473
...,...
999995,12.012521
999996,58.677685
999997,6.646736
999998,71.224922
