In [89]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import lightgbm as lgb

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')

In [90]:
"""
    We are going to be using pipelines for all preprocessing steps.

    Missing values handling:
    - Episode Length will be imputed with median,
    - Guest popularity will be imputed with 0,
    - Number of ads will be imputed with a median

    Categorical data processing:
    - Drop Episode Title, Id, Podcast name, Episode Sentiment as they are very weak
    - Ordinal encoding: Publication Day, Publication Time,
    - One-hot encoding: Genre


    Numerical data processing:
    - Adding features from eda notebook
    - Drop values that do not make sense (like listening time greater than episode length)
    - Clip 1% percentile for all numerical features
    - Add was_guest_present feature where guest popularity is greater than 0
    
"""

'\n    We are going to be using pipelines for all preprocessing steps.\n\n    Missing values handling:\n    - Episode Length will be imputed with median,\n    - Guest popularity will be imputed with 0,\n    - Number of ads will be imputed with a median\n\n    Categorical data processing:\n    - Drop Episode Title, Id, Podcast name, Episode Sentiment as they are very weak\n    - Ordinal encoding: Publication Day, Publication Time,\n    - One-hot encoding: Genre\n\n\n    Numerical data processing:\n    - Adding features from eda notebook\n    - Drop values that do not make sense (like listening time greater than episode length)\n    - Clip 1% percentile for all numerical features\n    - Add was_guest_present feature where guest popularity is greater than 0\n\n'

In [91]:
# method for adding new numerical features
def add_ratio_features(df):
    df = df.copy()
    df["ads_per_minute"] = df["Number_of_Ads"] / df["Episode_Length_minutes"].replace(0, np.nan)
    df["was_guest_present"] = (df["Guest_Popularity_percentage"] > 0).astype(int)
    df["popularity_sum"] = df["Host_Popularity_percentage"] + df["Guest_Popularity_percentage"]
    df["popularity_product"] = df["Host_Popularity_percentage"] * df["Guest_Popularity_percentage"]
    df["host_to_guest_ratio"] = df["Host_Popularity_percentage"] / df["Guest_Popularity_percentage"].replace(0, np.nan)
    df["host_popularity_to_ads_ratio"] = df["Host_Popularity_percentage"] / df["Number_of_Ads"].replace(0, np.nan)
    df["guest_popularity_to_ads_ratio"] = df["Guest_Popularity_percentage"] / df["Number_of_Ads"].replace(0, np.nan)
    df["popularity_sum_per_minute"] = df["popularity_sum"] / df["Episode_Length_minutes"].replace(0, np.nan)
    
    df['host_popularity_to_ads_ratio'] = df['host_popularity_to_ads_ratio'].clip(lower=0, upper=1000000)
    df['host_to_guest_ratio'] = df['host_to_guest_ratio'].clip(lower=0, upper=1000000)
    df['guest_popularity_to_ads_ratio'] = df['guest_popularity_to_ads_ratio'].clip(lower=0, upper=1000000)
    df['popularity_product'] = df['popularity_product'].clip(lower=0, upper=1000000)
    df['popularity_sum_per_minute'] = df['popularity_sum_per_minute'].clip(lower=0, upper=1000000)

    return df

# drop values that do not make sense
def drop_values(df):
    df = df.copy()
    df = df.drop(df[(df['Listening_Time_minutes'] > df['Episode_Length_minutes'])].index)
    df = df.drop(df[(df['Number_of_Ads'] > 3)].index)
    df = df.drop(df[(df['Guest_Popularity_percentage'] > 100)].index)
    df = df.drop(df[(df['Host_Popularity_percentage'] > 100)].index)
    return df

# clip 1% percentile for all numerical features
def clip_percentiles(df):
    df = df.copy()
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) == 0:
        return df
    lower = df[numeric_cols].quantile(0.01)
    upper = df[numeric_cols].quantile(0.99)
    df[numeric_cols] = df[numeric_cols].clip(lower, upper, axis=1)
    return df

    

In [92]:
# feature engineering
feature_eng = Pipeline([
    ('add_ratio_features', FunctionTransformer(prepare_champion_features, validate=False)),
])


In [93]:
# categorical transformer
categorical_transformer = ColumnTransformer([
    ('ordinal', OrdinalEncoder(), ['Publication_Day', 'Publication_Time']),
    ('onehot', OneHotEncoder(), ['Genre']),
], remainder='passthrough')

In [94]:
# final pipeline

pipeline = Pipeline([
    ('feature_eng', feature_eng),
    ('model', lgb.LGBMRegressor(
        objective='regression',
        metric='rmse',
        num_leaves=35,
        n_estimators=400,
        learning_rate=0.2
        ))
])
pipeline

0,1,2
,steps,"[('feature_eng', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('add_ratio_features', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,func,<function pre...t 0x13a3d8ae0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,boosting_type,'gbdt'
,num_leaves,35
,max_depth,-1
,learning_rate,0.2
,n_estimators,400
,subsample_for_bin,200000
,objective,'regression'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [95]:
# Load train, test datasets
original_train_ds = pd.read_csv('train.csv')
original_test_ds = pd.read_csv('test.csv')

In [None]:
champion_features = [
    'Episode_Length_minutes',
    'Number_of_Ads',
    'Host_Popularity_percentage',
    'Guest_Popularity_percentage',
    'popularity_product',
    'host_guest_popularity_difference'
]

train_clean = drop_values(original_train_ds)

train_clean = prepare_champion_features(train_clean)

train_clean = train_clean.dropna(subset=['Episode_Length_minutes', 'Listening_Time_minutes'])

X_champion = train_clean[champion_features].copy()
y_champion = train_clean['Listening_Time_minutes'].copy()

imputer_champion = SimpleImputer(strategy='median')
X_champion_imputed = pd.DataFrame(
    imputer_champion.fit_transform(X_champion), 
    columns=X_champion.columns,
    index=X_champion.index
)

pipeline.fit(X_champion_imputed, y_champion)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002494 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1279
[LightGBM] [Info] Number of data points in the train set: 660292, number of used features: 6
[LightGBM] [Info] Start training from score 45.717289


0,1,2
,steps,"[('feature_eng', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('add_ratio_features', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,func,<function pre...t 0x13a3d8ae0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,boosting_type,'gbdt'
,num_leaves,35
,max_depth,-1
,learning_rate,0.2
,n_estimators,400
,subsample_for_bin,200000
,objective,'regression'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [None]:
test_df = original_test_ds.copy()
test_df = prepare_champion_features(test_df)

champion_features = [
    'Episode_Length_minutes',
    'Number_of_Ads',
    'Host_Popularity_percentage',
    'Guest_Popularity_percentage',
    'popularity_product',
    'host_guest_popularity_difference'
]
X_test = test_df[champion_features].copy()

X_test_imputed = pd.DataFrame(
    imputer_champion.transform(X_test),
    columns=X_test.columns,
    index=X_test.index
)

pred_minutes = pipeline.predict(X_test_imputed)
predictions = np.clip(pred_minutes, 0, X_test_imputed['Episode_Length_minutes'].values)

print(f"Predictions mean: {predictions.mean()}")
print(f"Predictions std: {predictions.std()}")
print(f"Predictions min: {predictions.min()}")
print(f"Predictions max: {predictions.max()}")
print(original_train_ds['Listening_Time_minutes'].describe())


Predictions mean: 45.5861374516778
Predictions std: 23.72041508614082
Predictions min: 0.0
Predictions max: 119.44
count    750000.000000
mean         45.437406
std          27.138306
min           0.000000
25%          23.178350
50%          43.379460
75%          64.811580
max         119.970000
Name: Listening_Time_minutes, dtype: float64


In [112]:
print(predictions)
output = pd.DataFrame({'id': original_test_ds.id,
                       'Listening_Time_minutes': predictions})
output.to_csv('submission.csv', index=False)

[56.77274436 18.15575519 50.91791608 ...  7.31005495 69.6485056
 57.49825601]
