In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os
import optuna
import sys

In [2]:
train  = pd.read_csv('final_train.csv',low_memory=True)
test  = pd.read_csv('final_test.csv',low_memory=True)
samp  = pd.read_csv('sample_submission.csv',low_memory=True)
more_train_data  = pd.read_csv('podcast_dataset.csv',low_memory=True)

In [4]:
more_train_data.head()

Unnamed: 0,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
0,Healthy Living,Episode 77,99.25,Health,21.37,Thursday,Night,70.22,3,Positive,55.158695
1,Mystery Matters,Episode 6,19.43,True Crime,47.19,Friday,Evening,75.15,1,Neutral,7.686559
2,Current Affairs,Episode 1,117.03,News,96.33,Sunday,Night,57.95,3,Neutral,110.064645
3,Mystery Matters,Episode 38,16.97,True Crime,25.73,Monday,Night,24.19,0,Positive,12.00038
4,Humor Hub,Episode 73,83.48,Comedy,76.69,Tuesday,Afternoon,42.31,3,Positive,


In [6]:
train.drop(columns=['id'],inplace = True)

In [8]:
test.drop(columns=['id'],inplace = True)

In [9]:
test.head()

Unnamed: 0,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Episode_Title_num,Ad_Density,Popularity_Diff,Popularity_Interaction,Host_Popularity_squared,Popularity_Average
0,11,71,78.96,2,38.11,2,1,3.0,1.0,0,73,0.012664,35.11,114.33,1452.3721,20.555
1,36,16,27.87,5,71.29,3,2,3.0,0.0,0,23,0.0,68.29,213.87,5082.2641,37.145
2,24,3,69.1,1,67.89,0,1,3.0,0.0,1,11,0.0,64.89,203.67,4609.0521,35.445
3,4,71,115.39,1,23.4,3,2,3.0,2.0,1,73,0.017332,20.4,70.2,547.56,13.2
4,27,46,72.32,4,58.1,6,2,3.0,2.0,0,50,0.027654,55.1,174.3,3375.61,30.55


In [10]:
train.shape,test.shape

((750000, 17), (250000, 16))

In [11]:
def data_process(df):
    df['Episode_Title_num'] = df['Episode_Title'].astype(str).str.replace('Episode ', '').astype(int)
    df['Guest_Popularity_percentage'].fillna(df['Guest_Popularity_percentage'].median(), inplace=True)
    df['Episode_Length_minutes'].fillna(df['Episode_Length_minutes'].median(), inplace=True)
    df['Number_of_Ads'].fillna(df['Number_of_Ads'].median(), inplace=True)
    df['Guest_Popularity_percentage'] = df['Guest_Popularity_percentage'].clip(upper=3)
    df['Episode_Sentiment'] = df['Episode_Sentiment'].replace({'Neutral': 0, 'Positive': 1, 'Negative': -1})

    df['Ad_Density'] = df['Number_of_Ads'] / (df['Episode_Length_minutes'] + 1e-3)
    df['Popularity_Diff'] = df['Host_Popularity_percentage'] - df['Guest_Popularity_percentage']
    df['Popularity_Interaction'] = df['Host_Popularity_percentage'] * df['Guest_Popularity_percentage']
    df['Host_Popularity_squared'] = df['Host_Popularity_percentage'] ** 2
    df['Popularity_Average'] = (df['Host_Popularity_percentage'] + df['Guest_Popularity_percentage'])/2
    
    return df


In [12]:
more_train = data_process(more_train_data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Guest_Popularity_percentage'].fillna(df['Guest_Popularity_percentage'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Episode_Length_minutes'].fillna(df['Episode_Length_minutes'].median(), inplace=True)
The behavior will change in pandas 3.0. This

In [14]:
more_train.head()

Unnamed: 0,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,Episode_Title_num,Ad_Density,Popularity_Diff,Popularity_Interaction,Host_Popularity_squared,Popularity_Average
0,Healthy Living,Episode 77,99.25,Health,21.37,Thursday,Night,3.0,3,1,55.158695,77,0.030226,18.37,64.11,456.6769,12.185
1,Mystery Matters,Episode 6,19.43,True Crime,47.19,Friday,Evening,3.0,1,0,7.686559,6,0.051464,44.19,141.57,2226.8961,25.095
2,Current Affairs,Episode 1,117.03,News,96.33,Sunday,Night,3.0,3,0,110.064645,1,0.025634,93.33,288.99,9279.4689,49.665
3,Mystery Matters,Episode 38,16.97,True Crime,25.73,Monday,Night,3.0,0,1,12.00038,38,0.0,22.73,77.19,662.0329,14.365
4,Humor Hub,Episode 73,83.48,Comedy,76.69,Tuesday,Afternoon,3.0,3,1,,73,0.035936,73.69,230.07,5881.3561,39.845


In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
cat_cols = more_train.select_dtypes(exclude=['number']).columns.tolist()

In [17]:
cat_cols

['Podcast_Name',
 'Episode_Title',
 'Genre',
 'Publication_Day',
 'Publication_Time']

In [18]:
more_train[cat_cols] = more_train[cat_cols].fillna(more_train[cat_cols].mode().iloc[0])
print(cat_cols)
for col in cat_cols:
    le = LabelEncoder()
    more_train[col] = le.fit_transform(more_train[col])

['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time']


In [20]:
more_train.head()

Unnamed: 0,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,Episode_Title_num,Ad_Density,Popularity_Diff,Popularity_Interaction,Host_Popularity_squared,Popularity_Average
0,20,75,99.25,3,21.37,4,3,3.0,3,1,55.158695,77,0.030226,18.37,64.11,456.6769,12.185
1,34,56,19.43,9,47.19,0,1,3.0,1,0,7.686559,6,0.051464,44.19,141.57,2226.8961,25.095
2,7,0,117.03,6,96.33,3,3,3.0,3,0,110.064645,1,0.025634,93.33,288.99,9279.4689,49.665
3,34,32,16.97,9,25.73,1,3,3.0,0,1,12.00038,38,0.0,22.73,77.19,662.0329,14.365
4,22,71,83.48,1,76.69,5,0,3.0,3,1,,73,0.035936,73.69,230.07,5881.3561,39.845


In [21]:
total_train = pd.concat([train, more_train], ignore_index=True)

In [22]:
total_train

Unnamed: 0,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,Episode_Title_num,Ad_Density,Popularity_Diff,Popularity_Interaction,Host_Popularity_squared,Popularity_Average
0,34,98,63.84,9,74.81,4,3,3.0,0.0,1,31.419980,98,0.000000,71.81,224.43,5596.5361,38.905
1,24,19,119.80,1,66.95,2,0,3.0,2.0,-1,88.012410,26,0.016694,63.95,200.85,4482.3025,34.975
2,40,8,73.90,2,69.97,5,1,3.0,0.0,-1,44.925310,16,0.000000,66.97,209.91,4895.8009,36.485
3,10,40,67.17,8,57.22,1,2,3.0,2.0,1,46.278240,45,0.029775,54.22,171.66,3274.1284,30.110
4,31,85,110.51,3,80.07,1,0,3.0,3.0,0,75.610310,86,0.027147,77.07,240.21,6411.2049,41.535
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
802495,21,9,24.81,4,66.15,1,1,3.0,1.0,0,20.573795,17,0.040305,63.15,198.45,4375.8225,34.575
802496,30,89,92.15,5,89.61,2,3,3.0,2.0,-1,76.198459,9,0.021704,86.61,268.83,8029.9521,46.305
802497,4,17,112.27,1,26.33,2,3,3.0,0.0,0,107.602135,24,0.000000,23.33,78.99,693.2689,14.665
802498,2,84,62.87,0,41.47,6,0,3.0,0.0,0,17.220998,85,0.000000,38.47,124.41,1719.7609,22.235


In [23]:
total_train.head()

Unnamed: 0,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,Episode_Title_num,Ad_Density,Popularity_Diff,Popularity_Interaction,Host_Popularity_squared,Popularity_Average
0,34,98,63.84,9,74.81,4,3,3.0,0.0,1,31.41998,98,0.0,71.81,224.43,5596.5361,38.905
1,24,19,119.8,1,66.95,2,0,3.0,2.0,-1,88.01241,26,0.016694,63.95,200.85,4482.3025,34.975
2,40,8,73.9,2,69.97,5,1,3.0,0.0,-1,44.92531,16,0.0,66.97,209.91,4895.8009,36.485
3,10,40,67.17,8,57.22,1,2,3.0,2.0,1,46.27824,45,0.029775,54.22,171.66,3274.1284,30.11
4,31,85,110.51,3,80.07,1,0,3.0,3.0,0,75.61031,86,0.027147,77.07,240.21,6411.2049,41.535


In [24]:
total_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 802500 entries, 0 to 802499
Data columns (total 17 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Podcast_Name                 802500 non-null  int64  
 1   Episode_Title                802500 non-null  int64  
 2   Episode_Length_minutes       802500 non-null  float64
 3   Genre                        802500 non-null  int64  
 4   Host_Popularity_percentage   802500 non-null  float64
 5   Publication_Day              802500 non-null  int64  
 6   Publication_Time             802500 non-null  int64  
 7   Guest_Popularity_percentage  802500 non-null  float64
 8   Number_of_Ads                802500 non-null  float64
 9   Episode_Sentiment            802500 non-null  int64  
 10  Listening_Time_minutes       797105 non-null  float64
 11  Episode_Title_num            802500 non-null  int64  
 12  Ad_Density                   802500 non-null  float64
 13 

In [25]:
total_train.duplicated().sum()

6598

In [27]:
total_train.drop_duplicates(inplace =True)

In [30]:
total_train.isna().sum()

Podcast_Name                      0
Episode_Title                     0
Episode_Length_minutes            0
Genre                             0
Host_Popularity_percentage        0
Publication_Day                   0
Publication_Time                  0
Guest_Popularity_percentage       0
Number_of_Ads                     0
Episode_Sentiment                 0
Listening_Time_minutes         5132
Episode_Title_num                 0
Ad_Density                        0
Popularity_Diff                   0
Popularity_Interaction            0
Host_Popularity_squared           0
Popularity_Average                0
dtype: int64

In [31]:
final_train_data = total_train.dropna()

In [33]:
final_train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 790770 entries, 0 to 799999
Data columns (total 17 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Podcast_Name                 790770 non-null  int64  
 1   Episode_Title                790770 non-null  int64  
 2   Episode_Length_minutes       790770 non-null  float64
 3   Genre                        790770 non-null  int64  
 4   Host_Popularity_percentage   790770 non-null  float64
 5   Publication_Day              790770 non-null  int64  
 6   Publication_Time             790770 non-null  int64  
 7   Guest_Popularity_percentage  790770 non-null  float64
 8   Number_of_Ads                790770 non-null  float64
 9   Episode_Sentiment            790770 non-null  int64  
 10  Listening_Time_minutes       790770 non-null  float64
 11  Episode_Title_num            790770 non-null  int64  
 12  Ad_Density                   790770 non-null  float64
 13  Popu

In [34]:
final_train_data.to_csv('combined_all_train_data.csv',index = False)

## Creating Models

In [48]:
# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [43]:
target = 'Listening_Time_minutes'
X = final_train_data.drop(columns=[target])
y = final_train_data[target]

In [44]:
# Define models
models = [
    ("LinearRegression", LinearRegression()),
    ("RandomForest", RandomForestRegressor()),
    ("GradientBoosting", GradientBoostingRegressor()),
    ("XGBoost", XGBRegressor()),
    ("LightGBM", LGBMRegressor()),
    ("CatBoost", CatBoostRegressor()),
    ("DecisionTree", DecisionTreeRegressor())
]

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(632616, 16)
(158154, 16)
(632616,)
(158154,)


In [49]:
rmse_results = {}

for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_results[name] = rmse
    print(f"{name}: RMSE = {rmse:.4f}")

LinearRegression: RMSE = 13.3390
RandomForest: RMSE = 12.8336
GradientBoosting: RMSE = 13.1779
XGBoost: RMSE = 13.0765
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011172 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2256
[LightGBM] [Info] Number of data points in the train set: 632616, number of used features: 16
[LightGBM] [Info] Start training from score 45.454456
LightGBM: RMSE = 13.1171
Learning rate set to 0.113438
0:	learn: 24.9074863	total: 239ms	remaining: 3m 58s
1:	learn: 23.0014609	total: 315ms	remaining: 2m 37s
2:	learn: 21.3785759	total: 390ms	remaining: 2m 9s
3:	learn: 19.9889958	total: 469ms	remaining: 1m 56s
4:	learn: 18.8226266	total: 544ms	remaining: 1m 48s
5:	learn: 17.8383453	total: 618ms	remaining: 1m 42s
6:	learn: 17.0182424	total: 700ms	remaining: 1m 39s
7:	learn: 16.3381932	total: 781ms	remaining: 1m 36s
8:	

In [50]:
print("\nSorted RMSEs:")
for name, rmse in sorted(rmse_results.items(), key=lambda x: x[1]):
    print(f"{name}: RMSE = {rmse:.4f}")


Sorted RMSEs:
RandomForest: RMSE = 12.8336
CatBoost: RMSE = 13.0740
XGBoost: RMSE = 13.0765
LightGBM: RMSE = 13.1171
GradientBoosting: RMSE = 13.1779
LinearRegression: RMSE = 13.3390
DecisionTree: RMSE = 18.2099


In [52]:
model = LGBMRegressor()
model.fit(X_train,y_train)
y_light = model.predict(test)
y_light

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.039149 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2256
[LightGBM] [Info] Number of data points in the train set: 632616, number of used features: 16
[LightGBM] [Info] Start training from score 45.454456


array([55.66883279, 18.07421325, 49.91377815, ...,  6.58852712,
       74.5139234 , 57.1997729 ])

In [53]:
samp['Listening_Time_minutes'] = y_light

In [54]:
samp.to_csv('light.csv',index=False)

In [55]:
model = CatBoostRegressor()
model.fit(X_train,y_train)
y_cat = model.predict(test)
y_cat

Learning rate set to 0.113438
0:	learn: 24.9074863	total: 97.5ms	remaining: 1m 37s
1:	learn: 23.0014609	total: 196ms	remaining: 1m 38s
2:	learn: 21.3785759	total: 293ms	remaining: 1m 37s
3:	learn: 19.9889958	total: 386ms	remaining: 1m 36s
4:	learn: 18.8226266	total: 487ms	remaining: 1m 36s
5:	learn: 17.8383453	total: 582ms	remaining: 1m 36s
6:	learn: 17.0182424	total: 670ms	remaining: 1m 35s
7:	learn: 16.3381932	total: 772ms	remaining: 1m 35s
8:	learn: 15.7763626	total: 869ms	remaining: 1m 35s
9:	learn: 15.3017402	total: 964ms	remaining: 1m 35s
10:	learn: 14.9193918	total: 1.06s	remaining: 1m 35s
11:	learn: 14.6063117	total: 1.16s	remaining: 1m 35s
12:	learn: 14.3505834	total: 1.25s	remaining: 1m 35s
13:	learn: 14.1503142	total: 1.36s	remaining: 1m 35s
14:	learn: 13.9769882	total: 1.45s	remaining: 1m 35s
15:	learn: 13.8399913	total: 1.54s	remaining: 1m 35s
16:	learn: 13.7284304	total: 1.64s	remaining: 1m 34s
17:	learn: 13.6371237	total: 1.73s	remaining: 1m 34s
18:	learn: 13.5633335	tot

array([54.79297476, 18.19082933, 50.23596862, ...,  6.93524067,
       73.76793199, 58.03722738])

In [56]:
samp['Listening_Time_minutes']= y_cat

In [57]:
samp.to_csv('catboost_model_more_train.csv',index = False)

In [62]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor

In [60]:
model_cat = CatBoostRegressor(
        iterations=1500,
        learning_rate=0.08777255350163136,
        depth=10,
        l2_leaf_reg=0.1259643500248322,
        bootstrap_type='Bayesian',
        random_strength=4.276181166674371e-08,
        bagging_temperature=0.35995482350907326,
        od_type='Iter',
        od_wait=39,
        verbose=200,
        allow_writing_files=False,
        # task_type='GPU',
        # cat_features=CATS,
        random_seed=42
    )

In [61]:
model_cat.fit(X_train,y_train)

0:	learn: 25.3543744	total: 203ms	remaining: 5m 3s
200:	learn: 12.9428963	total: 34.2s	remaining: 3m 41s
400:	learn: 12.7071818	total: 1m 7s	remaining: 3m 6s
600:	learn: 12.5025272	total: 1m 41s	remaining: 2m 31s
800:	learn: 12.3239489	total: 2m 12s	remaining: 1m 55s
1000:	learn: 12.1527529	total: 2m 43s	remaining: 1m 21s
1200:	learn: 11.9971531	total: 3m 13s	remaining: 48.2s
1400:	learn: 11.8506671	total: 3m 44s	remaining: 15.8s
1499:	learn: 11.7811234	total: 3m 58s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x2567eaa9290>

In [63]:
Y_cat_taken = model_cat.predict(test)

In [65]:
Y_cat_taken

array([54.20831977, 18.43022495, 48.97779287, ...,  7.83141086,
       79.60391183, 60.00223796])

In [64]:
samp['Listening_Time_minutes']=Y_cat_taken

In [66]:
samp.to_csv('kaggle_cat.csv',index=False)

## Using Kfold-Cross-Validation

In [67]:
best_params = {
    'max_depth': 12,
    'learning_rate': 0.025491448564195527,
    'subsample': 0.8253613764200876,
    'colsample_bytree': 0.9565745021788857,
    'reg_lambda': 3.8512759107248455,
    'reg_alpha': 0.009389539205819732,
    'n_estimators': 608,
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'tree_method': 'hist',
    'verbosity': 0
}


In [68]:
# Set up K-Fold CV
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
rmse_scores = []


In [69]:
# K-Fold training
for train_idx, val_idx in kf.split(X):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = XGBRegressor(**best_params)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores.append(rmse)
    
    print(f"Fold {fold} RMSE: {rmse:.4f}")
    fold += 1

Fold 1 RMSE: 12.8660
Fold 2 RMSE: 12.8084
Fold 3 RMSE: 12.9346
Fold 4 RMSE: 12.8957
Fold 5 RMSE: 12.9060


In [70]:
print("\nCross-Validation Results:")
print(f"Mean RMSE: {np.mean(rmse_scores):.4f}")
print(f"Std RMSE: {np.std(rmse_scores):.4f}")


Cross-Validation Results:
Mean RMSE: 12.8821
Std RMSE: 0.0429


In [71]:
model = XGBRegressor(**best_params)

In [72]:
model.fit(X_train,y_train)

In [73]:
model

In [74]:
y_xg = model.predict(test)

In [75]:
y_xg

array([53.77386  , 20.241589 , 49.070843 , ...,  6.8848567, 75.21902  ,
       58.460274 ], dtype=float32)

In [76]:
samp

Unnamed: 0,id,Listening_Time_minutes
0,750000,54.208320
1,750001,18.430225
2,750002,48.977793
3,750003,76.073736
4,750004,50.070036
...,...,...
249995,999995,10.379807
249996,999996,57.550497
249997,999997,7.831411
249998,999998,79.603912


In [77]:
samp['Listening_Time_minutes'] = y_xg

In [78]:
samp.to_csv('kfold_xgboost.csv',index = False)

## Blending the csv file.

In [79]:
file1 = pd.read_csv('kaggle_cat.csv')
file2 = pd.read_csv('kfold_xgboost.csv')
file3 = pd.read_csv('catboost_model_more_train.csv')
file4 = pd.read_csv('light.csv')
file5 = pd.read_csv('model_xgboost.csv')

In [81]:
file1.shape,file2.shape,file3.shape,file4.shape,file5.shape

((250000, 2), (250000, 2), (250000, 2), (250000, 2), (250000, 2))

In [83]:
blend_df = pd.DataFrame({
    'cat_kaggle': file1['Listening_Time_minutes'],
    'xgb_kfold': file2['Listening_Time_minutes'],
    'cat_more': file3['Listening_Time_minutes'],
    'lightgbm': file4['Listening_Time_minutes'],
    'xgb_model': file5['Listening_Time_minutes'],
})

# Simple average blending (equal weight)
blend_df['blended_avg'] = blend_df.mean(axis=1)

In [84]:
blend_df

Unnamed: 0,cat_kaggle,xgb_kfold,cat_more,lightgbm,xgb_model,blended_avg
0,54.208320,53.773860,54.792975,55.668833,55.765343,54.841866
1,18.430225,20.241589,18.190829,18.074213,21.044462,19.196264
2,48.977793,49.070843,50.235969,49.913778,50.389328,49.717542
3,76.073736,73.362820,77.396142,80.170135,71.491820,75.698931
4,50.070036,48.736420,48.667758,48.743176,48.444000,48.932278
...,...,...,...,...,...,...
249995,10.379807,10.804342,10.682098,11.682393,11.594829,11.028694
249996,57.550497,60.044170,57.774455,57.414442,59.366726,58.430058
249997,7.831411,6.884857,6.935241,6.588527,6.707707,6.989549
249998,79.603912,75.219020,73.767932,74.513923,73.080400,75.237037


In [87]:
samp['Listening_Time_minutes'] = blend_df['blended_avg']

In [89]:
samp.to_csv('using_mean_blending.csv',index = False)