# Podcast Listening Time Prediction
## Advanced Machine Learning Pipeline

## 1. Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import ElasticNet
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
plt.style.use('ggplot')

In [None]:
# Load datasets
train_df = pd.read_csv("/kaggle/input/playground-series-s5e4/train.csv")
test_df = pd.read_csv("/kaggle/input/playground-series-s5e4/test.csv")
sample_submission = pd.read_csv("/kaggle/input/playground-series-s5e4/sample_submission.csv")

# Display basic info
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
train_df.head()

## 2. Data Exploration

In [None]:
# Target variable distribution
plt.figure(figsize=(10, 6))
sns.histplot(train_df['Listening_Time_minutes'], kde=True)
plt.title('Distribution of Listening Time (minutes)')
plt.show()

In [None]:
# Correlation analysis
numeric_cols = train_df.select_dtypes(include=np.number).columns
plt.figure(figsize=(12, 8))
sns.heatmap(train_df[numeric_cols].corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlations')
plt.show()

## 3. Feature Engineering

In [None]:
# Define target and features
y = train_df['Listening_Time_minutes']
drop_cols = ['id', 'Episode_Title', 'Listening_Time_minutes']
X = train_df.drop(columns=drop_cols)
X_test = test_df.drop(columns=['id', 'Episode_Title'])

# Ordinal encoding for Publication_Time
time_order = ['Early Morning', 'Morning', 'Afternoon', 'Evening', 'Night', 'Late Night']
time_mapping = {time: i for i, time in enumerate(time_order)}
X['Publication_Time_Ordinal'] = X['Publication_Time'].map(time_mapping)
X_test['Publication_Time_Ordinal'] = X_test['Publication_Time'].map(time_mapping)

# Text-based feature
X['Episode_Title_Length'] = train_df['Episode_Title'].str.len()
X_test['Episode_Title_Length'] = test_df['Episode_Title'].str.len()

# Interaction features
X['Duration_Sentiment_Interaction'] = X['Duration_minutes'] * X['Episode_Sentiment_Score']
X_test['Duration_Sentiment_Interaction'] = X_test['Duration_minutes'] * X_test['Episode_Sentiment_Score']

# Identify categorical and numerical features
categorical_cols = ['Podcast_Name', 'Genre', 'Publication_Day', 'Episode_Sentiment', 'Publication_Time']
numerical_cols = [col for col in X.columns if col not in categorical_cols + ['Publication_Time_Ordinal']]

## 4. Preprocessing Pipeline

In [None]:
# Numerical pipeline
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols),
        ('time_ordinal', 'passthrough', ['Publication_Time_Ordinal'])
    ])

## 5. Model Training and Evaluation

In [None]:
# Define models
models = {
    'RandomForest': RandomForestRegressor(random_state=42, n_jobs=-1),
    'XGBoost': XGBRegressor(random_state=42, n_jobs=-1, eval_metric='rmse'),
    'LightGBM': LGBMRegressor(random_state=42, n_jobs=-1),
    'ElasticNet': ElasticNet(random_state=42)
}

# Hyperparameter grids
param_grids = {
    'RandomForest': {
        'model__n_estimators': [100, 200],
        'model__max_depth': [None, 10, 20],
        'model__min_samples_split': [2, 5]
    },
    'XGBoost': {
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.01, 0.1],
        'model__max_depth': [3, 6]
    }
}

# KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
results = {}

for name, model in models.items():
    print(f"\n=== Training {name} ===")
    
    # Create pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('feature_selector', SelectFromModel(RandomForestRegressor(n_estimators=50, random_state=42))),
        ('model', model)
    ])
    
    # Hyperparameter tuning if parameters are defined
    if name in param_grids:
        grid_search = GridSearchCV(
            pipeline,
            param_grids[name],
            cv=kf,
            scoring='neg_root_mean_squared_error',
            n_jobs=-1,
            verbose=1
        )
        grid_search.fit(X, y)
        best_model = grid_search.best_estimator_
        best_score = -grid_search.best_score_
        best_params = grid_search.best_params_
        
        print(f"Best RMSE: {best_score:.4f}")
        print(f"Best parameters: {best_params}")
        
        # Store results
        results[name] = {
            'model': best_model,
            'rmse': best_score,
            'params': best_params
        }
    else:
        # For models without hyperparameter tuning
        scores = cross_val_score(
            pipeline,
            X,
            y,
            cv=kf,
            scoring='neg_root_mean_squared_error',
            n_jobs=-1
        )
        avg_rmse = -scores.mean()
        
        print(f"Avg RMSE: {avg_rmse:.4f}")
        
        # Fit the model on full data for later use
        pipeline.fit(X, y)
        results[name] = {
            'model': pipeline,
            'rmse': avg_rmse
        }

## 6. Model Comparison

In [None]:
# Compare model performance
model_comparison = pd.DataFrame({
    'Model': results.keys(),
    'RMSE': [result['rmse'] for result in results.values()]
}).sort_values('RMSE')

plt.figure(figsize=(10, 6))
sns.barplot(x='RMSE', y='Model', data=model_comparison, palette='viridis')
plt.title('Model Comparison by RMSE')
plt.xlabel('Root Mean Squared Error')
plt.ylabel('Model')
plt.show()

## 7. Ensemble Prediction

In [None]:
# Select top performing models
top_models = sorted(results.items(), key=lambda x: x[1]['rmse'])[:3]

# Create weighted ensemble predictions
test_predictions = []
weights = []

for name, result in top_models:
    pred = result['model'].predict(X_test)
    test_predictions.append(pred)
    # Inverse weight by RMSE (better models get more weight)
    weights.append(1 / result['rmse'])

# Normalize weights
weights = np.array(weights) / sum(weights)

# Create final ensemble prediction
final_preds = np.average(test_predictions, axis=0, weights=weights)

## 8. Feature Importance

In [None]:
# Get feature names
feature_names = (numerical_cols + 
                 list(results['RandomForest']['model'].named_steps['preprocessor']
                     .named_transformers_['cat']
                     .named_steps['onehot']
                     .get_feature_names_out(categorical_cols)) +
                 ['Publication_Time_Ordinal'])

# Display feature importance from best model
best_model_name = top_models[0][0]
if hasattr(results[best_model_name]['model'].named_steps['model'], 'feature_importances_'):
    importances = results[best_model_name]['model'].named_steps['model'].feature_importances_
    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
    importance_df = importance_df.sort_values('Importance', ascending=False)
    
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', 
                data=importance_df.head(20), 
                palette='rocket')
    plt.title(f'Top 20 Features by Importance ({best_model_name})')
    plt.tight_layout()
    plt.show()

## 9. Generate Submission

In [None]:
# Prepare submission
sample_submission['Listening_Time_minutes'] = final_preds
sample_submission.to_csv('submission.csv', index=False)
print("Submission file saved!")

# Show sample of predictions
sample_submission.head()