In [None]:
# Spotify Popularity Prediction — Clean Pipeline

Notebook with a structured, presentation-ready workflow:
- Imports & setup
- Load data
- EDA (distribution + correlations)
- Robust preprocessing (numeric/categorical)
- Random Forest with small grid search
- Evaluation (metrics + plots)
- Feature importances


In [None]:
# 1) Imports & Setup
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

sns.set(style="whitegrid", context="talk")
plt.rcParams["figure.dpi"] = 120

DATASET_PATH = Path("/Users/anooptejthotapalli/Downloads/dataset-3-1.csv")
RANDOM_STATE = 42
TEST_SIZE = 0.2

assert DATASET_PATH.exists(), f"Dataset not found at {DATASET_PATH}"


In [None]:
# 2) Load Data
df = pd.read_csv(DATASET_PATH)
print("Data shape:", df.shape)
df.head(3)


In [None]:
# 3) Light Cleaning & EDA
# Keep a modeling copy
df_model = df.copy()

# Convert boolean to int if present
if 'explicit' in df_model.columns:
    df_model['explicit'] = df_model['explicit'].astype(int)

# EDA
numeric_cols_eda = df.select_dtypes(include=[np.number]).columns.tolist()

fig, ax = plt.subplots(1, 2, figsize=(12, 4))
sns.histplot(df['popularity'], bins=30, kde=True, ax=ax[0])
ax[0].set_title('Popularity Distribution')
ax[0].set_xlabel('popularity'); ax[0].set_ylabel('count')

sns.heatmap(df[numeric_cols_eda].corr(), cmap='coolwarm', center=0, ax=ax[1])
ax[1].set_title('Correlation Heatmap (numeric)')
plt.tight_layout()
plt.show()


In [None]:
# 4) Features/Target & Column Types
assert 'popularity' in df_model.columns, "Target 'popularity' not found."
X = df_model.drop(columns=['popularity', 'track_id', 'artists', 'album_name', 'track_name'], errors='ignore')
y = df_model['popularity']

numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(exclude=[np.number]).columns.tolist()

print('Numeric features:', len(numeric_features))
print('Categorical features:', len(categorical_features))
list(numeric_features)[:10]


In [None]:
# 5) Preprocessing & Model Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor

numeric_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ohe', OneHotEncoder(handle_unknown='ignore')),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
    ],
    remainder='drop',
)

pipe = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1)),
    ]
)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

from sklearn.model_selection import GridSearchCV
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 8, 16],
}

grid = GridSearchCV(
    pipe,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=3,
    n_jobs=-1,
    verbose=1,
)

grid.fit(X_train, y_train)
best_model = grid.best_estimator_

print('Best Params:', grid.best_params_)


In [None]:
# 6) Evaluation
from sklearn.metrics import mean_squared_error, r2_score

y_pred = best_model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"Random Forest RMSE: {rmse:.2f}")
print(f"Random Forest R^2:  {r2:.2f}")

fig, ax = plt.subplots(1, 2, figsize=(12, 5))
# Pred vs Actual
ax[0].scatter(y_test, y_pred, alpha=0.3)
lims = [min(y_test.min(), y_pred.min()), max(y_test.max(), y_pred.max())]
ax[0].plot(lims, lims, 'r--')
ax[0].set_title('Predicted vs Actual')
ax[0].set_xlabel('Actual popularity')
ax[0].set_ylabel('Predicted popularity')

# Residuals
residuals = y_test - y_pred
sns.histplot(residuals, bins=30, kde=True, ax=ax[1])
ax[1].set_title('Residuals Distribution')
ax[1].set_xlabel('Residual (Actual - Predicted)')

plt.tight_layout()
plt.show()


In [None]:
# 7) Feature Importances
from typing import List
from sklearn.compose import ColumnTransformer

def get_feature_names_from_column_transformer(ct: ColumnTransformer) -> List[str]:
    feature_names = []
    for name, transformer, cols in ct.transformers_:
        if name == 'remainder' and transformer == 'drop':
            continue
        if hasattr(transformer, 'named_steps'):
            last = list(transformer.named_steps.values())[-1]
            if hasattr(last, 'get_feature_names_out'):
                fn = last.get_feature_names_out(cols)
                feature_names.extend(fn)
            else:
                feature_names.extend(cols)
        else:
            feature_names.extend(cols if isinstance(cols, list) else [cols])
    return [str(f) for f in feature_names]

pre = best_model.named_steps['preprocessor']
rf = best_model.named_steps['model']

feature_names = get_feature_names_from_column_transformer(pre)
importances = pd.Series(rf.feature_importances_, index=feature_names).sort_values(ascending=False)

TOP_N = 15
plt.figure(figsize=(8, 6))
sns.barplot(x=importances.head(TOP_N).values, y=importances.head(TOP_N).index, orient='h')
plt.title(f'Top {TOP_N} Feature Importances')
plt.xlabel('Importance'); plt.ylabel('Feature')
plt.tight_layout()
plt.show()

print('Top 5 features:\n', importances.head(5))
