In [None]:
# Spotify Popularity Prediction

This notebook builds a predictive model for Spotify track popularity (0-100) using audio and metadata features.

It includes:
- Data loading and cleaning
- Exploratory data analysis (EDA)
- Feature engineering (e.g., artist count, title signals)
- Modeling (Linear Regression, Random Forest, optional XGBoost)
- Evaluation (MAE, RMSE, R²)
- Visualizations (correlations, feature importances, SHAP, partial dependence)

Set the `DATASET_PATH` below if your file is located elsewhere.


In [None]:
# Paths
from pathlib import Path
DATASET_PATH = Path('/Users/anooptejthotapalli/Downloads/dataset-3-1.csv')
assert DATASET_PATH.exists(), f"Dataset not found at {DATASET_PATH}"

# Basics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)

# Load
df = pd.read_csv(DATASET_PATH)
print(df.shape)
df.head()


In [None]:
# Clean column names: fix possible line-wrapped header cells
# Sometimes CSVs from certain sources wrap header lines; we'll normalize columns.
df.columns = [c.strip().replace('\n', '').replace('  ', ' ') for c in df.columns]

# Rename common wrapped columns if present
rename_map = {
    'danceabi lity': 'danceability',
    'mod e': 'mode',
    'time_signatur e': 'time_signature',
}
df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True)

# Drop unnamed index column if present
for col in df.columns:
    if col.lower().startswith('unnamed'):
        df.drop(columns=[col], inplace=True)
        break

print('Columns:', list(df.columns))
df.head()


In [None]:
# Basic EDA
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
non_target_numeric = [c for c in numeric_cols if c != 'popularity']

print(df[numeric_cols].describe().T)

# Correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(df[numeric_cols].corr(), cmap='coolwarm', center=0)
plt.title('Correlation Heatmap (Numeric Features)')
plt.show()

# Popularity distribution
plt.figure(figsize=(6,4))
sns.histplot(df['popularity'], bins=30, kde=True)
plt.title('Popularity Distribution')
plt.show()


In [None]:
# Feature engineering
from sklearn.preprocessing import OneHotEncoder

# Derived features
if 'artists' in df.columns:
    df['num_artists'] = df['artists'].astype(str).str.split(';').apply(len)
else:
    df['num_artists'] = 1

if 'track_name' in df.columns:
    lower_titles = df['track_name'].astype(str).str.lower()
    df['title_len'] = lower_titles.str.len()
    df['title_has_feat'] = lower_titles.str.contains('feat|ft\.|with', regex=True).astype(int)
else:
    df['title_len'] = 0
    df['title_has_feat'] = 0

categorical_cols = []
if 'track_genre' in df.columns:
    categorical_cols.append('track_genre')

# One-hot encode single categorical (keep low cardinality)
if categorical_cols:
    enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    ohe = enc.fit_transform(df[categorical_cols])
    ohe_cols = [f"{categorical_cols[0]}__{cat}" for cat in enc.categories_[0]]
    ohe_df = pd.DataFrame(ohe, columns=ohe_cols, index=df.index)
    df = pd.concat([df.drop(columns=categorical_cols), ohe_df], axis=1)

features = [c for c in df.columns if c not in ['popularity','track_id','artists','album_name','track_name']]
X = df[features]
y = df['popularity']
X.shape, y.shape


In [None]:
# Train/test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
X_train.shape, X_test.shape


In [None]:
# Baseline and tree-based models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    rmse = mean_squared_error(y_test, preds, squared=False)
    r2 = r2_score(y_test, preds)
    results[name] = {'MAE': mae, 'RMSE': rmse, 'R2': r2}
    print(f"{name}: MAE={mae:.3f} RMSE={rmse:.3f} R2={r2:.3f}")

best_model_name = max(results, key=lambda k: results[k]['R2'])
best_model = models[best_model_name]
print('Best model:', best_model_name)


In [None]:
# Feature importance (for tree models)
importances = None
if hasattr(best_model, 'feature_importances_'):
    importances = pd.Series(best_model.feature_importances_, index=X_train.columns)
    top20 = importances.sort_values(ascending=False).head(20)
    plt.figure(figsize=(8,6))
    sns.barplot(x=top20.values, y=top20.index)
    plt.title(f'Top 20 Feature Importances ({best_model_name})')
    plt.tight_layout()
    plt.show()
else:
    print('Best model has no feature_importances_ attribute.')


In [None]:
# SHAP explanations (optional; can be heavy on very large datasets)
!pip -q install shap >/dev/null
import shap

# Use a sample for speed
sample_idx = np.random.RandomState(42).choice(X_test.index, size=min(2000, len(X_test)), replace=False)
X_sample = X_test.loc[sample_idx]

explainer = None
if best_model_name == 'RandomForest':
    explainer = shap.TreeExplainer(best_model)
    shap_values = explainer.shap_values(X_sample)
    shap.summary_plot(shap_values, X_sample, show=False)
    plt.tight_layout()
    plt.show()
else:
    # KernelExplainer fallback
    explainer = shap.KernelExplainer(best_model.predict, shap.sample(X_train, 200))
    shap_values = explainer.shap_values(X_sample)
    shap.summary_plot(shap_values, X_sample, show=False)
    plt.tight_layout()
    plt.show()


In [None]:
# Evaluation visualizations
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

preds = best_model.predict(X_test)
mae = mean_absolute_error(y_test, preds)
rmse = mean_squared_error(y_test, preds, squared=False)
r2 = r2_score(y_test, preds)
print(f"Final ({best_model_name}) -> MAE={mae:.3f} RMSE={rmse:.3f} R2={r2:.3f}")

# Prediction vs Actual
plt.figure(figsize=(6,6))
plt.scatter(y_test, preds, alpha=0.3)
lims = [min(y_test.min(), preds.min()), max(y_test.max(), preds.max())]
plt.plot(lims, lims, 'r--')
plt.xlabel('Actual Popularity')
plt.ylabel('Predicted Popularity')
plt.title('Prediction vs Actual')
plt.tight_layout()
plt.show()

# Residuals
residuals = y_test - preds
plt.figure(figsize=(6,4))
sns.histplot(residuals, bins=30, kde=True)
plt.title('Residuals Distribution')
plt.xlabel('Residual (Actual - Predicted)')
plt.tight_layout()
plt.show()


In [None]:
# Partial Dependence (selected features)
from sklearn.inspection import PartialDependenceDisplay

selected_features = [f for f in ['danceability','energy','acousticness','valence','tempo'] if f in X_train.columns]
if selected_features:
    fig, ax = plt.subplots(figsize=(10, 6))
    PartialDependenceDisplay.from_estimator(best_model, X_train, selected_features, ax=ax)
    plt.suptitle('Partial Dependence Plots')
    plt.tight_layout()
    plt.show()
else:
    print('Selected features not found for PDP.')


In [None]:
# Save model
import joblib
MODEL_PATH = Path('../models/best_spotify_popularity_model.pkl')
joblib.dump(best_model, MODEL_PATH)
print('Saved model to', MODEL_PATH.resolve())
