# Day 4: Spotify Track Data Analysis

Exploratory Data Analysis and Classification Modeling on Spotify Track Datasets.

We analyze track popularity, artist metrics, and other features to understand patterns in music data, then build classification models to predict popularity categories.

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import os

# Visualization
import plotly.express as px
import plotly.graph_objects as go
import plotly.subplots as sp
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    classification_report, confusion_matrix,
    accuracy_score, precision_score, recall_score, f1_score
)

import joblib
import warnings
warnings.filterwarnings('ignore')

# Ensure output directories exist
os.makedirs('../models', exist_ok=True)
os.makedirs('../viz', exist_ok=True)

## Load and Explore Data

In [None]:
# Load both datasets
track_data = pd.read_csv('../data/track_data_final.csv')
spotify_clean = pd.read_csv('../data/spotify_data_clean.csv')

print('=== track_data_final.csv ===')
print(f'Shape: {track_data.shape}')
display(track_data.head())

print('\n=== spotify_data_clean.csv ===')
print(f'Shape: {spotify_clean.shape}')
display(spotify_clean.head())

In [None]:
print('=== track_data_final info ===')
display(track_data.info())
print('\n=== spotify_data_clean info ===')
display(spotify_clean.info())

In [None]:
print('=== track_data_final - Statistical Summary ===')
display(track_data.describe())
print('\n=== Missing Values ===')
display(track_data.isnull().sum())

In [None]:
print('=== spotify_data_clean - Statistical Summary ===')
display(spotify_clean.describe())
print('\n=== Missing Values ===')
display(spotify_clean.isnull().sum())

## Data Cleaning

We'll work primarily with `track_data_final.csv` for modeling since it has duration in milliseconds. We'll also use `spotify_data_clean.csv` for supplementary analysis.

In [None]:
# Drop rows with missing artist_popularity or artist_followers
df = track_data.dropna(subset=['artist_popularity', 'artist_followers']).copy()

# Convert explicit to int
df['explicit'] = df['explicit'].astype(int)

# Parse release date to extract year
df['release_year'] = pd.to_datetime(df['album_release_date'], errors='coerce').dt.year

# Convert duration from ms to minutes for readability
df['track_duration_min'] = df['track_duration_ms'] / 60000

# Create popularity category for classification
df['popularity_category'] = pd.cut(
    df['track_popularity'],
    bins=[-1, 25, 50, 75, 100],
    labels=['Low', 'Medium', 'High', 'Very High']
)

print(f'Cleaned dataset shape: {df.shape}')
display(df['popularity_category'].value_counts())

## Exploratory Data Analysis

### Track Popularity Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram of track popularity
axes[0].hist(df['track_popularity'], bins=40, color='#1DB954', edgecolor='black', alpha=0.8)
axes[0].set_title('Distribution of Track Popularity')
axes[0].set_xlabel('Track Popularity')
axes[0].set_ylabel('Count')

# Popularity category counts
cat_counts = df['popularity_category'].value_counts().sort_index()
axes[1].bar(cat_counts.index.astype(str), cat_counts.values, color=['#e74c3c','#f39c12','#2ecc71','#3498db'], edgecolor='black')
axes[1].set_title('Tracks by Popularity Category')
axes[1].set_xlabel('Category')
axes[1].set_ylabel('Count')

plt.tight_layout()
plt.savefig('../viz/popularity_distribution.png', dpi=150, bbox_inches='tight')
plt.show()
print('Saved: viz/popularity_distribution.png')

### Artist Popularity vs Track Popularity

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Scatter: artist popularity vs track popularity
axes[0].scatter(df['artist_popularity'], df['track_popularity'], alpha=0.3, s=10, color='#1DB954')
axes[0].set_title('Artist Popularity vs Track Popularity')
axes[0].set_xlabel('Artist Popularity')
axes[0].set_ylabel('Track Popularity')

# Scatter: artist followers vs track popularity
axes[1].scatter(df['artist_followers'], df['track_popularity'], alpha=0.3, s=10, color='#e74c3c')
axes[1].set_title('Artist Followers vs Track Popularity')
axes[1].set_xlabel('Artist Followers')
axes[1].set_ylabel('Track Popularity')

plt.tight_layout()
plt.savefig('../viz/artist_vs_track_popularity.png', dpi=150, bbox_inches='tight')
plt.show()
print('Saved: viz/artist_vs_track_popularity.png')

### Correlation Heatmap

In [None]:
numeric_cols = ['track_popularity', 'track_duration_ms', 'artist_popularity',
                'artist_followers', 'track_number', 'album_total_tracks', 'explicit']
corr = df[numeric_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='RdYlGn', center=0,
            square=True, linewidths=0.5)
plt.title('Correlation Heatmap of Numeric Features')
plt.tight_layout()
plt.savefig('../viz/correlation_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()
print('Saved: viz/correlation_heatmap.png')

### Top 15 Artists by Average Track Popularity

In [None]:
top_artists = (
    df.groupby('artist_name')['track_popularity']
    .agg(['mean', 'count'])
    .query('count >= 3')
    .sort_values('mean', ascending=False)
    .head(15)
)

plt.figure(figsize=(12, 6))
plt.barh(top_artists.index[::-1], top_artists['mean'].values[::-1], color='#1DB954', edgecolor='black')
plt.xlabel('Average Track Popularity')
plt.title('Top 15 Artists by Average Track Popularity (min 3 tracks)')
plt.tight_layout()
plt.savefig('../viz/top_artists.png', dpi=150, bbox_inches='tight')
plt.show()
print('Saved: viz/top_artists.png')

### Box Plot: Track Popularity by Album Type

In [None]:
plt.figure(figsize=(10, 5))
album_types = df['album_type'].value_counts().index[:5]  # top album types
subset = df[df['album_type'].isin(album_types)]
sns.boxplot(data=subset, x='album_type', y='track_popularity', palette='Set2')
plt.title('Track Popularity by Album Type')
plt.xlabel('Album Type')
plt.ylabel('Track Popularity')
plt.tight_layout()
plt.savefig('../viz/popularity_by_album_type.png', dpi=150, bbox_inches='tight')
plt.show()
print('Saved: viz/popularity_by_album_type.png')

### Explicit vs Non-Explicit Track Popularity

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(data=df, x='explicit', y='track_popularity', palette=['#2ecc71', '#e74c3c'])
plt.xticks([0, 1], ['Non-Explicit', 'Explicit'])
plt.title('Track Popularity: Explicit vs Non-Explicit')
plt.xlabel('Explicit')
plt.ylabel('Track Popularity')
plt.tight_layout()
plt.savefig('../viz/explicit_vs_popularity.png', dpi=150, bbox_inches='tight')
plt.show()
print('Saved: viz/explicit_vs_popularity.png')

### Genre Analysis

In [None]:
# Parse genres (stored as string representations of lists)
import ast

def parse_genres(genre_str):
    """Parse genre string to list."""
    if pd.isna(genre_str) or genre_str in ('N/A', '[]', ''):
        return []
    try:
        return ast.literal_eval(genre_str)
    except (ValueError, SyntaxError):
        return [genre_str.strip()]

all_genres = df['artist_genres'].apply(parse_genres).explode()
genre_counts = all_genres.value_counts().head(20)

plt.figure(figsize=(12, 6))
plt.barh(genre_counts.index[::-1], genre_counts.values[::-1], color='#3498db', edgecolor='black')
plt.xlabel('Number of Tracks')
plt.title('Top 20 Genres by Track Count')
plt.tight_layout()
plt.savefig('../viz/top_genres.png', dpi=150, bbox_inches='tight')
plt.show()
print('Saved: viz/top_genres.png')

## Key Observations

- **Popularity Distribution**: Track popularity is right-skewed with many tracks having low popularity scores. Most tracks fall in the Low-Medium range.
- **Artist-Track Correlation**: Artist popularity has a moderate positive correlation with track popularity; more popular artists tend to have more popular tracks.
- **Artist Followers**: High follower count shows some correlation with track popularity, but the relationship is noisy.
- **Album Type**: Tracks from albums tend to have wider popularity ranges than singles or compilations.
- **Explicit Content**: Explicit and non-explicit tracks show similar popularity distributions.
- **Genre Trends**: Pop, hip-hop, and related genres dominate the track count.

## Classification Models

We predict popularity category (Low / Medium / High / Very High) using numeric features.

### Feature Preparation

In [None]:
# Features for modeling
feature_cols = ['track_duration_ms', 'artist_popularity', 'artist_followers',
                'track_number', 'album_total_tracks', 'explicit']

model_df = df[feature_cols + ['popularity_category']].dropna()

# Encode target
le = LabelEncoder()
model_df['target'] = le.fit_transform(model_df['popularity_category'])

X = model_df[feature_cols]
y = model_df['target']

print(f'Features shape: {X.shape}')
print(f'Target distribution:\n{model_df["popularity_category"].value_counts().sort_index()}')
print(f'\nTarget encoding: {dict(zip(le.classes_, le.transform(le.classes_)))}')

### Train-Test Split & Scaling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f'Train: {X_train.shape[0]} samples')
print(f'Test:  {X_test.shape[0]} samples')

### Model 1: Logistic Regression

In [None]:
lr_model = LogisticRegression(max_iter=1000, random_state=42, multi_class='multinomial')
lr_model.fit(X_train_scaled, y_train)
lr_pred = lr_model.predict(X_test_scaled)

print('=== Logistic Regression ===')
print(f'Accuracy: {accuracy_score(y_test, lr_pred):.4f}')
print(f'\nClassification Report:')
print(classification_report(y_test, lr_pred, target_names=le.classes_))

### Model 2: Random Forest Classifier

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

print('=== Random Forest Classifier ===')
print(f'Accuracy: {accuracy_score(y_test, rf_pred):.4f}')
print(f'\nClassification Report:')
print(classification_report(y_test, rf_pred, target_names=le.classes_))

### Model Comparison

In [None]:
# Build comparison table
scoreboard = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest'],
    'Accuracy': [
        accuracy_score(y_test, lr_pred),
        accuracy_score(y_test, rf_pred)
    ],
    'Precision (weighted)': [
        precision_score(y_test, lr_pred, average='weighted'),
        precision_score(y_test, rf_pred, average='weighted')
    ],
    'Recall (weighted)': [
        recall_score(y_test, lr_pred, average='weighted'),
        recall_score(y_test, rf_pred, average='weighted')
    ],
    'F1 Score (weighted)': [
        f1_score(y_test, lr_pred, average='weighted'),
        f1_score(y_test, rf_pred, average='weighted')
    ]
})

display(scoreboard)

# Bar chart comparison
metrics = ['Accuracy', 'Precision (weighted)', 'Recall (weighted)', 'F1 Score (weighted)']
x = np.arange(len(metrics))
width = 0.35

fig, ax = plt.subplots(figsize=(12, 5))
bars1 = ax.bar(x - width/2, scoreboard[metrics].iloc[0].values, width, label='Logistic Regression', color='#3498db')
bars2 = ax.bar(x + width/2, scoreboard[metrics].iloc[1].values, width, label='Random Forest', color='#1DB954')

ax.set_ylabel('Score')
ax.set_title('Model Comparison')
ax.set_xticks(x)
ax.set_xticklabels(metrics, rotation=15)
ax.legend()
ax.set_ylim(0, 1)
ax.bar_label(bars1, fmt='%.3f', fontsize=8)
ax.bar_label(bars2, fmt='%.3f', fontsize=8)

plt.tight_layout()
plt.savefig('../viz/model_comparison.png', dpi=150, bbox_inches='tight')
plt.show()
print('Saved: viz/model_comparison.png')

### Confusion Matrices

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for ax, pred, name in [(axes[0], lr_pred, 'Logistic Regression'),
                        (axes[1], rf_pred, 'Random Forest')]:
    cm = confusion_matrix(y_test, pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
                xticklabels=le.classes_, yticklabels=le.classes_)
    ax.set_title(f'Confusion Matrix: {name}')
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')

plt.tight_layout()
plt.savefig('../viz/confusion_matrices.png', dpi=150, bbox_inches='tight')
plt.show()
print('Saved: viz/confusion_matrices.png')

### Feature Importance (Random Forest)

In [None]:
importances = pd.Series(rf_model.feature_importances_, index=feature_cols).sort_values()

plt.figure(figsize=(10, 5))
importances.plot(kind='barh', color='#1DB954', edgecolor='black')
plt.title('Feature Importance (Random Forest)')
plt.xlabel('Importance')
plt.tight_layout()
plt.savefig('../viz/feature_importance.png', dpi=150, bbox_inches='tight')
plt.show()
print('Saved: viz/feature_importance.png')

## Save Models

In [None]:
# Save both models
joblib.dump(lr_model, '../models/spotify_lr_model.joblib')
joblib.dump(rf_model, '../models/spotify_rf_model.joblib')

# Also save the scaler and label encoder for reuse
joblib.dump(scaler, '../models/spotify_scaler.joblib')
joblib.dump(le, '../models/spotify_label_encoder.joblib')

print('Models saved to ../models/')
print('  - spotify_lr_model.joblib')
print('  - spotify_rf_model.joblib')
print('  - spotify_scaler.joblib')
print('  - spotify_label_encoder.joblib')

## Summary

| Metric | Logistic Regression | Random Forest |
|--------|-------------------|---------------|
| Accuracy | see output above | see output above |
| Precision | see output above | see output above |
| Recall | see output above | see output above |
| F1 Score | see output above | see output above |

Both models were trained to classify Spotify tracks into popularity categories (Low, Medium, High, Very High). Artist popularity and artist followers are the most predictive features for track popularity classification.