In [22]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import seaborn as sns

In [23]:
# load data : Weird '../' descrepancy with file paths in .py vs .ipynb indicates hidden folder? 
high_popularity = pd.read_csv('../Data/raw/high_popularity_spotify_data.csv')
low_popularity = pd.read_csv('../Data/raw/low_popularity_spotify_data.csv')

# ensure columns match all the way through
high_popularity = high_popularity.sort_index(axis=1)
low_popularity = low_popularity.sort_index(axis=1)

# concatenate the full data
low_popularity = low_popularity.iloc[1:]
low_popularity.columns = high_popularity.columns
spotify_data = pd.concat([high_popularity, low_popularity], ignore_index=True)

# keep only numbers with track name as index
genre_index = spotify_data.set_index(['playlist_genre'])
genre_index = genre_index.rename(index={"r&b": "RnB"})

num_data = spotify_data.select_dtypes(include='number')
spotify_data_numerics = num_data.set_index(genre_index.index)

In [24]:
def perform_pca(df, numeric_columns, n_components=2):
    X = df[numeric_columns]

    pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
        ("pca", PCA(n_components=n_components))
    ])

    X_pca = pipe.fit_transform(X)
    pca_model = pipe.named_steps["pca"]

    return X_pca, pca_model

In [None]:
def plot_popularity_vs_pca(df, numeric_columns):
    # Run PCA
    X_pca, pca_model = perform_pca(df, numeric_columns)

    df_pca = pd.DataFrame({
        "PC1": X_pca[:, 0],
        "PC2": X_pca[:, 1],
        "popularity": df["track_popularity"].values
    })

    plt.figure(figsize=(10, 8))
    sns.scatterplot(
        data=df_pca,
        x="PC1",
        y="PC2",
        hue="popularity",
        palette="viridis",
        s=40,
        alpha=0.7
    )

    plt.title("Popularity vs PCA Components (PC1 vs PC2)", fontsize=16, weight="bold")
    plt.xlabel(f"PC1 ({pca_model.explained_variance_ratio_[0]*100:.1f}% variance)")
    plt.ylabel(f"PC2 ({pca_model.explained_variance_ratio_[1]*100:.1f}% variance)")
    #plt.colorbar(plt.cm.ScalarMappable(cmap="viridis"), label="Track Popularity")
    plt.tight_layout()
    plt.show()



In [27]:
#plot_popularity_vs_pca(spotify_data, spotify_data_numerics.columns)

In [19]:
def plot_genre_vs_pca(df, numeric_columns):
    X_pca, pca_model = perform_pca(df, numeric_columns)

    df_pca = pd.DataFrame({
        "PC1": X_pca[:, 0],
        "PC2": X_pca[:, 1],
        "genre": df["playlist_genre"].values,
        "popularity": df["track_popularity"].values
    })

    plt.figure(figsize=(12, 8))
    sns.scatterplot(
        data=df_pca,
        x="PC1",
        y="PC2",
        hue="genre",
        alpha=0.7,
        s=40,
        palette="tab20"
    )

    plt.title("PCA Projection Colored by Genre", fontsize=16, weight="bold")
    plt.xlabel(f"PC1 ({pca_model.explained_variance_ratio_[0]*100:.1f}% variance)")
    plt.ylabel(f"PC2 ({pca_model.explained_variance_ratio_[1]*100:.1f}% variance)")
    plt.tight_layout()
    plt.show()


In [21]:
def plot_pc1_vs_popularity(df, numeric_columns):
    X_pca, _ = perform_pca(df, numeric_columns)

    plt.figure(figsize=(8, 6))
    sns.regplot(x=X_pca[:, 0], y=df["track_popularity"], scatter_kws={"alpha": 0.3})

    plt.title("Track Popularity vs PC1", fontsize=14, weight="bold")
    plt.xlabel("PC1")
    plt.ylabel("Popularity")
    plt.tight_layout()
    plt.show()
