In [1]:
import pandas as pd
import numpy as np

import ast
import torch

from utils.interactive_plot import plot_embeddings_interactive
from utils.kl_divergence import compute_kls
from utils.dimension_reduction import get_tsne_embeddings

audio = pd.read_csv("data/audio.csv")
lyrics = pd.read_csv("data/lyrics.csv")
full = pd.read_csv("data/full.csv")

full["sqrt_wks"] = full["wks_on_chart"].apply(lambda x: np.sqrt(x))
full['wks_bucketed'] = full['sqrt_wks'].astype(int)

to_merge = ['track_name', 'wks_on_chart', 'wks_bucketed', 'key', 'key_mode']

audio = audio.merge(full[to_merge], on='track_name', how='left')
lyrics = lyrics.merge(full[to_merge], on='track_name', how='left')

2024-04-09 11:06:11.451410: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def list_to_tensor(lst):
    if lst is not None:
        return torch.tensor([float(x) for x in lst.strip("[]").split(",") if x])
    else:
        return None

audio['latent_audio'] = audio['latent_audio'].apply(lambda x: list_to_tensor(x) if pd.notnull(x) else None)
lyrics['latent_embedding'] = lyrics['latent_embedding'].apply(lambda x: torch.tensor(ast.literal_eval(x)))

In [3]:
compute_kls(audio, "wks_on_chart")

The cluster most similar to the overall distribution is: 0
KL Divergences by cluster:
Cluster 0: 0.2110428290889822
Cluster 6: 0.25427667087705697
Cluster 7: 0.27770737279481517
Cluster 9: 0.2882718861480844
Cluster 12: 0.34802911098494427
Cluster 1: 0.4009614888979761
Cluster 5: 0.44810921873213877
Cluster 4: 0.46431604351138084
Cluster 14: 0.4757839799037803
Cluster 10: 0.5734805843645955
Cluster 8: 0.5971418191334149
Cluster 13: 0.856317033055617
Cluster 11: 1.1325952436656401
Cluster 2: 1.2576571152366296
Cluster 3: 1.702261692267202


In [4]:
compute_kls(lyrics, "wks_on_chart")

The cluster most similar to the overall distribution is: 3
KL Divergences by cluster:
Cluster 3: 0.23379734932819404
Cluster 14: 0.28937617946640015
Cluster 0: 0.3260878037577486
Cluster 10: 0.3322540618378864
Cluster 7: 0.34538143848009495
Cluster 8: 0.36509220740021675
Cluster 1: 0.3816692846274304
Cluster 6: 0.42861129278253285
Cluster 9: 0.44747640680663886
Cluster 5: 0.5096761503616904
Cluster 12: 0.5670417761316591
Cluster 13: 1.1485117979548258
Cluster 2: 1.2639310676053295
Cluster 4: 1.5725192807267954
Cluster 11: 1.801134277872217


In [5]:
audio_embeddings = get_tsne_embeddings(audio, "latent_audio")
lyric_embeddings = get_tsne_embeddings(lyrics, "latent_embedding")

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [33]:
import matplotlib.pyplot as plt
import plotly.graph_objects as go


def plot_embeddings_interactive_test(df, embeddings, feature, emphasis=[]):
    """
    This is a function so we can see if there are patterns among different buckets of billboard scores.
    It's a bit dizzying to look at everything overlaid...

    This plot is interactive to make encoding EDA not as painful
    """
    unique = df[feature].astype(str).unique()
    colors = plt.cm.cividis(np.linspace(0, 1, len(unique)))
    color_map = {category: f'rgb({int(col[0]*255)}, {int(col[1]*255)}, {int(col[2]*255)})' for category, col in zip(unique, colors)}

    fig = go.Figure()

    # Ensuring embeddings_2d is a NumPy array for indexing
    embeddings_2d_np = np.array(embeddings)

    # Add traces for album names
    for category in unique:
        idx = df[feature].astype(str) == category
        selected_embeddings = embeddings_2d_np[idx]
        fig.add_trace(go.Scatter(x=selected_embeddings[:, 0], y=selected_embeddings[:, 1],
                                 mode='markers', marker=dict(color=color_map[category], opacity=1.0 if category in emphasis else 0.1),
                                 name=category, text=df[idx]['track_name']))

    fig.update_layout(title='Interactive Plot of Song Embeddings',
                      xaxis_title='TSNE 1',
                      yaxis_title='TSNE 2',
                      legend_title_text='Legend',
                      hovermode='closest')

    fig.show()

In [34]:
plot_embeddings_interactive_test(audio, audio_embeddings, "wks_bucketed", emphasis=["5", "6", "7"])