In [1]:
import pandas as pd
import numpy as np
from scipy.stats import entropy

audio = pd.read_csv("data/audio.csv")
lyrics = pd.read_csv("data/lyrics.csv")
full = pd.read_csv("data/full.csv")
audio = audio.merge(full[['track_name', 'wks_on_chart']], on='track_name', how='left')
lyrics = lyrics.merge(full[['track_name', 'wks_on_chart']], on='track_name', how='left')

In [7]:
def compute_kls(df):
    max_weeks = df['wks_on_chart'].max()
    bins = np.arange(0, max_weeks + 1)
    overall_distribution, _ = np.histogram(df['wks_on_chart'], bins=bins, density=True)
    
    clusters = df['cluster_assignment'].unique()
    kl_divergences = {}

    for cluster in clusters:
        cluster_data = df[df['cluster_assignment'] == cluster]['wks_on_chart']
        cluster_distribution, _ = np.histogram(cluster_data, bins=bins, density=True)
        
        cluster_distribution += 1e-10
        overall_distribution += 1e-10
        
        kl_divergence = entropy(cluster_distribution, overall_distribution)
        kl_divergences[cluster] = kl_divergence

    most_similar_cluster = min(kl_divergences, key=kl_divergences.get)
    print(f"The cluster most similar to the overall distribution is: {most_similar_cluster}")

    print("KL Divergences by cluster:")
    for cluster, divergence in sorted(kl_divergences.items(), key=lambda x: x[1]):
        print(f"Cluster {cluster}: {divergence}")

In [8]:
compute_kls(audio)

The cluster most similar to the overall distribution is: 0
KL Divergences by cluster:
Cluster 0: 0.2110428290889822
Cluster 6: 0.25427667087705697
Cluster 7: 0.27770737279481517
Cluster 9: 0.2882718861480844
Cluster 12: 0.34802911098494427
Cluster 1: 0.4009614888979761
Cluster 5: 0.44810921873213877
Cluster 4: 0.46431604351138084
Cluster 14: 0.4757839799037803
Cluster 10: 0.5734805843645955
Cluster 8: 0.5971418191334149
Cluster 13: 0.856317033055617
Cluster 11: 1.1325952436656401
Cluster 2: 1.2576571152366296
Cluster 3: 1.702261692267202


In [9]:
compute_kls(lyrics)

The cluster most similar to the overall distribution is: 3
KL Divergences by cluster:
Cluster 3: 0.23379734932819404
Cluster 14: 0.28937617946640015
Cluster 0: 0.3260878037577486
Cluster 10: 0.3322540618378864
Cluster 7: 0.34538143848009495
Cluster 8: 0.36509220740021675
Cluster 1: 0.3816692846274304
Cluster 6: 0.42861129278253285
Cluster 9: 0.44747640680663886
Cluster 5: 0.5096761503616904
Cluster 12: 0.5670417761316591
Cluster 13: 1.1485117979548258
Cluster 2: 1.2639310676053295
Cluster 4: 1.5725192807267954
Cluster 11: 1.801134277872217
