In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('spotify2019.csv')

In [None]:
data.head(10)

In [None]:
print(data.isnull().sum())

In [None]:
data.dropna(inplace=True)

In [None]:
duplicates = data[data.duplicated(subset=['track_id'], keep=False)]
print(f"Jumlah duplikat: {len(duplicates)}")

In [None]:
# Drop dupliactes by 'track_id'
data = data.drop_duplicates(subset=['track_id'], keep='first')

In [None]:
# Numerical feature
data_numeric = data.select_dtypes(include=['float64', 'int64'])

In [None]:
data['tempo'] = pd.qcut(data['tempo'], q=5, labels=False) # tempo is bined into 5 bin

In [None]:
data['duration_ms'] = data['duration_ms'].apply(lambda x: x / 60000) #convert miliseconds to minutes
data.rename(columns={'duration_ms': 'duration_min'}, inplace=True)

In [None]:
import numpy as np

data['key_sin'] = np.sin(2 * np.pi * data['key']/12) # mapping key to x y coordinates in circle unit
data['key_cos'] = np.cos(2 * np.pi * data['key']/12)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(15, 6))
sns.boxplot(data=data_numeric, orient='h')
plt.title('Boxplot')
plt.show()

In [None]:
Q1 = data_numeric.quantile(0.25)
Q3 = data_numeric.quantile(0.75)
IQR = Q3 - Q1

data = data[~((data_numeric < (Q1 - 1.5 * IQR)) | (data_numeric > (Q3 + 1.5 * IQR))).any(axis=1)].reset_index(drop=True) # remove outlier based on 1.5 * IQR range
data

In [None]:
data_num = data.select_dtypes(include=['float64', 'int64']) # correlation between numeric features
correlation_matrix = data_num.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=5, ncols=3, figsize=(20, 20)) # display histogram distribution

for i, col in enumerate(data_num.columns):
  sns.histplot(data=data_num, x=col, kde=True, ax=axes[i // 3, i % 3])

plt.tight_layout()
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler

data_numeric = data.select_dtypes(include=['float64', 'int64'])

scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_numeric)

In [None]:
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt

# GMM Clustering
gmm = GaussianMixture(n_components=7, random_state=42)
gmm_labels = gmm.fit_predict(data_scaled)


data_clustered_gmm = data.copy()
data_clustered_gmm['cluster'] = gmm_labels


In [None]:
# PCA
pca = PCA(n_components=2, random_state=42)
pca_result = pca.fit_transform(data_scaled)
data_clustered_gmm['PCA1'] = pca_result[:, 0]
data_clustered_gmm['PCA2'] = pca_result[:, 1]

# Visualization
plt.figure(figsize=(10, 6))
sns.scatterplot(
    data=data_clustered_gmm,
    x='PCA1', y='PCA2',
    hue='cluster',
    palette='Set2',
    alpha=0.7
)
plt.title("GMM Clustering Result")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.legend(title='Cluster')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Clusters Summary

cluster_summary = data_clustered_gmm.groupby('cluster')[data_num.columns].mean().round(2)
display(cluster_summary)

In [None]:
# Top 100 By Popularity in Each Clusters

top_100_popular_each_cluster = (
    data_clustered_gmm
    .sort_values(['cluster', 'popularity'], ascending=[True, False])
    .groupby('cluster')
    .head(100)
    .reset_index(drop=True)
)

for c in sorted(top_100_popular_each_cluster['cluster'].unique()):
    print(f"\n=== Cluster {c} ===")
    display(
        top_100_popular_each_cluster[top_100_popular_each_cluster['cluster'] == c].head(10)
    )