1. Sandhika Surya Ardianto (5025211022)
2. Muhammad Zikri Ramadhan (5025211085)

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('spotify2019.csv')

In [None]:
data.head(10)

In [None]:
print(data.isnull().sum())

In [None]:
data.dropna(inplace=True)

In [None]:
duplicates = data[data.duplicated(subset=['track_id'], keep=False)]
print(f"Jumlah duplikat: {len(duplicates)}")

In [None]:
# Drop dupliactes by 'track_id'
data = data.drop_duplicates(subset=['track_id'], keep='first')

In [None]:
# Numerical feature
data_numeric = data.select_dtypes(include=['float64', 'int64'])

In [None]:
data['tempo'] = pd.qcut(data['tempo'], q=5, labels=False) # tempo is bined into 5 bin

In [None]:
data['duration_ms'] = data['duration_ms'].apply(lambda x: x / 60000) #convert miliseconds to minutes
data.rename(columns={'duration_ms': 'duration_min'}, inplace=True)

In [None]:
import numpy as np

data['key_sin'] = np.sin(2 * np.pi * data['key']/12) # mapping key to x y coordinates in circle unit
data['key_cos'] = np.cos(2 * np.pi * data['key']/12)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(15, 6))
sns.boxplot(data=data_numeric, orient='h')
plt.title('Boxplot')
plt.show()

In [None]:
Q1 = data_numeric.quantile(0.25)
Q3 = data_numeric.quantile(0.75)
IQR = Q3 - Q1

data = data[~((data_numeric < (Q1 - 1.5 * IQR)) | (data_numeric > (Q3 + 1.5 * IQR))).any(axis=1)].reset_index(drop=True) # remove outlier based on 1.5 * IQR range
data

In [None]:
data_num = data.select_dtypes(include=['float64', 'int64']) # correlation between numeric features
correlation_matrix = data_num.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=5, ncols=3, figsize=(20, 20)) # display histogram distribution

for i, col in enumerate(data_num.columns):
  sns.histplot(data=data_num, x=col, kde=True, ax=axes[i // 3, i % 3])

plt.tight_layout()
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler

data_numeric = data.select_dtypes(include=['float64', 'int64'])

scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_numeric)

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(data_scaled)
    wcss.append(kmeans.inertia_)

plt.plot(range(1, 11), wcss, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.grid()
plt.show()

In [None]:
kmeans = KMeans(n_clusters=7, random_state=42)
cluster_labels = kmeans.fit_predict(data_scaled)

In [None]:
data_clustered = data.copy()
data_clustered['cluster'] = cluster_labels

In [None]:
from sklearn.decomposition import PCA
import seaborn as sns

pca = PCA(n_components=2, random_state=42)
pca_result = pca.fit_transform(data_scaled)
data_clustered['PCA1'] = pca_result[:, 0]
data_clustered['PCA2'] = pca_result[:, 1]

# 5. Visualization
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x='PCA1', y='PCA2',
    hue='cluster',
    palette='Set2',
    data=data_clustered,
    legend='full',
    alpha=0.7
)
plt.title('Clustering Visualization (PCA)')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.grid(True)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Count songs in each clusters
cluster_counts = data_clustered['cluster'].value_counts().sort_index()

# Plot bar chart
plt.figure(figsize=(8, 5))
ax = sns.barplot(x=cluster_counts.index, y=cluster_counts.values, palette='Set2')

# Label each bar
for i, count in enumerate(cluster_counts.values):
    ax.text(i, count + 5, str(count), ha='center', va='bottom')

plt.xlabel('Cluster')
plt.ylabel('Count')
plt.title('Song count per Clusters')
plt.grid(True, axis='y')
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import pandas as pd

from sklearn.metrics import pairwise_distances_argmin_min

# Centroid of the KMeans
centroids = kmeans.cluster_centers_

# Calculate distance from the song to its centroid
distances = []
for i in range(len(data_scaled)):
    cluster_id = data_clustered.loc[i, 'cluster']
    centroid = centroids[cluster_id]
    dist = np.linalg.norm(data_scaled[i] - centroid)
    distances.append(dist)

data_clustered['distance_to_centroid'] = distances


In [None]:
# Take top 100 songs only
 
top_100_nearest = (
    data_clustered
    .sort_values(['cluster', 'distance_to_centroid'])
    .groupby('cluster')
    .head(100)
    .reset_index(drop=True)
)

# Visualization
sns.scatterplot(
    x='PCA1', y='PCA2',
    hue='cluster',
    data=top_100_nearest,
    palette='Set2',
    alpha=0.8
)
plt.title("Top 100 songs in each cluster")
plt.grid(True)
plt.show()

In [None]:
for c in sorted(data_clustered['cluster'].unique()):
    print(f"\n=== Cluster {c} ===")
    display(data_clustered[data_clustered['cluster'] == c].head(10))


In [None]:
# Clusters Summary

cluster_summary = data_clustered.groupby('cluster')[data_num.columns].mean().round(2)
display(cluster_summary)

In [None]:
# Top 100 By Popularity in Each Clusters

top_100_popular_each_cluster = (
    data_clustered
    .sort_values(['cluster', 'popularity'], ascending=[True, False])
    .groupby('cluster')
    .head(100)
    .reset_index(drop=True)
)

for c in sorted(top_100_popular_each_cluster['cluster'].unique()):
    print(f"\n=== Cluster {c} ===")
    display(
        top_100_popular_each_cluster[top_100_popular_each_cluster['cluster'] == c].head(10)
    )