# Cluster-based Song Recommendation (KMeans + Cosine Similarity)
Bu notebook, `dataset_clean.csv` verisini kullanarak:
1) Audio feature'lar ile KMeans clustering yapar (k seÃ§imi iÃ§in silhouette)
2) SeÃ§ilen bir ÅŸarkÄ± iÃ§in *aynÄ± cluster iÃ§inden* cosine similarity ile en benzer ÅŸarkÄ±larÄ± Ã¶nerir.

**Not:** Artist bilgisi kullanÄ±lmaz; sadece `track_name` Ã¼zerinden seÃ§im yapÄ±lÄ±r.

In [7]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity

DATA_PATH = "/Users/berkebilgin/Desktop/VSCODE/Python/470/dataset_clean.csv"  # path yazÄ±lÄ±r.
RANDOM_STATE = 42


In [8]:
# 1) Veri yÃ¼kleme
df = pd.read_csv(DATA_PATH)

print("Dataset loaded:", df.shape)
display(df.head())


âœ… Dataset loaded: (71493, 21)


Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,20001,3nqQXoyQOWXiESFLlDF1hG,Sam Smith;Kim Petras,Unholy (feat. Kim Petras),Unholy (feat. Kim Petras),100,156943,False,0.714,0.472,...,-7.375,1,0.0864,0.013,5e-06,0.266,0.238,131.121,4,dance
1,51664,2tTmW7RDtMQtBk7m2rYeSw,Bizarrap;Quevedo,"Quevedo: Bzrp Music Sessions, Vol. 52","Quevedo: Bzrp Music Sessions, Vol. 52",99,198937,False,0.621,0.782,...,-5.548,1,0.044,0.0125,0.033,0.23,0.55,128.033,4,hip-hop
2,81210,4uUG5RXrOk84mYEfFvj3cK,David Guetta;Bebe Rexha,I'm Good (Blue),I'm Good (Blue),98,175238,True,0.561,0.965,...,-3.673,0,0.0343,0.00383,7e-06,0.371,0.304,128.04,4,pop
3,89411,5ww2BF9slyYgNOk37BlC4u,Manuel Turizo,La Bachata,La Bachata,98,162637,False,0.835,0.679,...,-5.329,0,0.0364,0.583,2e-06,0.218,0.85,124.98,4,reggaeton
4,68305,6Sq7ltF9Qa7SNFBsV5Cogx,Bad Bunny;Chencho Corleone,Un Verano Sin Ti,Me Porto Bonito,97,178567,True,0.911,0.712,...,-5.105,0,0.0817,0.0901,2.7e-05,0.0933,0.425,92.005,4,latino


In [9]:
# 2) Clustering iÃ§in kullanÄ±lacak audio feature'lar
audio_features = [
    'danceability', 'energy', 'key', 'loudness', 'mode',
    'speechiness', 'acousticness', 'instrumentalness',
    'liveness', 'valence', 'tempo'
]

# Eksik kolon kontrolÃ¼
missing = [c for c in audio_features if c not in df.columns]
if missing:
    raise ValueError(f"Bu feature kolonlarÄ± df iÃ§inde yok: {missing}")

# Feature matrix oluÅŸturma
X = df[audio_features].copy()

# NaN varsa medyanla doldurduk
X = X.fillna(X.median(numeric_only=True))

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [10]:
# 3) k seÃ§imi (silhouette): 2..10 arasÄ± dene
k_range = range(2, 11)
sil_scores = []

for k in k_range:
    km = KMeans(n_clusters=k, random_state=RANDOM_STATE, n_init=10)
    labels = km.fit_predict(X_scaled)
    sil = silhouette_score(X_scaled, labels)
    sil_scores.append(sil)

optimal_k = list(k_range)[int(np.argmax(sil_scores))]
print("ðŸŽ¯ Optimal k (silhouette):", optimal_k)
print("Silhouette scores:", dict(zip(k_range, np.round(sil_scores, 4))))


ðŸŽ¯ Optimal k (silhouette): 2
Silhouette scores: {2: 0.2094, 3: 0.14, 4: 0.136, 5: 0.1352, 6: 0.1361, 7: 0.1584, 8: 0.1481, 9: 0.1356, 10: 0.1328}


In [16]:
# 4) Final KMeans modeli + cluster etiketleri
kmeans = KMeans(n_clusters=optimal_k, random_state=RANDOM_STATE, n_init=20)
df["cluster"] = kmeans.fit_predict(X_scaled)

df[["track_name", "cluster"]].head()


Unnamed: 0,track_name,cluster
0,Unholy (feat. Kim Petras),1
1,"Quevedo: Bzrp Music Sessions, Vol. 52",1
2,I'm Good (Blue),1
3,La Bachata,1
4,Me Porto Bonito,1


In [12]:
# 5) Benzer ÅŸarkÄ± Ã¶neri fonksiyonu (cluster iÃ§i + cosine similarity)
def recommend_similar_songs(
    df: pd.DataFrame,
    song_name: str,
    top_n: int = 5,
    feature_cols=audio_features,
    cluster_col: str = "cluster",
    track_col: str = "track_name",
    min_cluster_size_for_filter: int = 3,
    fallback_global: bool = True
):
    # hedef ÅŸarkÄ± (ilk eÅŸleÅŸme)
    target_rows = df[df[track_col] == song_name]
    if target_rows.empty:
        # yakÄ±n isim Ã¶ner
        suggestions = df[df[track_col].astype(str).str.contains(str(song_name), case=False, na=False)][track_col].head(10).tolist()
        raise ValueError(f"ÅžarkÄ± bulunamadÄ±: {song_name}. YakÄ±n eÅŸleÅŸmeler: {suggestions}")

    target = target_rows.iloc[[0]]
    target_cluster = int(target[cluster_col].values[0])

    # aday havuzu: aynÄ± cluster
    cand = df[df[cluster_col] == target_cluster].copy()

    # cluster Ã§ok kÃ¼Ã§Ã¼kse globale dÃ¼ÅŸ
    if len(cand) < min_cluster_size_for_filter and fallback_global:
        cand = df.copy()

    # feature matrisleri (aynÄ± scaler ile)
    X_all = df[feature_cols].fillna(df[feature_cols].median(numeric_only=True)).astype(float).values
    X_all_scaled = scaler.transform(X_all)

    target_vec = scaler.transform(
        target[feature_cols].fillna(df[feature_cols].median(numeric_only=True)).astype(float).values
    )

    cand_idx = cand.index.to_numpy()
    X_cand = X_all_scaled[cand_idx]

    sims = cosine_similarity(target_vec, X_cand)[0]
    cand = cand.copy()
    cand["similarity"] = sims

    # kendisini Ã§Ä±kar (track_name aynÄ± olanlarÄ± Ã§Ä±karÄ±yoruz)
    cand = cand[cand[track_col] != target.iloc[0][track_col]]

    # top_n
    out = cand.sort_values("similarity", ascending=False).head(top_n)

    return out[[track_col, cluster_col, "similarity"]].reset_index(drop=True)



In [13]:
# 6) KullanÄ±m Ã¶rneÄŸi
# Not: dataset'te birebir bulunan bir track_name yazmalÄ±sÄ±n.
# Ã–rn: df["track_name"].value_counts().head(10) ile popÃ¼ler isimlere bakabilirsin.

example_song = df["track_name"].iloc[0]
example_song


'Unholy (feat. Kim Petras)'

In [14]:
recommend_similar_songs(df, song_name=example_song, top_n=5)




Unnamed: 0,track_name,cluster,similarity
0,Shower,1,0.951394
1,Duydum Ki,1,0.945811
2,Super Gremlin Freestyle,1,0.931188
3,Moving On,1,0.927364
4,Mary Had a Lil' lamb (Trap Remix),1,0.922406
