tag_clustering.ipynb
- playlist_processing.ipynb를 통해 각 노래마다의 태그들에 대해 DBSCAN으로 클러스터링을 진행해 같은 분위기의 노래를 군집화한다.

- input: processed_song.json
- output: developing

In [17]:
import json
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.cluster import DBSCAN

In [2]:
# load music dataframe 
def load_music_df(PATH = './json_data/processed_song.json'):
    with open(PATH, 'r', encoding='UTF8') as f:
        data = json.load(f)
    df = pd.json_normalize(data)
    return df

# take most common k tags 
def extract_common_tags(df: pd.DataFrame, k=100):
    tags = []
    for idx, row in df.iterrows():
        tags.extend(row['tags'].split())
    return [tag for [tag, cnt] in Counter(tags).most_common(k)]

# except tags not in most common k tags
def only_common_tags(df: pd.DataFrame, tags: list):
    for idx, row in df.iterrows():
        df.loc[idx, 'new_tags'] = ' '.join([tag for tag in row['tags'].split() if tag in tags])
    df.drop('tags', axis=1, inplace=True)
    return df

# tags clustering with DBSCAN
def clustering_with_dbscan(df: pd.DataFrame):
    text = df['new_tags'].to_list()
    vectorizer = TfidfVectorizer(min_df = 5, ngram_range = (1,5))
    vectorizer.fit(text)
    vector = np.array(vectorizer.transform(text).toarray())
    
    # DBSCAN clustering model
    model = DBSCAN(eps = 0.1, metric = "cosine")
    df['label'] = model.fit_predict(vector)
    return df

In [3]:
df = load_music_df()
tags = extract_common_tags(df)
df = only_common_tags(df, tags)
df = clustering_with_dbscan(df)

In [11]:
text = df['new_tags'].to_list()
vectorizer = TfidfVectorizer(min_df=5, ngram_range=(1, 5))
vectorizer.fit(text)
vector = np.array(vectorizer.transform(text).toarray())

In [18]:
model = DBSCAN(eps=0.1, metric = "cosine")
result = model.fit_predict(vector)

In [21]:
result

array([  0,  -1,   1, ...,  -1,  -1, 119])

In [7]:
def get_cluster_details(cluster_model, cluster_data, feature_names,
                       cluster_num, top_n_features=10):
    cluster_details = {}
    center_feature_idx = cluster_model.cluster_centers_.argsort()[:,::-1]
    
    for cluster_num in range(cluster_num):
        cluster_details[cluster_num] = {}
        cluster_details[cluster_num]['cluster'] = cluster_num
        
        top_ftr_idx = center_feature_idx[cluster_num, :top_n_features]
        top_ftr = [feature_names[idx] for idx in top_ftr_idx]
        top_ftr_val = cluster_model.cluster_centers_[cluster_num, top_ftr_idx].tolist()
        
        cluster_details[cluster_num]['top_features'] = top_ftr
        cluster_details[cluster_num]['top_featrues_value'] = top_ftr_val
        filenames = cluster_data[cluster_data['labels']==cluster_num]['tags']
        filenames = filenames.values.tolist()
        cluster_details[cluster_num]['filenames'] = filenames
    
    return cluster_details
feature_names = vectorizer.get_feature_names_out()
details = get_cluster_details(cluster_model=kmeans, cluster_data=df, feature_names=feature_names, cluster_num=n_clusters)

In [8]:
for key, value in details.items():
    print(key, value['top_features'])

0 ['기분전환', '감성', '인디', '발라드', '아이돌', '힙합', '휴식', '드라이브', '힐링', '겨울']
1 ['새벽', '잔잔한', '감성', '휴식', '새벽감성', '힐링', '인디', '위로', '잠들기전', '몽환']
2 ['운동', '댄스', '아이돌', '스트레스', '드라이브', '기분전환', '신나는', 'kpop', '걸그룹', '케이팝']
3 ['설렘', '사랑', '힐링', '휴식', '까페', '기분전환', '새벽', '잔잔한', '인디', '발라드']
4 ['알앤비', '힙합', '소울', '감성힙합', '감성', '트렌디', 'rnb', '새벽', '드라이브', '그루브']
5 ['회상', '추억', '이별', '슬픔', '발라드', '새벽', '감성', '힐링', '잔잔한', '휴식']
6 ['설렘', '달달', '사랑', '연애', '고백', '데이트', '달달한', '벚꽃', '두근두근', '연인']
7 ['이별', '슬픔', '발라드', '감성', '새벽', '비오는날', '사랑', '잔잔한', '눈물', '가을']
8 ['카페', '감성', '인디', '잔잔한', '휴식', '새벽', '기분전환', '어쿠스틱', '가을', '버스']
9 ['여름', '시원한', '더위', '드라이브', '청량한', '신나는', '여행', '트로피컬', '기분전환', '댄스']


In [9]:
for i in range(0, 10):
    print(F"Cluster {i}: {len(df[df['labels'] == i])}")

Cluster 0: 7049
Cluster 1: 1756
Cluster 2: 1681
Cluster 3: 1473
Cluster 4: 1647
Cluster 5: 1854
Cluster 6: 1542
Cluster 7: 2116
Cluster 8: 1872
Cluster 9: 580
