tag_clustering.ipynb
- playlist_processing.ipynb를 통해 각 노래마다의 태그들에 대해 K-means 클러스터링을 진행해 같은 분위기의 노래를 군집화한다.

- input: processed_song.json
- output: developing

In [139]:
import json, sqlite3
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans

In [144]:
class Music_Labeler:
    
    # Load music Dataframe and take most common k tags
    def __init__(
            self, 
            PATH = './json_data/processed_song.json',
            most_common_k = 100,
        ):
        
        with open(PATH, 'r', encoding="UTF8") as f:
            data = json.load(f)
        self.df = pd.json_normalize(data)
        
        # take only most common k tags
        common_tags = []
        for idx, row in self.df.iterrows():
            common_tags.extend(row['tags'].split())
        self.tags = [tag for [tag, cnt] in Counter(tags).most_common(most_common_k)]
        
        # except tags not in most common k tags each music 
        for idx, row in self.df.iterrows():
            self.df.loc[idx, 'new_tags'] = ' '.join([tag for tag in row['tags'].split() if tag in self.tags])
        self.df.drop('tags', axis=1, inplace=True)  
    
    def run(self):
        self.clustering_with_kmeans()
        self.result2db()
        
    def clustering_with_kmeans(self, n_clusters = 50):
        text = self.df['new_tags'].to_list()
        vectorizer = TfidfVectorizer(min_df = 5)
        X = normalize(vectorizer.fit_transform(text))
        
        # K-means model with @n_clusters clusters
        model = KMeans(n_clusters=n_clusters, random_state=10)
        self.df['label'] = model.fit_predict(X)
        
        # details(tags) each cluster 
        self.cluster_details = {}
        center_feature_idx = model.cluster_centers_.argsort()[:, ::-1]
        feature_names = vectorizer.get_feature_names_out()
        
        for cluster_num in range(n_clusters):
            self.cluster_details[cluster_num] = {}
            self.cluster_details[cluster_num]['cluster'] = cluster_num
            
            top_ftr_idx = center_feature_idx[cluster_num, :10]
            top_ftr = [feature_names[idx] for idx in top_ftr_idx]
            top_ftr_val = model.cluster_centers_[cluster_num, top_ftr_idx].tolist()
            
            self.cluster_details[cluster_num]['top_features'] = top_ftr
            self.cluster_details[cluster_num]['top_features_value'] = top_ftr_val
    
    def result2db(self, DB = './flask_api/db.db'):
        try:
            conn = sqlite3.connect(DB)
            cur = conn.cursor()
        except:
            print("DB Connection Error!")
            return
        
        try:
            cur.execute("DELETE FROM CLUSTER")
            for key, tags in self.cluster_details.items():
                for tag in tags['top_features']:
                    cur.execute("INSERT INTO CLUSTER VALUES (?, ?)", (key, tag))
        except:
            print("Failed to execute query")
            return
        finally:        
            conn.commit()
            conn.close()
        

In [None]:
labeler = Music_Labeler()
labeler.run()