music_clustering.ipynb

- song_meta, train.json의 음악, 플레이리스트 정보를 사용해 각 음악의 태그에 대해 k-means 클러스터링 수행
- 각 음악별 가장 많이 등장하는 10개의 태그를 저장
- 모든 음악의 태그 중 @most_common_k개 만큼의 태그만 사용해 클러스터링 수행
- @n_clusters개의 클러스터로 k-means 클러스터링 수행
- 각 클러스터 별 상위 10개의 태그는 DB의 CLUSTER 테이블에 저장 

In [1]:
import json, sqlite3
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans

In [4]:
class Genre_Filter:
    def __init__(self, 
                song_meta_path = './json_data/song_meta.json',
                playlist_path = './json_data/train.json'):
        
        # load song_meta dataset
        with open(song_meta_path, 'r', encoding='UTF8') as f:
            data = json.load(f)
        self.song_df = pd.json_normalize(data)
        
        # load playlist dataset
        with open(playlist_path, 'r', encoding='UTF8') as f:
            data = json.load(f)
        self.playlist_df = pd.json_normalize(data)
    
    def run(self):
        # select genre (발라드, 댄스, R&B, 인디, POP, 아이돌)
        select_genre_list = ['GN0100','GN0200','GN0400','GN0500','GN0900','GN2500']
        filter = self.song_df.song_gn_gnr_basket.apply(lambda x: any(genre for genre in select_genre_list if genre in x))
        self.song_df = self.song_df[filter]
        
        # except genre (록/메탈, 성인가요, 포크/블루스, POP, 록/메탈, 일렉트로니카, 랩/힙합, R&B/SOUL, 포크/블루스/컨트리 ... )
        except_genre_list = [
                    'GN0600', 'GN0700', 'GN0800', 'GN1000', 'GN1100', 'GN1200', 'GN1300', 'GN1400', 'GN1500', 
                    'GN1600', 'GN1700', 'GN1800', 'GN1900', 'GN2000', 'GN2100', 'GN2200', 'GN2300', 'GN2400',
                    'GN2600', 'GN2700', 'GN2800', 'GN2900', 'GN3000'
                    ]
        # except detail genre, e.g., ballad in 80s ~ 90s
        except_dt_genre_list = ['GN0102', 'GN0103', 'GN0104', 'GN0202', 'GN0203', 'GN0204', 'GN0504', 'GN0507', 'GN0508']
        
        filter = self.song_df.song_gn_gnr_basket.apply(lambda x: any(genre for genre in except_genre_list if genre in x))
        self.song_df = self.song_df[~filter]
        
        filter = self.song_df.song_gn_dtl_gnr_basket.apply(lambda x: any(genre for genre in except_dt_genre_list if genre in x))
        self.song_df = self.song_df[~filter]
        
        # except outdated music (target_date: 2000.01.01)
        target_date = '20000101'
        self.song_df = self.song_df[self.song_df['issue_date'] > target_date]
        
        # except other outliers
        filter = self.song_df.artist_name_basket.apply(lambda x: 'Various Artists' not in x)
        self.song_df = self.song_df[filter]
        self.song_df = self.song_df[~self.song_df['album_name'].str.contains('70|80|추억의|베스트|Live|모음|컬렉션', na=False)]
        self.song_df = self.song_df[~self.song_df['song_name'].str.contains('Inst.|Ver.', na=False)]

        self.song_df.reset_index(drop=True, inplace=True)
        
        self.song_df = self.song_df[['id', 'song_name', 'artist_name_basket']]
        self.song_df['tags'] = [[] for _ in range(len(self.song_df))]
        self.song_df['count'] = 0
        
        # Add playlist's @tags to each songs and update @count to give weights.
        for idx_, row in self.playlist_df.iterrows():
            cnt = row.like_cnt
            song_list = row.songs
            tags = row.tags
            for song in song_list:
                if song in self.song_df['id'].values:
                    idx, = self.song_df.index[self.song_df['id'] == song].values
                    self.song_df.at[idx, 'count'] += 1
                    origin_tags = dict(self.song_df.at[idx, 'tags'])
                    
                    for tag in tags:
                        if tag not in origin_tags: origin_tags[tag] = 1
                        elif tag in origin_tags: origin_tags[tag] += 1
                    
                    if len(origin_tags) > 10:
                        new_dict = {}
                        for tag, cnt in sorted(list(origin_tags.items()), key=lambda x: -x[1])[:10]:
                            new_dict[tag] = cnt
                        self.song_df.at[idx, 'tags'] = list(new_dict.items())
                    else:
                        self.song_df.at[idx, 'tags'] = list(origin_tags.items())
        
        for idx, row in self.song_df.iterrows():    
            self.song_df.at[idx, 'tags'] = list(dict(row.tags).keys())
        
        filter = self.song_df.tags.apply(lambda x: len(x) >= 10)
        self.song_df = self.song_df[filter]
        
        for idx, row in self.song_df.iterrows():
            self.song_df.at[idx, 'tags'] = ' '.join(self.song_df.at[idx, 'tags'])
        self.song_df = self.song_df[self.song_df['count'] >= 10]
        
        self.song_df.sort_values(by='count', ascending=False, inplace=True)
        return self.song_df
        

In [10]:
class Music_Labeler:
    # Load music Dataframe and take most common k tags
    def __init__(
            self, 
            df,
            n_clusters = 20, 
            most_common_k = 100,
        ):
        
        self.df = df
        self.n_clusters = n_clusters
        
        # take only most common k tags
        common_tags = []
        for idx, row in self.df.iterrows():
            common_tags.extend(row['tags'].split())
        self.tags = [tag for [tag, cnt] in Counter(common_tags).most_common(most_common_k)]
        
        # except tags not in most common k tags each music 
        for idx, row in self.df.iterrows():
            self.df.loc[idx, 'new_tags'] = ' '.join([tag for tag in row['tags'].split() if tag in self.tags])
        self.df.drop('tags', axis=1, inplace=True)  
    
    def run(self):
        self.clustering_with_kmeans()
        self.result2db()
        
    def clustering_with_kmeans(self):
        n_clusters = self.n_clusters
        
        text = self.df['new_tags'].to_list()
        vectorizer = TfidfVectorizer(min_df = 5)
        X = normalize(vectorizer.fit_transform(text))
        
        # K-means model with @n_clusters clusters
        model = KMeans(n_clusters=n_clusters, random_state=10)
        self.df['label'] = model.fit_predict(X)
        
        # details(tags) each cluster 
        self.cluster_details = {}
        center_feature_idx = model.cluster_centers_.argsort()[:, ::-1]
        feature_names = vectorizer.get_feature_names_out()
        
        for cluster_num in range(n_clusters):
            self.cluster_details[cluster_num] = {}
            self.cluster_details[cluster_num]['cluster'] = cluster_num
            
            top_ftr_idx = center_feature_idx[cluster_num, :10]
            top_ftr = [feature_names[idx] for idx in top_ftr_idx]
            top_ftr_val = model.cluster_centers_[cluster_num, top_ftr_idx].tolist()
            
            self.cluster_details[cluster_num]['top_features'] = top_ftr
            self.cluster_details[cluster_num]['top_features_value'] = top_ftr_val
    
    def result2db(self, DB = '../flask_api/db.db'):
        try:
            conn = sqlite3.connect(DB)
            cur = conn.cursor()
        except:
            print("DB Connection Error!")
            return
        
        try:
            cur.execute("DELETE FROM CLUSTER")
            for key, tags in self.cluster_details.items():
                for tag in tags['top_features']:
                    cur.execute("INSERT INTO CLUSTER VALUES (?, ?)", (key, tag))
        except:
            print("Failed to execute query")
            return
        
        try:
            cur.execute("DELETE FROM MUSIC")
            for idx, row in self.df.iterrows():
                cur.execute("INSERT INTO MUSIC VALUES (?, ?, ?, ?, ?, ?)", 
                (row['id'], row['song_name'], ' '.join(row['artist_name_basket']), row['new_tags'], row['count'], row['label']))
        except:
            print("Failed to execute query")
            return
        finally:
            conn.commit()
            conn.close()
        
        return self.df

In [5]:
filter = Genre_Filter()
df = filter.run()

In [None]:
import pymysql
conn = pymysql.connect(
    host = 'flask-db.cuqw33e66jfm.ap-northeast-2.rds.amazonaws.com',
    user = 'admin',
    password = 'capstoneml',
    db = 'recommendation',
    charset = 'utf8',
    port = 3306
)
cur = conn.cursor()

In [12]:
labeler = Music_Labeler(df, n_clusters = 10)
df = labeler.run()

In [10]:
conn = sqlite3.connect('../flask_api/db.db')
cur = conn.cursor()

In [17]:
result = []
for i in range(10):
    cur.execute("SELECT tag FROM CLUSTER WHERE label = ?", [i])
    result.append(cur.fetchall())