genre_filtering.ipynb
- 'song_meta.json'의 약 700,000 음악의 메타 데이터에서 발라드, R&B, 아이돌, 10년도 이후 노래로 조건을 정해 80,000 개로 필터링

- input: song_meta.json
- output: cleaned_song_meta.json

In [15]:
# Load original 'song_meta.json' to @df
import json
import pandas as pd

with open('./json_data/song_meta.json', 'r', encoding='UTF8') as f:
    data = json.load(f)

df = pd.json_normalize(data)

In [16]:
df

Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
0,[GN0901],20140512,불후의 명곡 - 7080 추억의 얄개시대 팝송베스트,2255639,[2727],Feelings,[GN0900],[Various Artists],0
1,"[GN1601, GN1606]",20080421,"Bach : Partitas Nos. 2, 3 & 4",376431,[29966],"Bach : Partita No. 4 In D Major, BWV 828 - II....",[GN1600],[Murray Perahia],1
2,[GN0901],20180518,Hit,4698747,[3361],Solsbury Hill (Remastered 2002),[GN0900],[Peter Gabriel],2
3,"[GN1102, GN1101]",20151016,Feeling Right (Everything Is Nice) (Feat. Popc...,2644882,[838543],Feeling Right (Everything Is Nice) (Feat. Popc...,[GN1100],[Matoma],3
4,"[GN1802, GN1801]",20110824,그남자 그여자,2008470,[560160],그남자 그여자,[GN1800],[Jude Law],4
...,...,...,...,...,...,...,...,...,...
707984,[GN2001],19991219,The Best Best Of The Black President,65254,[166499],Coffin For Head Of State,[GN2000],[Fela Kuti],707984
707985,[GN0901],19860000,True Colors,44141,[11837],Change Of Heart,[GN0900],[Cyndi Lauper],707985
707986,"[GN0105, GN0101]",20160120,행보 2015 윤종신 / 작사가 윤종신 Live Part.1,2662866,[437],스치듯 안녕,[GN0100],[윤종신],707986
707987,"[GN1807, GN1801]",20131217,명상의 시간을 위한 뉴에이지 음악,2221722,[729868],숲의 빛,[GN1800],[Nature Piano],707987


In [19]:
# filtering dataset on a specific condition
def cleaning_dataset(df) -> pd.DataFrame:
    
    # select genre
    # 발라드, 댄스, R&B, 인디, 아이돌
    select_genre_list = ['GN0100','GN0200','GN0400','GN0500','GN2500']
    filter = df.song_gn_gnr_basket.apply(lambda x: any(genre for genre in select_genre_list if genre in x))
    df = df[filter]
    
    # except genre
    # 록/메탈, 성인가요, 포크/블루스, POP, 록/메탈, 일렉트로니카, 랩/힙합, R&B/SOUL, 포크/블루스/컨트리 ... 
    except_genre_list = ['GN600', 'GN700', 'GN800', 'GN900', 'GN1000', 'GN1100', 'GN1200', 'GN1300', 'GN1400', 
                         'GN1500', 'GN1600', 'GN1700', 'GN1800', 'GN1900', 'GN2000', 'GN2100', 'GN2200', 'GN2300', 
                         'GN2400', 'GN2600', 'GN2700', 'GN2800', 'GN2900', 'GN3000']
    
    # except detail genre, e.g., ballad in 80s ~ 90s
    except_dt_genre_list = ['GN0102', 'GN0103', 'GN0104', 'GN0202', 'GN0203', 'GN0204', 'GN0504', 'GN0507', 'GN0508']

    filter = df.song_gn_gnr_basket.apply(lambda x: any(genre for genre in except_genre_list if genre in x))
    df = df[~filter]
    
    filter = df.song_gn_dtl_gnr_basket.apply(lambda x: any(genre for genre in except_dt_genre_list if genre in x))
    df = df[~filter]

    # except outdated music (target_date: 2000.01.01)
    target_date = '20000101'
    df = df[df['issue_date'] > target_date]

    # except other outliers
    filter = df.artist_name_basket.apply(lambda x: 'Various Artists' not in x)
    df = df[filter]
    df = df[~df['album_name'].str.contains('70|80|추억의|베스트|Live|모음|컬렉션', na=False)]
    df = df[~df['song_name'].str.contains('Inst.|Ver.', na=False)]

    df.reset_index(drop=True, inplace=True)
    return df

new_df = cleaning_dataset(df)

In [21]:
new_df

Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
0,"[GN0105, GN0101]",20170320,Pastel Reflection,10047088,[753752],"사랑, 그대라는 멜로디",[GN0100],[진호],9
1,"[GN2503, GN0205, GN2501, GN2506, GN0201]",20160226,Melting,2669407,[750053],Girl Crush,"[GN2500, GN0200]",[마마무 (Mamamoo)],17
2,"[GN0805, GN0501, GN0502, GN0801, GN0509]",20150205,내가 부른 그림 2,2303168,[230399],무얼 기다리나 (Feat. 조원선),"[GN0500, GN0800]",[이영훈],19
3,"[GN0805, GN0501, GN0502, GN0801, GN0509]",20120629,남몰래 듣기,2133128,[681291],찾고 있니,"[GN0500, GN0800]",[이호석],35
4,"[GN0509, GN0501, GN0304, GN0505, GN0301]",20140828,Clarity,2278112,[588331],Walk Alone,"[GN0500, GN0300]",[LHA],47
...,...,...,...,...,...,...,...,...,...
86149,"[GN0509, GN0502, GN0801, GN0501]",20170302,우릴 좋아하게 되고 말거야,10042593,[994512],어지러워,"[GN0500, GN0800]",[헤일],707922
86150,"[GN0105, GN0101]",20101018,미니 앨범,1044156,[1128],내안의 그대,[GN0100],[변진섭],707925
86151,"[GN0105, GN0101]",20171012,너를 그리워해,10101402,[1758500],너를 그리워해,[GN0100],[토요],707957
86152,"[GN0401, GN0402]",20110902,I Decide (Piano Remix),2010118,[202407],I Decide (Piano Remix),[GN0400],[헤리티지(Heritage)],707969


In [22]:
new_df.to_json('./json_data/cleaned_song_meta.json', orient='records')

In [23]:
with open('./json_data/cleaned_song_meta.json', 'r', encoding='UTF8') as f:
    data = json.load(f)
df = pd.json_normalize(data)

In [24]:
df

Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
0,"[GN0105, GN0101]",20170320,Pastel Reflection,10047088,[753752],"사랑, 그대라는 멜로디",[GN0100],[진호],9
1,"[GN2503, GN0205, GN2501, GN2506, GN0201]",20160226,Melting,2669407,[750053],Girl Crush,"[GN2500, GN0200]",[마마무 (Mamamoo)],17
2,"[GN0805, GN0501, GN0502, GN0801, GN0509]",20150205,내가 부른 그림 2,2303168,[230399],무얼 기다리나 (Feat. 조원선),"[GN0500, GN0800]",[이영훈],19
3,"[GN0805, GN0501, GN0502, GN0801, GN0509]",20120629,남몰래 듣기,2133128,[681291],찾고 있니,"[GN0500, GN0800]",[이호석],35
4,"[GN0509, GN0501, GN0304, GN0505, GN0301]",20140828,Clarity,2278112,[588331],Walk Alone,"[GN0500, GN0300]",[LHA],47
...,...,...,...,...,...,...,...,...,...
86149,"[GN0509, GN0502, GN0801, GN0501]",20170302,우릴 좋아하게 되고 말거야,10042593,[994512],어지러워,"[GN0500, GN0800]",[헤일],707922
86150,"[GN0105, GN0101]",20101018,미니 앨범,1044156,[1128],내안의 그대,[GN0100],[변진섭],707925
86151,"[GN0105, GN0101]",20171012,너를 그리워해,10101402,[1758500],너를 그리워해,[GN0100],[토요],707957
86152,"[GN0401, GN0402]",20110902,I Decide (Piano Remix),2010118,[202407],I Decide (Piano Remix),[GN0400],[헤리티지(Heritage)],707969
