In [1]:
from datetime import timedelta, datetime
import glob
from itertools import chain
import json
import os
import re

import numpy as np
import pandas as pd

from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from konlpy.tag import Twitter
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from pandas.plotting import register_matplotlib_converters
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

plt.rcParams['font.family'] = 'NanumGothic'
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.options.display.float_format = '{:.3f}'.format

In [2]:
pd.options.mode.chained_assignment = None

## 3. 학습 데이터 : `train.json`

### 데이터 불러오기

In [22]:
train = pd.read_json('train.json', typ = 'frame', encoding='utf-8')

In [23]:
train

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,[락],61281,여행같은 음악,"[525514, 129701, 383374, 562083, 297861, 13954...",71,2013-12-19 18:36:19.000
1,"[추억, 회상]",10532,요즘 너 말야,"[432406, 675945, 497066, 120377, 389529, 24427...",1,2014-12-02 16:19:42.000
2,"[까페, 잔잔한]",76951,"편하게, 잔잔하게 들을 수 있는 곡.-","[83116, 276692, 166267, 186301, 354465, 256598...",17,2017-08-28 07:09:34.000
3,"[연말, 눈오는날, 캐럴, 분위기, 따듯한, 크리스마스캐럴, 겨울노래, 크리스마스,...",147456,크리스마스 분위기에 흠뻑 취하고 싶을때,"[394031, 195524, 540149, 287984, 440773, 10033...",33,2019-12-05 15:15:18.000
4,[댄스],27616,추억의 노래 ㅋ,"[159327, 553610, 5130, 645103, 294435, 100657,...",9,2011-10-25 13:54:56.000
...,...,...,...,...,...,...
115066,"[록메탈, 밴드사운드, 록, 락메탈, 메탈, 락, extreme]",120325,METAL E'SM #2,"[429629, 441511, 612106, 516359, 691768, 38714...",3,2020-04-17 04:31:11.000
115067,[일렉],106976,빠른 리스너를 위한 따끈따끈한 최신 인기 EDM 모음!,"[321330, 216057, 534472, 240306, 331098, 23288...",13,2015-12-24 17:23:19.000
115068,"[담시, 가족, 눈물, 그리움, 주인공, 나의_이야기, 사랑, 친구]",11343,#1. 눈물이 앞을 가리는 나의_이야기,"[50512, 249024, 250608, 371171, 229942, 694943...",4,2019-08-16 20:59:22.000
115069,"[잔잔한, 버스, 퇴근버스, Pop, 풍경, 퇴근길]",131982,퇴근 버스에서 편히 들으면서 하루를 마무리하기에 좋은 POP,"[533534, 608114, 343608, 417140, 609009, 30217...",4,2019-10-25 23:40:42.000


In [24]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115071 entries, 0 to 115070
Data columns (total 6 columns):
tags            115071 non-null object
id              115071 non-null int64
plylst_title    115071 non-null object
songs           115071 non-null object
like_cnt        115071 non-null int64
updt_date       115071 non-null object
dtypes: int64(2), object(4)
memory usage: 5.3+ MB


- 학습 데이터에는 **총 115,071개 플레이리스트 정보가 담겨져 있으며** 포함하는 값은 아래와 같습니다.
  + `tags` : 플레이리스트에 매핑된 태그
  + `id` : 플레이리스트 아이디
  + `plylst_title` : 플레이리스트 명
  + `songs` : 플레이리스트 내 수록된 곡 아이디
  + `like_cnt` : 플레이리스트 좋아요 횟수
  + `updt_date` : 플레이리스트 업데이트 일시

In [35]:
# 플레이리스트 아이디(id)와 수록곡(songs) 추출
plylst_song_map = train[['id', 'songs', 'updt_date']]

# unnest songs
plylst_song_map_unnest = np.dstack(
    (
        np.repeat(plylst_song_map.id.values, list(map(len, plylst_song_map.songs))), 
        np.concatenate(plylst_song_map.songs.values),
        np.repeat(plylst_song_map.updt_date.values, list(map(len, plylst_song_map.songs))),
        
    )
)

# unnested 데이터프레임 생성 : plylst_song_map
plylst_song_map = pd.DataFrame(data = plylst_song_map_unnest[0], columns = plylst_song_map.columns)
plylst_song_map['id'] = plylst_song_map['id'].astype(str)
plylst_song_map['songs'] = plylst_song_map['songs'].astype(str)
plylst_song_map['updt_date'] = plylst_song_map['updt_date'].astype(str)

# unnest 객체 제거
del plylst_song_map_unnest

In [36]:
plylst_song_map

Unnamed: 0,id,songs,updt_date
0,61281,525514,2013-12-19 18:36:19.000
1,61281,129701,2013-12-19 18:36:19.000
2,61281,383374,2013-12-19 18:36:19.000
3,61281,562083,2013-12-19 18:36:19.000
4,61281,297861,2013-12-19 18:36:19.000
...,...,...,...
5285866,100389,111365,2020-04-18 20:35:06.000
5285867,100389,51373,2020-04-18 20:35:06.000
5285868,100389,640239,2020-04-18 20:35:06.000
5285869,100389,13759,2020-04-18 20:35:06.000


In [39]:
plylst_song_map['updt_year'] = plylst_song_map['updt_date'].apply(lambda x: int(x[:4]))
plylst_song_map

Unnamed: 0,id,songs,updt_date,updt_year
0,61281,525514,2013-12-19 18:36:19.000,2013
1,61281,129701,2013-12-19 18:36:19.000,2013
2,61281,383374,2013-12-19 18:36:19.000,2013
3,61281,562083,2013-12-19 18:36:19.000,2013
4,61281,297861,2013-12-19 18:36:19.000,2013
...,...,...,...,...
5285866,100389,111365,2020-04-18 20:35:06.000,2020
5285867,100389,51373,2020-04-18 20:35:06.000,2020
5285868,100389,640239,2020-04-18 20:35:06.000,2020
5285869,100389,13759,2020-04-18 20:35:06.000,2020


In [47]:
plst2004 = plylst_song_map[plylst_song_map['updt_year']==2004]
plst2004

Unnamed: 0,id,songs,updt_date,updt_year
2448042,10350,279334,2004-12-31 15:22:50.000,2004
2448043,10350,48316,2004-12-31 15:22:50.000,2004
2448044,10350,163468,2004-12-31 15:22:50.000,2004
2448045,10350,374850,2004-12-31 15:22:50.000,2004
2448046,10350,220219,2004-12-31 15:22:50.000,2004
2448047,10350,199522,2004-12-31 15:22:50.000,2004
2448048,10350,139815,2004-12-31 15:22:50.000,2004
2448049,10350,689793,2004-12-31 15:22:50.000,2004
2448050,10350,70506,2004-12-31 15:22:50.000,2004
2448051,10350,24423,2004-12-31 15:22:50.000,2004


In [57]:
df = pd.DataFrame(plst2004['songs'].value_counts())
df = df.reset_index()
df.columns = ['song_id', 'count']
df['year'] = 2004
df

Unnamed: 0,song_id,count,year
0,623836,2,2004
1,241793,2,2004
2,178796,2,2004
3,639607,2,2004
4,651332,2,2004
5,682996,2,2004
6,691353,2,2004
7,48677,2,2004
8,289113,2,2004
9,325455,2,2004


In [61]:
for year in range(2004, 2021):
    tmp = plylst_song_map[plylst_song_map['updt_year']==year]
    df = pd.DataFrame(tmp['songs'].value_counts())
    df = df.reset_index()
    df.columns = ['song_id', 'count']
    df['year'] = year
    df.to_csv('top'+'{}'.format(year)+'.csv', encoding='utf-8', index=False)

In [62]:
df

Unnamed: 0,song_id,count,year
0,215411,1132,2020
1,680366,1121,2020
2,678762,1106,2020
3,235773,1092,2020
4,648628,1074,2020
...,...,...,...
220968,264753,1,2020
220969,626593,1,2020
220970,628289,1,2020
220971,584592,1,2020


In [65]:
# 데이터 확인
pd.read_csv('top2010.csv', encoding='utf-8')

Unnamed: 0,song_id,count,year
0,482903,38,2010
1,109574,37,2010
2,287003,35,2010
3,117595,34,2010
4,453055,32,2010
...,...,...,...
47173,676889,1,2010
47174,562264,1,2010
47175,601544,1,2010
47176,408946,1,2010


<br>