# CF 성능 확인을 위한 코드
inspired by: https://gist.github.com/tgsmith61591/ce7d614d7a0442f94cd5ae5d1e51d3c2

### load library

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [14]:
from scipy import sparse

In [2]:
from tqdm.notebook import tqdm

### load dataframe

In [176]:
song_meta = pd.read_json("../data/song_meta.json")
train_original = pd.read_json("../data/train.json")

In [177]:
train_original['num_songs'] = train_original['songs'].map(lambda x: len(x))

In [178]:
# sample 코드 돌리는 것 편의상
train_original.sort_values(by='id', inplace=True)

In [179]:
# sample 1000개 추출
train = train_original.iloc[:1000].copy()

In [180]:
DTYPE = np.float64
NUM_PLYSTS = train.id.max()
NUM_SONGS = 707989

### simple EDA

In [25]:
train_original['num_songs'].describe()

count    115071.000000
mean         45.935735
std          43.950335
min           1.000000
25%          19.000000
50%          30.000000
75%          54.000000
max         200.000000
Name: num_songs, dtype: float64

# cf score 1

Standard CF model scoring method

# cf score 2

melon music continuation 문제에 맞춘 score method

1. train/test data split
2. test data set에서 각 playlist마다 일정 부분 masking
3. train data로 test data의 각 playlist마다 100개 곡 예측
4. nDCG score 계산

### cf_train_test_split

In [11]:
### u, i, r 각각 길이가 같은지 확인 ###
def check_consistent_length(u, i, r):
    from sklearn.utils import validation as skval
    
    skval.check_consistent_length(u, i, r)
    return np.asarray(u), np.asarray(i), np.asarray(r, dtype=DTYPE)

In [94]:
### item based or user based 변경 및 csr matrix 생성 함수 ###
def to_sparse_csr(u, i, r, axis=0, dtype=DTYPE):
    if axis not in (0,1):
        raise ValueError('axis must be an int in (0, 1)')
    
    rows = u if axis == 0 else i
    cols = i if axis == 0 else u
    data = r
    
    data, rows, cols = (np.asarray(x) for x in (data, rows, cols))
    shape = (NUM_PLYSTS+1, NUM_SONGS)
    
    return sparse.csr_matrix((data, (rows, cols)), shape=shape, dtype=dtype)

In [137]:
### test set에서 가릴 위치를 담은 test_mask 생성 ###
### 단, 각 plyst마다 seed가 될 곡 하나는 꼭 남겨두는 mask ###
def get_stratified_te_mask(u, i, mask_size, random_state):
    from sklearn.utils.validation import check_random_state
    random_state = check_random_state(random_state)
    n_events = u.shape[0]
    
    test_mask = random_state.rand(n_events) <= (1-mask_size)
    
    for array in (u, i):
        present = array[test_mask]
        test_vals = array[~test_mask]
        missing = np.unique(test_vals[np.where(~np.in1d(test_vals, present))[0]])
        
        if missing.shape[0] == 0:
            continue
            
        array_mask_missing = np.in1d(array, missing)
        where_missing = np.where(array_mask_missing)[0]
        
        added = set()
        for idx, val in zip(where_missing, array[where_missing]):
            if val in added:
                continue
            
            test_mask[idx] = True
            added.add(val)
    
    return test_mask

In [138]:
### 최종적으로 CF를 만들고 성능을 확인할 train csr matrix, test csr matrix ###
def make_sparse_tr(users, items, ratings):
    train_csr = to_sparse_csr(u=users, i=items, r=ratings, axis=0)
    return train_csr
    
def make_sparse_te(users, items, ratings, test_mask):
    test_csr = to_sparse_csr(u=users[test_mask], i=items[test_mask], r=ratings[test_mask], axis=0)
    return test_csr

In [139]:
def input_for_csr(dataframe):
    if 'id' not in dataframe.columns:
        raise NameError("dataframe must have column name 'id'.")
    if 'songs' not in dataframe.columns:
        raise NameError("dataframe must have column name 'songs'.")
        
    data, rows, cols = [], [], []

    for plyst_id, songs in dataframe[['id','songs']].values:
        for song_id in songs:
            rows.append(plyst_id)
            cols.append(song_id)
            data.append(1)
    return rows, cols, data # u, i, r

In [190]:
### 최종으로 쓸 function ###
def cf_train_test_split(data, test_size=0.3, mask_size=0.7, random_state=None):
    from sklearn.model_selection import train_test_split
    train_set, test_set = train_test_split(data, test_size=test_size, shuffle=True, random_state=random_state)
    print('train set: ', len(train_set), 'plysts')
    print('[train] min songs in one plyst: ', train_set.num_songs.min())
    print('test set: ', len(test_set), 'plysts')
    print('[test] min songs in one plyst: ', test_set.num_songs.min())
    
    # check_consistent_length - input: u, i, r
    # output: users, items, ratings
    u, i, r = input_for_csr(train_set)
    tr_rows, tr_cols, tr_data = check_consistent_length(u, i, r)
    u, i, r = input_for_csr(test_set)
    te_rows, te_cols, te_data = check_consistent_length(u, i, r)
    
    test_mask = get_stratified_te_mask(u=te_rows, i=te_cols, mask_size=mask_size, random_state=random_state)
    _, counts = np.unique(test_mask, return_counts=True)
    print('---')
    print('masked: ', counts[0])
    print('unmasked: ', counts[1])
    
    return make_sparse_tr(tr_rows, tr_cols, tr_data), make_sparse_te(te_rows, te_cols, te_data, test_mask)

In [191]:
tr_csr, te_csr = cf_train_test_split(train, test_size=0.3, mask_size=0.7, random_state=42)

train set:  700 plysts
[train] min songs in one plyst:  3
test set:  300 plysts
[test] min songs in one plyst:  4
---
masked:  1900
unmasked:  10893


### nDCG score

In [192]:
class CustomEvaluator:
    
    def _idcg(self, l):
        return sum((1.0 / np.log(i + 2) for i in range(l)))

    def __init__(self):
        self._idcgs = [self._idcg(i) for i in range(101)]

    def _ndcg(self, gt, rec):
        dcg = 0.0
        for i, r in enumerate(rec):
            if r in gt:
                dcg += 1.0 / np.log(i + 2)

        return dcg / self._idcgs[len(gt)]

    def _eval(self, gt_fname, rec_fname):
        gt_playlists = load_json(gt_fname)
        gt_dict = {g["id"]: g for g in gt_playlists}
        rec_playlists = load_json(rec_fname)
        
        music_ndcg = 0.0
        tag_ndcg = 0.0

        for rec in rec_playlists:
            gt = gt_dict[rec["id"]]
            music_ndcg += self._ndcg(gt["songs"], rec["songs"][:100])
            tag_ndcg += self._ndcg(gt["tags"], rec["tags"][:10])

        music_ndcg = music_ndcg / len(rec_playlists)
        tag_ndcg = tag_ndcg / len(rec_playlists)
        score = music_ndcg * 0.85 + tag_ndcg * 0.15

        return music_ndcg, tag_ndcg, score

    def evaluate(self, gt_fname, rec_fname):
        try:
            music_ndcg, tag_ndcg, score = self._eval(gt_fname, rec_fname)
            print(f"Music nDCG: {music_ndcg:.6}")
            print(f"Tag nDCG: {tag_ndcg:.6}")
            print(f"Score: {score:.6}")
        except Exception as e:
            print(e)

In [None]:
# # example
# evaluator = CustomEvaluator()
# evaluator.evaluate("arena_data/answers/val.json", "arena_data/results/results.json")