# Recommendation

Make recommendation for a (partial) playlist by scoring each song $m$ by its
1. popularity, i.e., the total number of occurrence in all playlists in MPD,
2. otherwise, we first score each playlist in training set by all seed songs (denoted by $\widetilde m$) in the (partial) playlist as $\sum_{\widetilde m} \mathbf{p}_n^\top \mathbf{s}_{\widetilde m}$, then choose the top $100$ scored training playlists $\left\{\mathbf{p}_k \right\}_{k=1}^{100}$ to score song $m$ by
$$
b_m + \sum_{k} \mathbf{p}_k^\top \mathbf{s}_m
$$
where $\mathbf{s}_m$ and $b_m$ are the latent feature vector and bias of song $m$, respectively.

Lastly, we take the top 500 scored songs (excluding the seed songs) as recommendation.

In [None]:
import os
import gzip
import torch
import numpy as np
import pickle as pkl
from tqdm import tqdm
from scipy.sparse import isspmatrix_csr

In [None]:
feature_dim = 200  # latent feature dimension

In [None]:
data_dir = 'data'
fparam = os.path.join(data_dir, 'mf-%d-mpd.npy' % feature_dim)
fchallenge = os.path.join(data_dir, 'challenge_set.pkl.gz')
fmftrain = os.path.join(data_dir, 'mftrain_mpd.pkl.gz')
fsong2pop = os.path.join(data_dir, 'song2pop_mpd.pkl.gz')
fsubmit = os.path.join(data_dir, 'submit-%d.csv' % feature_dim)

In [None]:
song2pop = pkl.load(gzip.open(fsong2pop, 'rb'))
song_sorted = sorted(song2pop, key=lambda sid: (-song2pop[sid], sid))

In [None]:
Y, song2index = pkl.load(gzip.open(fmftrain, 'rb'))
index2song = {ix: sid for sid, ix in song2index.items()}
assert isspmatrix_csr(Y)

Load trained parameters

In [None]:
w = np.load(fparam, allow_pickle=False)
N, M = Y.shape
D = feature_dim
S = w[:D * M].reshape(D, M)
b = w[D * M:(D + 1) * M].reshape(1, M)
P = w[(D + 1) * M:].reshape(N, D)

Transfer parameters to GPU if available

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda:0')
    S = torch.from_numpy(S).to(device)
    b = torch.from_numpy(b).to(device)
    P = torch.from_numpy(P).to(device)

Load challenge set

In [None]:
task_dict = pkl.load(gzip.open(fchallenge, 'rb'))

Make prediction

In [None]:
pred_dict = dict()
NUM_REC = 500
k = 100

for task in range(1, 11):
    for query in tqdm(task_dict[task]):
        pid = query['pid']
        tracks = query['tracks']
        if len(tracks) == 0:
            pred_dict[pid] = song_sorted[:NUM_REC]
            continue     
        if torch.cuda.is_available():
            seed_vec = S[:, [song2index[sid] for sid in tracks]].sum(dim=1).view(D, 1)
            pv, pix = torch.mm(P, seed_vec).sum(dim=1).view(-1).topk(k)
            phi = P[pix, :].sum(dim=0).view(1, D)
            scores = torch.mm(phi, S).view(-1) + b.view(-1)
            v, ix = torch.sort(-scores, dim=-1)
            indices = ix[:2 * NUM_REC].cpu().numpy()
        else:
            seed_vec = S[:, [song2index[sid] for sid in tracks]].sum(axis=1).reshape(D, 1)
            pix = np.argpartition(np.dot(P, seed_vec).sum(axis=1).reshape(-1), -k)[-k:]
            phi = P[pix, :].sum(axis=0).reshape(1, D)
            scores = torch.mm(phi, S).reshape(-1) + b.reshape(-1)
            indices = np.argsort(-scores)[:2 * NUM_REC]
        songs = [index2song[ix] for ix in indices if index2song[ix] not in set(tracks)]
        pred_dict[pid] = songs[:NUM_REC]

Generate submission file

In [None]:
def gen_submission(fout, pred_dict):
    assert len(pred_dict) == int(1e4)
    assert fout.endswith('.csv')
    header = 'team_info,main,dchen,u5708856@anu.edu.au'
    prefix = 'spotify:track'
    lines = [header]
    for pid in sorted(pred_dict):
        tracks = pred_dict[pid]
        line = [str(pid)] + ['%s:%s' % (prefix, track) for track in tracks]
        lines.append(','.join(line))
    with open(fout, 'w') as fd:
        fd.writelines('\n'.join(lines))
    print('{:,} lines written.'.format(len(lines)))

In [None]:
gen_submission(fsubmit, pred_dict)

Compress submission file

In [None]:
!gzip $fsubmit