In [1]:
from scipy.sparse import *
import numpy as np
import pandas as pd
import sys
import math

from recsys.preprocess import *
from recsys.utility import *

In [2]:
def load_things(location, has_test = True):
    global train, test, playlists, tracks, target_tracks, target_playlists, tracks_in_playlist, tracks_target_only

    train = pd.read_csv(os.path.join(location, 'train.csv'))
    target_playlists = pd.read_csv(os.path.join(location, 'target_playlists.csv'))
    target_tracks = pd.read_csv(os.path.join(location, 'target_tracks.csv'))

    playlists = pd.read_csv('data/playlists_final.csv', delimiter='\t')
    tracks = pd.read_csv('data/tracks_final.csv', delimiter='\t')

    #tracks['tags'] = tracks['tags'].apply(lambda x: np.array(eval(x)))
    tracks.index = tracks.track_id

    tracks_in_playlist = get_playlist_track_list2(train)
    tracks_target_only = tracks[tracks.track_id.isin(target_tracks.track_id)]
    
    if has_test:
        test = pd.read_csv(os.path.join(location, 'test.csv'))

In [3]:
def load_similarity(location):
    row = []
    col = []
    data = []
    content = None
    with open(os.path.join(location, 'similarity_bpr.txt'), 'r') as f:
        content = f.readlines()

    row = list(map(int, content[1].strip().split(' ')))
    col = list(map(int, content[2].strip().split(' ')))
    data = list(map(float, content[3].strip().split(' ')))

    coo = coo_matrix((data, (row, col)), shape=(100000, 100000))
    csr = coo.tocsr()

    return csr

In [4]:
def from_num_to_id(df, row_num, column = 'track_id'):
    """ df must have a 'track_id' column """
    return df.iloc[row_num][column]

def from_id_to_num(df, tr_id, column='track_id'):
    """ df must have a 'track_id' column """
    return np.where(df[column].values == tr_id)[0][0]

In [5]:
def build_id_to_num_map(df, column):
    a = pd.Series(np.arange(len(df)))
    a.index = df[column]
    return a

def build_num_to_id_map(df, column):
    a = pd.Series(df[column])
    a.index = np.arange(len(df))
    return a

In [6]:
def load_URM():
    tr_map = build_id_to_num_map(tracks, 'track_id')
    pl_map = build_id_to_num_map(playlists, 'playlist_id')
    
    train_new = pd.DataFrame()
    train_new['track_id'] = train['track_id'].apply(lambda x : tr_map[x])
    train_new['playlist_id'] = train['playlist_id'].apply(lambda x : pl_map[x])
    
    rows = train_new['playlist_id'].values
    cols = train_new['track_id'].values
    values = np.ones(len(train_new))
    
    M = coo_matrix((values, (rows, cols)))
    return M.tocsr()
    

# Set location of the folder with all data

In [7]:
location = 'test7/'

In [8]:
load_things(location, True)

## Load URM

In [9]:
%%time
M = load_URM()

CPU times: user 18 s, sys: 116 ms, total: 18.1 s
Wall time: 18.1 s


In [None]:
from implicit.nearest_neighbours import bm25_weight
from implicit.nearest_neighbours import tfidf_weight

In [None]:
M = bm25_weight(M)

In [None]:
M = tfidf_weight(M)

In [None]:
M = M.tocsr()

In [None]:
M[0,:].data

In [None]:
M = M.tocsc()
max_pl_length = M.sum(0).A.max()
for i in range(M.shape[1]):
    n_playlist = M.indptr[i+1] - M.indptr[i]
    if n_playlist >= 1:
        M.data[M.indptr[i]:M.indptr[i+1]] = np.repeat(math.sqrt((500) / (n_playlist)), n_playlist)
    else:
        print("argh")
        M.data[M.indptr[i]:M.indptr[i+1]] = np.repeat(math.sqrt((500) / (n_playlist+5)), n_playlist)

## Prediction #1: dot product

Best result so far: 

- 1.5 0.7 0 0 0.1 0 0.01   --> 0.0735
- 1.5 0.7 0 0 0.1 0 0 --> 0.0710
- 1 0.7 0.01 0 0.3 0 0 --> 0.0669
- 1.4 0.7 0.01 0 0.3 0 0.5 --> 0.0659
- 1.4 0.7 0.01 0 0.3 0.4 0.01 --> 0.0638
- 1.5 0.7 0 0 0.1 0 0 --> 0.07108
- 1.5 0.7 0 0 0.1 0 0.02 --> 0.07346
- 1.5 0.7 0 0 0.1 0 0.1 --> 0.734
- 1.7 0.7 0 0 0.1 0 0.5 --> 0.0585

== dataset 2

- 1.5 0.7 0.1 0 0.1 0 0.01 --> 0.07207

(restoring tag idf)
- 1.5 0.7 0.1 0 0.1 0 0.01 --> 0.0742
- 1.5 0.7 0 0 0.3 0 0.01 --> 0.07166
- 1.5 0.5 0.01 0.01 0.1 0 0.01 --> 0.07165

== dataset 3
- 1.5 0.7 0.1 0 0.1 0 0.01 --> 0.0722

(using tag IDF log(100000/#tag_freq)

== dataset 5
- 1.5 0.7 0.1 0.1 0.1 0.1 0.1 --> 0.07548  (URM changed)
- 1 0.7 0.1 0.1 0.1 0.1 0.1 --> 0.0739   (URM changed)
- 1 0.7 0.1 0.1 0.1 0.1 0.1 --> 0.0715

== dataset 7
- 1.5 0.7 0.1 0.1 0.1 0.1 0.1 --> 0.0723
- 1.5 0.7 0.1 0.1 0.1 0.1 0.1 --> 0.07467 (URM changed)
- 1.5 0.7 0.1 0.1 0.1 0.1 0.1 --> 0.07585 (URM changed v.2)


In [14]:
S = load_similarity(location)
S2 = S.copy()
S2 = S2.transpose().tocsr()

In [17]:
pl2id_map = build_num_to_id_map(playlists, 'playlist_id')
tr2id_map = build_num_to_id_map(tracks, 'track_id')
pl2num_map = build_id_to_num_map(playlists, 'playlist_id')

M = M.tocsr()
predictions = {}
for pl_id in target_playlists['playlist_id'].values:
    pl_num = pl2num_map[pl_id]
    r = M[pl_num,:].dot(S2)
    idx = r.data.argsort()
    ranking = np.flip(r.indices[idx], 0)
    
    count = 0
    i = 0
    pred = []
    while count < 5 and i < len(ranking):
        tr_id = tr2id_map[ranking[i]]
        if tr_id not in tracks_in_playlist.loc[pl_id]['track_ids'] and tr_id in target_tracks['track_id'].values:
            pred.append(tr_id)
            count +=1
        i+=1
    i=0
    if (len(pred) < 5):
        print("aaaargh len < 5")
        print("{0}".format(pl_num))
    while len(pred) < 5 and i < len(ranking):
        pred.append(tr2id_map[ranking[i]])
        i+=1
    while(len(pred) < 5):
        pred.append(0)
    predictions[pl_id] = np.array(pred)

pred = pd.DataFrame()
pred['playlist_id'] = predictions.keys()
pred['track_ids'] = list(predictions.values())

In [None]:
pl2id_map = build_num_to_id_map(playlists, 'playlist_id')
tr2id_map = build_num_to_id_map(tracks, 'track_id')
pl2num_map = build_id_to_num_map(playlists, 'playlist_id')

M = M.tocsr()
predictions = {}
for pl_id in target_playlists['playlist_id'].values:
    pl_num = pl2num_map[pl_id]
    p1 = get_pred(M, S2, pl_num)
    p2 = get_pred(M, S2, W_sparse[:,pl_num].argmax(), shrink=0.1)
    
    #print("{} {} {} {}".format(p1[0], p2[0], np.intersect1d(p1[0], p2[0]), test[test['playlist_id'] == pl_id]['track_id'].values))
    
    predictions[pl_id] = blend([p1,p2])

pred = pd.DataFrame()
pred['playlist_id'] = predictions.keys()
pred['track_ids'] = list(predictions.values())

In [18]:
print(evaluate(test, pred))

0.023407608490743758


Unnamed: 0,playlist_id,track_ids
0,4112384,"[2902714, 1434261, 1187176, 1387554, 1205585]"
1,8028161,"[3033014, 65103, 3039869, 1585080, 1952328]"
2,8536068,"[3542379, 258341, 2427054, 2557632, 1805887]"
3,2785288,"[1762722, 3828446, 3668606, 2834644, 3166665]"
4,1179658,"[705903, 3436154, 3332967, 3075639, 1713721]"
5,8511632,"[2009159, 1077228, 2863395, 3718993, 1866341]"
6,11173900,"[612601, 1434491, 3866410, 2306694, 1272967]"
7,4456463,"[2398985, 1291070, 3081288, 1899264, 949534]"
8,10682385,"[1864599, 3498040, 187052, 325804, 2149194]"
9,5849106,"[570940, 2557632, 1563309, 1337470, 346924]"


In [None]:
print(evaluate(test, pred))

In [None]:
print(evaluate(test, pred))

In [None]:
print(evaluate(test, pred))

In [None]:
print(evaluate(test, pred))

In [None]:
print(evaluate(test, pred))

In [None]:
print(evaluate(test, pred))

In [None]:
print(evaluate(test, pred))

In [None]:
def blend(predictions):
    preds = predictions[0][0]
    vals = predictions[0][1]
    
    for i,p in enumerate(predictions[1][0]):
        if p in preds:
            idx, = np.where(preds==p)[0]
            vals[idx] *= (1 + 1/(i+1))**2
    
    preds = np.array(preds)
    vals = np.array(vals)
    
    
    idx = vals.argsort()
    idx = np.flip(idx, 0)
    return preds[idx[:5]]
    
    
    

In [None]:
def get_pred(URM, SIM, pl_num, shrink=1):
    global tr2id_map, tracks_in_playlist
    r = URM[pl_num,:].dot(SIM)
    idx = r.data.argsort()
    ranking = np.flip(r.indices[idx], 0)
    
    values = []
    count = 0
    i = 0
    pred = []
    while count < 10 and i < len(ranking):
        tr_id = tr2id_map[ranking[i]]
        if tr_id not in tracks_in_playlist.loc[pl_id]['track_ids']:
            pred.append(tr_id)
            values.append(r[:,ranking[i]].data[0])
            count +=1
        i+=1
    i=0
    if (len(pred) < 5):
        print("aaaargh len < 5")
        print("{0}".format(pl_num))
    while len(pred) < 5 and i < len(ranking):
        pred.append(tr2id_map[ranking[i]])
        values.append(r[:,ranking[i]].data[0])
        i+=1
    while(len(pred) < 5):
        pred.append(0)
        values.append(0)
    return (np.array(pred), np.array(values)*shrink)

In [None]:
print(evaluate(test, pred))

In [None]:
print(evaluate(test, pred))

In [None]:
print(evaluate(test, pred))

In [None]:
print(evaluate(test, pred))

In [None]:
S2[S2>5] = 5

In [None]:
len(pred)

In [None]:
pred['track_ids'] = pred['track_ids'].apply(lambda x : ' '.join(map(str, x)))

In [None]:
pred.to_csv(os.path.join(location,'results.csv'), index=False)

## Prediction #2: min distance

Best result so far: 

- 1.5 0.7 0 0 0.1 0 0.01 --> 0.06
- 1.5 0.7 0 0 0.1 0 0 --> 0.0631
- 1 0.7 0.01 0 0.3 0 0 --> 0.0628
- 1.4 0.7 0.01 0 0.3 0 0.5 --> 0.0639
- 1.4 0.7 0.01 0 0.3 0.4 0.01 --> 0.0589
- 1.5 0.7 0 0 0.1 0 0 --> 0.06317
- 1.5 0.7 0 0 0.1 0 0.02 --> 0.0644
- 1.5 0.7 0 0 0.1 0 0.1 --> 0.6441
- 1.7 0.7 0 0 0.1 0 0.5 --> 0.6509

== dataset 2

- 1.5 0.7 0.1 0 0.1 0 0.01 --> 0.0632

(restoring tag idf)
- 1.5 0.7 0.1 0 0.1 0 0.01 --> 0.06777
- 1.5 0.7 0 0 0.3 0 0.01 --> 0.0677
- 1.5 0.5 0.01 0.01 0.1 0 0.01 --> 0.06784

(using tag IDF log(100000/#tag_freq)

== dataset 5
- 1.5 0.7 0.1 0.1 0.1 0.1 0.1 --> 0.0704

In [None]:
S = load_similarity(location)
S2 = S.copy()
S2 = S2.transpose().tocsr()

In [None]:
pl2id_map = build_num_to_id_map(playlists, 'playlist_id')
tr2id_map = build_num_to_id_map(tracks, 'track_id')
pl2num_map = build_id_to_num_map(playlists, 'playlist_id')

M = M.tocsr()
predictions = {}
for pl_id in target_playlists['playlist_id'].values:
    pl_num = pl2num_map[pl_id]
    
    tmp_a = M[pl_num,:].nonzero()[1]
    tmp_c = S2[tmp_a,:]
    tmp_b = tmp_c.data.argsort()
    ranking = np.flip(tmp_c.indices[tmp_b], 0)
    
    count = 0
    i = 0
    pred = []
    while count < 5 and i < len(ranking):
        tr_id = tr2id_map[ranking[i]]
        if tr_id not in tracks_in_playlist.loc[pl_id]['track_ids']:
            pred.append(tr_id)
            count +=1
        i+=1
    i=0
    if (len(pred) < 5):
        print("aaaargh len < 5")
        print("{0}".format(pl_num))
    while len(pred) < 5:
        pred.append(0)
        i+=1
    predictions[pl_id] = np.array(pred)
pred = pd.DataFrame()
pred['playlist_id'] = predictions.keys()
pred['track_ids'] = list(predictions.values())
evaluate(test, pred)

In [None]:
0.06932

# Fabio part

In [None]:
def reduce_train(train, to_keep=0.8):
    # shuffle train index
    train.drop(range(int(len(train)*to_keep))).head()
    
#train = reduce_train(train, to_keep=0.5)

In [None]:
location = 'test1/'

In [None]:
load_things(location, True)

In [None]:
tracks_num = pd.DataFrame(tracks, copy=True)
tracks_num['track_id_tmp'] = tracks['track_id']
tracks_num['track_id'] = np.arange(len(tracks))

playlists_num = pd.DataFrame(playlists, copy=True)
playlists_num['playlist_id_tmp'] = playlists['playlist_id']
playlists_num['playlist_id'] = np.arange(len(playlists))

train_num = pd.DataFrame(train, copy=True)
train_num['playlist_id_tmp'] = train['playlist_id']
train_num['track_id_tmp'] = train['track_id']


track_to_num = build_id_to_num_map(tracks, 'track_id')
playlist_to_num = build_id_to_num_map(playlists, 'playlist_id')
num_to_tracks = build_num_to_id_map(tracks, 'track_id')


train_num['track_id'] = train['track_id'].apply(lambda x : track_to_num[x])
train_num['playlist_id'] = train['playlist_id'].apply(lambda x : playlist_to_num[x])

tracks_num.tags = tracks_num.tags.apply(lambda s: np.array(eval(s), dtype=int))
playlists_num.title = playlists_num.title.apply(lambda s: np.array(eval(s), dtype=int))

target_playlists_num = pd.DataFrame()
target_playlists_num['playlist_id_tmp'] = target_playlists['playlist_id']
target_playlists_num['playlist_id'] = target_playlists['playlist_id'].apply(lambda x : playlist_to_num[x])

target_tracks_num = pd.DataFrame()
target_tracks_num['track_id_tmp'] = target_tracks['track_id']
target_tracks_num['track_id'] = target_tracks['track_id'].apply(lambda x : track_to_num[x])

playlist_tracks = pd.DataFrame(train_num['playlist_id'].drop_duplicates())
playlist_tracks.index = train_num['playlist_id'].unique()
playlist_tracks['track_ids'] = train_num.groupby('playlist_id').apply(lambda x : x['track_id'].values)
playlist_tracks = playlist_tracks.sort_values('playlist_id')

track_playlists = pd.DataFrame(train_num['track_id'].drop_duplicates())
track_playlists.index = train_num['track_id'].unique()
track_playlists['playlist_ids'] = train_num.groupby('track_id').apply(lambda x : x['playlist_id'].values)
track_playlists = track_playlists.sort_values('track_id')

def transform_album_1(alb):
    ar = eval(alb)
    if len(ar) == 0 or (len(ar) > 0 and ar[0] == None):
        ar = [-1]
    return ar[0]

def transform_album_2(alb):
    global next_album_id
    if alb == -1:
        alb = next_album_id
        next_album_id += 1
    return alb
    
tracks_num.album = tracks_num.album.apply(lambda alb: transform_album_1(alb))

last_album = tracks_num.album.max()
next_album_id = last_album + 1

tracks_num.album = tracks_num.album.apply(lambda alb: transform_album_2(alb))

In [None]:
# User Rating Matrix URM
def get_URM(tracks, playlists, playlist_tracks, track_playlists, normalized=False):
    URM = lil_matrix((len(playlists), len(tracks)))
    num_playlists = len(playlist_tracks)

    i = 0
    
    for row in track_playlists.itertuples():
        track_id = row.track_id
        #row.playlist_ids.sort()
        nq = len(row.playlist_ids)
        for pl_id in row.playlist_ids:
            URM[pl_id,track_id] = math.log((num_playlists - nq + 0.5)/(nq + 0.5)) if normalized else 1
        if i % 10000 == 0:
            print(i)
        i += 1
    
    return URM

In [None]:
%%time
URM = get_URM(tracks_num, playlists_num, playlist_tracks, track_playlists, normalized=True)

In [None]:
URM

In [None]:
# Count distinct title tokens
token_playlists = {}
for row in playlists_num.itertuples():
    for token in row.title:
        if token in token_playlists:
            token_playlists[token].append(row.playlist_id)
        else:
            token_playlists[token] = [row.playlist_id]

In [None]:
# User Title Matrix UTM
def get_UTM(playlists, token_playlists, normalized=False):
    unique_tokens = list(token_playlists.keys())
    UTM = lil_matrix((len(playlists), max(unique_tokens)+1))
    
    num_playlists = len(playlists)
    i = 0
    
    for token,playlist_ids in token_playlists.items():
        nq = len(playlist_ids)
        for playlist_id in playlist_ids:
            UTM[playlist_id,token] = math.log((num_playlists - nq + 0.5)/(nq + 0.5)) if normalized else 1
        if i % 2000 == 0:
            print(i)
        i += 1
    
    return UTM

In [None]:
UTM = get_UTM(playlists_num, token_playlists, normalized=True)

In [None]:
UTM = UTM.tocsc()

In [None]:
res = UTM.dot(UTM.transpose())

In [None]:
res

In [None]:
import scipy.sparse as sps
def check_matrix(X, format='csc', dtype=np.float32):
    if format == 'csc' and not isinstance(X, sps.csc_matrix):
        return X.tocsc().astype(dtype)
    elif format == 'csr' and not isinstance(X, sps.csr_matrix):
        return X.tocsr().astype(dtype)
    elif format == 'coo' and not isinstance(X, sps.coo_matrix):
        return X.tocoo().astype(dtype)
    elif format == 'dok' and not isinstance(X, sps.dok_matrix):
        return X.todok().astype(dtype)
    elif format == 'bsr' and not isinstance(X, sps.bsr_matrix):
        return X.tobsr().astype(dtype)
    elif format == 'dia' and not isinstance(X, sps.dia_matrix):
        return X.todia().astype(dtype)
    elif format == 'lil' and not isinstance(X, sps.lil_matrix):
        return X.tolil().astype(dtype)
    else:
        return X.astype(dtype)

In [None]:
import scipy

class ISimilarity(object):
    """Abstract interface for the similarity metrics"""

    def __init__(self, shrinkage=10):
        self.shrinkage = shrinkage

    def compute(self, X):
        pass


class Cosine(ISimilarity):
    def compute(self, X):
        # convert to csc matrix for faster column-wise operations
        X = check_matrix(X, 'csc', dtype=np.float32)

        # 1) normalize the columns in X
        # compute the column-wise norm
        # NOTE: this is slightly inefficient. We must copy X to compute the column norms.
        # A faster solution is to  normalize the matrix inplace with a Cython function.
        Xsq = X.copy()
        Xsq.data **= 2
        norm = np.sqrt(Xsq.sum(axis=0))
        norm = np.asarray(norm).ravel()
        norm += 1e-6
        # compute the number of non-zeros in each column
        # NOTE: this works only if X is instance of sparse.csc_matrix
        col_nnz = np.diff(X.indptr)
        # then normalize the values in each column
        X.data /= np.repeat(norm, col_nnz)
        print("Normalized")

        # 2) compute the cosine similarity using the dot-product
        dist = X * X.T
        print("Computed")
        
        # zero out diagonal values
        dist = dist - sps.dia_matrix((dist.diagonal()[scipy.newaxis, :], [0]), shape=dist.shape)
        print("Removed diagonal")
        
        # and apply the shrinkage
        if self.shrinkage > 0:
            dist = self.apply_shrinkage(X, dist)
            print("Applied shrinkage")    
        
        return dist

    def apply_shrinkage(self, X, dist):
        # create an "indicator" version of X (i.e. replace values in X with ones)
        X_ind = X.copy()
        X_ind.data = np.ones_like(X_ind.data)
        # compute the co-rated counts
        co_counts = X_ind * X_ind.T
        # remove the diagonal
        co_counts = co_counts - sps.dia_matrix((co_counts.diagonal()[scipy.newaxis, :], [0]), shape=co_counts.shape)
        # compute the shrinkage factor as co_counts_ij / (co_counts_ij + shrinkage)
        # then multiply dist with it
        co_counts_shrink = co_counts.copy()
        co_counts_shrink.data += self.shrinkage
        co_counts.data /= co_counts_shrink.data
        dist.data *= co_counts.data
        return dist


In [None]:
distance = Cosine()

In [None]:
SIM = distance.compute(M)

In [None]:
SIM

In [None]:
k = 50
SIM = check_matrix(SIM, 'csr')
values, rows, cols = [], [], []
nitems = SIM.shape[0]
for i in range(nitems):
    if (i % 10000 == 0):
        print("Item %d of %d" % (i, nitems))

    this_item_weights = SIM[i,:].toarray()[0]
    top_k_idx = np.argsort(this_item_weights) [-k:]

    values.extend(this_item_weights[top_k_idx])
    rows.extend(np.arange(nitems)[top_k_idx])
    cols.extend(np.ones(k) * i)
W_sparse = sps.csc_matrix((values, (rows, cols)), shape=(nitems, nitems), dtype=np.float32)

In [None]:
WM = W_sparse.dot(M)

In [None]:
WM.data

In [None]:
W_sparse.max()

In [None]:
WM.max(1)

In [None]:
M.max(1)

In [None]:
W_sparse

In [None]:
W_sparse[:,1].max()

In [None]:
M[0,:].data

In [None]:
WM = WM + M

In [None]:
M

In [None]:
distance = Cosine(5)
pl_weights = distance.compute(UTM)        

In [None]:
pl_weights[3,:].data

In [None]:
k = 5
pl_weights = check_matrix(pl_weights, 'csr')
values, rows, cols = [], [], []
nitems = pl_weights.shape[0]
for i in range(nitems):
    if (i % 10000 == 0):
        print("Item %d of %d" % (i, nitems))

    this_item_weights = pl_weights[i,:].toarray()[0]
    top_k_idx = np.argsort(this_item_weights) [-k:]

    values.extend(this_item_weights[top_k_idx])
    rows.extend(np.arange(nitems)[top_k_idx])
    cols.extend(np.ones(k) * i)
W_sparse = sps.csc_matrix((values, (rows, cols)), shape=(nitems, nitems), dtype=np.float32)

In [None]:
M.shape

In [None]:
M2 = 1.0*URM>0

In [None]:
M2 = M2.astype('float32')

In [None]:
URM2 = W_sparse.dot(M2)

In [None]:
k = 5
URM2 = check_matrix(URM2, 'csr')
values, rows, cols = [], [], []
nitems = URM2.shape[0]
for i in range(nitems):
    if (i % 10000 == 0):
        print("Item %d of %d" % (i, nitems))

    this_item_weights = URM2[i,:].toarray()[0]
    top_k_idx = np.argsort(this_item_weights) [-k:]

    values.extend(this_item_weights[top_k_idx])
    rows.extend(np.arange(nitems)[top_k_idx])
    cols.extend(np.ones(k) * i)
URM2 = sps.csc_matrix((values, (rows, cols)), shape=(nitems, nitems), dtype=np.float32)

In [None]:
URM3 = URM2 + M2

In [None]:
URM3

In [None]:
1007372 + 3439780

In [None]:
M = URM3

In [None]:
M2 = M.copy()

In [None]:
M2 = M2.transpose().tocsr()

In [None]:
dist = Cosine()

In [None]:
S = dist.compute(M2)

In [None]:
S2 = S.copy()

In [None]:
S2

In [None]:
%%time
k = 100
S2 = check_matrix(S2, 'csr')
values, rows, cols = [], [], []
nitems = S2.shape[0]
for i in range(nitems):
    if (i % 10000 == 0):
        print("Item %d of %d" % (i, nitems))

    this_item_weights = S2[i,:].toarray()[0]
    top_k_idx = np.argsort(this_item_weights) [-k:]

    values.extend(this_item_weights[top_k_idx])
    rows.extend(np.arange(nitems)[top_k_idx])
    cols.extend(np.ones(k) * i)

S2 = sps.csc_matrix((values, (rows, cols)), shape=(nitems, nitems), dtype=np.float32)

In [None]:
t = pd.read_csv('data/train_final.csv', sep='\t')

In [None]:
ttracks = pd.read_csv('data/target_tracks.csv', sep='\t')

In [None]:
tracks = pd.read_csv('data/tracks_final.csv', sep='\t')

In [None]:
train = pd.read_csv('data/train_final.csv', sep='\t')

In [None]:
tracks['album_clear'] = tracks['album'].apply(clear_album)

In [None]:
tracks['tags_clear'] = tracks['tags'].apply(lambda x : eval(x))

In [None]:
tracks['tag_len'] = tracks['tags_clear'].apply(lambda x : len(x))

In [None]:
track_playlist_table = train.groupby('track_id').apply(lambda x : x['playlist_id'].values)

In [None]:
track_playlist_table = pd.DataFrame(track_playlist_table, columns=['playlist_ids'])
track_playlist_table['track_id'] = track_playlist_table.index

In [None]:
track_playlist_table['pl_len'] = track_playlist_table['playlist_ids'].apply(lambda x : len(x))

In [None]:
track_playlist_table['pl_len'].describe()

In [None]:
tracks['tag_len'].describe()

In [None]:
sum(tracks['tag_len']== 0)

In [None]:
sum(tracks['album_clear'].isnull())

In [None]:
tttracks = tracks[tracks.track_id.isin(ttracks.track_id)].copy()

In [None]:
sum(tttracks['tag_len'] == 0)

In [None]:
758/32000

In [None]:
2789/100000

All target tracks has author

8k over 32k tracks has null album, but this is ok according to general distribution (25% of tracks has no album)

758 tracks has no tag, but this is normal according to distribution (2% of the tracks has empty tag list)
all the tracks that has empty tag list also have empty duration and empty playcount

In [None]:
def clear_album(album_string):
    l = album_string[1:-1]
    if l == '' or l == 'None':
        return None
    else:
        return int(l)

In [None]:
tttracks['album_clear'] = tttracks['album'].apply(clear_album)

In [None]:
sum(tttracks['duration'] == -1)

In [None]:
train_original = pd.read_csv('data/train_final.csv', sep='\t')

In [None]:
train, test, target_playlists, target_tracks = train_test_split(train, test_size = 0.4, min_playlist_tracks=10)

In [None]:
import myslim

In [None]:
M = load_URM()

In [None]:
import implicit
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

In [None]:
model = implicit.als.AlternatingLeastSquares(factors=1000, regularization=20, iterations=1)

In [None]:
M2 = bm25_weight(M) * 5

In [None]:
%%time
model.fit(M2)

In [None]:
A = model.user_factors
B = model.item_factors.transpose()

In [None]:
A

In [None]:
pl2id_map = build_num_to_id_map(playlists, 'playlist_id')
tr2id_map = build_num_to_id_map(tracks, 'track_id')
pl2num_map = build_id_to_num_map(playlists, 'playlist_id')

M = M.tocsr()
predictions = {}
for pl_id in target_playlists['playlist_id'].values:
    pl_num = pl2num_map[pl_id]
    r = A[pl_num,:].dot(B)
    idx = r.argsort()
    ranking = np.flip(idx, 0)
    
    count = 0
    i = 0
    pred = []
    while count < 5 and i < len(ranking):
        tr_id = tr2id_map[ranking[i]]
        if tr_id not in tracks_in_playlist.loc[pl_id]['track_ids']:
            pred.append(tr_id)
            count +=1
        i+=1
    i=0
    if (len(pred) < 5):
        print("aaaargh len < 5")
        print("{0}".format(pl_num))
    while len(pred) < 5 and i < len(ranking):
        pred.append(tr2id_map[ranking[i]])
        i+=1
    while(len(pred) < 5):
        pred.append(0)
    predictions[pl_id] = np.array(pred)

pred = pd.DataFrame()
pred['playlist_id'] = predictions.keys()
pred['track_ids'] = list(predictions.values())

In [None]:
print(evaluate(test, pred))